Spaces:

QHL067
/

CrossFlow

Running on Zero

App Files Files Community

QHL067 commited on Feb 18

Commit

f9567e5

1 Parent(s): 9772b52

working

Browse files

Files changed (36) hide show

app.py +79 -3
configs/t2i_256px_clip_dimr.py +118 -0
configs/t2i_256px_t5_dimr.py +118 -0
configs/t2i_512px_clip_dimr.py +131 -0
configs/t2i_512px_t5_dimr.py +131 -0
configs/t2i_512px_t5_dit.py +92 -0
configs/t2i_training_demo.py +132 -0
datasets.py +305 -0
demo_t2i.py +194 -0
demo_t2i_arith.py +290 -0
diffusion/base_solver.py +203 -0
diffusion/flow_matching.py +702 -0
libs/__init__.py +1 -0
libs/autoencoder.py +519 -0
libs/clip.py +68 -0
libs/model/axial_rope.py +109 -0
libs/model/common_layers.py +104 -0
libs/model/dimr_t2i.py +443 -0
libs/model/dit_t2i.py +405 -0
libs/model/flags.py +56 -0
libs/model/sigmoid/kernel.py +316 -0
libs/model/sigmoid/module.py +274 -0
libs/model/trans_autoencoder.py +289 -0
libs/t5.py +237 -0
libs/timm.py +114 -0
requirements.txt +19 -4
scripts/extract_empty_feature.py +56 -0
scripts/extract_mscoco_feature.py +83 -0
scripts/extract_test_prompt_feature.py +72 -0
scripts/extract_train_feature.py +159 -0
sde.py +326 -0
tools/clip_score.py +90 -0
tools/fid_score.py +268 -0
tools/inception.py +328 -0
train_t2i.py +328 -0
utils.py +274 -0

app.py CHANGED Viewed

@@ -1,8 +1,83 @@
 import gradio as gr
 import numpy as np
 import random
-# import spaces #[uncomment to use ZeroGPU]
 from diffusers import DiffusionPipeline
 import torch
@@ -21,7 +96,7 @@ MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 1024
-# @spaces.GPU #[uncomment to use ZeroGPU]
 def infer(
     prompt1,
     prompt2,
@@ -69,7 +144,8 @@ css = """
 with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
-        gr.Markdown(" # Text-to-Image Gradio Template")
         with gr.Row():
             prompt1 = gr.Text(

 import gradio as gr
+from absl import flags
+from absl import app
+from ml_collections import config_flags
+import os
+import ml_collections
+import torch
+from torch import multiprocessing as mp
+import torch.nn as nn
+import accelerate
+import utils
+import tempfile
+from absl import logging
+import builtins
+import einops
+import math
 import numpy as np
+import time
+from PIL import Image
 import random
+from diffusion.flow_matching import FlowMatching, ODEFlowMatchingSolver, ODEEulerFlowMatchingSolver
+from tools.clip_score import ClipSocre
+import libs.autoencoder
+from libs.clip import FrozenCLIPEmbedder
+from libs.t5 import T5Embedder
+def unpreprocess(x):
+        x = 0.5 * (x + 1.)
+        x.clamp_(0., 1.)
+        return x
+def batch_decode(_z, decode, batch_size=10):
+    """
+    The VAE decoder requires large GPU memory. To run the interpolation model on GPUs with 24 GB or smaller RAM, you can use this code to reduce memory usage for the VAE.
+    It works by splitting the input tensor into smaller chunks.
+    """
+    num_samples = _z.size(0)
+    decoded_batches = []
+    for i in range(0, num_samples, batch_size):
+        batch = _z[i:i + batch_size]
+        decoded_batch = decode(batch)
+        decoded_batches.append(decoded_batch)
+    image_unprocessed = torch.cat(decoded_batches, dim=0)
+    return image_unprocessed
+def get_caption(llm, text_model, prompt_dict, batch_size):
+    if batch_size == 3:
+        # only addition or only subtraction
+        assert len(prompt_dict) == 2
+        _batch_con = list(prompt_dict.values()) + [' ']
+    elif batch_size == 4:
+        # addition and subtraction
+        assert len(prompt_dict) == 3
+        _batch_con = list(prompt_dict.values()) + [' ']
+    elif batch_size >= 5:
+        # linear interpolation
+        assert len(prompt_dict) == 2
+        _batch_con = [prompt_dict['prompt_1']] + [' '] * (batch_size-2) + [prompt_dict['prompt_2']]
+    if llm == "clip":
+        _latent, _latent_and_others = text_model.encode(_batch_con)
+        _con = _latent_and_others['token_embedding'].detach()
+    elif llm == "t5":
+        _latent, _latent_and_others = text_model.get_text_embeddings(_batch_con)
+        _con = (_latent_and_others['token_embedding'] * 10.0).detach()
+    else:
+        raise NotImplementedError
+    _con_mask = _latent_and_others['token_mask'].detach()
+    _batch_token = _latent_and_others['tokens'].detach()
+    _batch_caption = _batch_con
+    return (_con, _con_mask, _batch_token, _batch_caption)
+import spaces #[uncomment to use ZeroGPU]
 from diffusers import DiffusionPipeline
 import torch
 MAX_IMAGE_SIZE = 1024
+@spaces.GPU #[uncomment to use ZeroGPU]
 def infer(
     prompt1,
     prompt2,
 with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
+        gr.Markdown(" # CrossFlow")
+        gr.Markdown(" CrossFlow directly transforms text representations into images for text-to-image generation, enabling interpolation in the input text latent space.")
         with gr.Row():
             prompt1 = gr.Text(

configs/t2i_256px_clip_dimr.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import ml_collections
+from dataclasses import dataclass
+@dataclass
+class Args:
+    def __init__(self, **kwargs):
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+model = Args(
+    channels = 4,
+    block_grad_to_lowres = False,
+    norm_type = "TDRMSN",
+    use_t2i = True,
+    clip_dim=768,
+    num_clip_token=77,
+    gradient_checking=True,
+    cfg_indicator=0.1,
+    textVAE = Args(
+        num_blocks = 11,
+        hidden_dim = 1024,
+        hidden_token_length = 256,
+        num_attention_heads = 8,
+        dropout_prob = 0.1,
+    ),
+    stage_configs = [
+            Args(
+                block_type = "TransformerBlock",
+                dim = 1024,  # channel
+                hidden_dim = 2048,
+                num_attention_heads = 16,
+                num_blocks = 65,  # depth
+                max_height = 16,
+                max_width = 16,
+                image_input_ratio = 1,
+                input_feature_ratio = 2,
+                final_kernel_size = 3,
+                dropout_prob = 0,
+            ),
+            Args(
+                block_type = "ConvNeXtBlock",
+                dim = 512,
+                hidden_dim = 1024,
+                kernel_size = 7,
+                num_blocks = 33,
+                max_height = 32,
+                max_width = 32,
+                image_input_ratio = 1,
+                input_feature_ratio = 1,
+                final_kernel_size = 3,
+                dropout_prob = 0,
+            ),
+    ],
+)
+def d(**kwargs):
+    """Helper of creating a config dict."""
+    return ml_collections.ConfigDict(initial_dictionary=kwargs)
+def get_config():
+    config = ml_collections.ConfigDict()
+    config.seed = 1234
+    config.z_shape = (4, 32, 32)
+    config.autoencoder = d(
+        pretrained_path='assets/stable-diffusion/autoencoder_kl.pth',
+        scale_factor=0.23010
+    )
+    config.train = d(
+        n_steps=1000000,
+        batch_size=1024,
+        mode='cond',
+        log_interval=10,
+        eval_interval=5000,
+        save_interval=50000,
+    )
+    config.optimizer = d(
+        name='adamw',
+        lr=0.00001,
+        weight_decay=0.03,
+        betas=(0.9, 0.9),
+    )
+    config.lr_scheduler = d(
+        name='customized',
+        warmup_steps=5000
+    )
+    global model
+    config.nnet = d(
+        name='dimr',
+        model_args=model,
+    )
+    config.loss_coeffs = [1/4, 1]
+    config.dataset = d(
+        name='JDB_demo_features',
+        resolution=256,
+        llm='clip',
+        train_path='/data/qihao/dataset/JDB_demo_feature/',
+        val_path='/data/qihao/dataset/coco_val_features/',
+        cfg=False
+    )
+    config.sample = d(
+        sample_steps=50,
+        n_samples=30000,
+        mini_batch_size=20,
+        cfg=False,
+        scale=7,
+        path=''
+    )
+    return config

configs/t2i_256px_t5_dimr.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import ml_collections
+from dataclasses import dataclass
+@dataclass
+class Args:
+    def __init__(self, **kwargs):
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+model = Args(
+    channels = 4,
+    block_grad_to_lowres = False,
+    norm_type = "TDRMSN",
+    use_t2i = True,
+    clip_dim=4096,
+    num_clip_token=77,
+    gradient_checking=True,
+    cfg_indicator=0.1,
+    textVAE = Args(
+        num_blocks = 11,
+        hidden_dim = 1024,
+        hidden_token_length = 256,
+        num_attention_heads = 8,
+        dropout_prob = 0.1,
+    ),
+    stage_configs = [
+            Args(
+                block_type = "TransformerBlock",
+                dim = 1024,  # channel
+                hidden_dim = 2048,
+                num_attention_heads = 16,
+                num_blocks = 65,  # depth
+                max_height = 16,
+                max_width = 16,
+                image_input_ratio = 1,
+                input_feature_ratio = 2,
+                final_kernel_size = 3,
+                dropout_prob = 0,
+            ),
+            Args(
+                block_type = "ConvNeXtBlock",
+                dim = 512,
+                hidden_dim = 1024,
+                kernel_size = 7,
+                num_blocks = 33,
+                max_height = 32,
+                max_width = 32,
+                image_input_ratio = 1,
+                input_feature_ratio = 1,
+                final_kernel_size = 3,
+                dropout_prob = 0,
+            ),
+    ],
+)
+def d(**kwargs):
+    """Helper of creating a config dict."""
+    return ml_collections.ConfigDict(initial_dictionary=kwargs)
+def get_config():
+    config = ml_collections.ConfigDict()
+    config.seed = 1234
+    config.z_shape = (4, 32, 32)
+    config.autoencoder = d(
+        pretrained_path='assets/stable-diffusion/autoencoder_kl.pth',
+        scale_factor=0.23010
+    )
+    config.train = d(
+        n_steps=1000000,
+        batch_size=1024,
+        mode='cond',
+        log_interval=10,
+        eval_interval=5000,
+        save_interval=50000,
+    )
+    config.optimizer = d(
+        name='adamw',
+        lr=0.00005,
+        weight_decay=0.03,
+        betas=(0.9, 0.9),
+    )
+    config.lr_scheduler = d(
+        name='customized',
+        warmup_steps=5000
+    )
+    global model
+    config.nnet = d(
+        name='dimr',
+        model_args=model,
+    )
+    config.loss_coeffs = [1/4, 1]
+    config.dataset = d(
+        name='JDB_demo_features',
+        resolution=256,
+        llm='t5',
+        train_path='/data/qihao/dataset/JDB_demo_feature/',
+        val_path='/data/qihao/dataset/coco_val_features/',
+        cfg=False
+    )
+    config.sample = d(
+        sample_steps=50,
+        n_samples=30000,
+        mini_batch_size=20,
+        cfg=False,
+        scale=7,
+        path=''
+    )
+    return config

configs/t2i_512px_clip_dimr.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import ml_collections
+from dataclasses import dataclass
+@dataclass
+class Args:
+    def __init__(self, **kwargs):
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+model = Args(
+    channels = 4,
+    block_grad_to_lowres = False,
+    norm_type = "TDRMSN",
+    use_t2i = True,
+    clip_dim=768,
+    num_clip_token=77,
+    gradient_checking=True,
+    cfg_indicator=0.15,
+    textVAE = Args(
+        num_blocks = 11,
+        hidden_dim = 1024,
+        hidden_token_length = 256,
+        num_attention_heads = 8,
+        dropout_prob = 0.1,
+    ),
+    stage_configs = [
+            Args(
+                block_type = "TransformerBlock",
+                dim = 1024,  # channel
+                hidden_dim = 2048,
+                num_attention_heads = 16,
+                num_blocks = 65,  # depth
+                max_height = 16,
+                max_width = 16,
+                image_input_ratio = 1,
+                input_feature_ratio = 4,
+                final_kernel_size = 3,
+                dropout_prob = 0,
+            ),
+            Args(
+                block_type = "ConvNeXtBlock",
+                dim = 512,
+                hidden_dim = 1024,
+                kernel_size = 7,
+                num_blocks = 33,
+                max_height = 32,
+                max_width = 32,
+                image_input_ratio = 1,
+                input_feature_ratio = 2,
+                final_kernel_size = 3,
+                dropout_prob = 0,
+            ),
+            Args(
+                block_type = "ConvNeXtBlock",
+                dim = 256,
+                hidden_dim = 512,
+                kernel_size = 7,
+                num_blocks = 33,
+                max_height = 64,
+                max_width = 64,
+                image_input_ratio = 1,
+                input_feature_ratio = 1,
+                final_kernel_size = 3,
+                dropout_prob = 0,
+            ),
+    ],
+)
+def d(**kwargs):
+    """Helper of creating a config dict."""
+    return ml_collections.ConfigDict(initial_dictionary=kwargs)
+def get_config():
+    config = ml_collections.ConfigDict()
+    config.seed = 1234
+    config.z_shape = (4, 64, 64)
+    config.autoencoder = d(
+        pretrained_path='assets/stable-diffusion/autoencoder_kl.pth',
+        scale_factor=0.23010
+    )
+    config.train = d(
+        n_steps=1000000,
+        batch_size=1024,
+        mode='cond',
+        log_interval=10,
+        eval_interval=5000,
+        save_interval=50000,
+    )
+    config.optimizer = d(
+        name='adamw',
+        lr=0.00001,
+        weight_decay=0.03,
+        betas=(0.9, 0.9),
+    )
+    config.lr_scheduler = d(
+        name='customized',
+        warmup_steps=5000
+    )
+    global model
+    config.nnet = d(
+        name='dimr',
+        model_args=model,
+    )
+    config.loss_coeffs = [1/4, 1/2, 1]
+    config.dataset = d(
+        name='JDB_demo_features',
+        resolution=512,
+        llm='clip',
+        train_path='/data/qihao/dataset/JDB_demo_feature/',
+        val_path='/data/qihao/dataset/coco_val_features/',
+        cfg=False
+    )
+    config.sample = d(
+        sample_steps=50,
+        n_samples=30000,
+        mini_batch_size=10,
+        cfg=False,
+        scale=7,
+        path=''
+    )
+    return config

configs/t2i_512px_t5_dimr.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import ml_collections
+from dataclasses import dataclass
+@dataclass
+class Args:
+    def __init__(self, **kwargs):
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+model = Args(
+    channels = 4,
+    block_grad_to_lowres = False,
+    norm_type = "TDRMSN",
+    use_t2i = True,
+    clip_dim=4096,
+    num_clip_token=77,
+    gradient_checking=True,
+    cfg_indicator=0.15,
+    textVAE = Args(
+        num_blocks = 11,
+        hidden_dim = 1024,
+        hidden_token_length = 256,
+        num_attention_heads = 8,
+        dropout_prob = 0.1,
+    ),
+    stage_configs = [
+            Args(
+                block_type = "TransformerBlock",
+                dim = 1024,  # channel
+                hidden_dim = 2048,
+                num_attention_heads = 16,
+                num_blocks = 65,  # depth
+                max_height = 16,
+                max_width = 16,
+                image_input_ratio = 1,
+                input_feature_ratio = 4,
+                final_kernel_size = 3,
+                dropout_prob = 0,
+            ),
+            Args(
+                block_type = "ConvNeXtBlock",
+                dim = 512,
+                hidden_dim = 1024,
+                kernel_size = 7,
+                num_blocks = 33,
+                max_height = 32,
+                max_width = 32,
+                image_input_ratio = 1,
+                input_feature_ratio = 2,
+                final_kernel_size = 3,
+                dropout_prob = 0,
+            ),
+            Args(
+                block_type = "ConvNeXtBlock",
+                dim = 256,
+                hidden_dim = 512,
+                kernel_size = 7,
+                num_blocks = 33,
+                max_height = 64,
+                max_width = 64,
+                image_input_ratio = 1,
+                input_feature_ratio = 1,
+                final_kernel_size = 3,
+                dropout_prob = 0,
+            ),
+    ],
+)
+def d(**kwargs):
+    """Helper of creating a config dict."""
+    return ml_collections.ConfigDict(initial_dictionary=kwargs)
+def get_config():
+    config = ml_collections.ConfigDict()
+    config.seed = 1234
+    config.z_shape = (4, 64, 64)
+    config.autoencoder = d(
+        pretrained_path='assets/stable-diffusion/autoencoder_kl.pth',
+        scale_factor=0.23010
+    )
+    config.train = d(
+        n_steps=1000000,
+        batch_size=1024,
+        mode='cond',
+        log_interval=10,
+        eval_interval=5000,
+        save_interval=50000,
+    )
+    config.optimizer = d(
+        name='adamw',
+        lr=0.00001,
+        weight_decay=0.03,
+        betas=(0.9, 0.9),
+    )
+    config.lr_scheduler = d(
+        name='customized',
+        warmup_steps=5000
+    )
+    global model
+    config.nnet = d(
+        name='dimr',
+        model_args=model,
+    )
+    config.loss_coeffs = [1/4, 1/2, 1]
+    config.dataset = d(
+        name='JDB_demo_features',
+        resolution=512,
+        llm='t5',
+        train_path='/data/qihao/dataset/JDB_demo_feature/',
+        val_path='/data/qihao/dataset/coco_val_features/',
+        cfg=False
+    )
+    config.sample = d(
+        sample_steps=50,
+        n_samples=30000,
+        mini_batch_size=10,
+        cfg=False,
+        scale=7,
+        path=''
+    )
+    return config

configs/t2i_512px_t5_dit.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import ml_collections
+from dataclasses import dataclass
+@dataclass
+class Args:
+    def __init__(self, **kwargs):
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+model = Args(
+    latent_size = 64,
+    learn_sigma = False, # different from DiT, we direct predict noise here
+    channels = 4,
+    block_grad_to_lowres = False,
+    norm_type = "TDRMSN",
+    use_t2i = True,
+    clip_dim=4096,
+    num_clip_token=77,
+    gradient_checking=True, # for larger model
+    cfg_indicator=0.10,
+    textVAE = Args(
+        num_blocks = 11,
+        hidden_dim = 1024,
+        hidden_token_length = 256,
+        num_attention_heads = 8,
+        dropout_prob = 0.1,
+    ),
+)
+def d(**kwargs):
+    """Helper of creating a config dict."""
+    return ml_collections.ConfigDict(initial_dictionary=kwargs)
+def get_config():
+    config = ml_collections.ConfigDict()
+    config.seed = 1234
+    config.z_shape = (4, 64, 64)
+    config.autoencoder = d(
+        pretrained_path='assets/stable-diffusion/autoencoder_kl.pth',
+        scale_factor=0.23010
+    )
+    config.train = d(
+        n_steps=1000000,
+        batch_size=1024,
+        mode='cond',
+        log_interval=10,
+        eval_interval=5000,
+        save_interval=50000,
+    )
+    config.optimizer = d(
+        name='adamw',
+        lr=0.00002,
+        weight_decay=0.03,
+        betas=(0.9, 0.9),
+    )
+    config.lr_scheduler = d(
+        name='customized',
+        warmup_steps=5000
+    )
+    global model
+    config.nnet = d(
+        name='dit',
+        model_args=model,
+    )
+    config.loss_coeffs = []
+    config.dataset = d(
+        name='JDB_demo_features',
+        resolution=512,
+        llm='t5',
+        train_path='/data/qihao/dataset/JDB_demo_feature/',
+        val_path='/data/qihao/dataset/coco_val_features/',
+        cfg=False
+    )
+    config.sample = d(
+        sample_steps=50,
+        n_samples=30000,
+        mini_batch_size=10,
+        cfg=False,
+        scale=7,
+        path=''
+    )
+    return config

configs/t2i_training_demo.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import ml_collections
+from dataclasses import dataclass
+@dataclass
+class Args:
+    def __init__(self, **kwargs):
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+model = Args(
+    channels = 4,
+    block_grad_to_lowres = False,
+    norm_type = "TDRMSN",
+    use_t2i = True,
+    clip_dim=768,                                               # 768 for CLIP, 4096 for T5-XXL
+    num_clip_token=77,
+    gradient_checking=True,
+    cfg_indicator=0.1,
+    textVAE = Args(
+        num_blocks = 11,
+        hidden_dim = 1024,
+        hidden_token_length = 256,
+        num_attention_heads = 8,
+        dropout_prob = 0.1,
+    ),
+    stage_configs = [                                           # this is just an example
+            Args(
+                block_type = "TransformerBlock",
+                dim = 960,
+                hidden_dim = 1920,
+                num_attention_heads = 16,
+                num_blocks = 29,
+                max_height = 16,
+                max_width = 16,
+                image_input_ratio = 1,
+                input_feature_ratio = 4,
+                final_kernel_size = 3,
+                dropout_prob = 0,
+            ),
+            Args(
+                block_type = "ConvNeXtBlock",
+                dim = 480,
+                hidden_dim = 960,
+                kernel_size = 7,
+                num_blocks = 15,
+                max_height = 32,
+                max_width = 32,
+                image_input_ratio = 1,
+                input_feature_ratio = 2,
+                final_kernel_size = 3,
+                dropout_prob = 0,
+            ),
+            Args(
+                block_type = "ConvNeXtBlock",
+                dim = 240,
+                hidden_dim = 480,
+                kernel_size = 7,
+                num_blocks = 15,
+                max_height = 64,
+                max_width = 64,
+                image_input_ratio = 1,
+                input_feature_ratio = 1,
+                final_kernel_size = 3,
+                dropout_prob = 0,
+            ),
+    ],
+)
+def d(**kwargs):
+    """Helper of creating a config dict."""
+    return ml_collections.ConfigDict(initial_dictionary=kwargs)
+def get_config():
+    config = ml_collections.ConfigDict()
+    config.seed = 1234                                          # random seed
+    config.z_shape = (4, 64, 64)                                # image latent size
+    config.autoencoder = d(
+        pretrained_path='assets/stable-diffusion/autoencoder_kl.pth', # path of pretrained VAE CKPT from LDM
+        scale_factor=0.23010
+    )
+    config.train = d(
+        n_steps=1000000,                                        # total training iterations
+        batch_size=4,                                           # overall batch size across ALL gpus, where batch_size_per_gpu == batch_size / number_of_gpus
+        mode='cond',
+        log_interval=10,
+        eval_interval=10,                                       # iteration interval for visual testing on the specified prompt
+        save_interval=100,                                      # iteration interval for saving checkpoints and testing FID
+        n_samples_eval=5,                                       # number of samples duing visual testing. This depends on your GPU memory and can be any integer between 1 and 15 (as we provide only 15 prompts).
+    )
+    config.optimizer = d(
+        name='adamw',
+        lr=0.00001,                                             # learning rate
+        weight_decay=0.03,
+        betas=(0.9, 0.9),
+    )
+    config.lr_scheduler = d(
+        name='customized',
+        warmup_steps=5000                                       # warmup steps
+    )
+    global model
+    config.nnet = d(
+        name='dimr',
+        model_args=model,
+    )
+    config.loss_coeffs = [1/4, 1/2, 1]                          # weight on loss, only needed for DiMR. Here, loss = 1/4 * loss_block1 + 1/2 * loss_block2 + 1 * loss_block3
+    config.dataset = d(
+        name='JDB_demo_features',                               # dataset name
+        resolution=512,                                         # dataset resolution
+        llm='clip',                                             # language model to generate language embedding
+        train_path='/data/qihao/dataset/JDB_demo_feature/',     # training set path
+        val_path='/data/qihao/dataset/coco_val_features/',      # val set path
+        cfg=False
+    )
+    config.sample = d(
+        sample_steps=50,                                        # sample steps duing inference/testing
+        n_samples=30000,                                        # number of samples for testing (during training, we sample 10K images, which is hardcoded in the training script)
+        mini_batch_size=10,                                     # batch size for testing (i.e., the number of images generated per GPU)
+        cfg=False,
+        scale=7,                                                # cfg scale
+        path=''
+    )
+    return config

datasets.py ADDED Viewed

	@@ -0,0 +1,305 @@

+from torch.utils.data import Dataset
+from torchvision import datasets
+import torchvision.transforms as transforms
+from scipy.signal import convolve2d
+import numpy as np
+import torch
+import math
+import random
+from PIL import Image
+import os
+import glob
+import einops
+import torchvision.transforms.functional as F
+import time
+from tqdm import tqdm
+import json
+import pickle
+import io
+import cv2
+import libs.clip
+import bisect
+class UnlabeledDataset(Dataset):
+    def __init__(self, dataset):
+        self.dataset = dataset
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, item):
+        data = tuple(self.dataset[item][:-1])  # remove label
+        if len(data) == 1:
+            data = data[0]
+        return data
+class LabeledDataset(Dataset):
+    def __init__(self, dataset, labels):
+        self.dataset = dataset
+        self.labels = labels
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, item):
+        return self.dataset[item], self.labels[item]
+class DatasetFactory(object):
+    def __init__(self):
+        self.train = None
+        self.test = None
+    def get_split(self, split, labeled=False):
+        if split == "train":
+            dataset = self.train
+        elif split == "test":
+            dataset = self.test
+        else:
+            raise ValueError
+        if self.has_label:
+            return dataset if labeled else UnlabeledDataset(dataset)
+        else:
+            assert not labeled
+            return dataset
+    def unpreprocess(self, v):  # to B C H W and [0, 1]
+        v = 0.5 * (v + 1.)
+        v.clamp_(0., 1.)
+        return v
+    @property
+    def has_label(self):
+        return True
+    @property
+    def data_shape(self):
+        raise NotImplementedError
+    @property
+    def data_dim(self):
+        return int(np.prod(self.data_shape))
+    @property
+    def fid_stat(self):
+        return None
+    def sample_label(self, n_samples, device):
+        raise NotImplementedError
+    def label_prob(self, k):
+        raise NotImplementedError
+def center_crop_arr(pil_image, image_size):
+    # We are not on a new enough PIL to support the `reducing_gap`
+    # argument, which uses BOX downsampling at powers of two first.
+    # Thus, we do it by hand to improve downsample quality.
+    while min(*pil_image.size) >= 2 * image_size:
+        pil_image = pil_image.resize(
+            tuple(x // 2 for x in pil_image.size), resample=Image.BOX
+        )
+    scale = image_size / min(*pil_image.size)
+    pil_image = pil_image.resize(
+        tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC
+    )
+    arr = np.array(pil_image)
+    crop_y = (arr.shape[0] - image_size) // 2
+    crop_x = (arr.shape[1] - image_size) // 2
+    return arr[crop_y : crop_y + image_size, crop_x : crop_x + image_size]
+# MS COCO
+def center_crop(width, height, img):
+    resample = {'box': Image.BOX, 'lanczos': Image.LANCZOS}['lanczos']
+    crop = np.min(img.shape[:2])
+    img = img[(img.shape[0] - crop) // 2: (img.shape[0] + crop) // 2,
+          (img.shape[1] - crop) // 2: (img.shape[1] + crop) // 2]
+    try:
+        img = Image.fromarray(img, 'RGB')
+    except:
+        img = Image.fromarray(img)
+    img = img.resize((width, height), resample)
+    return np.array(img).astype(np.uint8)
+class MSCOCODatabase(Dataset):
+    def __init__(self, root, annFile, size=None):
+        from pycocotools.coco import COCO
+        self.root = root
+        self.height = self.width = size
+        self.coco = COCO(annFile)
+        self.keys = list(sorted(self.coco.imgs.keys()))
+    def _load_image(self, key: int):
+        path = self.coco.loadImgs(key)[0]["file_name"]
+        return Image.open(os.path.join(self.root, path)).convert("RGB")
+    def _load_target(self, key: int):
+        return self.coco.loadAnns(self.coco.getAnnIds(key))
+    def __len__(self):
+        return len(self.keys)
+    def __getitem__(self, index):
+        key = self.keys[index]
+        image = self._load_image(key)
+        image = np.array(image).astype(np.uint8)
+        image = center_crop(self.width, self.height, image).astype(np.float32)
+        image = (image / 127.5 - 1.0).astype(np.float32)
+        image = einops.rearrange(image, 'h w c -> c h w')
+        anns = self._load_target(key)
+        target = []
+        for ann in anns:
+            target.append(ann['caption'])
+        return image, target
+def get_feature_dir_info(root):
+    files = glob.glob(os.path.join(root, '*.npy'))
+    files_caption = glob.glob(os.path.join(root, '*_*.npy'))
+    num_data = len(files) - len(files_caption)
+    n_captions = {k: 0 for k in range(num_data)}
+    for f in files_caption:
+        name = os.path.split(f)[-1]
+        k1, k2 = os.path.splitext(name)[0].split('_')
+        n_captions[int(k1)] += 1
+    return num_data, n_captions
+class MSCOCOFeatureDataset(Dataset):
+    # the image features are got through sample
+    def __init__(self, root, need_squeeze=False, full_feature=False, fix_test_order=False):
+        self.root = root
+        self.num_data, self.n_captions = get_feature_dir_info(root)
+        self.need_squeeze = need_squeeze
+        self.full_feature = full_feature
+        self.fix_test_order = fix_test_order
+    def __len__(self):
+        return self.num_data
+    def __getitem__(self, index):
+        if self.full_feature:
+            z = np.load(os.path.join(self.root, f'{index}.npy'))
+            if self.fix_test_order:
+                k = self.n_captions[index] - 1
+            else:
+                k = random.randint(0, self.n_captions[index] - 1)
+            test_item = np.load(os.path.join(self.root, f'{index}_{k}.npy'), allow_pickle=True).item()
+            token_embedding = test_item['token_embedding']
+            token_mask = test_item['token_mask']
+            token = test_item['token']
+            caption = test_item['promt']
+            return z, token_embedding, token_mask, token, caption
+        else:
+            z = np.load(os.path.join(self.root, f'{index}.npy'))
+            k = random.randint(0, self.n_captions[index] - 1)
+            c = np.load(os.path.join(self.root, f'{index}_{k}.npy'))
+            if self.need_squeeze:
+                return z, c.squeeze()
+            else:
+                return z, c
+class JDBFeatureDataset(Dataset):
+    def __init__(self, root, resolution, llm):
+        super().__init__()
+        json_path = os.path.join(root,'img_text_pair.jsonl')
+        self.img_root = os.path.join(root,'imgs')
+        self.feature_root = os.path.join(root,'features')
+        self.resolution = resolution
+        self.llm = llm
+        self.file_list = []
+        with open(json_path, 'r', encoding='utf-8') as file:
+            for line in file:
+                self.file_list.append(json.loads(line)['img_path'])
+    def __len__(self):
+        return len(self.file_list)
+    def __getitem__(self, idx):
+        data_item = self.file_list[idx]
+        feature_path = os.path.join(self.feature_root, data_item.split('/')[-1].replace('.jpg','.npy'))
+        img_path = os.path.join(self.img_root, data_item)
+        train_item = np.load(feature_path, allow_pickle=True).item()
+        pil_image = Image.open(img_path)
+        pil_image.load()
+        pil_image = pil_image.convert("RGB")
+        z = train_item[f'image_latent_{self.resolution}']
+        token_embedding = train_item[f'token_embedding_{self.llm}']
+        token_mask = train_item[f'token_mask_{self.llm}']
+        token = train_item[f'token_{self.llm}']
+        caption = train_item['batch_caption']
+        img = center_crop_arr(pil_image, image_size=self.resolution)
+        img = (img / 127.5 - 1.0).astype(np.float32)
+        img = einops.rearrange(img, 'h w c -> c h w')
+        # return z, token_embedding, token_mask, token, caption, 0, img, 0, 0
+        return z, token_embedding, token_mask, token, caption, img
+class JDBFullFeatures(DatasetFactory):  # the moments calculated by Stable Diffusion image encoder & the contexts calculated by clip
+    def __init__(self, train_path, val_path, resolution, llm, cfg=False, p_uncond=None, fix_test_order=False):
+        super().__init__()
+        print('Prepare dataset...')
+        self.resolution = resolution
+        self.train = JDBFeatureDataset(train_path, resolution=resolution, llm=llm)
+        self.test = MSCOCOFeatureDataset(os.path.join(val_path, 'val'), full_feature=True, fix_test_order=fix_test_order)
+        assert len(self.test) == 40504
+        print('Prepare dataset ok')
+        self.empty_context = np.load(os.path.join(val_path, 'empty_context.npy'), allow_pickle=True).item()
+        assert not cfg
+        # text embedding extracted by clip
+        self.prompts, self.token_embedding, self.token_mask, self.token = [], [], [], []
+        for f in sorted(os.listdir(os.path.join(val_path, 'run_vis')), key=lambda x: int(x.split('.')[0])):
+            vis_item = np.load(os.path.join(val_path, 'run_vis', f), allow_pickle=True).item()
+            self.prompts.append(vis_item['promt'])
+            self.token_embedding.append(vis_item['token_embedding'])
+            self.token_mask.append(vis_item['token_mask'])
+            self.token.append(vis_item['token'])
+        self.token_embedding = np.array(self.token_embedding)
+        self.token_mask = np.array(self.token_mask)
+        self.token = np.array(self.token)
+    @property
+    def data_shape(self):
+        if self.resolution==512:
+            return 4, 64, 64
+        else:
+            return 4, 32, 32
+    @property
+    def fid_stat(self):
+        return f'assets/fid_stats/fid_stats_mscoco256_val.npz'
+def get_dataset(name, **kwargs):
+    if name == 'JDB_demo_features':
+        return JDBFullFeatures(**kwargs)
+    else:
+        raise NotImplementedError(name)

demo_t2i.py ADDED Viewed

	@@ -0,0 +1,194 @@

+"""
+    This file is used for T2I generation, it also compute the clip similarity between the generated images and the input prompt
+"""
+from absl import flags
+from absl import app
+from ml_collections import config_flags
+import os
+import ml_collections
+import torch
+from torch import multiprocessing as mp
+import torch.nn as nn
+import accelerate
+import utils
+import tempfile
+from absl import logging
+import builtins
+import einops
+import math
+import numpy as np
+import time
+from PIL import Image
+from diffusion.flow_matching import FlowMatching, ODEFlowMatchingSolver, ODEEulerFlowMatchingSolver
+from tools.clip_score import ClipSocre
+import libs.autoencoder
+from libs.clip import FrozenCLIPEmbedder
+from libs.t5 import T5Embedder
+def unpreprocess(x):
+        x = 0.5 * (x + 1.)
+        x.clamp_(0., 1.)
+        return x
+def get_caption(llm, text_model, _batch_prompt):
+    _batch_con = _batch_prompt
+    if llm == "clip":
+        _latent, _latent_and_others = text_model.encode(_batch_con)
+        _con = _latent_and_others['token_embedding'].detach()
+    elif llm == "t5":
+        _latent, _latent_and_others = text_model.get_text_embeddings(_batch_con)
+        _con = (_latent_and_others['token_embedding'] * 10.0).detach()
+    else:
+        raise NotImplementedError
+    _con_mask = _latent_and_others['token_mask'].detach()
+    _batch_token = _latent_and_others['tokens'].detach()
+    _batch_caption = _batch_con
+    return (_con, _con_mask, _batch_token, _batch_caption)
+def evaluate(config):
+    if config.get('benchmark', False):
+        torch.backends.cudnn.benchmark = True
+        torch.backends.cudnn.deterministic = False
+    mp.set_start_method('spawn')
+    accelerator = accelerate.Accelerator()
+    device = accelerator.device
+    accelerate.utils.set_seed(config.seed, device_specific=True)
+    logging.info(f'Process {accelerator.process_index} using device: {device}')
+    config.mixed_precision = accelerator.mixed_precision
+    config = ml_collections.FrozenConfigDict(config)
+    if accelerator.is_main_process:
+        utils.set_logger(log_level='info', fname=config.output_path)
+    else:
+        utils.set_logger(log_level='error')
+        builtins.print = lambda *args: None
+    nnet = utils.get_nnet(**config.nnet)
+    nnet = accelerator.prepare(nnet)
+    logging.info(f'load nnet from {config.nnet_path}')
+    accelerator.unwrap_model(nnet).load_state_dict(torch.load(config.nnet_path, map_location='cpu'))
+    nnet.eval()
+    ##
+    if config.nnet.model_args.clip_dim == 4096:
+        llm = "t5"
+        t5 = T5Embedder(device=device)
+    elif config.nnet.model_args.clip_dim == 768:
+        llm = "clip"
+        clip = FrozenCLIPEmbedder()
+        clip.eval()
+        clip.to(device)
+    else:
+        raise NotImplementedError
+    if llm == "clip":
+        context_generator = get_caption(llm, clip, _batch_prompt=[config.prompt]*config.sample.mini_batch_size)
+    elif llm == "t5":
+        context_generator = get_caption(llm, t5, _batch_prompt=[config.prompt]*config.sample.mini_batch_size)
+    else:
+        raise NotImplementedError
+    ##
+    autoencoder = libs.autoencoder.get_model(**config.autoencoder)
+    autoencoder.to(device)
+    @torch.cuda.amp.autocast()
+    def encode(_batch):
+        return autoencoder.encode(_batch)
+    @torch.cuda.amp.autocast()
+    def decode(_batch):
+        return autoencoder.decode(_batch)
+    bdv_nnet = None # We don't use Autoguidance
+    ClipSocre_model = ClipSocre(device=device) # we also return clip score
+    #######
+    logging.info(config.sample)
+    logging.info(f'sample: n_samples={config.sample.n_samples}, mode=t2i, mixed_precision={config.mixed_precision}')
+    def ode_fm_solver_sample(nnet_ema, _n_samples, _sample_steps, bdv_nnet=bdv_nnet, context=None, caption=None, testbatch_img_blurred=None, two_stage_generation=-1, token=None, token_mask=None, return_clipScore=False, ClipSocre_model=None):
+        with torch.no_grad():
+            del testbatch_img_blurred
+            _z_gaussian = torch.randn(_n_samples, *config.z_shape, device=device)
+            if 'dimr' in config.nnet.name or 'dit' in config.nnet.name:
+                _z_x0, _mu, _log_var = nnet_ema(context, text_encoder = True, shape = _z_gaussian.shape, mask=token_mask)
+                _z_init = _z_x0.reshape(_z_gaussian.shape)
+            else:
+                raise NotImplementedError
+            assert config.sample.scale > 1
+            if config.cfg != -1:
+                _cfg = config.cfg
+            else:
+                _cfg = config.sample.scale
+            has_null_indicator = hasattr(config.nnet.model_args, "cfg_indicator")
+            _sample_steps = config.sample.sample_steps
+            ode_solver = ODEEulerFlowMatchingSolver(nnet_ema, bdv_model_fn=bdv_nnet, step_size_type="step_in_dsigma", guidance_scale=_cfg)
+            _z, _ = ode_solver.sample(x_T=_z_init, batch_size=_n_samples, sample_steps=_sample_steps, unconditional_guidance_scale=_cfg, has_null_indicator=has_null_indicator)
+            image_unprocessed = decode(_z)
+            clip_score = ClipSocre_model.calculate_clip_score(caption, image_unprocessed)
+            return image_unprocessed, clip_score
+    def sample_fn(_n_samples, return_caption=False, return_clipScore=False, ClipSocre_model=None, config=None):
+        _context, _token_mask, _token, _caption = context_generator
+        assert _context.size(0) == _n_samples
+        assert return_clipScore
+        assert not return_caption
+        return ode_fm_solver_sample(nnet, _n_samples, config.sample.sample_steps, bdv_nnet=bdv_nnet, context=_context, token=_token, token_mask=_token_mask, return_clipScore=return_clipScore, ClipSocre_model=ClipSocre_model, caption=_caption)
+    with tempfile.TemporaryDirectory() as temp_path:
+        path = config.img_save_path or config.sample.path or temp_path
+        if accelerator.is_main_process:
+            os.makedirs(path, exist_ok=True)
+        logging.info(f'Samples are saved in {path}')
+        clip_score_list = utils.sample2dir_wCLIP(accelerator, path, config.sample.n_samples, config.sample.mini_batch_size, sample_fn, unpreprocess, return_clipScore=True, ClipSocre_model=ClipSocre_model, config=config)
+        if clip_score_list is not None:
+            _clip_score_list = torch.cat(clip_score_list)
+        if accelerator.is_main_process:
+            logging.info(f'nnet_path={config.nnet_path}, clip_score{len(_clip_score_list)}={_clip_score_list.mean().item()}')
+FLAGS = flags.FLAGS
+config_flags.DEFINE_config_file(
+    "config", None, "Training configuration.", lock_config=False)
+flags.mark_flags_as_required(["config"])
+flags.DEFINE_string("nnet_path", None, "The nnet to evaluate.")
+flags.DEFINE_string("prompt", None, "The prompt used for generation.")
+flags.DEFINE_string("output_path", None, "The path to output log.")
+flags.DEFINE_float("cfg", -1, 'cfg scale, will use the scale defined in the config file is not assigned')
+flags.DEFINE_string("img_save_path", None, "The path to image log.")
+def main(argv):
+    config = FLAGS.config
+    config.nnet_path = FLAGS.nnet_path
+    config.prompt = FLAGS.prompt
+    config.output_path = FLAGS.output_path
+    config.img_save_path = FLAGS.img_save_path
+    config.cfg = FLAGS.cfg
+    evaluate(config)
+if __name__ == "__main__":
+    app.run(main)

demo_t2i_arith.py ADDED Viewed

	@@ -0,0 +1,290 @@

+"""
+    This file is used for T2I generation, it also compute the clip similarity between the generated images and the input prompt
+"""
+from absl import flags
+from absl import app
+from ml_collections import config_flags
+import os
+import ml_collections
+import torch
+from torch import multiprocessing as mp
+import torch.nn as nn
+import accelerate
+import utils
+import tempfile
+from absl import logging
+import builtins
+import einops
+import math
+import numpy as np
+import time
+from PIL import Image
+from diffusion.flow_matching import FlowMatching, ODEFlowMatchingSolver, ODEEulerFlowMatchingSolver
+from tools.clip_score import ClipSocre
+import libs.autoencoder
+from libs.clip import FrozenCLIPEmbedder
+from libs.t5 import T5Embedder
+def unpreprocess(x):
+        x = 0.5 * (x + 1.)
+        x.clamp_(0., 1.)
+        return x
+def batch_decode(_z, decode, batch_size=10):
+    """
+    The VAE decoder requires large GPU memory. To run the interpolation model on GPUs with 24 GB or smaller RAM, you can use this code to reduce memory usage for the VAE.
+    It works by splitting the input tensor into smaller chunks.
+    """
+    num_samples = _z.size(0)
+    decoded_batches = []
+    for i in range(0, num_samples, batch_size):
+        batch = _z[i:i + batch_size]
+        decoded_batch = decode(batch)
+        decoded_batches.append(decoded_batch)
+    image_unprocessed = torch.cat(decoded_batches, dim=0)
+    return image_unprocessed
+def get_caption(llm, text_model, prompt_dict, batch_size):
+    if batch_size == 3:
+        # only addition or only subtraction
+        assert len(prompt_dict) == 2
+        _batch_con = list(prompt_dict.values()) + [' ']
+    elif batch_size == 4:
+        # addition and subtraction
+        assert len(prompt_dict) == 3
+        _batch_con = list(prompt_dict.values()) + [' ']
+    elif batch_size >= 5:
+        # linear interpolation
+        assert len(prompt_dict) == 2
+        _batch_con = [prompt_dict['prompt_1']] + [' '] * (batch_size-2) + [prompt_dict['prompt_2']]
+    if llm == "clip":
+        _latent, _latent_and_others = text_model.encode(_batch_con)
+        _con = _latent_and_others['token_embedding'].detach()
+    elif llm == "t5":
+        _latent, _latent_and_others = text_model.get_text_embeddings(_batch_con)
+        _con = (_latent_and_others['token_embedding'] * 10.0).detach()
+    else:
+        raise NotImplementedError
+    _con_mask = _latent_and_others['token_mask'].detach()
+    _batch_token = _latent_and_others['tokens'].detach()
+    _batch_caption = _batch_con
+    return (_con, _con_mask, _batch_token, _batch_caption)
+def evaluate(config):
+    if config.get('benchmark', False):
+        torch.backends.cudnn.benchmark = True
+        torch.backends.cudnn.deterministic = False
+    mp.set_start_method('spawn')
+    accelerator = accelerate.Accelerator()
+    device = accelerator.device
+    accelerate.utils.set_seed(config.seed, device_specific=True)
+    logging.info(f'Process {accelerator.process_index} using device: {device}')
+    config.mixed_precision = accelerator.mixed_precision
+    config = ml_collections.FrozenConfigDict(config)
+    if accelerator.is_main_process:
+        utils.set_logger(log_level='info', fname=config.output_path)
+    else:
+        utils.set_logger(log_level='error')
+        builtins.print = lambda *args: None
+    nnet = utils.get_nnet(**config.nnet)
+    nnet = accelerator.prepare(nnet)
+    logging.info(f'load nnet from {config.nnet_path}')
+    accelerator.unwrap_model(nnet).load_state_dict(torch.load(config.nnet_path, map_location='cpu'))
+    nnet.eval()
+    ##
+    if config.nnet.model_args.clip_dim == 4096:
+        llm = "t5"
+        t5 = T5Embedder(device=device)
+    elif config.nnet.model_args.clip_dim == 768:
+        llm = "clip"
+        clip = FrozenCLIPEmbedder()
+        clip.eval()
+        clip.to(device)
+    else:
+        raise NotImplementedError
+    config = ml_collections.ConfigDict(config)
+    if config.test_type == 'interpolation':
+        prompt_dict = {'prompt_1':config.prompt_1, 'prompt_2':config.prompt_2}
+        for key in prompt_dict.keys():
+            assert prompt_dict[key] is not None
+        config.sample.mini_batch_size = config.num_of_interpolation
+        assert config.sample.mini_batch_size >= 5, "for linear interpolation, please sample at least five image"
+    elif config.test_type == 'arithmetic':
+        prompt_dict = {'prompt_ori':config.prompt_ori, 'prompt_a':config.prompt_a, 'prompt_s':config.prompt_s}
+        keys_to_remove = [key for key, value in prompt_dict.items() if value is None]
+        for key in keys_to_remove:
+            del prompt_dict[key]
+        counter = len(prompt_dict)
+        assert prompt_dict['prompt_ori'] is not None
+        assert counter == 2 or counter == 3
+        config.sample.mini_batch_size = counter + 1
+    else:
+        raise NotImplementedError
+    config = ml_collections.FrozenConfigDict(config)
+    if llm == "clip":
+        context_generator = get_caption(llm, clip, prompt_dict=prompt_dict, batch_size=config.sample.mini_batch_size)
+    elif llm == "t5":
+        context_generator = get_caption(llm, t5, prompt_dict=prompt_dict, batch_size=config.sample.mini_batch_size)
+    else:
+        raise NotImplementedError
+    ##
+    autoencoder = libs.autoencoder.get_model(**config.autoencoder)
+    autoencoder.to(device)
+    @torch.cuda.amp.autocast()
+    def encode(_batch):
+        return autoencoder.encode(_batch)
+    @torch.cuda.amp.autocast()
+    def decode(_batch):
+        return autoencoder.decode(_batch)
+    bdv_nnet = None # We don't use Autoguidance
+    ClipSocre_model = ClipSocre(device=device) # we also return clip score
+    #######
+    logging.info(config.sample)
+    logging.info(f'sample: n_samples={config.sample.n_samples}, mode=t2i, mixed_precision={config.mixed_precision}')
+    def ode_fm_solver_sample(nnet_ema, _n_samples, _sample_steps, bdv_nnet=bdv_nnet, context=None, caption=None, testbatch_img_blurred=None, two_stage_generation=-1, token=None, token_mask=None, return_clipScore=False, ClipSocre_model=None):
+        with torch.no_grad():
+            del testbatch_img_blurred
+            _z_gaussian = torch.randn(_n_samples, *config.z_shape, device=device)
+            if 'dimr' in config.nnet.name or 'dit' in config.nnet.name:
+                _z_x0, _mu, _log_var = nnet_ema(context, text_encoder = True, shape = _z_gaussian.shape, mask=token_mask)
+                _z_init = _z_x0.reshape(_z_gaussian.shape)
+            else:
+                raise NotImplementedError
+            if len(_z_init) == 3:
+                if config.prompt_a is not None:
+                    assert config.prompt_s is None
+                    _z_x0_temp = _z_x0[0] + _z_x0[1]
+                elif config.prompt_s is not None:
+                    assert config.prompt_a is None
+                    _z_x0_temp = _z_x0[0] - _z_x0[1]
+                else:
+                    raise NotImplementedError
+                mean = _z_x0_temp.mean()
+                std = _z_x0_temp.std()
+                _z_x0[2] = (_z_x0_temp - mean) / std
+            elif len(_z_init) == 4:
+                _z_x0_temp = _z_x0[0] + _z_x0[1] - _z_x0[2]
+                mean = _z_x0_temp.mean()
+                std = _z_x0_temp.std()
+                _z_x0[3] = (_z_x0_temp - mean) / std
+            elif len(_z_init) >= 5:
+                tensor_a = _z_init[0]
+                tensor_b = _z_init[-1]
+                num_interpolations = len(_z_init) - 2
+                interpolations = [tensor_a + (tensor_b - tensor_a) * (i / (num_interpolations + 1)) for i in range(1, num_interpolations + 1)]
+                _z_init = torch.stack([tensor_a] + interpolations + [tensor_b], dim=0)
+            assert config.sample.scale > 1
+            if config.cfg != -1:
+                _cfg = config.cfg
+            else:
+                _cfg = config.sample.scale
+            has_null_indicator = hasattr(config.nnet.model_args, "cfg_indicator")
+            _sample_steps = config.sample.sample_steps
+            ode_solver = ODEEulerFlowMatchingSolver(nnet_ema, bdv_model_fn=bdv_nnet, step_size_type="step_in_dsigma", guidance_scale=_cfg)
+            _z, _ = ode_solver.sample(x_T=_z_init, batch_size=_n_samples, sample_steps=_sample_steps, unconditional_guidance_scale=_cfg, has_null_indicator=has_null_indicator)
+            if config.save_gpu_memory:
+                image_unprocessed = batch_decode(_z, decode)
+            else:
+                image_unprocessed = decode(_z)
+            clip_score = ClipSocre_model.calculate_clip_score(caption, image_unprocessed)
+            return image_unprocessed, clip_score
+    def sample_fn(_n_samples, return_caption=False, return_clipScore=False, ClipSocre_model=None, config=None):
+        _context, _token_mask, _token, _caption = context_generator
+        assert return_clipScore
+        assert not return_caption
+        return ode_fm_solver_sample(nnet, _n_samples, config.sample.sample_steps, bdv_nnet=bdv_nnet, context=_context, token=_token, token_mask=_token_mask, return_clipScore=return_clipScore, ClipSocre_model=ClipSocre_model, caption=_caption)
+    with tempfile.TemporaryDirectory() as temp_path:
+        path = config.img_save_path or config.sample.path or temp_path
+        if accelerator.is_main_process:
+            os.makedirs(path, exist_ok=True)
+        logging.info(f'Samples are saved in {path}')
+        clip_score_list = utils.sample2dir_wCLIP(accelerator, path, config.sample.n_samples, config.sample.mini_batch_size, sample_fn, unpreprocess, return_clipScore=True, ClipSocre_model=ClipSocre_model, config=config)
+        if clip_score_list is not None:
+            _clip_score_list = torch.cat(clip_score_list)
+        if accelerator.is_main_process:
+            logging.info(f'nnet_path={config.nnet_path}, clip_score{len(_clip_score_list)}={_clip_score_list.mean().item()}')
+FLAGS = flags.FLAGS
+config_flags.DEFINE_config_file(
+    "config", None, "Training configuration.", lock_config=False)
+flags.mark_flags_as_required(["config"])
+flags.DEFINE_string("nnet_path", None, "The nnet to evaluate.")
+flags.DEFINE_string("output_path", None, "The path to output log.")
+flags.DEFINE_float("cfg", -1, 'cfg scale, will use the scale defined in the config file is not assigned')
+flags.DEFINE_string("img_save_path", None, "The path to image log.")
+flags.DEFINE_string("test_type", None, "The prompt used for generation.")
+flags.DEFINE_string("prompt_1", None, "The prompt used for linear interpolation.")
+flags.DEFINE_string("prompt_2", None, "The prompt used for linear interpolation.")
+flags.DEFINE_integer("num_of_interpolation", -1, 'number of image being samples for linear interpolation')
+flags.DEFINE_boolean('save_gpu_memory', False, 'To save VRAM')
+flags.DEFINE_string("prompt_ori", None, "The prompt used for arithmetic operations.")
+flags.DEFINE_string("prompt_a", None, "The prompt used for arithmetic operations (addition).")
+flags.DEFINE_string("prompt_s", None, "The prompt used for arithmetic operations (subtraction).")
+def main(argv):
+    config = FLAGS.config
+    config.nnet_path = FLAGS.nnet_path
+    config.output_path = FLAGS.output_path
+    config.img_save_path = FLAGS.img_save_path
+    config.cfg = FLAGS.cfg
+    config.test_type = FLAGS.test_type
+    config.prompt_1 = FLAGS.prompt_1
+    config.prompt_2 = FLAGS.prompt_2
+    config.num_of_interpolation = FLAGS.num_of_interpolation
+    config.save_gpu_memory = FLAGS.save_gpu_memory
+    config.prompt_ori = FLAGS.prompt_ori
+    config.prompt_a = FLAGS.prompt_a
+    config.prompt_s = FLAGS.prompt_s
+    evaluate(config)
+if __name__ == "__main__":
+    app.run(main)

diffusion/base_solver.py ADDED Viewed

	@@ -0,0 +1,203 @@

+"""
+This file contains the solver base class, including the cfg indicator
+"""
+import enum
+import logging
+from collections import defaultdict
+from typing import Callable, Dict, List, Union
+import numpy as np
+import torch
+import random
+logger = logging.getLogger(__name__)
+_default_cfg_processor = {"caption": lambda x, T, t: x}
+class ConditionTypes(enum.Enum):
+    IMAGE_EMBED: str = "image_conditioning"  # not implemented yet
+    TEXT_EMBED: str = "caption"
+    HINT_EMBED: str = "hint"  # not implemented yet
+class Solver:
+    def __init__(
+        self,
+        model_fn,
+        bdv_model_fn=None,
+        schedule="linear",
+        conditioning_types: List[str] = ["caption"],
+        guidance_scale: Union[float, Dict[ConditionTypes, float]] = 1.0,
+        cfg_processor: Callable = _default_cfg_processor,
+        **kwargs,
+    ):
+        self.model = model_fn
+        self.bdv_model = bdv_model_fn
+        self.schedule = schedule
+        # This list (conditioning_types) is important to decide which conditioning variable is given the priority
+        # For multi_cfg with 2 variables c,i, the cfg equation is
+        # output = e(null,null) + scale_c * (e(i,c) - e(i,null)) + scale_i * (e(i,null) - e(null,null))
+        # Note that the marginalization can be changed slightly to obtain a different equation
+        # output = e(null,null) + scale_i * (e(c,i) - e(c,null)) + scale_c * (e(c,null) - e(null,null))
+        # The order of the conditioning variables in the list decides which of the two equations above are used
+        # If the list is ["image", "caption"] then the first equation is used and
+        # if the list is ["caption", "image"] then the second is used
+        self.condition_types = [ConditionTypes(el) for el in conditioning_types]
+        self.unconditional_guidance_scale = guidance_scale
+        if isinstance(guidance_scale, dict):
+            self.unconditional_guidance_scale = {
+                ConditionTypes(k): v for k, v in guidance_scale.items()
+            }
+        else:
+            # If a single float is provided, we assume it is for text conditioning
+            self.unconditional_guidance_scale = {
+                ConditionTypes.TEXT_EMBED: guidance_scale
+            }
+        assert all(
+            [
+                el in self.unconditional_guidance_scale.keys()
+                for el in self.condition_types
+            ]
+        )
+        self.cfg_processor = cfg_processor
+        if self.cfg_processor is None:
+            self.cfg_processor = _default_cfg_processor
+        if isinstance(self.cfg_processor, dict):
+            assert all(callable(v) for k, v in self.cfg_processor.items())
+            self.cfg_processor = {
+                ConditionTypes(k): v for k, v in self.cfg_processor.items()
+            }
+        else:
+            assert callable(self.cfg_processor)
+            self.cfg_processor = {ConditionTypes.TEXT_EMBED: cfg_processor}
+        if self.cfg_processor is not None:
+            assert all([el in self.cfg_processor.keys() for el in self.condition_types])
+        self.inf_steps_completed = 0
+    @property
+    def device(self):
+        return self.model.device
+    def register_buffer(self, name, attr):
+        if isinstance(attr, torch.Tensor):
+            attr = attr.to(self.device)
+        setattr(self, name, attr)
+    def _check_the_conditioning(self, conditioning, batch_size):
+        # Checks if batch sizes match
+        if conditioning is not None:
+            if isinstance(conditioning, dict):
+                ctmp = conditioning[list(conditioning.keys())[0]]
+                while isinstance(ctmp, list):
+                    ctmp = ctmp[0]
+                if isinstance(ctmp, dict):
+                    if isinstance(ctmp["c"], list):
+                        cbs = ctmp["c"][0].shape[0]
+                    else:
+                        cbs = ctmp["c"].shape[0]
+                else:
+                    cbs = ctmp.shape[0]
+                if cbs != batch_size:
+                    logger.info(
+                        f"Warning: Got {cbs} conditionings but batch-size is {batch_size}"
+                    )
+            elif isinstance(conditioning, list):
+                for ctmp in conditioning:
+                    if ctmp.shape[0] != batch_size:
+                        logger.info(
+                            f"Warning: Got {ctmp.shape[0]} conditionings but batch-size is {batch_size}"
+                        )
+            else:
+                if conditioning.shape[0] != batch_size:
+                    logger.info(
+                        f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}"
+                    )
+    def sample(
+        self,
+        sample_steps,
+        batch_size,
+        sampling_method,
+        unconditional_guidance_scale,
+        has_null_indicator,
+        shape=None, # no longer use it
+        callback=None,
+        normals_sequence=None,
+        img_callback=None,
+        quantize_x0=False,
+        eta=0.0,
+        mask=None,
+        x0=None,
+        temperature=1.0,
+        noise_dropout=0.0,
+        verbose=True,
+        x_T=None,
+        log_every_t=100,
+        dynamic_threshold=None,
+        ucg_schedule=None,
+        t_schedule=None,  # Default value is set below
+        skip_type=None,  # Deprecated, kept for backward compatibility. Use `t_schedule` instead.
+        start_timestep=None,
+        num_timesteps=None,
+        do_make_schedule=True,
+        **kwargs,
+    ):
+        self.num_inf_timesteps = sample_steps
+        assert skip_type is None
+        t_schedule = t_schedule or "time_uniform"
+        if self.unconditional_guidance_scale is None:
+            self.unconditional_guidance_scale = unconditional_guidance_scale
+        assert isinstance(sampling_method, Callable)
+        samples, intermediates = sampling_method(
+            x_T=x_T,
+            # Hardcoded in PLMS file
+            ddim_use_original_steps=False,
+            callback=callback,
+            num_timesteps=num_timesteps,
+            quantize_denoised=quantize_x0,
+            mask=mask,
+            x0=x0,
+            img_callback=img_callback,
+            log_every_t=log_every_t,
+            temperature=temperature,
+            noise_dropout=noise_dropout,
+            unconditional_guidance_scale=unconditional_guidance_scale,
+            has_null_indicator=has_null_indicator,
+            dynamic_threshold=dynamic_threshold,
+            verbose=verbose,
+            ucg_schedule=ucg_schedule,
+            start_timestep=start_timestep,
+        )
+        return samples, intermediates
+    @torch.no_grad()
+    def get_model_output_dimr(
+        self,
+        x,
+        t_continuous,
+        unconditional_guidance_scale,
+        has_null_indicator,
+    ):
+        log_snr = 4 - t_continuous * 8 # inversed
+        if has_null_indicator:
+            _cond = self.model(x, t=t_continuous, log_snr=log_snr, null_indicator=torch.tensor([False] * x.shape[0]).to(x.device))[-1]
+            _uncond = self.model(x, t=t_continuous, log_snr=log_snr, null_indicator=torch.tensor([True] * x.shape[0]).to(x.device))[-1]
+            assert unconditional_guidance_scale > 1
+            return _uncond + unconditional_guidance_scale * (_cond - _uncond)
+        else:
+            _cond = self.model(x, log_snr=log_snr)[-1]
+            return _cond

diffusion/flow_matching.py ADDED Viewed

	@@ -0,0 +1,702 @@

+import logging
+from typing import Callable, Dict, Optional, Tuple
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+import torchdiffeq
+import random
+from sde import multi_scale_targets
+from diffusion.base_solver import Solver
+import numpy as np
+from torchvision import transforms
+def check_zip(*args):
+    args = [list(arg) for arg in args]
+    length = len(args[0])
+    for arg in args:
+        assert len(arg) == length
+    return zip(*args)
+def kl_divergence(source, target):
+    q_raw = source.view(-1)
+    p_raw = target.view(-1)
+    p = F.softmax(p_raw, dim=0)
+    q = F.softmax(q_raw, dim=0)
+    q_log = torch.log(q)
+    kl_div_1 = F.kl_div(q_log, p, reduction='sum')
+    return kl_div_1
+class TimeStepSampler:
+    """
+    Abstract class to sample timesteps for flow matching.
+    """
+    def sample_time(self, x_start):
+        # In flow matching, time is in range [0, 1] and 1 indicates the original image; 0 is pure noise
+        # this convention is *REVERSE* of diffusion
+        raise NotImplementedError
+class ClipLoss(nn.Module):
+    def __init__(
+            self,
+            local_loss=False,
+            gather_with_grad=False,
+            cache_labels=False,
+            rank=0,
+            world_size=1,
+            use_horovod=False,
+    ):
+        super().__init__()
+        self.local_loss = local_loss
+        self.gather_with_grad = gather_with_grad
+        self.cache_labels = cache_labels
+        self.rank = rank
+        self.world_size = world_size
+        self.use_horovod = use_horovod
+        # cache state
+        self.prev_num_logits = 0
+        self.labels = {}
+    def get_ground_truth(self, device, num_logits) -> torch.Tensor:
+        # calculated ground-truth and cache if enabled
+        if self.prev_num_logits != num_logits or device not in self.labels:
+            labels = torch.arange(num_logits, device=device, dtype=torch.long)
+            if self.world_size > 1 and self.local_loss:
+                labels = labels + num_logits * self.rank
+            if self.cache_labels:
+                self.labels[device] = labels
+                self.prev_num_logits = num_logits
+        else:
+            labels = self.labels[device]
+        return labels
+    def get_logits(self, image_features, text_features, logit_scale):
+        if self.world_size > 1:
+            all_image_features, all_text_features = gather_features(
+                image_features, text_features,
+                self.local_loss, self.gather_with_grad, self.rank, self.world_size, self.use_horovod)
+            if self.local_loss:
+                logits_per_image = logit_scale * image_features @ all_text_features.T
+                logits_per_text = logit_scale * text_features @ all_image_features.T
+            else:
+                logits_per_image = logit_scale * all_image_features @ all_text_features.T
+                logits_per_text = logits_per_image.T
+        else:
+            logits_per_image = logit_scale * image_features @ text_features.T
+            logits_per_text = logit_scale * text_features @ image_features.T
+        return logits_per_image, logits_per_text
+    def forward(self, image_features, text_features, logit_scale, output_dict=False):
+        device = image_features.device
+        logits_per_image, logits_per_text = self.get_logits(image_features, text_features, logit_scale)
+        labels = self.get_ground_truth(device, logits_per_image.shape[0])
+        total_loss = (
+            F.cross_entropy(logits_per_image, labels) +
+            F.cross_entropy(logits_per_text, labels)
+        ) / 2
+        return {"contrastive_loss": total_loss} if output_dict else total_loss
+class SigLipLoss(nn.Module):
+    """ Sigmoid Loss for Language Image Pre-Training (SigLIP) - https://arxiv.org/abs/2303.15343
+    @article{zhai2023sigmoid,
+      title={Sigmoid loss for language image pre-training},
+      author={Zhai, Xiaohua and Mustafa, Basil and Kolesnikov, Alexander and Beyer, Lucas},
+      journal={arXiv preprint arXiv:2303.15343},
+      year={2023}
+    }
+    """
+    def __init__(
+            self,
+            cache_labels=False,
+            rank=0,
+            world_size=1,
+            bidir=True,
+            use_horovod=False,
+    ):
+        super().__init__()
+        self.cache_labels = cache_labels
+        self.rank = rank
+        self.world_size = world_size
+        assert not use_horovod  # FIXME need to look at hvd ops for ring transfers
+        self.use_horovod = use_horovod
+        self.bidir = bidir
+        # cache state FIXME cache not currently used, worthwhile?
+        self.prev_num_logits = 0
+        self.labels = {}
+    def get_ground_truth(self, device, dtype, num_logits, negative_only=False) -> torch.Tensor:
+        labels = -torch.ones((num_logits, num_logits), device=device, dtype=dtype)
+        if not negative_only:
+            labels = 2 * torch.eye(num_logits, device=device, dtype=dtype) + labels
+        return labels
+    def get_logits(self, image_features, text_features, logit_scale, logit_bias=None):
+        logits = logit_scale * image_features @ text_features.T
+        if logit_bias is not None:
+            logits += logit_bias
+        return logits
+    def _loss(self, image_features, text_features, logit_scale, logit_bias=None, negative_only=False):
+        logits = self.get_logits(image_features, text_features, logit_scale, logit_bias)
+        labels = self.get_ground_truth(
+            image_features.device,
+            image_features.dtype,
+            image_features.shape[0],
+            negative_only=negative_only,
+        )
+        loss = -F.logsigmoid(labels * logits).sum() / image_features.shape[0]
+        return loss
+    def forward(self, image_features, text_features, logit_scale, logit_bias, output_dict=False):
+        loss = self._loss(image_features, text_features, logit_scale, logit_bias)
+        if self.world_size > 1:
+            # exchange text features w/ neighbour world_size - 1 times
+            right_rank = (self.rank + 1) % self.world_size
+            left_rank = (self.rank - 1 + self.world_size) % self.world_size
+            if self.bidir:
+                text_features_to_right = text_features_to_left = text_features
+                num_bidir, remainder = divmod(self.world_size - 1, 2)
+                for i in range(num_bidir):
+                    text_features_recv = neighbour_exchange_bidir_with_grad(
+                        left_rank,
+                        right_rank,
+                        text_features_to_left,
+                        text_features_to_right,
+                    )
+                    for f in text_features_recv:
+                        loss += self._loss(
+                            image_features,
+                            f,
+                            logit_scale,
+                            logit_bias,
+                            negative_only=True,
+                        )
+                    text_features_to_left, text_features_to_right = text_features_recv
+                if remainder:
+                    text_features_recv = neighbour_exchange_with_grad(
+                        left_rank, right_rank, text_features_to_right)
+                    loss += self._loss(
+                        image_features,
+                        text_features_recv,
+                        logit_scale,
+                        logit_bias,
+                        negative_only=True,
+                    )
+            else:
+                text_features_to_right = text_features
+                for i in range(self.world_size - 1):
+                    text_features_from_left = neighbour_exchange_with_grad(
+                        left_rank, right_rank, text_features_to_right)
+                    loss += self._loss(
+                        image_features,
+                        text_features_from_left,
+                        logit_scale,
+                        logit_bias,
+                        negative_only=True,
+                    )
+                    text_features_to_right = text_features_from_left
+        return {"contrastive_loss": loss} if output_dict else loss
+class ResolutionScaledTimeStepSampler(TimeStepSampler):
+    def __init__(self, scale: float, base_time_step_sampler: TimeStepSampler):
+        self.scale = scale
+        self.base_time_step_sampler = base_time_step_sampler
+    @torch.no_grad()
+    def sample_time(self, x_start):
+        base_time = self.base_time_step_sampler.sample_time(x_start)
+        # based on eq (23) of https://arxiv.org/abs/2403.03206
+        scaled_time = (base_time * self.scale) / (1 + (self.scale - 1) * base_time)
+        return scaled_time
+class LogitNormalSampler(TimeStepSampler):
+    def __init__(self, normal_mean: float = 0, normal_std: float = 1):
+        # follows https://arxiv.org/pdf/2403.03206.pdf
+        # sample from a normal distribution
+        # pass the output through standard logistic function, i.e., sigmoid
+        self.normal_mean = float(normal_mean)
+        self.normal_std = float(normal_std)
+    @torch.no_grad()
+    def sample_time(self, x_start):
+        x_normal = torch.normal(
+            mean=self.normal_mean,
+            std=self.normal_std,
+            size=(x_start.shape[0],),
+            device=x_start.device,
+        )
+        x_logistic = torch.nn.functional.sigmoid(x_normal)
+        return x_logistic
+class UniformTimeSampler(TimeStepSampler):
+    @torch.no_grad()
+    def sample_time(self, x_start):
+        # [0, 1] and 1 indicates the original image; 0 is pure noise
+        return torch.rand(x_start.shape[0], device=x_start.device)
+class FlowMatching(nn.Module):
+    def __init__(
+        self,
+        sigma_min: float = 1e-5,
+        sigma_max: float = 1.0,
+        timescale: float = 1.0,
+        **kwargs,
+    ):
+        # LatentDiffusion/DDPM will create too many class variables we do not need
+        super().__init__(**kwargs)
+        self.time_step_sampler = LogitNormalSampler()
+        self.sigma_min = sigma_min
+        self.sigma_max = sigma_max
+        self.timescale = timescale
+        self.clip_loss = ClipLoss()
+        # self.SigLipLoss = SigLipLoss()
+        self.resizer = transforms.Resize(256) # for clip
+    def sample_noise(self, x_start):
+        # simple IID noise
+        return torch.randn_like(x_start, device=x_start.device) * self.sigma_max
+    def mos(self, err, start_dim=1, con_mask=None):  # mean of square
+        if con_mask is not None:
+            return (err.pow(2).mean(dim=-1) * con_mask).sum(dim=-1) / con_mask.sum(dim=-1)
+        else:
+            return err.pow(2).flatten(start_dim=start_dim).mean(dim=-1)
+    def Xentropy(self, pred, tar, con_mask=None):
+        if con_mask is not None:
+            return (nn.functional.cross_entropy(pred, tar, reduction='none') * con_mask).sum(dim=-1) / con_mask.sum(dim=-1)
+        else:
+            return nn.functional.cross_entropy(pred, tar, reduction='none').mean(dim=-1)
+    def l2_reg(self, pred, lam = 0.0001):
+        return lam * torch.norm(pred, p=2, dim=(1, 2, 3)) ** 2
+    # model forward and prediction
+    def forward(
+        self,
+        x,
+        nnet,
+        loss_coeffs,
+        cond,
+        con_mask,
+        nnet_style,
+        training_step,
+        cond_ori=None,  # not using
+        con_mask_ori=None,  # not using
+        batch_img_clip=None, # not using
+        model_config=None,
+        all_config=None,
+        text_token=None,
+        return_raw_loss=False,
+        additional_embeddings=None,
+        timesteps: Optional[Tuple[int, int]] = None,
+        *args,
+        **kwargs,
+    ):
+        assert timesteps is None, "timesteps must be None"
+        timesteps = self.time_step_sampler.sample_time(x)
+        if nnet_style == 'dimr':
+            if hasattr(model_config, "standard_diffusion") and model_config.standard_diffusion:
+                standard_diffusion=True
+            else:
+                standard_diffusion=False
+            return self.p_losses_textVAE(
+                x, cond, con_mask, timesteps, nnet, batch_img_clip=batch_img_clip, cond_ori=cond_ori, con_mask_ori=con_mask_ori, text_token=text_token, loss_coeffs=loss_coeffs, return_raw_loss=return_raw_loss, nnet_style=nnet_style, standard_diffusion=standard_diffusion, all_config=all_config, training_step=training_step, *args, **kwargs
+            )
+        elif nnet_style == 'dit':
+            if hasattr(model_config, "standard_diffusion") and model_config.standard_diffusion:
+                standard_diffusion=True
+                raise NotImplementedError("need update")
+            else:
+                standard_diffusion=False
+            return self.p_losses_textVAE_dit(
+                    x, cond, con_mask, timesteps, nnet, batch_img_clip=batch_img_clip, cond_ori=cond_ori, con_mask_ori=con_mask_ori, text_token=text_token, loss_coeffs=loss_coeffs, return_raw_loss=return_raw_loss, nnet_style=nnet_style, standard_diffusion=standard_diffusion, all_config=all_config, training_step=training_step, *args, **kwargs
+                )
+        else:
+            raise NotImplementedError
+    def p_losses_textVAE(
+        self,
+        x_start,
+        cond,
+        con_mask,
+        t,
+        nnet,
+        loss_coeffs,
+        training_step,
+        text_token=None,
+        nnet_style=None,
+        all_config=None,
+        batch_img_clip=None,
+        cond_ori=None, # not using
+        con_mask_ori=None, # not using
+        return_raw_loss=False,
+        additional_embeddings=None,
+        standard_diffusion=False,
+        noise=None,
+    ):
+        """
+        CrossFlow training for DiMR
+        """
+        assert noise is None
+        x0, mu, log_var = nnet(cond, text_encoder = True, shape = x_start.shape, mask = con_mask)
+        ############ loss for Text VE
+        if batch_img_clip.shape[-1] == 512:
+            recon_gt = self.resizer(batch_img_clip)
+        else:
+            recon_gt = batch_img_clip
+        recon_gt_clip, logit_scale = nnet(recon_gt, image_clip = True)
+        image_features = recon_gt_clip / recon_gt_clip.norm(dim=-1, keepdim=True)
+        text_features = x0 / x0.norm(dim=-1, keepdim=True)
+        recons_loss = self.clip_loss(image_features, text_features, logit_scale)
+        # kld_loss = -0.5 * torch.sum(1 + log_var - mu ** 2 - log_var.exp(), dim = 1)
+        kld_loss = -0.5 * torch.sum(1 + log_var - (0.3 * mu) ** 6 - log_var.exp(), dim = 1) # slightly different KL loss function: mu -> 0 [(0.3*mu) ** 6] and var -> 1
+        kld_loss_weight = 1e-2 # 0.0005
+        loss_mlp = recons_loss + kld_loss * kld_loss_weight
+        ############ loss for FM
+        noise = x0.reshape(x_start.shape)
+        if hasattr(all_config.nnet.model_args, "cfg_indicator"):
+            null_indicator = torch.from_numpy(np.array([random.random() < all_config.nnet.model_args.cfg_indicator for _ in range(x_start.shape[0])])).to(x_start.device)
+            if null_indicator.sum()<=1:
+                null_indicator[null_indicator==True] = False
+                assert null_indicator.sum() == 0
+                pass
+            else:
+                target_null = x_start[null_indicator]
+                target_null = torch.cat((target_null[1:], target_null[:1]))
+                x_start[null_indicator] = target_null
+        else:
+            null_indicator = None
+        x_noisy = self.psi(t, x=noise, x1=x_start)
+        target_velocity = self.Dt_psi(t, x=noise, x1=x_start)
+        log_snr = 4 - t * 8 # compute from timestep : inversed
+        prediction = nnet(x_noisy, log_snr = log_snr, null_indicator=null_indicator)
+        target = multi_scale_targets(target_velocity, levels = len(prediction), scale_correction = True)
+        loss_diff = 0
+        for pred, coeff in check_zip(prediction, loss_coeffs):
+            loss_diff = loss_diff + coeff * self.mos(pred - target[pred.shape[-1]])
+        ###########
+        loss = loss_diff + loss_mlp
+        return loss, {'loss_diff': loss_diff, 'clip_loss': recons_loss, 'kld_loss': kld_loss, 'kld_loss_weight': torch.tensor(kld_loss_weight, device=kld_loss.device), 'clip_logit_scale': logit_scale}
+    def p_losses_textVAE_dit(
+        self,
+        x_start,
+        cond,
+        con_mask,
+        t,
+        nnet,
+        loss_coeffs,
+        training_step,
+        text_token=None,
+        nnet_style=None,
+        all_config=None,
+        batch_img_clip=None,
+        cond_ori=None, # not using
+        con_mask_ori=None, # not using
+        return_raw_loss=False,
+        additional_embeddings=None,
+        standard_diffusion=False,
+        noise=None,
+    ):
+        """
+        CrossFLow training for DiT
+        """
+        assert noise is None
+        x0, mu, log_var = nnet(cond, text_encoder = True, shape = x_start.shape, mask = con_mask)
+        ############ loss for Text VE
+        if batch_img_clip.shape[-1] == 512:
+            recon_gt = self.resizer(batch_img_clip)
+        else:
+            recon_gt = batch_img_clip
+        recon_gt_clip, logit_scale = nnet(recon_gt, image_clip = True)
+        image_features = recon_gt_clip / recon_gt_clip.norm(dim=-1, keepdim=True)
+        text_features = x0 / x0.norm(dim=-1, keepdim=True)
+        recons_loss = self.clip_loss(image_features, text_features, logit_scale)
+        # kld_loss = -0.5 * torch.sum(1 + log_var - mu ** 2 - log_var.exp(), dim = 1)
+        kld_loss = -0.5 * torch.sum(1 + log_var - (0.3 * mu) ** 6 - log_var.exp(), dim = 1)
+        kld_loss_weight = 1e-2 # 0.0005
+        loss_mlp = recons_loss + kld_loss * kld_loss_weight
+        ############ loss for FM
+        noise = x0.reshape(x_start.shape)
+        if hasattr(all_config.nnet.model_args, "cfg_indicator"):
+            null_indicator = torch.from_numpy(np.array([random.random() < all_config.nnet.model_args.cfg_indicator for _ in range(x_start.shape[0])])).to(x_start.device)
+            if null_indicator.sum()<=1:
+                null_indicator[null_indicator==True] = False
+                assert null_indicator.sum() == 0
+                pass
+            else:
+                target_null = x_start[null_indicator]
+                target_null = torch.cat((target_null[1:], target_null[:1]))
+                x_start[null_indicator] = target_null
+        else:
+            null_indicator = None
+        x_noisy = self.psi(t, x=noise, x1=x_start)
+        target_velocity = self.Dt_psi(t, x=noise, x1=x_start)
+        prediction = nnet(x_noisy, t = t, null_indicator = null_indicator)[0]
+        loss_diff = self.mos(prediction - target_velocity)
+        ###########
+        loss = loss_diff + loss_mlp
+        return loss, {'loss_diff': loss_diff, 'clip_loss': recons_loss, 'kld_loss': kld_loss, 'kld_loss_weight': torch.tensor(kld_loss_weight, device=kld_loss.device), 'clip_logit_scale': logit_scale}
+    ## flow matching specific functions
+    def psi(self, t, x, x1):
+        assert (
+            t.shape[0] == x.shape[0]
+        ), f"Batch size of t and x does not agree {t.shape[0]} vs. {x.shape[0]}"
+        assert (
+            t.shape[0] == x1.shape[0]
+        ), f"Batch size of t and x1 does not agree {t.shape[0]} vs. {x1.shape[0]}"
+        assert t.ndim == 1
+        t = self.expand_t(t, x)
+        return (t * (self.sigma_min / self.sigma_max - 1) + 1) * x + t * x1
+    def Dt_psi(self, t: torch.Tensor, x: torch.Tensor, x1: torch.Tensor):
+        assert x.shape[0] == x1.shape[0]
+        return (self.sigma_min / self.sigma_max - 1) * x + x1
+    def expand_t(self, t: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
+        t_expanded = t
+        while t_expanded.ndim < x.ndim:
+            t_expanded = t_expanded.unsqueeze(-1)
+        return t_expanded.expand_as(x)
+class ODEEulerFlowMatchingSolver(Solver):
+    """
+    ODE Solver for Flow matching that uses an Euler discretization
+    Supports number of time steps at inference
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.step_size_type = kwargs.get("step_size_type", "step_in_dsigma")
+        assert self.step_size_type in ["step_in_dsigma", "step_in_dt"]
+        self.sample_timescale = 1.0 - 1e-5
+    @torch.no_grad()
+    def sample_euler(
+        self,
+        x_T,
+        unconditional_guidance_scale,
+        has_null_indicator,
+        t=[0, 1.0],
+        **kwargs,
+    ):
+        """
+        Euler solver for flow matching.
+        Based on https://github.com/VinAIResearch/LFM/blob/main/sampler/karras_sample.py
+        """
+        t = torch.tensor(t)
+        t = t * self.sample_timescale
+        sigma_min = 1e-5
+        sigma_max = 1.0
+        sigma_steps = torch.linspace(
+            sigma_min, sigma_max, self.num_time_steps + 1, device=x_T.device
+        )
+        discrete_time_steps_for_step = torch.linspace(
+            t[0], t[1], self.num_time_steps + 1, device=x_T.device
+        )
+        discrete_time_steps_to_eval_model_at = torch.linspace(
+            t[0], t[1], self.num_time_steps, device=x_T.device
+        )
+        print("num_time_steps : " + str(self.num_time_steps))
+        for i in range(self.num_time_steps):
+            t_i = discrete_time_steps_to_eval_model_at[i]
+            velocity = self.get_model_output_dimr(
+                x_T,
+                has_null_indicator = has_null_indicator,
+                t_continuous = t_i.repeat(x_T.shape[0]),
+                unconditional_guidance_scale = unconditional_guidance_scale,
+            )
+            if self.step_size_type == "step_in_dsigma":
+                step_size = sigma_steps[i + 1] - sigma_steps[i]
+            elif self.step_size_type == "step_in_dt":
+                step_size = (
+                    discrete_time_steps_for_step[i + 1]
+                    - discrete_time_steps_for_step[i]
+                )
+            x_T = x_T + velocity * step_size
+        intermediates = None
+        return x_T, intermediates
+    @torch.no_grad()
+    def sample(
+        self,
+        *args,
+        **kwargs,
+    ):
+        assert kwargs.get("ucg_schedule", None) is None
+        assert kwargs.get("skip_type", None) is None
+        assert kwargs.get("dynamic_threshold", None) is None
+        assert kwargs.get("x0", None) is None
+        assert kwargs.get("x_T") is not None
+        assert kwargs.get("score_corrector", None) is None
+        assert kwargs.get("normals_sequence", None) is None
+        assert kwargs.get("callback", None) is None
+        assert kwargs.get("quantize_x0", False) is False
+        assert kwargs.get("eta", 0.0) == 0.0
+        assert kwargs.get("mask", None) is None
+        assert kwargs.get("noise_dropout", 0.0) == 0.0
+        self.num_time_steps = kwargs.get("sample_steps")
+        self.x_T_uncon = kwargs.get("x_T_uncon")
+        samples, intermediates = super().sample(
+            *args,
+            sampling_method=self.sample_euler,
+            do_make_schedule=False,
+            **kwargs,
+        )
+        return samples, intermediates
+class ODEFlowMatchingSolver(Solver):
+    """
+    ODE Solver for Flow matching that uses `dopri5`
+    Does not support number of time steps based control
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.sample_timescale = 1.0 - 1e-5
+    # sampling for inference
+    @torch.no_grad()
+    def sample_transport(
+        self,
+        x_T,
+        unconditional_guidance_scale,
+        has_null_indicator,
+        t=[0, 1.0],
+        ode_opts={},
+        **kwargs,
+    ):
+        num_evals = 0
+        t = torch.tensor(t, device=x_T.device)
+        if "options" not in ode_opts:
+            ode_opts["options"] = {}
+        ode_opts["options"]["step_t"] = [self.sample_timescale + 1e-6]
+        def ode_func(t, x_T):
+            nonlocal num_evals
+            num_evals += 1
+            model_output = self.get_model_output_dimr(
+                x_T,
+                has_null_indicator = has_null_indicator,
+                t_continuous = t.repeat(x_T.shape[0]),
+                unconditional_guidance_scale = unconditional_guidance_scale,
+            )
+            return model_output
+        z = torchdiffeq.odeint(
+            ode_func,
+            x_T,
+            t * self.sample_timescale,
+            **{"atol": 1e-5, "rtol": 1e-5, "method": "dopri5", **ode_opts},
+        )
+        # first dimension of z contains solutions to different timepoints
+        # we only need the last one (corresponding to t=1, i.e., image)
+        z = z[-1]
+        intermediates = None
+        return z, intermediates
+    @torch.no_grad()
+    def sample(
+        self,
+        *args,
+        **kwargs,
+    ):
+        assert kwargs.get("ucg_schedule", None) is None
+        assert kwargs.get("skip_type", None) is None
+        assert kwargs.get("dynamic_threshold", None) is None
+        assert kwargs.get("x0", None) is None
+        assert kwargs.get("x_T") is not None
+        assert kwargs.get("score_corrector", None) is None
+        assert kwargs.get("normals_sequence", None) is None
+        assert kwargs.get("callback", None) is None
+        assert kwargs.get("quantize_x0", False) is False
+        assert kwargs.get("eta", 0.0) == 0.0
+        assert kwargs.get("mask", None) is None
+        assert kwargs.get("noise_dropout", 0.0) == 0.0
+        samples, intermediates = super().sample(
+            *args,
+            sampling_method=self.sample_transport,
+            do_make_schedule=False,
+            **kwargs,
+        )
+        return samples, intermediates

libs/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # codes from third party

libs/autoencoder.py ADDED Viewed

	@@ -0,0 +1,519 @@

+import torch
+import torch.nn as nn
+import numpy as np
+from einops import rearrange
+class LinearAttention(nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32):
+        super().__init__()
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias = False)
+        self.to_out = nn.Conv2d(hidden_dim, dim, 1)
+    def forward(self, x):
+        b, c, h, w = x.shape
+        qkv = self.to_qkv(x)
+        q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', heads = self.heads, qkv=3)
+        k = k.softmax(dim=-1)
+        context = torch.einsum('bhdn,bhen->bhde', k, v)
+        out = torch.einsum('bhde,bhdn->bhen', context, q)
+        out = rearrange(out, 'b heads c (h w) -> b (heads c) h w', heads=self.heads, h=h, w=w)
+        return self.to_out(out)
+def nonlinearity(x):
+    # swish
+    return x*torch.sigmoid(x)
+def Normalize(in_channels, num_groups=32):
+    return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=0)
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0,1,0,1)
+            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+class ResnetBlock(nn.Module):
+    def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
+                 dropout, temb_channels=512):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = torch.nn.Conv2d(in_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        if temb_channels > 0:
+            self.temb_proj = torch.nn.Linear(temb_channels,
+                                             out_channels)
+        self.norm2 = Normalize(out_channels)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(out_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(in_channels,
+                                                     out_channels,
+                                                     kernel_size=3,
+                                                     stride=1,
+                                                     padding=1)
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(in_channels,
+                                                    out_channels,
+                                                    kernel_size=1,
+                                                    stride=1,
+                                                    padding=0)
+    def forward(self, x, temb):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        if temb is not None:
+            h = h + self.temb_proj(nonlinearity(temb))[:,:,None,None]
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x+h
+class LinAttnBlock(LinearAttention):
+    """to match AttnBlock usage"""
+    def __init__(self, in_channels):
+        super().__init__(dim=in_channels, heads=1, dim_head=in_channels)
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.k = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.v = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=1,
+                                        stride=1,
+                                        padding=0)
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b,c,h,w = q.shape
+        q = q.reshape(b,c,h*w)
+        q = q.permute(0,2,1)   # b,hw,c
+        k = k.reshape(b,c,h*w) # b,c,hw
+        w_ = torch.bmm(q,k)     # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c)**(-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b,c,h*w)
+        w_ = w_.permute(0,2,1)   # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v,w_)     # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b,c,h,w)
+        h_ = self.proj_out(h_)
+        return x+h_
+def make_attn(in_channels, attn_type="vanilla"):
+    assert attn_type in ["vanilla", "linear", "none"], f'attn_type {attn_type} unknown'
+    print(f"making attention of type '{attn_type}' with {in_channels} in_channels")
+    if attn_type == "vanilla":
+        return AttnBlock(in_channels)
+    elif attn_type == "none":
+        return nn.Identity(in_channels)
+    else:
+        return LinAttnBlock(in_channels)
+class Encoder(nn.Module):
+    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+                 resolution, z_channels, double_z=True, use_linear_attn=False, attn_type="vanilla",
+                 **ignore_kwargs):
+        super().__init__()
+        if use_linear_attn: attn_type = "linear"
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(in_channels,
+                                       self.ch,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,)+tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch*in_ch_mult[i_level]
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions-1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        2*z_channels if double_z else z_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x):
+        # timestep embedding
+        temb = None
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions-1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+                 resolution, z_channels, give_pre_end=False, tanh_out=False, use_linear_attn=False,
+                 attn_type="vanilla", **ignorekwargs):
+        super().__init__()
+        if use_linear_attn: attn_type = "linear"
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.give_pre_end = give_pre_end
+        self.tanh_out = tanh_out
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        in_ch_mult = (1,)+tuple(ch_mult)
+        block_in = ch*ch_mult[self.num_resolutions-1]
+        curr_res = resolution // 2**(self.num_resolutions-1)
+        self.z_shape = (1,z_channels,curr_res,curr_res)
+        print("Working with z of shape {} = {} dimensions.".format(
+            self.z_shape, np.prod(self.z_shape)))
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(z_channels,
+                                       block_in,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks+1):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up) # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        out_ch,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, z):
+        #assert z.shape[1:] == self.z_shape[1:]
+        self.last_z_shape = z.shape
+        # timestep embedding
+        temb = None
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks+1):
+                h = self.up[i_level].block[i_block](h, temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        if self.give_pre_end:
+            return h
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        if self.tanh_out:
+            h = torch.tanh(h)
+        return h
+class FrozenAutoencoderKL(nn.Module):
+    def __init__(self, ddconfig, embed_dim, pretrained_path, scale_factor=0.18215):
+        super().__init__()
+        print(f'Create autoencoder with scale_factor={scale_factor}')
+        self.encoder = Encoder(**ddconfig)
+        self.decoder = Decoder(**ddconfig)
+        assert ddconfig["double_z"]
+        self.quant_conv = torch.nn.Conv2d(2 * ddconfig["z_channels"], 2 * embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
+        self.embed_dim = embed_dim
+        self.scale_factor = scale_factor
+        m, u = self.load_state_dict(torch.load(pretrained_path, map_location='cpu'))
+        assert len(m) == 0 and len(u) == 0
+        self.eval()
+        self.requires_grad_(False)
+    def encode_moments(self, x):
+        h = self.encoder(x)
+        moments = self.quant_conv(h)
+        return moments
+    def sample(self, moments):
+        mean, logvar = torch.chunk(moments, 2, dim=1)
+        logvar = torch.clamp(logvar, -30.0, 20.0)
+        std = torch.exp(0.5 * logvar)
+        z = mean + std * torch.randn_like(mean)
+        z = self.scale_factor * z
+        return z
+    def encode(self, x):
+        moments = self.encode_moments(x)
+        z = self.sample(moments)
+        return z
+    def decode(self, z):
+        z = (1. / self.scale_factor) * z
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z)
+        return dec
+    def forward(self, inputs, fn):
+        if fn == 'encode_moments':
+            return self.encode_moments(inputs)
+        elif fn == 'encode':
+            return self.encode(inputs)
+        elif fn == 'decode':
+            return self.decode(inputs)
+        else:
+            raise NotImplementedError
+def get_model(pretrained_path, scale_factor=0.18215):
+    ddconfig = dict(
+        double_z=True,
+        z_channels=4,
+        resolution=256,
+        in_channels=3,
+        out_ch=3,
+        ch=128,
+        ch_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_resolutions=[],
+        dropout=0.0
+    )
+    return FrozenAutoencoderKL(ddconfig, 4, pretrained_path, scale_factor)
+def main():
+    import torchvision.transforms as transforms
+    from torchvision.utils import save_image
+    import os
+    from PIL import Image
+    model = get_model('assets/stable-diffusion/autoencoder_kl.pth')
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    model = model.to(device)
+    scale_factor = 0.18215
+    T = transforms.Compose([transforms.Resize(256), transforms.CenterCrop(256), transforms.ToTensor()])
+    path = 'imgs'
+    fnames = os.listdir(path)
+    for fname in fnames:
+        p = os.path.join(path, fname)
+        img = Image.open(p)
+        img = T(img)
+        img = img * 2. - 1
+        img = img[None, ...]
+        img = img.to(device)
+        # with torch.cuda.amp.autocast():
+        #     moments = model.encode_moments(img)
+        #     mean, logvar = torch.chunk(moments, 2, dim=1)
+        #     logvar = torch.clamp(logvar, -30.0, 20.0)
+        #     std = torch.exp(0.5 * logvar)
+        #     zs = [(mean + std * torch.randn_like(mean)) * scale_factor for _ in range(4)]
+        #     recons = [model.decode(z) for z in zs]
+        with torch.cuda.amp.autocast():
+            print('test encode & decode')
+            recons = [model.decode(model.encode(img)) for _ in range(4)]
+        out = torch.cat([img, *recons], dim=0)
+        out = (out + 1) * 0.5
+        save_image(out, f'recons_{fname}')
+if __name__ == "__main__":
+    main()

libs/clip.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import torch.nn as nn
+from transformers import CLIPTokenizer, CLIPTextModel
+import time
+class AbstractEncoder(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def encode(self, *args, **kwargs):
+        raise NotImplementedError
+class FrozenCLIPEmbedder(AbstractEncoder):
+    """Uses the CLIP transformer encoder for text (from Hugging Face)"""
+    def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77):
+        super().__init__()
+        self.tokenizer = CLIPTokenizer.from_pretrained(version)
+        self.transformer = CLIPTextModel.from_pretrained(version)
+        self.device = device
+        self.max_length = max_length
+        self.freeze()
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+    def forward(self, text):
+        batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
+                                        return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
+        tokens = batch_encoding["input_ids"].to(self.device)
+        outputs = self.transformer(input_ids=tokens)
+        z = outputs.last_hidden_state
+        return z, {'token_embedding': outputs.last_hidden_state, 'pooler_output': outputs.pooler_output, 'token_mask': batch_encoding['attention_mask'].to(self.device), 'tokens': batch_encoding["input_ids"].to(self.device)}
+    def encode_from_token(self, tokens):
+        tokens = tokens.to(self.device)
+        outputs = self.transformer(input_ids=tokens)
+        z = outputs.last_hidden_state
+        return z
+    def encode(self, text):
+        return self(text)
+class FrozenCLIPTokenizer(AbstractEncoder):
+    """Uses the CLIP transformer encoder for text (from Hugging Face)"""
+    def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77):
+        super().__init__()
+        self.tokenizer = CLIPTokenizer.from_pretrained(version)
+        self.max_length = max_length
+        self.freeze()
+    def freeze(self):
+        for param in self.parameters():
+            param.requires_grad = False
+    def forward(self, text):
+        batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
+                                        return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
+        tokens = batch_encoding["input_ids"]
+        return tokens
+    def encode(self, text):
+        return self(text)

libs/model/axial_rope.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import math
+import torch
+import torch._dynamo
+from torch import nn
+from . import flags
+if flags.get_use_compile():
+    torch._dynamo.config.suppress_errors = True
+def rotate_half(x):
+    x1, x2 = x[..., 0::2], x[..., 1::2]
+    x = torch.stack((-x2, x1), dim=-1)
+    *shape, d, r = x.shape
+    return x.view(*shape, d * r)
+@flags.compile_wrap
+def apply_rotary_emb(freqs, t, start_index=0, scale=1.0):
+    freqs = freqs.to(t)
+    rot_dim = freqs.shape[-1]
+    end_index = start_index + rot_dim
+    assert rot_dim <= t.shape[-1], f"feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}"
+    t_left, t, t_right = t[..., :start_index], t[..., start_index:end_index], t[..., end_index:]
+    t = (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale)
+    return torch.cat((t_left, t, t_right), dim=-1)
+def centers(start, stop, num, dtype=None, device=None):
+    edges = torch.linspace(start, stop, num + 1, dtype=dtype, device=device)
+    return (edges[:-1] + edges[1:]) / 2
+def make_grid(h_pos, w_pos):
+    grid = torch.stack(torch.meshgrid(h_pos, w_pos, indexing='ij'), dim=-1)
+    h, w, d = grid.shape
+    return grid.view(h * w, d)
+def bounding_box(h, w, pixel_aspect_ratio=1.0):
+    # Adjusted dimensions
+    w_adj = w
+    h_adj = h * pixel_aspect_ratio
+    # Adjusted aspect ratio
+    ar_adj = w_adj / h_adj
+    # Determine bounding box based on the adjusted aspect ratio
+    y_min, y_max, x_min, x_max = -1.0, 1.0, -1.0, 1.0
+    if ar_adj > 1:
+        y_min, y_max = -1 / ar_adj, 1 / ar_adj
+    elif ar_adj < 1:
+        x_min, x_max = -ar_adj, ar_adj
+    return y_min, y_max, x_min, x_max
+def make_axial_pos(h, w, pixel_aspect_ratio=1.0, align_corners=False, dtype=None, device=None):
+    y_min, y_max, x_min, x_max = bounding_box(h, w, pixel_aspect_ratio)
+    if align_corners:
+        h_pos = torch.linspace(y_min, y_max, h, dtype=dtype, device=device)
+        w_pos = torch.linspace(x_min, x_max, w, dtype=dtype, device=device)
+    else:
+        h_pos = centers(y_min, y_max, h, dtype=dtype, device=device)
+        w_pos = centers(x_min, x_max, w, dtype=dtype, device=device)
+    return make_grid(h_pos, w_pos)
+def freqs_pixel(max_freq=10.0):
+    def init(shape):
+        freqs = torch.linspace(1.0, max_freq / 2, shape[-1]) * math.pi
+        return freqs.log().expand(shape)
+    return init
+def freqs_pixel_log(max_freq=10.0):
+    def init(shape):
+        log_min = math.log(math.pi)
+        log_max = math.log(max_freq * math.pi / 2)
+        return torch.linspace(log_min, log_max, shape[-1]).expand(shape)
+    return init
+class AxialRoPE(nn.Module):
+    def __init__(self, dim, n_heads, start_index=0, freqs_init=freqs_pixel_log(max_freq=10.0)):
+        super().__init__()
+        self.n_heads = n_heads
+        self.start_index = start_index
+        log_freqs = freqs_init((n_heads, dim // 4))
+        self.freqs_h = nn.Parameter(log_freqs.clone())
+        self.freqs_w = nn.Parameter(log_freqs.clone())
+    def extra_repr(self):
+        dim = (self.freqs_h.shape[-1] + self.freqs_w.shape[-1]) * 2
+        return f"dim={dim}, n_heads={self.n_heads}, start_index={self.start_index}"
+    def get_freqs(self, pos):
+        if pos.shape[-1] != 2:
+            raise ValueError("input shape must be (..., 2)")
+        freqs_h = pos[..., None, None, 0] * self.freqs_h.exp()
+        freqs_w = pos[..., None, None, 1] * self.freqs_w.exp()
+        freqs = torch.cat((freqs_h, freqs_w), dim=-1).repeat_interleave(2, dim=-1)
+        return freqs.transpose(-2, -3)
+    def forward(self, x, pos):
+        freqs = self.get_freqs(pos)
+        return apply_rotary_emb(freqs, x, self.start_index)

libs/model/common_layers.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import torch
+import torch.nn.functional as F
+from torch import nn
+from timm.models.layers import trunc_normal_
+class Linear(nn.Linear):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        trunc_normal_(self.weight, mean = 0, std = 0.02)
+        if self.bias is not None:
+            nn.init.zeros_(self.bias)
+class LayerNorm(nn.LayerNorm):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        trunc_normal_(self.weight, mean = 0, std = 0.02)
+        if self.bias is not None:
+            nn.init.zeros_(self.bias)
+class Conv2d(nn.Conv2d):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        trunc_normal_(self.weight, mean = 0, std = 0.02)
+        if self.bias is not None:
+            nn.init.zeros_(self.bias)
+class Embedding(nn.Embedding):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        trunc_normal_(self.weight, mean = 0, std = 0.02)
+class ImageNorm(nn.Module):
+    def forward(self, x):
+        assert x.dim() == 4
+        eps = 1e-05
+        x = x / (x.var(dim = (1, 2, 3), keepdim = True) + eps).sqrt()
+        return x
+class Flatten(nn.Module):
+    def forward(self, x):
+        B, H, W, C = x.shape
+        x = x.reshape(B, H * W, C)
+        return x
+class ChannelLast(nn.Module):
+    def forward(self, x):
+        assert x.dim() == 4
+        x = x.permute(0, 2, 3, 1) # [B, H, W, C]
+        return x
+class ChannelFirst(nn.Module):
+    def forward(self, x):
+        assert x.dim() == 4
+        x = x.permute(0, 3, 1, 2) # [B, C, H, W]
+        return x
+class OddUpInterpolate(nn.Module):
+    def __init__(self, ratio):
+        super().__init__()
+        self.ratio = ratio
+    def forward(self, x):
+        if self.ratio == 1:
+            return x
+        assert x.dim() == 4
+        B, C, H, W = x.shape
+        x = F.interpolate(x, size = ((H - 1) * self.ratio + 1, (W - 1) * self.ratio + 1), mode = "bilinear", align_corners = True)
+        return x
+    def __repr__(self):
+        return f"UpInterpolate(ratio={self.ratio})"
+class OddDownInterpolate(nn.Module):
+    def __init__(self, ratio):
+        super().__init__()
+        self.ratio = ratio
+    def forward(self, x):
+        if self.ratio == 1:
+            return x
+        assert x.dim() == 4
+        B, C, H, W = x.shape
+        x = F.interpolate(x, size = ((H - 1) // self.ratio + 1, (W - 1) // self.ratio + 1), mode = "area")
+        return x
+    def __repr__(self):
+        return f"DownInterpolate(ratio={self.ratio})"
+class EvenDownInterpolate(nn.Module):
+    def __init__(self, ratio):
+        super().__init__()
+        self.ratio = ratio
+    def forward(self, x):
+        if self.ratio == 1:
+            return x
+        assert len(x.shape) == 4
+        B, C, H, W = x.shape
+        x = F.interpolate(x, size = (H // self.ratio, W // self.ratio), mode = "area")
+        return x
+    def __repr__(self):
+        return f"DownInterpolate(ratio={self.ratio})"

libs/model/dimr_t2i.py ADDED Viewed

	@@ -0,0 +1,443 @@

+from re import A
+import torch
+import torch.nn as nn
+import torchvision.transforms as transforms
+import math
+import einops
+import torch.utils.checkpoint
+from functools import partial
+import open_clip
+import numpy as np
+from PIL import Image
+import torch.nn.functional as F
+import timm
+from timm.models.layers import trunc_normal_, Mlp
+from .sigmoid.module import LayerNorm, RMSNorm, AdaRMSNorm, TDRMSNorm, QKNorm, TimeDependentParameter
+from .common_layers import Linear, EvenDownInterpolate, ChannelFirst, ChannelLast, Embedding
+from .axial_rope import AxialRoPE, make_axial_pos
+from .trans_autoencoder import TransEncoder, Adaptor
+def check_zip(*args):
+    args = [list(arg) for arg in args]
+    length = len(args[0])
+    for arg in args:
+        assert len(arg) == length
+    return zip(*args)
+class PixelShuffleUpsample(nn.Module):
+    def __init__(self, dim_in, dim_out, ratio = 2):
+        super().__init__()
+        self.ratio = ratio
+        self.kernel = Linear(dim_in, dim_out * self.ratio * self.ratio)
+    def forward(self, x):
+        x = self.kernel(x)
+        B, H, W, C = x.shape
+        x = x.reshape(B, H, W, self.ratio, self.ratio, C // self.ratio // self.ratio)
+        x = x.transpose(2, 3)
+        x = x.reshape(B, H * self.ratio, W * self.ratio, C // self.ratio // self.ratio)
+        return x
+class PositionEmbeddings(nn.Module):
+    def __init__(self, max_height, max_width, dim):
+        super().__init__()
+        self.max_height = max_height
+        self.max_width = max_width
+        self.position_embeddings = Embedding(self.max_height * self.max_width, dim)
+    def forward(self, x):
+        B, H, W, C = x.shape
+        height_idxes = torch.arange(H, device = x.device)[:, None].repeat(1, W)
+        width_idxes = torch.arange(W, device = x.device)[None, :].repeat(H, 1)
+        idxes = height_idxes * self.max_width + width_idxes
+        x = x + self.position_embeddings(idxes[None])
+        return x
+class TextPositionEmbeddings(nn.Module):
+    def __init__(self, num_embeddings, embedding_dim):
+        super().__init__()
+        self.embedding = Embedding(num_embeddings, embedding_dim)
+    def forward(self, x):
+        batch_size, num_embeddings, embedding_dim = x.shape
+        # positions = torch.arange(height * width, device=x.device).reshape(1, height, width)
+        positions = torch.arange(num_embeddings, device=x.device).unsqueeze(0).expand(batch_size, num_embeddings)
+        x = x + self.embedding(positions)
+        return x
+class MLPBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config.norm_type == 'LN':
+            self.norm_type = 'LN'
+            self.norm = LayerNorm(config.dim)
+        elif config.norm_type == 'RMSN':
+            self.norm_type = 'RMSN'
+            self.norm = RMSNorm(config.dim)
+        elif config.norm_type == 'TDRMSN':
+            self.norm_type = 'TDRMSN'
+            self.norm = TDRMSNorm(config.dim)
+        elif config.norm_type == 'ADARMSN':
+            self.norm_type = 'ADARMSN'
+            self.norm = AdaRMSNorm(config.dim, config.dim)
+        self.act = nn.GELU()
+        self.w0 = Linear(config.dim, config.hidden_dim)
+        self.w1 = Linear(config.dim, config.hidden_dim)
+        self.w2 = Linear(config.hidden_dim, config.dim)
+    def forward(self, x):
+        if self.norm_type == 'LN' or self.norm_type == 'RMSN' or self.norm_type == 'TDRMSN':
+            x = self.norm(x)
+        elif self.norm_type == 'ADARMSN':
+            condition = x[:,0]
+            x = self.norm(x, condition)
+        x = self.act(self.w0(x)) * self.w1(x)
+        x = self.w2(x)
+        return x
+class SelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config.dim % config.num_attention_heads == 0
+        self.num_heads = config.num_attention_heads
+        self.head_dim = config.dim // config.num_attention_heads
+        if hasattr(config, "self_att_prompt") and config.self_att_prompt:
+            self.condition_key_value = Linear(config.clip_dim, 2 * config.dim, bias = False)
+        if config.norm_type == 'LN':
+            self.norm_type = 'LN'
+            self.norm = LayerNorm(config.dim)
+        elif config.norm_type == 'RMSN':
+            self.norm_type = 'RMSN'
+            self.norm = RMSNorm(config.dim)
+        elif config.norm_type == 'TDRMSN':
+            self.norm_type = 'TDRMSN'
+            self.norm = TDRMSNorm(config.dim)
+        elif config.norm_type == 'ADARMSN':
+            self.norm_type = 'ADARMSN'
+            self.norm = AdaRMSNorm(config.dim, config.dim)
+        self.pe_type = config.pe_type
+        if config.pe_type == 'Axial_RoPE':
+            self.pos_emb = AxialRoPE(self.head_dim, self.num_heads)
+            self.qk_norm = QKNorm(self.num_heads)
+        self.query_key_value = Linear(config.dim, 3 * config.dim, bias = False)
+        self.dense = Linear(config.dim, config.dim)
+    def forward(self, x, condition_embeds, condition_masks, pos=None):
+        B, N, C = x.shape
+        if self.norm_type == 'LN' or self.norm_type == 'RMSN' or self.norm_type == 'TDRMSN':
+            qkv = self.query_key_value(self.norm(x))
+        elif self.norm_type == 'ADARMSN':
+            condition = x[:,0]
+            qkv = self.query_key_value(self.norm(x, condition))
+        q, k, v = qkv.reshape(B, N, 3 * self.num_heads, self.head_dim).permute(0, 2, 1, 3).float().chunk(3, dim = 1)
+        if self.pe_type == 'Axial_RoPE':
+            q = self.pos_emb(self.qk_norm(q), pos)
+            k = self.pos_emb(self.qk_norm(k), pos)
+        if condition_embeds is not None:
+            _, L, D = condition_embeds.shape
+            kcvc = self.condition_key_value(condition_embeds)
+            kc, vc = kcvc.reshape(B, L, 2 * self.num_heads, self.head_dim).permute(0, 2, 1, 3).float().chunk(2, dim = 1)
+            k = torch.cat([k, kc], dim = 2)
+            v = torch.cat([v, vc], dim = 2)
+            mask = torch.cat([torch.ones(B, N, dtype = torch.bool, device = condition_masks.device), condition_masks], dim = -1)
+            mask = mask[:, None, None, :]
+        else:
+            mask = None
+        x = F.scaled_dot_product_attention(q, k, v, attn_mask = mask)
+        x = self.dense(x.permute(0, 2, 1, 3).reshape(B, N, C))
+        return x
+class TransformerBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.block1 = SelfAttention(config)
+        self.block2 = MLPBlock(config)
+        self.dropout = nn.Dropout(config.dropout_prob)
+        self.gradient_checking = config.gradient_checking
+    def forward(self, x, condition_embeds, condition_masks, pos):
+        if self.gradient_checking:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, condition_embeds, condition_masks, pos)
+        else:
+            return self._forward(x, condition_embeds, condition_masks, pos)
+    def _forward(self, x, condition_embeds, condition_masks, pos):
+        x = x + self.dropout(self.block1(x, condition_embeds, condition_masks, pos))
+        x = x + self.dropout(self.block2(x))
+        return x
+class ConvNeXtBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.block1 = nn.Sequential(
+            ChannelFirst(),
+            nn.Conv2d(config.dim, config.dim, kernel_size = config.kernel_size, padding = config.kernel_size // 2, stride = 1, groups = config.dim),
+            ChannelLast()
+        )
+        self.block2 = MLPBlock(config)
+        self.dropout = nn.Dropout(config.dropout_prob)
+        self.gradient_checking = config.gradient_checking
+    def forward(self, x, condition_embeds, condition_masks, pos):
+        if self.gradient_checking:
+            return torch.utils.checkpoint.checkpoint(self._forward, x)
+        else:
+            return self._forward(x)
+    def _forward(self, x):
+        x = x + self.dropout(self.block1(x))
+        x = x + self.dropout(self.block2(x))
+        return x
+class Stage(nn.Module):
+    def __init__(self, channels, config, lowres_dim = None, lowres_height = None):
+        super().__init__()
+        if config.block_type == "TransformerBlock":
+            self.encoder_cls = TransformerBlock
+        elif config.block_type == "ConvNeXtBlock":
+            self.encoder_cls = ConvNeXtBlock
+        else:
+            raise Exception()
+        self.pe_type = config.pe_type
+        self.input_layer = nn.Sequential(
+            EvenDownInterpolate(config.image_input_ratio),
+            nn.Conv2d(channels, config.dim, kernel_size = config.input_feature_ratio, stride = config.input_feature_ratio),
+            ChannelLast(),
+            PositionEmbeddings(config.max_height, config.max_width, config.dim)
+        )
+        if lowres_dim is not None:
+            ratio = config.max_height // lowres_height
+            self.upsample = nn.Sequential(
+                LayerNorm(lowres_dim),
+                PixelShuffleUpsample(lowres_dim, config.dim, ratio = ratio),
+                LayerNorm(config.dim),
+            )
+        self.blocks = nn.ModuleList([self.encoder_cls(config) for _ in range(config.num_blocks // 2 * 2 + 1)])
+        self.skip_denses = nn.ModuleList([Linear(config.dim * 2, config.dim) for _ in range(config.num_blocks // 2)])
+        self.output_layer = nn.Sequential(
+            LayerNorm(config.dim),
+            ChannelFirst(),
+            nn.Conv2d(config.dim, channels, kernel_size = config.final_kernel_size, padding = config.final_kernel_size // 2),
+        )
+        self.tensor_true = torch.nn.Parameter(torch.tensor([-1.0])) if self.encoder_cls is TransformerBlock else None
+        self.tensor_false = torch.nn.Parameter(torch.tensor([1.0])) if self.encoder_cls is TransformerBlock else None
+    def forward(self, images, lowres_skips = None, condition_context = None, condition_embeds = None, condition_masks = None, null_indicator=None):
+        if self.pe_type == 'Axial_RoPE' and self.encoder_cls is TransformerBlock:
+            x = self.input_layer(images)
+            _, H, W, _ = x.shape
+            pos = make_axial_pos(H, W)
+        else:
+            x = self.input_layer(images)
+            pos = None
+        if lowres_skips is not None:
+            x = x + self.upsample(lowres_skips)
+        if self.encoder_cls is TransformerBlock:
+            B, H, W, C = x.shape
+            x = x.reshape(B, H * W, C)
+            if null_indicator is not None:
+                indicator_tensor = torch.where(null_indicator, self.tensor_true, self.tensor_false)
+                indicator_tensor = indicator_tensor.view(B, 1, 1).expand(-1, -1, C)
+                x = torch.cat([indicator_tensor, x], dim = 1)
+        external_skips = [x]
+        num_blocks = len(self.blocks)
+        in_blocks = self.blocks[:(num_blocks // 2)]
+        mid_block = self.blocks[(num_blocks // 2)]
+        out_blocks = self.blocks[(num_blocks // 2 + 1):]
+        skips = []
+        for block in in_blocks:
+            x = block(x, condition_embeds, condition_masks, pos=pos)
+            external_skips.append(x)
+            skips.append(x)
+        x = mid_block(x, condition_embeds, condition_masks, pos=pos)
+        external_skips.append(x)
+        for dense, block in check_zip(self.skip_denses, out_blocks):
+            x = dense(torch.cat([x, skips.pop()], dim = -1))
+            x = block(x, condition_embeds, condition_masks, pos=pos)
+            external_skips.append(x)
+        if self.encoder_cls is TransformerBlock:
+            if null_indicator is not None:
+                x = x[:, 1:, :]
+                external_skips = [skip[:, 1:, :] for skip in external_skips]
+            x = x.reshape(B, H, W, C)
+            external_skips = [skip.reshape(B, H, W, C) for skip in external_skips]
+        output = self.output_layer(x)
+        return output, external_skips
+class MRModel(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.channels = config.channels
+        self.block_grad_to_lowres = config.block_grad_to_lowres
+        for stage_config in config.stage_configs:
+            if hasattr(config, "use_t2i"):
+                stage_config.use_t2i = config.use_t2i
+            if hasattr(config, "clip_dim"):
+                stage_config.clip_dim = config.clip_dim
+            if hasattr(config, "num_clip_token"):
+                stage_config.num_clip_token = config.num_clip_token
+            if hasattr(config, "gradient_checking"):
+                stage_config.gradient_checking = config.gradient_checking
+            if hasattr(config, "pe_type"):
+                stage_config.pe_type = config.pe_type
+            else:
+                stage_config.pe_type = 'APE'
+            if hasattr(config, "norm_type"):
+                stage_config.norm_type = config.norm_type
+            else:
+                stage_config.norm_type = 'LN'
+        #### diffusion model
+        if hasattr(config, "not_training_diff") and config.not_training_diff:
+            self.has_diff = False
+        else:
+            self.has_diff = True
+            lowres_dims = [None] + [stage_config.dim * (stage_config.num_blocks // 2 * 2 + 2) for stage_config in config.stage_configs[:-1]]
+            lowres_heights = [None] + [stage_config.max_height for stage_config in config.stage_configs[:-1]]
+            self.stages = nn.ModuleList([
+                Stage(self.channels, stage_config, lowres_dim = lowres_dim, lowres_height=lowres_height)
+                for stage_config, lowres_dim, lowres_height in check_zip(config.stage_configs, lowres_dims, lowres_heights)]
+            )
+        #### Text VE
+        if hasattr(config.textVAE, "num_down_sample_block"):
+            down_sample_block = config.textVAE.num_down_sample_block
+        else:
+            down_sample_block = 3
+        self.context_encoder = TransEncoder(d_model=config.clip_dim, N=config.textVAE.num_blocks, num_token=config.num_clip_token,
+                                            head_num=config.textVAE.num_attention_heads, d_ff=config.textVAE.hidden_dim,
+                                            latten_size=config.channels*config.stage_configs[-1].max_height*config.stage_configs[-1].max_width * 2,
+                                            down_sample_block=down_sample_block, dropout=config.textVAE.dropout_prob, last_norm=False)
+        #### image encoder to train VE
+        self.open_clip, _, self.open_clip_preprocess = open_clip.create_model_and_transforms('ViT-L-16-SigLIP-256', pretrained=None)
+        if config.stage_configs[-1].max_width==32:
+            # for 256px generation
+            self.open_clip_output = Mlp(in_features=1024,
+                                        hidden_features=config.channels*config.stage_configs[-1].max_height*config.stage_configs[-1].max_width,
+                                        out_features=config.channels*config.stage_configs[-1].max_height*config.stage_configs[-1].max_width,
+                                        norm_layer=nn.LayerNorm,
+                                    )
+        else:
+            # for 512px generation
+            self.open_clip_output = Adaptor(input_dim=1024,
+                                        tar_dim=config.channels*config.stage_configs[-1].max_height*config.stage_configs[-1].max_width
+                                        )
+        del self.open_clip.text
+        del self.open_clip.logit_bias
+    def _forward(self, images, log_snr, condition_context = None, condition_text_embeds = None, condition_text_masks = None, condition_drop_prob = None, null_indicator=None):
+        if self.has_diff:
+            TimeDependentParameter.seed_time(self, log_snr)
+            assert condition_context is None
+            assert condition_text_embeds is None
+            if condition_text_embeds is not None:
+                condition_embeds = self.text_conditioning(condition_text_embeds)
+                condition_masks = condition_text_masks
+            else:
+                condition_embeds = None
+                condition_masks = None
+            outputs = []
+            lowres_skips = None
+            for stage in self.stages:
+                output, lowres_skips = stage(images, lowres_skips = lowres_skips, condition_context = condition_context, condition_embeds = condition_embeds, condition_masks = condition_masks, null_indicator=null_indicator)
+                outputs.append(output)
+                lowres_skips = torch.cat(lowres_skips, dim = -1)
+                if self.block_grad_to_lowres:
+                    lowres_skips = lowres_skips.detach()
+            return outputs
+        else:
+            return [images]
+    def _reparameterize(self, mu, logvar):
+        std = torch.exp(0.5 * logvar)
+        eps = torch.randn_like(std)
+        return eps * std + mu
+    def _text_encoder(self, condition_context, tar_shape, mask):
+        output = self.context_encoder(condition_context, mask)
+        mu, log_var = torch.chunk(output, 2, dim=-1)
+        z = self._reparameterize(mu, log_var)
+        return [z, mu, log_var]
+    def _text_decoder(self, condition_enbedding, tar_shape):
+        context_token = self.context_decoder(condition_enbedding)
+        return context_token
+    def _img_clip(self, image_input):
+        image_latent = self.open_clip.encode_image(image_input)
+        image_latent = self.open_clip_output(image_latent)
+        return image_latent, self.open_clip.logit_scale
+    def forward(self, x, t = None, log_snr = None, text_encoder=False, text_decoder=False, image_clip=False, shape=None, mask=None, null_indicator=None):
+        if text_encoder:
+            return self._text_encoder(condition_context = x, tar_shape=shape, mask=mask)
+        elif text_decoder:
+            return self._text_decoder(condition_enbedding = x, tar_shape=shape) # mask is not needed for decoder
+        elif image_clip:
+            return self._img_clip(image_input = x)
+        else:
+            assert log_snr.dtype == torch.float32
+            return self._forward(images = x, log_snr = log_snr, null_indicator=null_indicator)

libs/model/dit_t2i.py ADDED Viewed

	@@ -0,0 +1,405 @@

+# DiT: https://github.com/facebookresearch/DiT/blob/main/models.py
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+import numpy as np
+import math
+from timm.models.vision_transformer import PatchEmbed, Attention, Mlp
+import open_clip
+import torch.utils.checkpoint
+from .trans_autoencoder import TransEncoder, Adaptor
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+#################################################################################
+#               Embedding Layers for Timesteps and Class Labels                 #
+#################################################################################
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class LabelEmbedder(nn.Module):
+    """
+    CrossFlow: update it for CFG with indicator
+    """
+    def __init__(self, num_classes, hidden_size):
+        super().__init__()
+        self.embedding_table = nn.Embedding(num_classes, hidden_size)
+    def forward(self, labels):
+        embeddings = self.embedding_table(labels.int())
+        return embeddings
+#################################################################################
+#                                 Core DiT Model                                #
+#################################################################################
+class DiTBlock(nn.Module):
+    """
+    A DiT block with adaptive layer norm zero (adaLN-Zero) conditioning.
+    """
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, **block_kwargs):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn = Attention(hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs)
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 6 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        return torch.utils.checkpoint.checkpoint(self._forward, x, c)
+        # return self._forward(x, c)
+    def _forward(self, x, c):
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=1)
+        x = x + gate_msa.unsqueeze(1) * self.attn(modulate(self.norm1(x), shift_msa, scale_msa))
+        x = x + gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))
+        return x
+class FinalLayer(nn.Module):
+    """
+    The final layer of DiT.
+    """
+    def __init__(self, hidden_size, patch_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class DiT(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+    def __init__(
+        self,
+        config,
+        patch_size=2,
+        hidden_size=1152,
+        depth=28,
+        num_heads=16,
+        mlp_ratio=4.0,
+        num_classes=2, # for cfg indicator
+    ):
+        super().__init__()
+        self.input_size = config.latent_size
+        self.learn_sigma = config.learn_sigma
+        self.in_channels = config.channels
+        self.out_channels = self.in_channels * 2 if self.learn_sigma else self.in_channels
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.x_embedder = PatchEmbed(self.input_size, patch_size, self.in_channels, hidden_size, bias=True)
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        self.y_embedder = LabelEmbedder(num_classes, hidden_size)
+        num_patches = self.x_embedder.num_patches
+        # Will use fixed sin-cos embedding:
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, hidden_size), requires_grad=False)
+        self.blocks = nn.ModuleList([
+            DiTBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio) for _ in range(depth)
+        ])
+        self.final_layer = FinalLayer(hidden_size, patch_size, self.out_channels)
+        self.initialize_weights()
+        ######### CrossFlow related
+        if hasattr(config.textVAE, "num_down_sample_block"):
+            down_sample_block = config.textVAE.num_down_sample_block
+        else:
+            down_sample_block = 3
+        self.context_encoder = TransEncoder(d_model=config.clip_dim, N=config.textVAE.num_blocks, num_token=config.num_clip_token,
+                                            head_num=config.textVAE.num_attention_heads, d_ff=config.textVAE.hidden_dim,
+                                            latten_size=config.channels * config.latent_size * config.latent_size * 2,
+                                            down_sample_block=down_sample_block, dropout=config.textVAE.dropout_prob, last_norm=False)
+        self.open_clip, _, self.open_clip_preprocess = open_clip.create_model_and_transforms('ViT-L-16-SigLIP-256', pretrained=None)
+        self.open_clip_output = Adaptor(input_dim=1024,
+                                    tar_dim=config.channels * config.latent_size * config.latent_size
+                                    )
+        del self.open_clip.text
+        del self.open_clip.logit_bias
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize (and freeze) pos_embed by sin-cos embedding:
+        pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], int(self.x_embedder.num_patches ** 0.5))
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
+        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
+        w = self.x_embedder.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.x_embedder.proj.bias, 0)
+        # Initialize label embedding table:
+        nn.init.normal_(self.y_embedder.embedding_table.weight, std=0.02)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        for block in self.blocks:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+    def unpatchify(self, x):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.out_channels
+        p = self.x_embedder.patch_size[0]
+        h = w = int(x.shape[1] ** 0.5)
+        assert h * w == x.shape[1]
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], c, h * p, h * p))
+        return imgs
+    def _forward(self, x, t, null_indicator):
+        """
+        Forward pass of DiT.
+        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
+        t: (N,) tensor of diffusion timesteps
+        """
+        x = self.x_embedder(x) + self.pos_embed  # (N, T, D), where T = H * W / patch_size ** 2
+        t = self.t_embedder(t)                   # (N, D)
+        y = self.y_embedder(null_indicator)    # (N, D)
+        c = t + y                                # (N, D)
+        for block in self.blocks:
+            x = block(x, c)                      # (N, T, D)
+        x = self.final_layer(x, c)                # (N, T, patch_size ** 2 * out_channels)
+        x = self.unpatchify(x)                   # (N, out_channels, H, W)
+        return [x]
+    def _forward_with_cfg(self, x, t, cfg_scale):
+        """
+        Forward pass of DiT, but also batches the unconditional forward pass for classifier-free guidance.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
+        half = x[: len(x) // 2]
+        combined = torch.cat([half, half], dim=0)
+        model_out = self.forward(combined, t)
+        # For exact reproducibility reasons, we apply classifier-free guidance on only
+        # three channels by default. The standard approach to cfg applies it to all channels.
+        # This can be done by uncommenting the following line and commenting-out the line following that.
+        # eps, rest = model_out[:, :self.in_channels], model_out[:, self.in_channels:]
+        eps, rest = model_out[:, :3], model_out[:, 3:]
+        cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
+        half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
+        eps = torch.cat([half_eps, half_eps], dim=0)
+        return torch.cat([eps, rest], dim=1)
+    def _reparameterize(self, mu, logvar):
+        std = torch.exp(0.5 * logvar)
+        eps = torch.randn_like(std)
+        return eps * std + mu
+    def _text_encoder(self, condition_context, tar_shape, mask):
+        output = self.context_encoder(condition_context, mask)
+        mu, log_var = torch.chunk(output, 2, dim=-1)
+        z = self._reparameterize(mu, log_var)
+        return [z, mu, log_var]
+    def _img_clip(self, image_input):
+        image_latent = self.open_clip.encode_image(image_input)
+        image_latent = self.open_clip_output(image_latent)
+        return image_latent, self.open_clip.logit_scale
+    def forward(self, x, t = None, log_snr = None, text_encoder=False, text_decoder=False, image_clip=False, shape=None, mask=None, null_indicator=None):
+        if text_encoder:
+            return self._text_encoder(condition_context = x, tar_shape=shape, mask=mask)
+        elif text_decoder:
+            raise NotImplementedError
+            return self._text_decoder(condition_enbedding = x, tar_shape=shape) # mask is not needed for decoder
+        elif image_clip:
+            return self._img_clip(image_input = x)
+        else:
+            return self._forward(x = x, t = t, null_indicator=null_indicator)
+#################################################################################
+#                   Sine/Cosine Positional Embedding Functions                  #
+#################################################################################
+# https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+#################################################################################
+#                                   DiT Configs                                  #
+#################################################################################
+def DiT_H_2(config, **kwargs):
+    return DiT(config=config, depth=36, hidden_size=1280, patch_size=2, num_heads=20, **kwargs)
+def DiT_XL_2(config, **kwargs):
+    return DiT(config=config, depth=28, hidden_size=1152, patch_size=2, num_heads=16, **kwargs)
+def DiT_XL_4(config, **kwargs):
+    return DiT(config=config, depth=28, hidden_size=1152, patch_size=4, num_heads=16, **kwargs)
+def DiT_XL_8(config, **kwargs):
+    return DiT(config=config, depth=28, hidden_size=1152, patch_size=8, num_heads=16, **kwargs)
+def DiT_L_2(config, **kwargs):
+    return DiT(config=config, depth=24, hidden_size=1024, patch_size=2, num_heads=16, **kwargs)
+def DiT_L_4(config, **kwargs):
+    return DiT(config=config, depth=24, hidden_size=1024, patch_size=4, num_heads=16, **kwargs)
+def DiT_L_8(config, **kwargs):
+    return DiT(config=config, depth=24, hidden_size=1024, patch_size=8, num_heads=16, **kwargs)
+def DiT_B_2(config, **kwargs):
+    return DiT(config=config, depth=12, hidden_size=768, patch_size=2, num_heads=12, **kwargs)
+def DiT_B_4(config, **kwargs):
+    return DiT(config=config, depth=12, hidden_size=768, patch_size=4, num_heads=12, **kwargs)
+def DiT_B_8(config, **kwargs):
+    return DiT(config=config, depth=12, hidden_size=768, patch_size=8, num_heads=12, **kwargs)
+def DiT_S_2(config, **kwargs):
+    return DiT(config=config, depth=12, hidden_size=384, patch_size=2, num_heads=6, **kwargs)
+def DiT_S_4(config, **kwargs):
+    return DiT(config=config, depth=12, hidden_size=384, patch_size=4, num_heads=6, **kwargs)
+def DiT_S_8(config, **kwargs):
+    return DiT(config=config, depth=12, hidden_size=384, patch_size=8, num_heads=6, **kwargs)
+DiT_models = {
+    'DiT-XL/2': DiT_XL_2,  'DiT-XL/4': DiT_XL_4,  'DiT-XL/8': DiT_XL_8,
+    'DiT-L/2':  DiT_L_2,   'DiT-L/4':  DiT_L_4,   'DiT-L/8':  DiT_L_8,
+    'DiT-B/2':  DiT_B_2,   'DiT-B/4':  DiT_B_4,   'DiT-B/8':  DiT_B_8,
+    'DiT-S/2':  DiT_S_2,   'DiT-S/4':  DiT_S_4,   'DiT-S/8':  DiT_S_8,
+}

libs/model/flags.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from contextlib import contextmanager
+from functools import update_wrapper
+import os
+import threading
+import torch
+def get_use_compile():
+    return os.environ.get("K_DIFFUSION_USE_COMPILE", "1") == "1"
+def get_use_flash_attention_2():
+    return os.environ.get("K_DIFFUSION_USE_FLASH_2", "1") == "1"
+state = threading.local()
+state.checkpointing = False
+@contextmanager
+def checkpointing(enable=True):
+    try:
+        old_checkpointing, state.checkpointing = state.checkpointing, enable
+        yield
+    finally:
+        state.checkpointing = old_checkpointing
+def get_checkpointing():
+    return getattr(state, "checkpointing", False)
+class compile_wrap:
+    def __init__(self, function, *args, **kwargs):
+        self.function = function
+        self.args = args
+        self.kwargs = kwargs
+        self._compiled_function = None
+        update_wrapper(self, function)
+    @property
+    def compiled_function(self):
+        if self._compiled_function is not None:
+            return self._compiled_function
+        if get_use_compile():
+            try:
+                self._compiled_function = torch.compile(self.function, *self.args, **self.kwargs)
+            except RuntimeError:
+                self._compiled_function = self.function
+        else:
+            self._compiled_function = self.function
+        return self._compiled_function
+    def __call__(self, *args, **kwargs):
+        return self.compiled_function(*args, **kwargs)

libs/model/sigmoid/kernel.py ADDED Viewed

	@@ -0,0 +1,316 @@

+import torch
+import torch.nn as nn
+from torch.utils.cpp_extension import load
+import os
+import time
+import random
+import math
+from torch.utils.checkpoint import checkpoint
+from torch.autograd import Function
+from functools import partial
+import warnings
+# curr_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "extension")
+# src_files = ['tdp.cu', 'torch_extension.cpp']
+# src_files = [os.path.join(curr_path, file) for file in src_files]
+# tdp = load('tdp', src_files, verbose = True)
+# import tdp
+def exported_tdp(param0, param1, weight, bias, times, custom = True):
+    original_shape = param0.shape
+    param0 = param0.reshape(-1)
+    param1 = param1.reshape(-1)
+    weight = weight.reshape(-1)
+    bias = bias.reshape(-1)
+    if custom and param0.shape[0] % 2 == 0:
+        result = TDP.apply(param0, param1, weight, bias, times)
+    else:
+        warnings.warn(f'Using slower tdp_torch implementation for a tensor with shape {param0.shape}')
+        result = tdp_torch(param0, param1, weight, bias, times)
+    result = result.reshape(*([times.shape[0]] + [d for d in original_shape]))
+    return result
+class TDP(Function):
+    @staticmethod
+    def forward(ctx, param0, param1, weight, bias, times):
+        assert param0.shape[0] % 2 == 0
+        param0 = param0.contiguous()
+        param1 = param1.contiguous()
+        weight = weight.contiguous()
+        bias = bias.contiguous()
+        times = times.contiguous()
+        assert param0.shape[0] == param1.shape[0] and param0.shape[0] == weight.shape[0] and param0.shape[0] == bias.shape[0]
+        assert param0.dim() == 1 and param1.dim() == 1 and weight.dim() == 1 and bias.dim() == 1 and times.dim() == 1
+        ctx.save_for_backward(param0, param1, weight, bias, times)
+        return tdp_cuda(param0, param1, weight, bias, times)
+    @staticmethod
+    def backward(ctx, g_result):
+        g_result = g_result.contiguous()
+        param0, param1, weight, bias, times = ctx.saved_tensors
+        g_param0, g_param1, g_weight, g_bias = backward_tdp_cuda(param0, param1, weight, bias, times, g_result)
+        return g_param0, g_param1, g_weight, g_bias, None
+def backward_tdp_torch(param0, param1, weight, bias, times, g_result):
+    param0 = param0[None]
+    param1 = param1[None]
+    weight = weight[None]
+    bias = bias[None]
+    a = times[:, None] * weight + bias
+    s = torch.sigmoid(a)
+    g_param0 = (s * g_result).sum(0)
+    g_param1 = ((1 - s) * g_result).sum(0)
+    g_s = (param0 - param1) * g_result
+    g_a = g_s * s * (1 - s)
+    g_weight = (g_a * times[:, None]).sum(0)
+    g_bias = g_a.sum(0)
+    return g_param0, g_param1, g_weight, g_bias
+def backward_tdp_cuda(param0, param1, weight, bias, times, g_result):
+    g_param0 = torch.empty_like(param0)
+    g_param1 = torch.empty_like(param0)
+    g_weight = torch.empty_like(param0)
+    g_bias = torch.empty_like(param0)
+    if param0.dtype == torch.half:
+        tdp.backward_tdp_fp16(param0, param1, weight, bias, times, g_result, g_param0, g_param1, g_weight, g_bias)
+    elif param0.dtype == torch.float:
+        tdp.backward_tdp_fp32(param0, param1, weight, bias, times, g_result, g_param0, g_param1, g_weight, g_bias)
+    else:
+        raise NotImplementedError
+    return g_param0, g_param1, g_weight, g_bias
+def tdp_torch(param0, param1, weight, bias, times):
+    a = torch.addcmul(bias[None], times[:, None], weight[None])
+    s = torch.sigmoid(a)
+    result = torch.addcmul(param1[None], s, param0[None] - param1[None])
+    return result
+def tdp_cuda(param0, param1, weight, bias, times):
+    result = torch.empty(times.shape[0], param0.shape[0], dtype = param0.dtype, device = param0.device)
+    if param0.dtype == torch.half:
+        tdp.tdp_fp16(param0, param1, weight, bias, times, result)
+    elif param0.dtype == torch.float:
+        tdp.tdp_fp32(param0, param1, weight, bias, times, result)
+    else:
+        raise NotImplementedError
+    return result
+def corrcoef(x, y):
+    return torch.corrcoef(torch.stack([x.reshape(-1).float(), y.reshape(-1).float()], dim = 0))[0, 1]
+def tdp_cuda_unit_test():
+    print("***** tdp_cuda_unit_test *****")
+    batch_size = random.randrange(1, 128)
+    num_params = random.randrange(1, 1000000) * 2
+    print("batch_size", batch_size, "num_params", num_params)
+    param0 = torch.randn(num_params).cuda()
+    param1 = torch.randn(num_params).cuda()
+    weight = torch.randn(num_params).cuda()
+    bias = torch.randn(num_params).cuda()
+    times = torch.rand(batch_size).cuda()
+    ref = tdp_torch(param0, param1, weight, bias, times)
+    out = tdp_cuda(param0, param1, weight, bias, times)
+    print(corrcoef(ref, out), (ref - out).abs().max())
+    out = tdp_cuda(param0.half(), param1.half(), weight.half(), bias.half(), times.half()).float()
+    print(corrcoef(ref, out), (ref - out).abs().max())
+def backward_tdp_cuda_unit_test():
+    print("***** backward_tdp_cuda_unit_test *****")
+    batch_size = random.randrange(1, 128)
+    num_params = random.randrange(1, 100000) * 2
+    print("batch_size", batch_size, "num_params", num_params)
+    param0 = torch.randn(num_params).cuda()
+    param1 = torch.randn(num_params).cuda()
+    weight = torch.randn(num_params).cuda()
+    bias = torch.randn(num_params).cuda()
+    times = torch.rand(batch_size).cuda()
+    g_result = torch.randn(batch_size, num_params).cuda()
+    refs = backward_tdp_torch(param0, param1, weight, bias, times, g_result)
+    outs = backward_tdp_cuda(param0, param1, weight, bias, times, g_result)
+    for r, o in zip(refs, outs):
+        print(corrcoef(r, o), (r - o).abs().max())
+    outs = backward_tdp_cuda(param0.half(), param1.half(), weight.half(), bias.half(), times.half(), g_result.half())
+    for r, o in zip(refs, outs):
+        print(corrcoef(r, o), (r - o).abs().max())
+def autograd_unit_test():
+    print("***** autograd_unit_test *****")
+    batch_size = random.randrange(1, 128)
+    num_params = random.randrange(1, 100000) * 2
+    print("batch_size", batch_size, "num_params", num_params)
+    def get_outputs(fn):
+        torch.manual_seed(1)
+        param0 = torch.randn(num_params, requires_grad = True).cuda()
+        param1 = torch.randn(num_params, requires_grad = True).cuda()
+        weight = torch.randn(num_params, requires_grad = True).cuda()
+        bias = torch.randn(num_params, requires_grad = True).cuda()
+        times = torch.rand(batch_size).cuda()
+        out = fn(param0, param1, weight, bias, times)
+        loss = ((out - 1.5) ** 2).mean()
+        param0.retain_grad()
+        param1.retain_grad()
+        weight.retain_grad()
+        bias.retain_grad()
+        loss.backward()
+        g_param0 = param0.grad
+        g_param1 = param1.grad
+        g_weight = weight.grad
+        g_bias = bias.grad
+        return out, g_param0, g_param1, g_weight, g_bias
+    refs = get_outputs(tdp_torch)
+    outs = get_outputs(TDP.apply)
+    for r, o in zip(refs, outs):
+        print(corrcoef(r, o), (r - o).abs().max())
+def exported_tdp_unit_test():
+    print("***** exported_tdp_unit_test *****")
+    batch_size = random.randrange(1, 128)
+    num_params = random.randrange(1, 100000) * 2
+    print("batch_size", batch_size, "num_params", num_params)
+    def get_outputs(fn):
+        torch.manual_seed(1)
+        param0 = torch.randn(num_params, requires_grad = True).cuda()
+        param1 = torch.randn(num_params, requires_grad = True).cuda()
+        weight = torch.randn(num_params, requires_grad = True).cuda()
+        bias = torch.randn(num_params, requires_grad = True).cuda()
+        times = torch.rand(batch_size).cuda()
+        out = fn(param0, param1, weight, bias, times)
+        loss = ((out - 1.5) ** 2).mean()
+        param0.retain_grad()
+        param1.retain_grad()
+        weight.retain_grad()
+        bias.retain_grad()
+        loss.backward()
+        g_param0 = param0.grad
+        g_param1 = param1.grad
+        g_weight = weight.grad
+        g_bias = bias.grad
+        return out, g_param0, g_param1, g_weight, g_bias
+    refs = get_outputs(partial(exported_tdp, custom = False))
+    outs = get_outputs(partial(exported_tdp, custom = True))
+    for r, o in zip(refs, outs):
+        print(corrcoef(r, o), (r - o).abs().max())
+def tdp_cuda_profile():
+    print("***** tdp_cuda_profile *****")
+    def profiler(fn, args):
+        for _ in range(10):
+            fn(*args)
+        torch.cuda.synchronize()
+        t0 = time.time()
+        for _ in range(100):
+            fn(*args)
+        torch.cuda.synchronize()
+        t1 = time.time()
+        return t1 - t0
+    batch_size = 16
+    num_params = 1024 * 1024
+    print("batch_size", batch_size, "num_params", num_params)
+    param0 = torch.randn(num_params).cuda()
+    param1 = torch.randn(num_params).cuda()
+    weight = torch.randn(num_params).cuda()
+    bias = torch.randn(num_params).cuda()
+    times = torch.rand(batch_size).cuda()
+    print("ref", profiler(tdp_torch, (param0, param1, weight, bias, times)))
+    print("cuda", profiler(tdp_cuda, (param0, param1, weight, bias, times)))
+    print("ref", profiler(tdp_torch, (param0.half(), param1.half(), weight.half(), bias.half(), times.half())))
+    print("cuda", profiler(tdp_cuda, (param0.half(), param1.half(), weight.half(), bias.half(), times.half())))
+def backward_tdp_cuda_profile():
+    print("***** backward_tdp_cuda_profile *****")
+    def profiler(fn, args):
+        for _ in range(10):
+            fn(*args)
+        torch.cuda.synchronize()
+        t0 = time.time()
+        for _ in range(100):
+            fn(*args)
+        torch.cuda.synchronize()
+        t1 = time.time()
+        return t1 - t0
+    batch_size = 16
+    num_params = 1024 * 1024
+    print("batch_size", batch_size, "num_params", num_params)
+    param0 = torch.randn(num_params).cuda()
+    param1 = torch.randn(num_params).cuda()
+    weight = torch.randn(num_params).cuda()
+    bias = torch.randn(num_params).cuda()
+    times = torch.rand(batch_size).cuda()
+    g_result = torch.randn(batch_size, num_params).cuda()
+    print("ref", profiler(backward_tdp_torch, (param0, param1, weight, bias, times, g_result)))
+    print("cuda", profiler(backward_tdp_cuda, (param0, param1, weight, bias, times, g_result)))
+    print("ref", profiler(backward_tdp_torch, (param0.half(), param1.half(), weight.half(), bias.half(), times.half(), g_result.half())))
+    print("cuda", profiler(backward_tdp_cuda, (param0.half(), param1.half(), weight.half(), bias.half(), times.half(), g_result.half())))
+def autogad_profile():
+    print("***** autogad_profile *****")
+    def profiler(fn, args):
+        for _ in range(10):
+            fn(*args).mean().backward()
+        torch.cuda.synchronize()
+        t0 = time.time()
+        for _ in range(100):
+            fn(*args).mean().backward()
+        torch.cuda.synchronize()
+        t1 = time.time()
+        return t1 - t0
+    batch_size = 16
+    num_params = 1024 * 1024
+    print("batch_size", batch_size, "num_params", num_params)
+    param0 = nn.Parameter(torch.randn(num_params)).cuda()
+    param1 = nn.Parameter(torch.randn(num_params)).cuda()
+    weight = nn.Parameter(torch.randn(num_params)).cuda()
+    bias = nn.Parameter(torch.randn(num_params)).cuda()
+    times = torch.rand(batch_size).cuda()
+    print("ref", profiler(tdp_torch, (param0, param1, weight, bias, times)))
+    print("cuda", profiler(TDP.apply, (param0, param1, weight, bias, times)))
+    print("ref", profiler(tdp_torch, (param0.half(), param1.half(), weight.half(), bias.half(), times.half())))
+    print("cuda", profiler(TDP.apply, (param0.half(), param1.half(), weight.half(), bias.half(), times.half())))
+if __name__ == "__main__":
+    tdp_cuda_unit_test()
+    backward_tdp_cuda_unit_test()
+    autograd_unit_test()
+    exported_tdp_unit_test()
+    tdp_cuda_profile()
+    backward_tdp_cuda_profile()
+    autogad_profile()

libs/model/sigmoid/module.py ADDED Viewed

	@@ -0,0 +1,274 @@

+from functools import reduce
+import math
+import torch
+import torch.nn as nn
+import numpy as np
+from .kernel import exported_tdp
+import torch.nn.functional as F
+from functools import partial
+from timm.models.layers import trunc_normal_
+class TimeDependentParameter(nn.Module):
+    def __init__(self, shape, init_fn):
+        super().__init__()
+        self.shape = shape
+        w = torch.empty(*shape)
+        init_fn(w)
+        self.param0 = nn.Parameter(w.clone().detach())
+        self.param1 = nn.Parameter(w.clone().detach())
+        self.nodecay_weight = nn.Parameter(torch.zeros(*shape))
+        self.nodecay_bias = nn.Parameter(torch.zeros(*shape))
+        self.curr_weight = None
+    def forward(self):
+        weight = self.curr_weight
+        # self.curr_weight = None
+        return weight
+    def __repr__(self):
+        return f"TimeDependentParameter(shape={self.shape})"
+    @staticmethod
+    def seed_time(model, log_snr):
+        assert log_snr.dim() == 1
+        if torch.all(log_snr == log_snr[0]):
+            log_snr = log_snr[0][None]
+        time_condition = log_snr / 4.0
+        tdp_list = [module for module in model.modules() if isinstance(module, TimeDependentParameter)]
+        for tdp in tdp_list:
+            tdp.curr_weight = exported_tdp(tdp.param0, tdp.param1, tdp.nodecay_weight + 1, tdp.nodecay_bias, time_condition, custom = False)
+class LayerNorm(nn.Module):
+    def __init__(self, dim, num_groups = 1, eps = 1e-05):
+        super().__init__()
+        self.eps = eps
+        self.dim = dim
+        self.num_groups = num_groups
+        self.weight = TimeDependentParameter((dim, ), nn.init.ones_)
+        self.bias = TimeDependentParameter((dim, ), nn.init.zeros_)
+    def _forward(self, x):
+        weight, bias = self.weight(), self.bias()
+        assert weight.shape[0] == bias.shape[0]
+        assert x.shape[-1] == self.dim
+        if weight.shape[0] == 1:
+            x = F.layer_norm(x, (self.dim, ), weight = weight[0], bias = bias[0], eps = self.eps)
+        else:
+            assert x.shape[0] == weight.shape[0]
+            x = F.layer_norm(x, (self.dim, ), eps = self.eps)
+            x = torch.addcmul(bias[:, None, :], weight[:, None, :], x)
+        return x
+    def forward(self, x):
+        original_shape = x.shape
+        batch_size = x.shape[0]
+        assert self.dim == x.shape[-1]
+        x = x.reshape(batch_size, -1, self.dim)
+        x = self._forward(x)
+        x = x.reshape(*original_shape)
+        return x
+class Linear(nn.Module):
+    def __init__(self, din, dout, bias = True, weight_init_fn = partial(trunc_normal_, std = 0.02)):
+        super().__init__()
+        self.din = din
+        self.dout = dout
+        self.weight = TimeDependentParameter((din, dout), weight_init_fn)
+        if bias:
+            self.bias = TimeDependentParameter((dout, ), nn.init.zeros_)
+        else:
+            self.bias = None
+    def _forward(self, x):
+        weight = self.weight()
+        bias = self.bias() if self.bias is not None else None
+        # if weight.shape[0] == 1:
+        #     B, L, D = x.shape
+        #     if bias is not None:
+        #         assert weight.shape[0] == bias.shape[0]
+        #         x = torch.addmm(bias, x.reshape(B * L, D), weight[0])
+        #     else:
+        #         x = torch.matmul(x.reshape(B * L, D), weight[0])
+        #     x = x.reshape(B, L, -1)
+        # else:
+        if bias is not None:
+            x = torch.baddbmm(bias[:, None, :], x, weight)
+        else:
+            x = torch.bmm(x, weight)
+        return x
+    def forward(self, x):
+        original_shape = x.shape
+        batch_size = x.shape[0]
+        x = x.reshape(batch_size, -1, self.din)
+        x = self._forward(x)
+        x = x.reshape(*(list(original_shape[:-1]) + [self.dout]))
+        return x
+class RMSNorm(nn.Module):
+    def __init__(self, d, p=-1., eps=1e-8, bias=False):
+        """
+            Root Mean Square Layer Normalization
+        :param d: model size
+        :param p: partial RMSNorm, valid value [0, 1], default -1.0 (disabled)
+        :param eps:  epsilon value, default 1e-8
+        :param bias: whether use bias term for RMSNorm, disabled by
+            default because RMSNorm doesn't enforce re-centering invariance.
+        """
+        super(RMSNorm, self).__init__()
+        self.eps = eps
+        self.d = d
+        self.p = p
+        self.bias = bias
+        self.scale = nn.Parameter(torch.ones(d))
+        self.register_parameter("scale", self.scale)
+        if self.bias:
+            self.offset = nn.Parameter(torch.zeros(d))
+            self.register_parameter("offset", self.offset)
+    def forward(self, x):
+        if self.p < 0. or self.p > 1.:
+            norm_x = x.norm(2, dim=-1, keepdim=True)
+            d_x = self.d
+        else:
+            partial_size = int(self.d * self.p)
+            partial_x, _ = torch.split(x, [partial_size, self.d - partial_size], dim=-1)
+            norm_x = partial_x.norm(2, dim=-1, keepdim=True)
+            d_x = partial_size
+        rms_x = norm_x * d_x ** (-1. / 2)
+        x_normed = x / (rms_x + self.eps)
+        if self.bias:
+            return self.scale * x_normed + self.offset
+        return self.scale * x_normed
+class TDRMSNorm(nn.Module):
+    def __init__(self, d, p=-1., eps=1e-8, bias=False):
+        """
+            Root Mean Square Layer Normalization
+        :param d: model size
+        :param p: partial RMSNorm, valid value [0, 1], default -1.0 (disabled)
+        :param eps:  epsilon value, default 1e-8
+        :param bias: whether use bias term for RMSNorm, disabled by
+            default because RMSNorm doesn't enforce re-centering invariance.
+        """
+        super(TDRMSNorm, self).__init__()
+        self.eps = eps
+        self.d = d
+        self.p = p
+        self.bias = bias
+        # self.scale = nn.Parameter(torch.ones(d))
+        self.scale = TimeDependentParameter((d, ), nn.init.ones_)
+        # self.register_parameter("scale", self.scale)
+        if self.bias:
+            # self.offset = nn.Parameter(torch.zeros(d))
+            self.offset = TimeDependentParameter((d, ), nn.init.zeros_)
+            # self.register_parameter("offset", self.offset)
+    def forward(self, x):
+        if self.p < 0. or self.p > 1.:
+            norm_x = x.norm(2, dim=-1, keepdim=True)
+            d_x = self.d
+        else:
+            partial_size = int(self.d * self.p)
+            partial_x, _ = torch.split(x, [partial_size, self.d - partial_size], dim=-1)
+            norm_x = partial_x.norm(2, dim=-1, keepdim=True)
+            d_x = partial_size
+        rms_x = norm_x * d_x ** (-1. / 2)
+        x_normed = x / (rms_x + self.eps)
+        _scale = self.scale()
+        if self.bias:
+            # return self.scale * x_normed + self.offset
+            _offset = self.offset()
+            if _scale.shape[0] == 1:
+                return _scale[0] * x_normed + _offset[0]
+            elif x_normed.dim() == 3:
+                return torch.addcmul(_offset[:, None, :], _scale[:, None, :], x_normed)
+            elif x_normed.dim() == 4:
+                return torch.addcmul(_offset[:, None, None, :], _scale[:, None, None, :], x_normed)
+            else:
+                raise NotImplementedError
+        # return self.scale * x_normed
+        if _scale.shape[0] == 1:
+            return _scale[0] * x_normed
+        elif x_normed.dim() == 3:
+            return _scale[:, None, :] * x_normed
+        elif x_normed.dim() == 4:
+            return _scale[:, None, None, :] * x_normed
+        else:
+            raise NotImplementedError
+def zero_init(layer):
+    nn.init.zeros_(layer.weight)
+    if layer.bias is not None:
+        nn.init.zeros_(layer.bias)
+    return layer
+def rms_norm(x, scale, eps):
+    dtype = reduce(torch.promote_types, (x.dtype, scale.dtype, torch.float32))
+    mean_sq = torch.mean(x.to(dtype)**2, dim=-1, keepdim=True)
+    scale = scale.to(dtype) * torch.rsqrt(mean_sq + eps)
+    return x * scale.to(x.dtype)
+class AdaRMSNorm(nn.Module):
+    def __init__(self, features, cond_features, eps=1e-6):
+        super().__init__()
+        self.eps = eps
+        self.linear = zero_init(nn.Linear(cond_features, features, bias=False))
+    def extra_repr(self):
+        return f"eps={self.eps},"
+    def forward(self, x, cond):
+        return rms_norm(x, self.linear(cond)[:, None, :] + 1, self.eps)
+class QKNorm(nn.Module):
+    def __init__(self, n_heads, eps=1e-6, max_scale=100.0):
+        super().__init__()
+        self.eps = eps
+        self.max_scale = math.log(max_scale)
+        self.scale = nn.Parameter(torch.full((n_heads,), math.log(10.0)))
+        self.proj_()
+    def extra_repr(self):
+        return f"n_heads={self.scale.shape[0]}, eps={self.eps}"
+    @torch.no_grad()
+    def proj_(self):
+        """Modify the scale in-place so it doesn't get "stuck" with zero gradient if it's clamped
+        to the max value."""
+        self.scale.clamp_(max=self.max_scale)
+    def forward(self, x):
+        self.proj_()
+        scale = torch.exp(0.5 * self.scale - 0.25 * math.log(x.shape[-1]))
+        return rms_norm(x, scale[:, None, None], self.eps)

libs/model/trans_autoencoder.py ADDED Viewed

	@@ -0,0 +1,289 @@

+"""
+Transformer-based varitional encoder model.
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+import copy
+def clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
+def build_mask(base_mask):
+    assert len(base_mask.shape) == 2
+    batch_size, seq_len = base_mask.shape[0], base_mask.shape[-1]
+    # create subsequent token mask
+    sub_mask = torch.tril(torch.ones([seq_len, seq_len],
+                                     dtype=torch.uint8)).type_as(base_mask)
+    sub_mask = sub_mask.unsqueeze(0).expand(batch_size, -1, -1)
+    base_mask = base_mask.unsqueeze(1).expand(-1, seq_len, -1)
+    return sub_mask & base_mask
+class Adaptor(nn.Module):
+    def __init__(self, input_dim, tar_dim):
+        super(Adaptor, self).__init__()
+        if tar_dim == 32768:
+            output_channel = 8
+        elif tar_dim == 16384:
+            output_channel = 4
+        else:
+            raise NotImplementedError("only support 512px, 256px does not need this")
+        self.tar_dim = tar_dim
+        self.fc1 = nn.Linear(input_dim, 4096)
+        self.ln_fc1 = nn.LayerNorm(4096)
+        self.fc2 = nn.Linear(4096, 4096)
+        self.ln_fc2 = nn.LayerNorm(4096)
+        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, padding=1)
+        self.ln_conv1 = nn.LayerNorm([32, 64, 64])
+        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
+        self.ln_conv2 = nn.LayerNorm([64, 64, 64])
+        self.conv3 = nn.Conv2d(in_channels=64, out_channels=output_channel, kernel_size=3, padding=1)
+    def forward(self, x):
+        x = torch.relu(self.ln_fc1(self.fc1(x)))
+        x = torch.relu(self.ln_fc2(self.fc2(x)))
+        x = x.view(-1, 1, 64, 64)
+        x = torch.relu(self.ln_conv1(self.conv1(x)))
+        x = torch.relu(self.ln_conv2(self.conv2(x)))
+        x = self.conv3(x)
+        x = x.view(-1, self.tar_dim)
+        return x
+class Compressor(nn.Module):
+    def __init__(self, input_dim=4096, tar_dim=2048):
+        super(Compressor, self).__init__()
+        self.fc1 = nn.Linear(input_dim, tar_dim)
+        self.ln_fc1 = nn.LayerNorm(tar_dim)
+        self.fc2 = nn.Linear(tar_dim, tar_dim)
+    def forward(self, x):
+        x = torch.relu(self.ln_fc1(self.fc1(x)))
+        x = self.fc2(x)
+        return x
+class TransEncoder(nn.Module):
+    def __init__(self, d_model, N, num_token, head_num, d_ff, latten_size, down_sample_block=3, dropout=0.1, last_norm=True):
+        super(TransEncoder, self).__init__()
+        self.N = N
+        if d_model==4096:
+            # for T5-XXL, first use MLP to compress into 1024
+            self.compressor = Compressor(input_dim=d_model, tar_dim=1024)
+            d_model = 1024
+        else:
+            self.compressor = None
+        self.layers = clones(EncoderLayer(MultiHeadAttentioin(d_model, head_num, dropout=dropout),
+                                          FeedForward(d_model, d_ff, dropout=dropout),
+                                          LayerNorm(d_model),
+                                          LayerNorm(d_model)), N)
+        self.reduction_layers = nn.ModuleList()
+        for _ in range(down_sample_block):
+            self.reduction_layers.append(
+                EncoderReductionLayer(MultiHeadAttentioin(d_model, head_num, dropout=dropout),
+                                  FeedForward(d_model, d_ff, dropout=dropout),
+                                  nn.Linear(d_model, d_model // 2),
+                                  LayerNorm(d_model),
+                                  LayerNorm(d_model)))
+            d_model = d_model // 2
+        if latten_size == 8192 or latten_size == 4096:
+            self.arc = 0
+            self.linear = nn.Linear(d_model*num_token, latten_size)
+            self.norm = LayerNorm(latten_size) if last_norm else None
+        else:
+            self.arc = 1
+            self.adaptor = Adaptor(d_model*num_token, latten_size)
+    def forward(self, x, mask):
+        mask = mask.unsqueeze(1)
+        if self.compressor is not None:
+            x = self.compressor(x)
+        for i, layer in enumerate(self.layers):
+            x = layer(x, mask)
+        for i, layer in enumerate(self.reduction_layers):
+            x = layer(x, mask)
+        if self.arc == 0:
+            x = self.linear(x.view(x.shape[0],-1))
+            x = self.norm(x) if self.norm else x
+        else:
+            x = self.adaptor(x.view(x.shape[0],-1))
+        return x
+class EncoderLayer(nn.Module):
+    def __init__(self, attn, feed_forward, norm1, norm2, dropout=0.1):
+        super(EncoderLayer, self).__init__()
+        self.attn = attn
+        self.feed_forward = feed_forward
+        self.norm1, self.norm2 = norm1, norm2
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+    def forward(self, x, mask):
+        # multihead attn & norm
+        a = self.attn(x, x, x, mask)
+        t = self.norm1(x + self.dropout1(a))
+        # feed forward & norm
+        z = self.feed_forward(t)  # linear(dropout(act(linear(x)))))
+        y = self.norm2(t + self.dropout2(z))
+        return y
+class EncoderReductionLayer(nn.Module):
+    def __init__(self, attn, feed_forward, reduction, norm1, norm2, dropout=0.1):
+        super(EncoderReductionLayer, self).__init__()
+        self.attn = attn
+        self.feed_forward = feed_forward
+        self.reduction = reduction
+        self.norm1, self.norm2 = norm1, norm2
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+    def forward(self, x, mask):
+        # multihead attn & norm
+        a = self.attn(x, x, x, mask)
+        t = self.norm1(x + self.dropout1(a))
+        # feed forward & norm
+        z = self.feed_forward(t)  # linear(dropout(act(linear(x)))))
+        y = self.norm2(t + self.dropout2(z))
+        # reduction
+        # y = self.reduction(y).view(x.shape[0], -1, x.shape[-1])
+        y = self.reduction(y)
+        return y
+class MultiHeadAttentioin(nn.Module):
+    def __init__(self, d_model, head_num, dropout=0.1, d_v=None):
+        super(MultiHeadAttentioin, self).__init__()
+        assert d_model % head_num == 0, "d_model must be divisible by head_num"
+        self.d_model = d_model
+        self.head_num = head_num
+        self.d_k = d_model // head_num
+        self.d_v = self.d_k if d_v is None else d_v
+        # d_model = d_k * head_num
+        self.W_Q = nn.Linear(d_model, head_num * self.d_k)
+        self.W_K = nn.Linear(d_model, head_num * self.d_k)
+        self.W_V = nn.Linear(d_model, head_num * self.d_v)
+        self.W_O = nn.Linear(d_model, d_model)
+        self.dropout = nn.Dropout(dropout)
+    def scaled_dp_attn(self, query, key, value, mask=None):
+        assert self.d_k == query.shape[-1]
+        # scores: [batch_size, head_num, seq_len, seq_len]
+        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.d_k)
+        # if torch.isinf(scores).any():
+        #     # to avoid leaking
+        #     scores = torch.where(scores == float('-inf'), torch.tensor(-65504.0), scores)
+        #     scores = torch.where(scores == float('inf'), torch.tensor(65504.0), scores)
+        if mask is not None:
+            assert mask.ndim == 3, "Mask shape {} doesn't seem right...".format(mask.shape)
+            mask = mask.unsqueeze(1)
+            try:
+                if scores.dtype == torch.float32:
+                    scores = scores.masked_fill(mask == 0, -1e9)
+                else:
+                    scores = scores.masked_fill(mask == 0, -1e4)
+            except RuntimeError:
+                print("- scores device: {}".format(scores.device))
+                print("- mask device: {}".format(mask.device))
+        # attn: [batch_size, head_num, seq_len, seq_len]
+        attn = F.softmax(scores, dim=-1)
+        attn = self.dropout(attn)
+        return torch.matmul(attn, value), attn
+    def forward(self, q, k, v, mask):
+        batch_size = q.shape[0]
+        query = self.W_Q(q).view(batch_size, -1, self.head_num, self.d_k).transpose(1, 2)
+        key = self.W_K(k).view(batch_size, -1, self.head_num, self.d_k).transpose(1, 2)
+        value = self.W_V(v).view(batch_size, -1, self.head_num, self.d_k).transpose(1, 2)
+        heads, attn = self.scaled_dp_attn(query, key, value, mask)
+        heads = heads.transpose(1, 2).contiguous().view(batch_size, -1,
+                                                        self.head_num * self.d_k)
+        assert heads.shape[-1] == self.d_model and heads.shape[0] == batch_size
+        y = self.W_O(heads)
+        assert y.shape == q.shape
+        return y
+class LayerNorm(nn.Module):
+    def __init__(self, layer_size, eps=1e-5):
+        super(LayerNorm, self).__init__()
+        self.g = nn.Parameter(torch.ones(layer_size))
+        self.b = nn.Parameter(torch.zeros(layer_size))
+        self.eps = eps
+    def forward(self, x):
+        mean = x.mean(-1, keepdim=True)
+        std = x.std(-1, keepdim=True)
+        x = (x - mean) / (std + self.eps)
+        return self.g * x + self.b
+class FeedForward(nn.Module):
+    def __init__(self, d_model, d_ff, dropout=0.1, act='relu', d_output=None):
+        super(FeedForward, self).__init__()
+        self.d_model = d_model
+        self.d_ff = d_ff
+        d_output = d_model if d_output is None else d_output
+        self.ffn_1 = nn.Linear(d_model, d_ff)
+        self.ffn_2 = nn.Linear(d_ff, d_output)
+        if act == 'relu':
+            self.act = nn.ReLU()
+        elif act == 'rrelu':
+            self.act = nn.RReLU()
+        else:
+            raise NotImplementedError
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        y = self.ffn_2(self.dropout(self.act(self.ffn_1(x))))
+        return y

libs/t5.py ADDED Viewed

	@@ -0,0 +1,237 @@

+"""
+This file contains code for t5 model.
+Reference:
+    https://github.com/deep-floyd/IF/blob/develop/deepfloyd_if/modules/t5.py
+"""
+# -*- coding: utf-8 -*-
+import os
+import re
+import html
+import urllib.parse as ul
+import ftfy
+import torch
+from bs4 import BeautifulSoup
+from transformers import T5EncoderModel, AutoTokenizer
+from huggingface_hub import hf_hub_download
+class T5Embedder:
+    available_models = ['t5-v1_1-xxl']
+    bad_punct_regex = re.compile(r'['+'#®•©™&@·º½¾¿¡§~'+'\)'+'\('+'\]'+'\['+'\}'+'\{'+'\|'+'\\'+'\/'+'\*' + r']{1,}')  # noqa
+    def __init__(self, device, dir_or_name='t5-v1_1-xxl', *, cache_dir=None, hf_token=None, use_text_preprocessing=True,
+                 t5_model_kwargs=None, torch_dtype=None, use_offload_folder=None):
+        self.device = torch.device(device)
+        self.torch_dtype = torch_dtype or torch.bfloat16
+        if t5_model_kwargs is None:
+            t5_model_kwargs = {'low_cpu_mem_usage': True, 'torch_dtype': self.torch_dtype}
+            if use_offload_folder is not None:
+                t5_model_kwargs['offload_folder'] = use_offload_folder
+                t5_model_kwargs['device_map'] = {
+                    'shared': self.device,
+                    'encoder.embed_tokens': self.device,
+                    'encoder.block.0': self.device,
+                    'encoder.block.1': self.device,
+                    'encoder.block.2': self.device,
+                    'encoder.block.3': self.device,
+                    'encoder.block.4': self.device,
+                    'encoder.block.5': self.device,
+                    'encoder.block.6': self.device,
+                    'encoder.block.7': self.device,
+                    'encoder.block.8': self.device,
+                    'encoder.block.9': self.device,
+                    'encoder.block.10': self.device,
+                    'encoder.block.11': self.device,
+                    'encoder.block.12': 'disk',
+                    'encoder.block.13': 'disk',
+                    'encoder.block.14': 'disk',
+                    'encoder.block.15': 'disk',
+                    'encoder.block.16': 'disk',
+                    'encoder.block.17': 'disk',
+                    'encoder.block.18': 'disk',
+                    'encoder.block.19': 'disk',
+                    'encoder.block.20': 'disk',
+                    'encoder.block.21': 'disk',
+                    'encoder.block.22': 'disk',
+                    'encoder.block.23': 'disk',
+                    'encoder.final_layer_norm': 'disk',
+                    'encoder.dropout': 'disk',
+                }
+            else:
+                t5_model_kwargs['device_map'] = {'shared': self.device, 'encoder': self.device}
+        self.use_text_preprocessing = use_text_preprocessing
+        self.hf_token = hf_token
+        self.cache_dir = cache_dir or os.path.expanduser('~/.cache/IF_')
+        self.dir_or_name = dir_or_name
+        tokenizer_path, path = dir_or_name, dir_or_name
+        if dir_or_name in self.available_models:
+            cache_dir = os.path.join(self.cache_dir, dir_or_name)
+            for filename in [
+                'config.json', 'special_tokens_map.json', 'spiece.model', 'tokenizer_config.json',
+                'pytorch_model.bin.index.json', 'pytorch_model-00001-of-00002.bin', 'pytorch_model-00002-of-00002.bin'
+            ]:
+                hf_hub_download(repo_id=f'DeepFloyd/{dir_or_name}', filename=filename, cache_dir=cache_dir,
+                                force_filename=filename, token=self.hf_token)
+            tokenizer_path, path = cache_dir, cache_dir
+        else:
+            cache_dir = os.path.join(self.cache_dir, 't5-v1_1-xxl')
+            for filename in [
+                'config.json', 'special_tokens_map.json', 'spiece.model', 'tokenizer_config.json',
+            ]:
+                hf_hub_download(repo_id='DeepFloyd/t5-v1_1-xxl', filename=filename, cache_dir=cache_dir,
+                                force_filename=filename, token=self.hf_token)
+            tokenizer_path = cache_dir
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+        self.model = T5EncoderModel.from_pretrained(path, **t5_model_kwargs).eval()
+    def get_text_embeddings(self, texts):
+        texts = [self.text_preprocessing(text) for text in texts]
+        text_tokens_and_mask = self.tokenizer(
+            texts,
+            max_length=77,
+            padding='max_length',
+            truncation=True,
+            return_attention_mask=True,
+            add_special_tokens=True,
+            return_tensors='pt'
+        )
+        text_tokens_and_mask['input_ids'] = text_tokens_and_mask['input_ids']
+        text_tokens_and_mask['attention_mask'] = text_tokens_and_mask['attention_mask']
+        with torch.no_grad():
+            text_encoder_embs = self.model(
+                input_ids=text_tokens_and_mask['input_ids'].to(self.device),
+                attention_mask=text_tokens_and_mask['attention_mask'].to(self.device),
+            )['last_hidden_state'].detach()
+        return text_encoder_embs, {'token_embedding': text_encoder_embs, 'token_mask': text_tokens_and_mask['attention_mask'].to(self.device), 'tokens': text_tokens_and_mask['input_ids'].to(self.device)}
+    def text_preprocessing(self, text):
+        if self.use_text_preprocessing:
+            # The exact text cleaning as was in the training stage:
+            text = self.clean_caption(text)
+            text = self.clean_caption(text)
+            return text
+        else:
+            return text.lower().strip()
+    @staticmethod
+    def basic_clean(text):
+        text = ftfy.fix_text(text)
+        text = html.unescape(html.unescape(text))
+        return text.strip()
+    def clean_caption(self, caption):
+        caption = str(caption)
+        caption = ul.unquote_plus(caption)
+        caption = caption.strip().lower()
+        caption = re.sub('<person>', 'person', caption)
+        # urls:
+        caption = re.sub(
+            r'\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))',  # noqa
+            '', caption)  # regex for urls
+        caption = re.sub(
+            r'\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))',  # noqa
+            '', caption)  # regex for urls
+        # html:
+        caption = BeautifulSoup(caption, features='html.parser').text
+        # @<nickname>
+        caption = re.sub(r'@[\w\d]+\b', '', caption)
+        # 31C0—31EF CJK Strokes
+        # 31F0—31FF Katakana Phonetic Extensions
+        # 3200—32FF Enclosed CJK Letters and Months
+        # 3300—33FF CJK Compatibility
+        # 3400—4DBF CJK Unified Ideographs Extension A
+        # 4DC0—4DFF Yijing Hexagram Symbols
+        # 4E00—9FFF CJK Unified Ideographs
+        caption = re.sub(r'[\u31c0-\u31ef]+', '', caption)
+        caption = re.sub(r'[\u31f0-\u31ff]+', '', caption)
+        caption = re.sub(r'[\u3200-\u32ff]+', '', caption)
+        caption = re.sub(r'[\u3300-\u33ff]+', '', caption)
+        caption = re.sub(r'[\u3400-\u4dbf]+', '', caption)
+        caption = re.sub(r'[\u4dc0-\u4dff]+', '', caption)
+        caption = re.sub(r'[\u4e00-\u9fff]+', '', caption)
+        #######################################################
+        # все виды тире / all types of dash --> "-"
+        caption = re.sub(
+            r'[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+',  # noqa
+            '-', caption)
+        # кавычки к одному стандарту
+        caption = re.sub(r'[`´«»“”¨]', '"', caption)
+        caption = re.sub(r'[‘’]', "'", caption)
+        # &quot;
+        caption = re.sub(r'&quot;?', '', caption)
+        # &amp
+        caption = re.sub(r'&amp', '', caption)
+        # ip adresses:
+        caption = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ' ', caption)
+        # article ids:
+        caption = re.sub(r'\d:\d\d\s+$', '', caption)
+        # \n
+        caption = re.sub(r'\\n', ' ', caption)
+        # "#123"
+        caption = re.sub(r'#\d{1,3}\b', '', caption)
+        # "#12345.."
+        caption = re.sub(r'#\d{5,}\b', '', caption)
+        # "123456.."
+        caption = re.sub(r'\b\d{6,}\b', '', caption)
+        # filenames:
+        caption = re.sub(r'[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)', '', caption)
+        #
+        caption = re.sub(r'[\"\']{2,}', r'"', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(r'[\.]{2,}', r' ', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(self.bad_punct_regex, r' ', caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(r'\s+\.\s+', r' ', caption)  # " . "
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r'(?:\-|\_)')
+        if len(re.findall(regex2, caption)) > 3:
+            caption = re.sub(regex2, ' ', caption)
+        caption = self.basic_clean(caption)
+        caption = re.sub(r'\b[a-zA-Z]{1,3}\d{3,15}\b', '', caption)  # jc6640
+        caption = re.sub(r'\b[a-zA-Z]+\d+[a-zA-Z]+\b', '', caption)  # jc6640vc
+        caption = re.sub(r'\b\d+[a-zA-Z]+\d+\b', '', caption)  # 6640vc231
+        caption = re.sub(r'(worldwide\s+)?(free\s+)?shipping', '', caption)
+        caption = re.sub(r'(free\s)?download(\sfree)?', '', caption)
+        caption = re.sub(r'\bclick\b\s(?:for|on)\s\w+', '', caption)
+        caption = re.sub(r'\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?', '', caption)
+        caption = re.sub(r'\bpage\s+\d+\b', '', caption)
+        caption = re.sub(r'\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b', r' ', caption)  # j2d1a2a...
+        caption = re.sub(r'\b\d+\.?\d*[xх×]\d+\.?\d*\b', '', caption)
+        caption = re.sub(r'\b\s+\:\s+', r': ', caption)
+        caption = re.sub(r'(\D[,\./])\b', r'\1 ', caption)
+        caption = re.sub(r'\s+', ' ', caption)
+        caption.strip()
+        caption = re.sub(r'^[\"\']([\w\W]+)[\"\']$', r'\1', caption)
+        caption = re.sub(r'^[\'\_,\-\:;]', r'', caption)
+        caption = re.sub(r'[\'\_,\-\:\-\+]$', r'', caption)
+        caption = re.sub(r'^\.\S+$', '', caption)
+        return caption.strip()

libs/timm.py ADDED Viewed

	@@ -0,0 +1,114 @@

+"""
+code from timm 0.3.2
+"""
+import torch
+import torch.nn as nn
+import math
+import warnings
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+                      "The distribution of values may be incorrect.",
+                      stacklevel=2)
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * l - 1, 2 * u - 1)
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.))
+        tensor.add_(mean)
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    # type: (Tensor, float, float, float, float) -> Tensor
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.trunc_normal_(w)
+    """
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+def drop_path(x, drop_prob: float = 0., training: bool = False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x

requirements.txt CHANGED Viewed

@@ -1,6 +1,21 @@
-accelerate
 diffusers
-invisible_watermark
 torch
-transformers
-xformers

 diffusers
 torch
+xformers
+openai-clip
+scikit-learn
+opencv-python
+torchdiffeq
+beautifulsoup4
+open_clip_torch
+scikit-image
+cython
+matplotlib
+accelerate==0.12.0
+absl-py
+ml_collections
+einops
+wandb
+ftfy==6.1.1
+transformers==4.23.1
+timm
+tensorboard

scripts/extract_empty_feature.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""
+This file is used to extract feature of the empty prompt.
+"""
+import os
+import sys
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import torch
+import os
+import numpy as np
+from libs.clip import FrozenCLIPEmbedder
+from libs.t5 import T5Embedder
+def main():
+    prompts = [
+        '',
+    ]
+    device = 'cuda'
+    llm = 'clip'
+    if llm=='clip':
+        clip = FrozenCLIPEmbedder()
+        clip.eval()
+        clip.to(device)
+    elif llm=='t5':
+        t5 = T5Embedder(device=device)
+    else:
+        raise NotImplementedError
+    save_dir = f'./'
+    if llm=='clip':
+        latent, latent_and_others = clip.encode(prompts)
+        token_embedding = latent_and_others['token_embedding']
+        token_mask = latent_and_others['token_mask']
+        token = latent_and_others['tokens']
+    elif llm=='t5':
+        latent, latent_and_others = t5.get_text_embeddings(prompts)
+        token_embedding = latent_and_others['token_embedding'].to(torch.float32) * 10.0
+        token_mask = latent_and_others['token_mask']
+        token = latent_and_others['tokens']
+    for i in range(len(prompts)):
+        data = {'token_embedding': token_embedding[i].detach().cpu().numpy(),
+                'token_mask': token_mask[i].detach().cpu().numpy(),
+                'token': token[i].detach().cpu().numpy(),
+                'batch_caption': prompts[i]}
+        np.save(os.path.join(save_dir, f'empty_context.npy'), data)
+if __name__ == '__main__':
+    main()

scripts/extract_mscoco_feature.py ADDED Viewed

	@@ -0,0 +1,83 @@

+"""
+This file is used to extract feature of the coco val set (to test zero-shot FID).
+"""
+import os
+import sys
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import torch
+import os
+import numpy as np
+from datasets import MSCOCODatabase
+import argparse
+from tqdm import tqdm
+import libs.autoencoder
+from libs.clip import FrozenCLIPEmbedder
+from libs.t5 import T5Embedder
+def main(resolution=256):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--split', default='val')
+    args = parser.parse_args()
+    print(args)
+    if args.split == "val":
+        datas = MSCOCODatabase(root='/data/qihao/dataset/coco2014/val2014',
+                             annFile='/data/qihao/dataset/coco2014/annotations/captions_val2014.json',
+                             size=resolution)
+        save_dir = f'val'
+    else:
+        raise NotImplementedError
+    device = "cuda"
+    os.makedirs(save_dir, exist_ok=True)
+    autoencoder = libs.autoencoder.get_model('../assets/stable-diffusion/autoencoder_kl.pth')
+    autoencoder.to(device)
+    llm = 'clip'
+    if llm=='clip':
+        clip = FrozenCLIPEmbedder()
+        clip.eval()
+        clip.to(device)
+    elif llm=='t5':
+        t5 = T5Embedder(device=device)
+    else:
+        raise NotImplementedError
+    with torch.no_grad():
+        for idx, data in tqdm(enumerate(datas)):
+            x, captions = data
+            if len(x.shape) == 3:
+                x = x[None, ...]
+            x = torch.tensor(x, device=device)
+            moments = autoencoder(x, fn='encode_moments').squeeze(0)
+            moments = moments.detach().cpu().numpy()
+            np.save(os.path.join(save_dir, f'{idx}.npy'), moments)
+            if llm=='clip':
+                latent, latent_and_others = clip.encode(captions)
+                token_embedding = latent_and_others['token_embedding']
+                token_mask = latent_and_others['token_mask']
+                token = latent_and_others['tokens']
+            elif llm=='t5':
+                latent, latent_and_others = t5.get_text_embeddings(captions)
+                token_embedding = latent_and_others['token_embedding'].to(torch.float32) * 10.0
+                token_mask = latent_and_others['token_mask']
+                token = latent_and_others['tokens']
+            for i in range(len(captions)):
+                data = {'promt': captions[i],
+                        'token_embedding': token_embedding[i].detach().cpu().numpy(),
+                        'token_mask': token_mask[i].detach().cpu().numpy(),
+                        'token': token[i].detach().cpu().numpy()}
+                np.save(os.path.join(save_dir, f'{idx}_{i}.npy'), data)
+if __name__ == '__main__':
+    main()

scripts/extract_test_prompt_feature.py ADDED Viewed

	@@ -0,0 +1,72 @@

+"""
+This file is used to extract feature for visulization during training
+"""
+import os
+import sys
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import torch
+import os
+import numpy as np
+from tqdm import tqdm
+import libs.autoencoder
+from libs.clip import FrozenCLIPEmbedder
+from libs.t5 import T5Embedder
+def main():
+    prompts = [
+        'A road with traffic lights, street lights and cars.',
+        'A bus driving in a city area with traffic signs.',
+        'A bus pulls over to the curb close to an intersection.',
+        'A group of people are walking and one is holding an umbrella.',
+        'A baseball player taking a swing at an incoming ball.',
+        'A dog next to a white cat with black-tipped ears.',
+        'A tiger standing on a rooftop while singing and jamming on an electric guitar under a spotlight. anime illustration.',
+        'A bird wearing headphones and speaking into a high-end microphone in a recording studio.',
+        'A bus made of cardboard.',
+        'A tower in the mountains.',
+        'Two cups of coffee, one with latte art of a cat. The other has latter art of a bird.',
+        'Oil painting of a robot made of sushi, holding chopsticks.',
+        'Portrait of a dog wearing a hat and holding a flag that has a yin-yang symbol on it.',
+        'A teddy bear wearing a motorcycle helmet and cape is standing in front of Loch Awe with Kilchurn Castle behind him. dslr photo.',
+        'A man standing on the moon',
+    ]
+    save_dir = f'run_vis'
+    os.makedirs(save_dir, exist_ok=True)
+    device = 'cuda'
+    llm = 'clip'
+    if llm=='clip':
+        clip = FrozenCLIPEmbedder()
+        clip.eval()
+        clip.to(device)
+    elif llm=='t5':
+        t5 = T5Embedder(device=device)
+    else:
+        raise NotImplementedError
+    if llm=='clip':
+        latent, latent_and_others = clip.encode(prompts)
+        token_embedding = latent_and_others['token_embedding']
+        token_mask = latent_and_others['token_mask']
+        token = latent_and_others['tokens']
+    elif llm=='t5':
+        latent, latent_and_others = t5.get_text_embeddings(prompts)
+        token_embedding = latent_and_others['token_embedding'].to(torch.float32) * 10.0
+        token_mask = latent_and_others['token_mask']
+        token = latent_and_others['tokens']
+    for i in range(len(prompts)):
+        data = {'promt': prompts[i],
+                'token_embedding': token_embedding[i].detach().cpu().numpy(),
+                'token_mask': token_mask[i].detach().cpu().numpy(),
+                'token': token[i].detach().cpu().numpy()}
+        np.save(os.path.join(save_dir, f'{i}.npy'), data)
+if __name__ == '__main__':
+    main()

scripts/extract_train_feature.py ADDED Viewed

	@@ -0,0 +1,159 @@

+"""
+This file is used to extract feature of the demo training data.
+"""
+import os
+import shutil
+import sys
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import torch
+import torch.nn as nn
+import os
+import numpy as np
+from tqdm import tqdm
+from PIL import Image
+import io
+import einops
+import random
+import json
+import libs.autoencoder
+from libs.clip import FrozenCLIPEmbedder
+from libs.t5 import T5Embedder
+def recreate_folder(folder_path):
+    if os.path.exists(folder_path):
+        shutil.rmtree(folder_path)
+    os.makedirs(folder_path)
+def center_crop_arr(pil_image, image_size):
+    while min(*pil_image.size) >= 2 * image_size:
+        pil_image = pil_image.resize(
+            tuple(x // 2 for x in pil_image.size), resample=Image.BOX
+        )
+    scale = image_size / min(*pil_image.size)
+    pil_image = pil_image.resize(
+        tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC
+    )
+    arr = np.array(pil_image)
+    crop_y = (arr.shape[0] - image_size) // 2
+    crop_x = (arr.shape[1] - image_size) // 2
+    return arr[crop_y : crop_y + image_size, crop_x : crop_x + image_size]
+def main(bz = 16):
+    json_path = '/path/to/JourneyDB_demo/img_text_pair.jsonl'
+    root_path = '/path/to/JourneyDB_demo/imgs'
+    dicts_list = []
+    with open(json_path, 'r', encoding='utf-8') as file:
+        for line in file:
+            dicts_list.append(json.loads(line))
+    save_dir = f'feature'
+    device = "cuda"
+    recreate_folder(save_dir)
+    autoencoder = libs.autoencoder.get_model('../assets/stable-diffusion/autoencoder_kl.pth')
+    autoencoder.to(device)
+    # CLIP model:
+    clip = FrozenCLIPEmbedder()
+    clip.eval()
+    clip.to(device)
+    # T5 model:
+    t5 = T5Embedder(device=device)
+    idx = 0
+    batch_img_256 = []
+    batch_img_512 = []
+    batch_caption = []
+    batch_name = []
+    for i, sample in enumerate(tqdm(dicts_list)):
+        try:
+            pil_image = Image.open(os.path.join(root_path,sample['img_path']))
+            caption = sample['prompt']
+            img_name = sample['img_path'].replace('.jpg','')
+            pil_image.load()
+            pil_image = pil_image.convert("RGB")
+        except:
+            with open("failed_file.txt", 'a+') as file:
+                file.write(sample['img_path'] + "\n")
+            continue
+        image_256 = center_crop_arr(pil_image, image_size=256)
+        image_512 = center_crop_arr(pil_image, image_size=512)
+        # if True:
+        #     image_id = random.randint(0,20)
+        #     Image.fromarray(image_256.astype(np.uint8)).save(f"temp_img_{image_id}_256.jpg")
+        #     Image.fromarray(image_512.astype(np.uint8)).save(f"temp_img_{image_id}_512.jpg")
+        image_256 = (image_256 / 127.5 - 1.0).astype(np.float32)
+        image_256 = einops.rearrange(image_256, 'h w c -> c h w')
+        batch_img_256.append(image_256)
+        image_512 = (image_512 / 127.5 - 1.0).astype(np.float32)
+        image_512 = einops.rearrange(image_512, 'h w c -> c h w')
+        batch_img_512.append(image_512)
+        batch_caption.append(caption)
+        batch_name.append(img_name)
+        if len(batch_name) == bz or i == len(dicts_list) - 1:
+            batch_img_256 = torch.tensor(np.stack(batch_img_256)).to(device)
+            moments_256 = autoencoder(batch_img_256, fn='encode_moments').squeeze(0)
+            moments_256 = moments_256.detach().cpu().numpy()
+            batch_img_512 = torch.tensor(np.stack(batch_img_512)).to(device)
+            moments_512 = autoencoder(batch_img_512, fn='encode_moments').squeeze(0)
+            moments_512 = moments_512.detach().cpu().numpy()
+            _latent_clip, latent_and_others_clip = clip.encode(batch_caption)
+            token_embedding_clip = latent_and_others_clip['token_embedding'].detach().cpu().numpy()
+            token_mask_clip = latent_and_others_clip['token_mask'].detach().cpu().numpy()
+            token_clip = latent_and_others_clip['tokens'].detach().cpu().numpy()
+            _latent_t5, latent_and_others_t5 = t5.get_text_embeddings(batch_caption)
+            token_embedding_t5 = (latent_and_others_t5['token_embedding'].to(torch.float32) * 10.0).detach().cpu().numpy()
+            token_mask_t5 = latent_and_others_t5['token_mask'].detach().cpu().numpy()
+            token_t5 = latent_and_others_t5['tokens'].detach().cpu().numpy()
+            for mt_256, mt_512, te_c, te_t, tm_c, tm_t, tk_c, tk_t, bc, bn in zip(moments_256, moments_512, token_embedding_clip, token_embedding_t5, token_mask_clip, token_mask_t5, token_clip, token_t5, batch_caption, batch_name):
+                assert mt_256.shape == (8,32,32)
+                assert mt_512.shape == (8,64,64)
+                assert te_c.shape == (77, 768)
+                assert te_t.shape == (77, 4096)
+                tar_path_name = os.path.join(save_dir, f'{bn}.npy')
+                if os.path.exists(tar_path_name):
+                    os.remove(tar_path_name)
+                data = {'image_latent_256': mt_256,
+                        'image_latent_512': mt_512,
+                        'token_embedding_clip': te_c,
+                        'token_embedding_t5': te_t,
+                        'token_mask_clip': tm_c,
+                        'token_mask_t5': tm_t,
+                        'token_clip': tk_c,
+                        'token_t5': tk_t,
+                        'batch_caption': bc}
+                try:
+                    np.save(tar_path_name, data)
+                    idx += 1
+                except:
+                    pass
+            batch_img_256 = []
+            batch_img_512 = []
+            batch_caption = []
+            batch_name = []
+    print(f'save {idx} files')
+if __name__ == '__main__':
+    main()

sde.py ADDED Viewed

	@@ -0,0 +1,326 @@

+import torch
+import torch.nn as nn
+from absl import logging
+import numpy as np
+import math
+from tqdm import tqdm
+import torch.nn.functional as F
+def check_zip(*args):
+    args = [list(arg) for arg in args]
+    length = len(args[0])
+    for arg in args:
+        assert len(arg) == length
+    return zip(*args)
+def get_sde(name, **kwargs):
+    if name == 'vpsde':
+        return VPSDE(**kwargs)
+    elif name == 'vpsde_cosine':
+        return VPSDECosine(**kwargs)
+    else:
+        raise NotImplementedError
+def stp(s, ts: torch.Tensor):  # scalar tensor product
+    if isinstance(s, np.ndarray):
+        s = torch.from_numpy(s).type_as(ts)
+    extra_dims = (1,) * (ts.dim() - 1)
+    return s.view(-1, *extra_dims) * ts
+def mos(a, start_dim=1):  # mean of square
+    return a.pow(2).flatten(start_dim=start_dim).mean(dim=-1)
+def duplicate(tensor, *size):
+    return tensor.unsqueeze(dim=0).expand(*size, *tensor.shape)
+class SDE(object):
+    r"""
+        dx = f(x, t)dt + g(t) dw with 0 <= t <= 1
+        f(x, t) is the drift
+        g(t) is the diffusion
+    """
+    def drift(self, x, t):
+        raise NotImplementedError
+    def diffusion(self, t):
+        raise NotImplementedError
+    def cum_beta(self, t):  # the variance of xt|x0
+        raise NotImplementedError
+    def cum_alpha(self, t):
+        raise NotImplementedError
+    def snr(self, t):  # signal noise ratio
+        raise NotImplementedError
+    def nsr(self, t):  # noise signal ratio
+        raise NotImplementedError
+    def marginal_prob(self, x0, t):  # the mean and std of q(xt|x0)
+        alpha = self.cum_alpha(t)
+        beta = self.cum_beta(t)
+        mean = stp(alpha ** 0.5, x0)  # E[xt|x0]
+        std = beta ** 0.5  # Cov[xt|x0] ** 0.5
+        return mean, std
+    def sample(self, x0, t_init=0):  # sample from q(xn|x0), where n is uniform
+        t = torch.rand(x0.shape[0], device=x0.device) * (1. - t_init) + t_init
+        mean, std = self.marginal_prob(x0, t)
+        eps = torch.randn_like(x0)
+        xt = mean + stp(std, eps)
+        return t, eps, xt
+class VPSDE(SDE):
+    def __init__(self, beta_min=0.1, beta_max=20):
+        # 0 <= t <= 1
+        self.beta_0 = beta_min
+        self.beta_1 = beta_max
+    def drift(self, x, t):
+        return -0.5 * stp(self.squared_diffusion(t), x)
+    def diffusion(self, t):
+        return self.squared_diffusion(t) ** 0.5
+    def squared_diffusion(self, t):  # beta(t)
+        return self.beta_0 + t * (self.beta_1 - self.beta_0)
+    def squared_diffusion_integral(self, s, t):  # \int_s^t beta(tau) d tau
+        return self.beta_0 * (t - s) + (self.beta_1 - self.beta_0) * (t ** 2 - s ** 2) * 0.5
+    def skip_beta(self, s, t):  # beta_{t|s}, Cov[xt|xs]=beta_{t|s} I
+        return 1. - self.skip_alpha(s, t)
+    def skip_alpha(self, s, t):  # alpha_{t|s}, E[xt|xs]=alpha_{t|s}**0.5 xs
+        x = -self.squared_diffusion_integral(s, t)
+        return x.exp()
+    def cum_beta(self, t):
+        return self.skip_beta(0, t)
+    def cum_alpha(self, t):
+        return self.skip_alpha(0, t)
+    def nsr(self, t):
+        nsr = self.squared_diffusion_integral(0, t).expm1()
+        nsr = nsr.clamp(max = 1e6, min = 1e-12)
+        return nsr
+    def snr(self, t):
+        snr = 1. / self.nsr(t)
+        snr = snr.clamp(max = 1e6, min = 1e-12)
+        return snr
+    def __str__(self):
+        return f'vpsde beta_0={self.beta_0} beta_1={self.beta_1}'
+    def __repr__(self):
+        return f'vpsde beta_0={self.beta_0} beta_1={self.beta_1}'
+class VPSDECosine(SDE):
+    r"""
+        dx = f(x, t)dt + g(t) dw with 0 <= t <= 1
+        f(x, t) is the drift
+        g(t) is the diffusion
+    """
+    def __init__(self, s=0.008):
+        self.s = s
+        self.F = lambda t: torch.cos((t + s) / (1 + s) * math.pi / 2) ** 2
+        self.F0 = math.cos(s / (1 + s) * math.pi / 2) ** 2
+    def drift(self, x, t):
+        ft = - torch.tan((t + self.s) / (1 + self.s) * math.pi / 2) / (1 + self.s) * math.pi / 2
+        return stp(ft, x)
+    def diffusion(self, t):
+        return (torch.tan((t + self.s) / (1 + self.s) * math.pi / 2) / (1 + self.s) * math.pi) ** 0.5
+    def cum_beta(self, t):  # the variance of xt|x0
+        return 1 - self.cum_alpha(t)
+    def cum_alpha(self, t):
+        return self.F(t) / self.F0
+    def snr(self, t):  # signal noise ratio
+        Ft = self.F(t)
+        snr = Ft / (self.F0 - Ft)
+        snr = snr.clamp(max = 1e6, min = 1e-12)
+        return snr
+    def nsr(self, t):  # noise signal ratio
+        Ft = self.F(t)
+        nsr = self.F0 / Ft - 1
+        nsr = nsr.clamp(max = 1e6, min = 1e-12)
+        return nsr
+    def __str__(self):
+        return 'vpsde_cosine'
+    def __repr__(self):
+        return 'vpsde_cosine'
+class ScoreModel(object):
+    r"""
+        The forward process is q(x_[0,T])
+    """
+    def __init__(self, nnet: nn.Module, loss_coeffs:list, sde: SDE, using_cfg: bool = False, T=1):
+        assert T == 1
+        self.nnet = nnet
+        self.loss_coeffs = loss_coeffs
+        self.sde = sde
+        self.T = T
+        self.using_cfg = using_cfg
+        print(f'ScoreModel with loss_coeffs={loss_coeffs}, sde={sde}, T={T}')
+    def predict(self, xt, t, **kwargs):
+        if not isinstance(t, torch.Tensor):
+            t = torch.tensor(t)
+        t = t.to(xt.device)
+        if t.dim() == 0:
+            t = duplicate(t, xt.size(0))
+        log_snr = self.sde.snr(t).log()
+        return self.nnet(xt, t = t * 999, log_snr = log_snr, **kwargs)  # follow SDE
+        # return self.nnet(xt, t = t, log_snr = log_snr, **kwargs)  # follow SDE
+    def noise_pred(self, xt, t, sampling = True, **kwargs):
+        if sampling:
+            if self.using_cfg:
+                return self.predict(xt, t, **kwargs)
+            else:
+                return self.predict(xt, t, **kwargs)[-1]
+        else:
+            return self.predict(xt, t, **kwargs)
+    def score(self, xt, t, **kwargs):
+        cum_beta = self.sde.cum_beta(t)
+        noise_pred = self.noise_pred(xt, t, sampling = True, **kwargs)
+        return stp(-cum_beta.rsqrt(), noise_pred)
+class ReverseSDE(object):
+    r"""
+        dx = [f(x, t) - g(t)^2 s(x, t)] dt + g(t) dw
+    """
+    def __init__(self, score_model):
+        self.sde = score_model.sde  # the forward sde
+        self.score_model = score_model
+    def drift(self, x, t, **kwargs):
+        drift = self.sde.drift(x, t)  # f(x, t)
+        diffusion = self.sde.diffusion(t)  # g(t)
+        score = self.score_model.score(x, t, **kwargs)
+        return drift - stp(diffusion ** 2, score)
+    def diffusion(self, t):
+        return self.sde.diffusion(t)
+class ODE(object):
+    r"""
+        dx = [f(x, t) - g(t)^2 s(x, t)] dt
+    """
+    def __init__(self, score_model):
+        self.sde = score_model.sde  # the forward sde
+        self.score_model = score_model
+    def drift(self, x, t, **kwargs):
+        drift = self.sde.drift(x, t)  # f(x, t)
+        diffusion = self.sde.diffusion(t)  # g(t)
+        score = self.score_model.score(x, t, **kwargs)
+        return drift - 0.5 * stp(diffusion ** 2, score)
+    def diffusion(self, t):
+        return 0
+def dct2str(dct):
+    return str({k: f'{v:.6g}' for k, v in dct.items()})
+@ torch.no_grad()
+def euler_maruyama(rsde, x_init, sample_steps, eps=1e-3, T=1, trace=None, verbose=False, **kwargs):
+    r"""
+    The Euler Maruyama sampler for reverse SDE / ODE
+    See `Score-Based Generative Modeling through Stochastic Differential Equations`
+    """
+    assert isinstance(rsde, ReverseSDE) or isinstance(rsde, ODE)
+    print(f"euler_maruyama with sample_steps={sample_steps}")
+    timesteps = np.append(0., np.linspace(eps, T, sample_steps))
+    timesteps = torch.tensor(timesteps).to(x_init)
+    x = x_init
+    if trace is not None:
+        trace.append(x)
+    for s, t in tqdm(list(zip(timesteps, timesteps[1:]))[::-1], disable=not verbose, desc='euler_maruyama'):
+        drift = rsde.drift(x, t, **kwargs)
+        diffusion = rsde.diffusion(t)
+        dt = s - t
+        mean = x + drift * dt
+        sigma = diffusion * (-dt).sqrt()
+        x = mean + stp(sigma, torch.randn_like(x)) if s != 0 else mean
+        if trace is not None:
+            trace.append(x)
+        statistics = dict(s=s, t=t, sigma=sigma.item())
+        logging.debug(dct2str(statistics))
+    return x
+def LSimple(score_model: ScoreModel, x0, **kwargs):
+    t, noise, xt = score_model.sde.sample(x0)
+    prediction = score_model.noise_pred(xt, t, sampling = False, **kwargs)
+    target = multi_scale_targets(noise, levels = len(prediction), scale_correction = True)
+    loss = 0
+    for pred, coeff in check_zip(prediction, score_model.loss_coeffs):
+        loss = loss + coeff * mos(pred - target[pred.shape[-1]])
+    return loss
+def odd_multi_scale_targets(target, levels, scale_correction):
+    B, C, H, W = target.shape
+    targets = {}
+    for l in range(levels):
+        ratio = int(2 ** l)
+        if ratio == 1:
+            targets[target.shape[-1]] = target
+            continue
+        assert (H - 1) % ratio == 0 and (W - 1) % ratio == 0
+        KS = ratio + 1
+        scale = KS if scale_correction else KS ** 2
+        kernel = torch.ones(C, 1, KS, KS, device = target.device) / scale
+        downsampled = F.conv2d(target, kernel, stride = ratio, padding = KS // 2, groups = C)
+        targets[downsampled.shape[-1]] = downsampled
+    return targets
+def even_multi_scale_targets(target, levels, scale_correction):
+    B, C, H, W = target.shape
+    targets = {}
+    for l in range(levels):
+        ratio = int(2 ** l)
+        if ratio == 1:
+            targets[target.shape[-1]] = target
+            continue
+        assert H % ratio == 0 and W % ratio == 0
+        KS = ratio
+        scale = KS if scale_correction else KS ** 2
+        kernel = torch.ones(C, 1, KS, KS, device = target.device) / scale
+        downsampled = F.conv2d(target, kernel, stride = ratio, groups = C)
+        targets[downsampled.shape[-1]] = downsampled
+    return targets
+def multi_scale_targets(target, levels, scale_correction):
+    B, C, H, W = target.shape
+    if H % 2 == 0:
+        return even_multi_scale_targets(target, levels, scale_correction)
+    else:
+        return odd_multi_scale_targets(target, levels, scale_correction)

tools/clip_score.py ADDED Viewed

	@@ -0,0 +1,90 @@

+"""
+    This file computes the clip score given image and text pair
+"""
+import clip
+import torch
+from PIL import Image
+from sklearn.preprocessing import normalize
+from torchvision.transforms import Compose, Normalize, Resize
+import torch
+import numpy as np
+class ClipSocre:
+    def __init__(self,device='cuda', prefix='A photo depicts', weight=1.0): # weight=2.5
+        self.device = device
+        self.model, _ = clip.load("ViT-B/32", device=device, jit=False)
+        self.model.eval()
+        self.transform = Compose([
+            Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+        ])
+        self.prefix = prefix
+        if self.prefix[-1] != ' ':
+            self.prefix += ' '
+        self.w = weight
+    def extract_all_images(self, images):
+        images_input = self.transform(images)
+        if self.device == 'cuda':
+            images_input = images_input.to(torch.float16)
+        image_feature = self.model.encode_image(images_input)
+        return image_feature
+    def extract_all_texts(self, texts,need_prefix):
+        if need_prefix:
+            c_data = clip.tokenize(self.prefix + texts, truncate=True).to(self.device)
+        else:
+            c_data = clip.tokenize(texts, truncate=True).to(self.device)
+        text_feature = self.model.encode_text(c_data)
+        return text_feature
+    def get_clip_score(self, img, text, need_prefix=False):
+        img_f = self.extract_all_images(img)
+        text_f = self.extract_all_texts(text,need_prefix)
+        images = img_f / torch.sqrt(torch.sum(img_f**2, axis=1, keepdims=True))
+        candidates = text_f / torch.sqrt(torch.sum(text_f**2, axis=1, keepdims=True))
+        clip_per = self.w * torch.clip(torch.sum(images * candidates, axis=1), 0, None)
+        return clip_per
+    def get_text_clip_score(self, text_1, text_2, need_prefix=False):
+        text_1_f = self.extract_all_texts(text_1,need_prefix)
+        text_2_f = self.extract_all_texts(text_2,need_prefix)
+        candidates_1 = text_1_f / torch.sqrt(torch.sum(text_1_f**2, axis=1, keepdims=True))
+        candidates_2 = text_2_f / torch.sqrt(torch.sum(text_2_f**2, axis=1, keepdims=True))
+        per = self.w * torch.clip(torch.sum(candidates_1 * candidates_2, axis=1), 0, None)
+        results = 'ClipS : ' + str(format(per.item(),'.4f'))
+        print(results)
+        return per.sum()
+    def get_img_clip_score(self, img_1, img_2, weight = 1):
+        img_f_1 = self.extract_all_images(img_1)
+        img_f_2 = self.extract_all_images(img_2)
+        images_1 = img_f_1 / torch.sqrt(torch.sum(img_f_1**2, axis=1, keepdims=True))
+        images_2 = img_f_2 / torch.sqrt(torch.sum(img_f_2**2, axis=1, keepdims=True))
+        # per = self.w * torch.clip(torch.sum(images_1 * images_2, axis=1), 0, None)
+        per = weight * torch.clip(torch.sum(images_1 * images_2, axis=1), 0, None)
+        return per.sum()
+    def calculate_clip_score(self, caption_list, image_unprocessed):
+        image_unprocessed = 0.5 * (image_unprocessed + 1.)
+        image_unprocessed.clamp_(0., 1.)
+        img_resize = Resize((224))(image_unprocessed)
+        return self.get_clip_score(img_resize,caption_list)

tools/fid_score.py ADDED Viewed

	@@ -0,0 +1,268 @@

+"""Calculates the Frechet Inception Distance (FID) to evalulate GANs
+The FID metric calculates the distance between two distributions of images.
+Typically, we have summary statistics (mean & covariance matrix) of one
+of these distributions, while the 2nd distribution is given by a GAN.
+When run as a stand-alone program, it compares the distribution of
+images that are stored as PNG/JPEG at a specified location with a
+distribution given by summary statistics (in pickle format).
+The FID is calculated by assuming that X_1 and X_2 are the activations of
+the pool_3 layer of the inception net for generated samples and real world
+samples respectively.
+See --help to see further details.
+Code apapted from https://github.com/bioinf-jku/TTUR to use PyTorch instead
+of Tensorflow
+Copyright 2018 Institute of Bioinformatics, JKU Linz
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+import pathlib
+import numpy as np
+import torch
+import torchvision.transforms as TF
+from PIL import Image
+from scipy import linalg
+from torch.nn.functional import adaptive_avg_pool2d
+from torchvision import transforms
+try:
+    from tqdm import tqdm
+except ImportError:
+    # If tqdm is not available, provide a mock version of it
+    def tqdm(x):
+        return x
+from .inception import InceptionV3
+IMAGE_EXTENSIONS = {'bmp', 'jpg', 'jpeg', 'pgm', 'png', 'ppm',
+                    'tif', 'tiff', 'webp'}
+class ImagePathDataset(torch.utils.data.Dataset):
+    def __init__(self, files, transforms=None):
+        self.files = files
+        self.transforms = transforms
+    def __len__(self):
+        return len(self.files)
+    def __getitem__(self, i):
+        path = self.files[i]
+        img = Image.open(path).convert('RGB')
+        if img.size == (512,512):
+            img = img.resize((256, 256))
+        if self.transforms is not None:
+            img = self.transforms(img)
+        return img
+def get_activations(files, model, batch_size=50, dims=2048, device='cpu', num_workers=8):
+    """Calculates the activations of the pool_3 layer for all images.
+    Params:
+    -- files       : List of image files paths
+    -- model       : Instance of inception model
+    -- batch_size  : Batch size of images for the model to process at once.
+                     Make sure that the number of samples is a multiple of
+                     the batch size, otherwise some samples are ignored. This
+                     behavior is retained to match the original FID score
+                     implementation.
+    -- dims        : Dimensionality of features returned by Inception
+    -- device      : Device to run calculations
+    -- num_workers : Number of parallel dataloader workers
+    Returns:
+    -- A numpy array of dimension (num images, dims) that contains the
+       activations of the given tensor when feeding inception with the
+       query tensor.
+    """
+    model.eval()
+    if batch_size > len(files):
+        print(('Warning: batch size is bigger than the data size. '
+               'Setting batch size to data size'))
+        batch_size = len(files)
+    dataset = ImagePathDataset(files, transforms=TF.ToTensor())
+    dataloader = torch.utils.data.DataLoader(dataset,
+                                             batch_size=batch_size,
+                                             shuffle=False,
+                                             drop_last=False,
+                                             num_workers=num_workers)
+    pred_arr = np.empty((len(files), dims))
+    start_idx = 0
+    # resizer = transforms.Resize(256) # for clip
+    for batch in tqdm(dataloader):
+        batch = batch.to(device)
+        with torch.no_grad():
+            pred = model(batch)[0]
+        # If model output is not scalar, apply global spatial average pooling.
+        # This happens if you choose a dimensionality not equal 2048.
+        if pred.size(2) != 1 or pred.size(3) != 1:
+            pred = adaptive_avg_pool2d(pred, output_size=(1, 1))
+        pred = pred.squeeze(3).squeeze(2).cpu().numpy()
+        pred_arr[start_idx:start_idx + pred.shape[0]] = pred
+        start_idx = start_idx + pred.shape[0]
+    return pred_arr
+def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
+    """Numpy implementation of the Frechet Distance.
+    The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1)
+    and X_2 ~ N(mu_2, C_2) is
+            d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)).
+    Stable version by Dougal J. Sutherland.
+    Params:
+    -- mu1   : Numpy array containing the activations of a layer of the
+               inception net (like returned by the function 'get_predictions')
+               for generated samples.
+    -- mu2   : The sample mean over activations, precalculated on an
+               representative data set.
+    -- sigma1: The covariance matrix over activations for generated samples.
+    -- sigma2: The covariance matrix over activations, precalculated on an
+               representative data set.
+    Returns:
+    --   : The Frechet Distance.
+    """
+    mu1 = np.atleast_1d(mu1)
+    mu2 = np.atleast_1d(mu2)
+    sigma1 = np.atleast_2d(sigma1)
+    sigma2 = np.atleast_2d(sigma2)
+    assert mu1.shape == mu2.shape, \
+        'Training and test mean vectors have different lengths'
+    assert sigma1.shape == sigma2.shape, \
+        'Training and test covariances have different dimensions'
+    diff = mu1 - mu2
+    # Product might be almost singular
+    covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
+    if not np.isfinite(covmean).all():
+        msg = ('fid calculation produces singular product; '
+               'adding %s to diagonal of cov estimates') % eps
+        print(msg)
+        offset = np.eye(sigma1.shape[0]) * eps
+        covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
+    # Numerical error might give slight imaginary component
+    if np.iscomplexobj(covmean):
+        if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
+            m = np.max(np.abs(covmean.imag))
+            raise ValueError('Imaginary component {}'.format(m))
+        covmean = covmean.real
+    tr_covmean = np.trace(covmean)
+    return (diff.dot(diff) + np.trace(sigma1)
+            + np.trace(sigma2) - 2 * tr_covmean)
+def calculate_activation_statistics(files, model, batch_size=50, dims=2048,
+                                    device='cpu', num_workers=8):
+    """Calculation of the statistics used by the FID.
+    Params:
+    -- files       : List of image files paths
+    -- model       : Instance of inception model
+    -- batch_size  : The images numpy array is split into batches with
+                     batch size batch_size. A reasonable batch size
+                     depends on the hardware.
+    -- dims        : Dimensionality of features returned by Inception
+    -- device      : Device to run calculations
+    -- num_workers : Number of parallel dataloader workers
+    Returns:
+    -- mu    : The mean over samples of the activations of the pool_3 layer of
+               the inception model.
+    -- sigma : The covariance matrix of the activations of the pool_3 layer of
+               the inception model.
+    """
+    act = get_activations(files, model, batch_size, dims, device, num_workers)
+    mu = np.mean(act, axis=0)
+    sigma = np.cov(act, rowvar=False)
+    return mu, sigma
+def compute_statistics_of_path(path, model, batch_size, dims, device, num_workers=8):
+    if path.endswith('.npz'):
+        with np.load(path) as f:
+            m, s = f['mu'][:], f['sigma'][:]
+    else:
+        path = pathlib.Path(path)
+        files = sorted([file for ext in IMAGE_EXTENSIONS
+                       for file in path.glob('*.{}'.format(ext))])
+        m, s = calculate_activation_statistics(files, model, batch_size,
+                                               dims, device, num_workers)
+    return m, s
+def save_statistics_of_path(path, out_path, device=None, batch_size=50, dims=2048, num_workers=8):
+    if device is None:
+        device = torch.device('cuda' if (torch.cuda.is_available()) else 'cpu')
+    else:
+        device = torch.device(device)
+    block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
+    model = InceptionV3([block_idx]).to(device)
+    m1, s1 = compute_statistics_of_path(path, model, batch_size, dims, device, num_workers)
+    np.savez(out_path, mu=m1, sigma=s1)
+def calculate_fid_given_paths(paths, device=None, batch_size=50, dims=2048, num_workers=8):
+    """Calculates the FID of two paths"""
+    if device is None:
+        device = torch.device('cuda' if (torch.cuda.is_available()) else 'cpu')
+    else:
+        device = torch.device(device)
+    for p in paths:
+        if not os.path.exists(p):
+            raise RuntimeError('Invalid path: %s' % p)
+    block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
+    model = InceptionV3([block_idx]).to(device)
+    m1, s1 = compute_statistics_of_path(paths[0], model, batch_size,
+                                        dims, device, num_workers)
+    m2, s2 = compute_statistics_of_path(paths[1], model, batch_size,
+                                        dims, device, num_workers)
+    fid_value = calculate_frechet_distance(m1, s1, m2, s2)
+    return fid_value

tools/inception.py ADDED Viewed

	@@ -0,0 +1,328 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision
+try:
+    from torchvision.models.utils import load_state_dict_from_url
+except ImportError:
+    from torch.utils.model_zoo import load_url as load_state_dict_from_url
+# Inception weights ported to Pytorch from
+# http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
+FID_WEIGHTS_URL = 'https://github.com/mseitzer/pytorch-fid/releases/download/fid_weights/pt_inception-2015-12-05-6726825d.pth'  # noqa: E501
+class InceptionV3(nn.Module):
+    """Pretrained InceptionV3 network returning feature maps"""
+    # Index of default block of inception to return,
+    # corresponds to output of final average pooling
+    DEFAULT_BLOCK_INDEX = 3
+    # Maps feature dimensionality to their output blocks indices
+    BLOCK_INDEX_BY_DIM = {
+        64: 0,   # First max pooling features
+        192: 1,  # Second max pooling featurs
+        768: 2,  # Pre-aux classifier features
+        2048: 3  # Final average pooling features
+    }
+    def __init__(self,
+                 output_blocks=(DEFAULT_BLOCK_INDEX,),
+                 resize_input=True,
+                 normalize_input=True,
+                 requires_grad=False,
+                 use_fid_inception=True):
+        """Build pretrained InceptionV3
+        Parameters
+        ----------
+        output_blocks : list of int
+            Indices of blocks to return features of. Possible values are:
+                - 0: corresponds to output of first max pooling
+                - 1: corresponds to output of second max pooling
+                - 2: corresponds to output which is fed to aux classifier
+                - 3: corresponds to output of final average pooling
+        resize_input : bool
+            If true, bilinearly resizes input to width and height 299 before
+            feeding input to model. As the network without fully connected
+            layers is fully convolutional, it should be able to handle inputs
+            of arbitrary size, so resizing might not be strictly needed
+        normalize_input : bool
+            If true, scales the input from range (0, 1) to the range the
+            pretrained Inception network expects, namely (-1, 1)
+        requires_grad : bool
+            If true, parameters of the model require gradients. Possibly useful
+            for finetuning the network
+        use_fid_inception : bool
+            If true, uses the pretrained Inception model used in Tensorflow's
+            FID implementation. If false, uses the pretrained Inception model
+            available in torchvision. The FID Inception model has different
+            weights and a slightly different structure from torchvision's
+            Inception model. If you want to compute FID scores, you are
+            strongly advised to set this parameter to true to get comparable
+            results.
+        """
+        super(InceptionV3, self).__init__()
+        self.resize_input = resize_input
+        self.normalize_input = normalize_input
+        self.output_blocks = sorted(output_blocks)
+        self.last_needed_block = max(output_blocks)
+        assert self.last_needed_block <= 3, \
+            'Last possible output block index is 3'
+        self.blocks = nn.ModuleList()
+        if use_fid_inception:
+            inception = fid_inception_v3()
+        else:
+            inception = _inception_v3(pretrained=True)
+        # Block 0: input to maxpool1
+        block0 = [
+            inception.Conv2d_1a_3x3,
+            inception.Conv2d_2a_3x3,
+            inception.Conv2d_2b_3x3,
+            nn.MaxPool2d(kernel_size=3, stride=2)
+        ]
+        self.blocks.append(nn.Sequential(*block0))
+        # Block 1: maxpool1 to maxpool2
+        if self.last_needed_block >= 1:
+            block1 = [
+                inception.Conv2d_3b_1x1,
+                inception.Conv2d_4a_3x3,
+                nn.MaxPool2d(kernel_size=3, stride=2)
+            ]
+            self.blocks.append(nn.Sequential(*block1))
+        # Block 2: maxpool2 to aux classifier
+        if self.last_needed_block >= 2:
+            block2 = [
+                inception.Mixed_5b,
+                inception.Mixed_5c,
+                inception.Mixed_5d,
+                inception.Mixed_6a,
+                inception.Mixed_6b,
+                inception.Mixed_6c,
+                inception.Mixed_6d,
+                inception.Mixed_6e,
+            ]
+            self.blocks.append(nn.Sequential(*block2))
+        # Block 3: aux classifier to final avgpool
+        if self.last_needed_block >= 3:
+            block3 = [
+                inception.Mixed_7a,
+                inception.Mixed_7b,
+                inception.Mixed_7c,
+                nn.AdaptiveAvgPool2d(output_size=(1, 1))
+            ]
+            self.blocks.append(nn.Sequential(*block3))
+        for param in self.parameters():
+            param.requires_grad = requires_grad
+    def forward(self, inp):
+        """Get Inception feature maps
+        Parameters
+        ----------
+        inp : torch.autograd.Variable
+            Input tensor of shape Bx3xHxW. Values are expected to be in
+            range (0, 1)
+        Returns
+        -------
+        List of torch.autograd.Variable, corresponding to the selected output
+        block, sorted ascending by index
+        """
+        outp = []
+        x = inp
+        if self.resize_input:
+            x = F.interpolate(x,
+                              size=(299, 299),
+                              mode='bilinear',
+                              align_corners=False)
+        if self.normalize_input:
+            x = 2 * x - 1  # Scale from range (0, 1) to range (-1, 1)
+        for idx, block in enumerate(self.blocks):
+            x = block(x)
+            if idx in self.output_blocks:
+                outp.append(x)
+            if idx == self.last_needed_block:
+                break
+        return outp
+def _inception_v3(*args, **kwargs):
+    """Wraps `torchvision.models.inception_v3`
+    Skips default weight inititialization if supported by torchvision version.
+    See https://github.com/mseitzer/pytorch-fid/issues/28.
+    """
+    try:
+        version = tuple(map(int, torchvision.__version__.split('.')[:2]))
+    except ValueError:
+        # Just a caution against weird version strings
+        version = (0,)
+    if version >= (0, 6):
+        kwargs['init_weights'] = False
+    return torchvision.models.inception_v3(*args, **kwargs)
+def fid_inception_v3():
+    """Build pretrained Inception model for FID computation
+    The Inception model for FID computation uses a different set of weights
+    and has a slightly different structure than torchvision's Inception.
+    This method first constructs torchvision's Inception and then patches the
+    necessary parts that are different in the FID Inception model.
+    """
+    inception = _inception_v3(num_classes=1008,
+                              aux_logits=False,
+                              pretrained=False)
+    inception.Mixed_5b = FIDInceptionA(192, pool_features=32)
+    inception.Mixed_5c = FIDInceptionA(256, pool_features=64)
+    inception.Mixed_5d = FIDInceptionA(288, pool_features=64)
+    inception.Mixed_6b = FIDInceptionC(768, channels_7x7=128)
+    inception.Mixed_6c = FIDInceptionC(768, channels_7x7=160)
+    inception.Mixed_6d = FIDInceptionC(768, channels_7x7=160)
+    inception.Mixed_6e = FIDInceptionC(768, channels_7x7=192)
+    inception.Mixed_7b = FIDInceptionE_1(1280)
+    inception.Mixed_7c = FIDInceptionE_2(2048)
+    state_dict = load_state_dict_from_url(FID_WEIGHTS_URL, model_dir="checkpoints", progress=True)
+    inception.load_state_dict(state_dict)
+    return inception
+class FIDInceptionA(torchvision.models.inception.InceptionA):
+    """InceptionA block patched for FID computation"""
+    def __init__(self, in_channels, pool_features):
+        super(FIDInceptionA, self).__init__(in_channels, pool_features)
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch5x5 = self.branch5x5_1(x)
+        branch5x5 = self.branch5x5_2(branch5x5)
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+        # Patch: Tensorflow's average pool does not use the padded zero's in
+        # its average calculation
+        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1,
+                                   count_include_pad=False)
+        branch_pool = self.branch_pool(branch_pool)
+        outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool]
+        return torch.cat(outputs, 1)
+class FIDInceptionC(torchvision.models.inception.InceptionC):
+    """InceptionC block patched for FID computation"""
+    def __init__(self, in_channels, channels_7x7):
+        super(FIDInceptionC, self).__init__(in_channels, channels_7x7)
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch7x7 = self.branch7x7_1(x)
+        branch7x7 = self.branch7x7_2(branch7x7)
+        branch7x7 = self.branch7x7_3(branch7x7)
+        branch7x7dbl = self.branch7x7dbl_1(x)
+        branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
+        # Patch: Tensorflow's average pool does not use the padded zero's in
+        # its average calculation
+        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1,
+                                   count_include_pad=False)
+        branch_pool = self.branch_pool(branch_pool)
+        outputs = [branch1x1, branch7x7, branch7x7dbl, branch_pool]
+        return torch.cat(outputs, 1)
+class FIDInceptionE_1(torchvision.models.inception.InceptionE):
+    """First InceptionE block patched for FID computation"""
+    def __init__(self, in_channels):
+        super(FIDInceptionE_1, self).__init__(in_channels)
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = [
+            self.branch3x3_2a(branch3x3),
+            self.branch3x3_2b(branch3x3),
+        ]
+        branch3x3 = torch.cat(branch3x3, 1)
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = [
+            self.branch3x3dbl_3a(branch3x3dbl),
+            self.branch3x3dbl_3b(branch3x3dbl),
+        ]
+        branch3x3dbl = torch.cat(branch3x3dbl, 1)
+        # Patch: Tensorflow's average pool does not use the padded zero's in
+        # its average calculation
+        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1,
+                                   count_include_pad=False)
+        branch_pool = self.branch_pool(branch_pool)
+        outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
+        return torch.cat(outputs, 1)
+class FIDInceptionE_2(torchvision.models.inception.InceptionE):
+    """Second InceptionE block patched for FID computation"""
+    def __init__(self, in_channels):
+        super(FIDInceptionE_2, self).__init__(in_channels)
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = [
+            self.branch3x3_2a(branch3x3),
+            self.branch3x3_2b(branch3x3),
+        ]
+        branch3x3 = torch.cat(branch3x3, 1)
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = [
+            self.branch3x3dbl_3a(branch3x3dbl),
+            self.branch3x3dbl_3b(branch3x3dbl),
+        ]
+        branch3x3dbl = torch.cat(branch3x3dbl, 1)
+        # Patch: The FID Inception model uses max pooling instead of average
+        # pooling. This is likely an error in this specific Inception
+        # implementation, as other Inception models use average pooling here
+        # (which matches the description in the paper).
+        branch_pool = F.max_pool2d(x, kernel_size=3, stride=1, padding=1)
+        branch_pool = self.branch_pool(branch_pool)
+        outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
+        return torch.cat(outputs, 1)

train_t2i.py ADDED Viewed

	@@ -0,0 +1,328 @@

+import ml_collections
+import torch
+from torch import multiprocessing as mp
+from datasets import get_dataset
+from torchvision.utils import make_grid, save_image
+import utils
+import einops
+from torch.utils._pytree import tree_map
+import accelerate
+from torch.utils.data import DataLoader
+from tqdm.auto import tqdm
+import tempfile
+from absl import logging
+import builtins
+import os
+import wandb
+import numpy as np
+import time
+import random
+import libs.autoencoder
+from libs.t5 import T5Embedder
+from libs.clip import FrozenCLIPEmbedder
+from diffusion.flow_matching import FlowMatching, ODEFlowMatchingSolver, ODEEulerFlowMatchingSolver
+from tools.fid_score import calculate_fid_given_paths
+from tools.clip_score import ClipSocre
+def train(config):
+    if config.get('benchmark', False):
+        torch.backends.cudnn.benchmark = True
+        torch.backends.cudnn.deterministic = False
+    mp.set_start_method('spawn')
+    accelerator = accelerate.Accelerator()
+    device = accelerator.device
+    accelerate.utils.set_seed(config.seed, device_specific=True)
+    logging.info(f'Process {accelerator.process_index} using device: {device}')
+    config.mixed_precision = accelerator.mixed_precision
+    config = ml_collections.FrozenConfigDict(config)
+    assert config.train.batch_size % accelerator.num_processes == 0
+    mini_batch_size = config.train.batch_size // accelerator.num_processes
+    if accelerator.is_main_process:
+        os.makedirs(config.ckpt_root, exist_ok=True)
+        os.makedirs(config.sample_dir, exist_ok=True)
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        wandb.init(dir=os.path.abspath(config.workdir), project=f'uvit_{config.dataset.name}', config=config.to_dict(),
+                   name=config.hparams, job_type='train', mode='offline')
+        utils.set_logger(log_level='info', fname=os.path.join(config.workdir, 'output.log'))
+        logging.info(config)
+    else:
+        utils.set_logger(log_level='error')
+        builtins.print = lambda *args: None
+    logging.info(f'Run on {accelerator.num_processes} devices')
+    dataset = get_dataset(**config.dataset)
+    assert os.path.exists(dataset.fid_stat)
+    gpu_model = torch.cuda.get_device_name(torch.cuda.current_device())
+    num_workers = 8
+    train_dataset = dataset.get_split(split='train', labeled=True)
+    train_dataset_loader = DataLoader(train_dataset, batch_size=mini_batch_size, shuffle=True, drop_last=True,
+                                    num_workers=num_workers, pin_memory=True, persistent_workers=True)
+    test_dataset = dataset.get_split(split='test', labeled=True)  # for sampling
+    test_dataset_loader = DataLoader(test_dataset, batch_size=config.sample.mini_batch_size, shuffle=True, drop_last=True,
+                                     num_workers=num_workers, pin_memory=True, persistent_workers=True)
+    train_state = utils.initialize_train_state(config, device)
+    nnet, nnet_ema, optimizer, train_dataset_loader, test_dataset_loader = accelerator.prepare(
+        train_state.nnet, train_state.nnet_ema, train_state.optimizer, train_dataset_loader, test_dataset_loader)
+    lr_scheduler = train_state.lr_scheduler
+    train_state.resume(config.ckpt_root)
+    autoencoder = libs.autoencoder.get_model(**config.autoencoder)
+    autoencoder.to(device)
+    if config.nnet.model_args.clip_dim == 4096:
+        llm = "t5"
+        t5 = T5Embedder(device=device)
+    elif config.nnet.model_args.clip_dim == 768:
+        llm = "clip"
+        clip = FrozenCLIPEmbedder()
+        clip.eval()
+        clip.to(device)
+    else:
+        raise NotImplementedError
+    ss_empty_context = None
+    ClipSocre_model = ClipSocre(device=device)
+    @ torch.cuda.amp.autocast()
+    def encode(_batch):
+        return autoencoder.encode(_batch)
+    @ torch.cuda.amp.autocast()
+    def decode(_batch):
+        return autoencoder.decode(_batch)
+    def get_data_generator():
+        while True:
+            for data in tqdm(train_dataset_loader, disable=not accelerator.is_main_process, desc='epoch'):
+                yield data
+    data_generator = get_data_generator()
+    def get_context_generator(autoencoder):
+        while True:
+            for data in test_dataset_loader:
+                if len(data) == 5:
+                    _img, _context, _token_mask, _token, _caption = data
+                else:
+                    _img, _context = data
+                    _token_mask = None
+                    _token = None
+                    _caption = None
+                if len(_img.shape)==5:
+                    _testbatch_img_blurred = autoencoder.sample(_img[:,1,:])
+                    yield _context, _token_mask, _token, _caption, _testbatch_img_blurred
+                else:
+                    assert len(_img.shape)==4
+                    yield _context, _token_mask, _token, _caption, None
+    context_generator = get_context_generator(autoencoder)
+    _flow_mathcing_model = FlowMatching()
+    def train_step(_batch, _ss_empty_context):
+        _metrics = dict()
+        optimizer.zero_grad()
+        assert len(_batch)==6
+        assert not config.dataset.cfg
+        _batch_img = _batch[0]
+        _batch_con = _batch[1]
+        _batch_mask = _batch[2]
+        _batch_token = _batch[3]
+        _batch_caption = _batch[4]
+        _batch_img_ori = _batch[5]
+        _z = autoencoder.sample(_batch_img)
+        loss, loss_dict = _flow_mathcing_model(_z, nnet, loss_coeffs=config.loss_coeffs, cond=_batch_con, con_mask=_batch_mask, batch_img_clip=_batch_img_ori, \
+            nnet_style=config.nnet.name, text_token=_batch_token, model_config=config.nnet.model_args, all_config=config, training_step=train_state.step)
+        _metrics['loss'] = accelerator.gather(loss.detach()).mean()
+        for key in loss_dict.keys():
+            _metrics[key] = accelerator.gather(loss_dict[key].detach()).mean()
+        accelerator.backward(loss.mean())
+        optimizer.step()
+        lr_scheduler.step()
+        train_state.ema_update(config.get('ema_rate', 0.9999))
+        train_state.step += 1
+        return dict(lr=train_state.optimizer.param_groups[0]['lr'], **_metrics)
+    def ode_fm_solver_sample(nnet_ema, _n_samples, _sample_steps, context=None, caption=None, testbatch_img_blurred=None, two_stage_generation=-1, token_mask=None, return_clipScore=False, ClipSocre_model=None):
+        with torch.no_grad():
+            _z_gaussian = torch.randn(_n_samples, *config.z_shape, device=device)
+            _z_x0, _mu, _log_var = nnet_ema(context, text_encoder = True, shape = _z_gaussian.shape, mask=token_mask)
+            _z_init = _z_x0.reshape(_z_gaussian.shape)
+            assert config.sample.scale > 1
+            _cfg = config.sample.scale
+            has_null_indicator = hasattr(config.nnet.model_args, "cfg_indicator")
+            ode_solver = ODEEulerFlowMatchingSolver(nnet_ema, step_size_type="step_in_dsigma", guidance_scale=_cfg)
+            _z, _ = ode_solver.sample(x_T=_z_init, batch_size=_n_samples, sample_steps=_sample_steps, unconditional_guidance_scale=_cfg, has_null_indicator=has_null_indicator)
+            image_unprocessed = decode(_z)
+            if return_clipScore:
+                clip_score = ClipSocre_model.calculate_clip_score(caption, image_unprocessed)
+                return image_unprocessed, clip_score
+            else:
+                return image_unprocessed
+    def eval_step(n_samples, sample_steps):
+        logging.info(f'eval_step: n_samples={n_samples}, sample_steps={sample_steps}, algorithm=ODE_Euler_Flow_Matching_Solver, '
+                     f'mini_batch_size={config.sample.mini_batch_size}')
+        def sample_fn(_n_samples, return_caption=False, return_clipScore=False, ClipSocre_model=None, config=None):
+            _context, _token_mask, _token, _caption, _testbatch_img_blurred = next(context_generator)
+            assert _context.size(0) == _n_samples
+            assert not return_caption # during training we should not use this
+            if return_caption:
+                return ode_fm_solver_sample(nnet_ema, _n_samples, sample_steps, context=_context, token_mask=_token_mask), _caption
+            elif return_clipScore:
+                return ode_fm_solver_sample(nnet_ema, _n_samples, sample_steps, context=_context, token_mask=_token_mask, return_clipScore=return_clipScore, ClipSocre_model=ClipSocre_model, caption=_caption)
+            else:
+                return ode_fm_solver_sample(nnet_ema, _n_samples, sample_steps, context=_context, token_mask=_token_mask)
+        with tempfile.TemporaryDirectory() as temp_path:
+            path = config.sample.path or temp_path
+            if accelerator.is_main_process:
+                os.makedirs(path, exist_ok=True)
+            clip_score_list = utils.sample2dir(accelerator, path, n_samples, config.sample.mini_batch_size, sample_fn, dataset.unpreprocess, return_clipScore=True, ClipSocre_model=ClipSocre_model, config=config)
+            _fid = 0
+            if accelerator.is_main_process:
+                _fid = calculate_fid_given_paths((dataset.fid_stat, path))
+                _clip_score_list = torch.cat(clip_score_list)
+                logging.info(f'step={train_state.step} fid{n_samples}={_fid} clip_score{len(_clip_score_list)} = {_clip_score_list.mean().item()}')
+                with open(os.path.join(config.workdir, 'eval.log'), 'a') as f:
+                    print(f'step={train_state.step} fid{n_samples}={_fid} clip_score{len(_clip_score_list)} = {_clip_score_list.mean().item()}', file=f)
+                wandb.log({f'fid{n_samples}': _fid}, step=train_state.step)
+            _fid = torch.tensor(_fid, device=device)
+            _fid = accelerator.reduce(_fid, reduction='sum')
+        return _fid.item()
+    logging.info(f'Start fitting, step={train_state.step}, mixed_precision={config.mixed_precision}')
+    step_fid = []
+    while train_state.step < config.train.n_steps:
+        nnet.train()
+        batch = tree_map(lambda x: x, next(data_generator))
+        metrics = train_step(batch, ss_empty_context)
+        nnet.eval()
+        if accelerator.is_main_process and train_state.step % config.train.log_interval == 0:
+            logging.info(utils.dct2str(dict(step=train_state.step, **metrics)))
+            logging.info(config.workdir)
+            wandb.log(metrics, step=train_state.step)
+        ############# save rigid image
+        if train_state.step % config.train.eval_interval == 0:
+            torch.cuda.empty_cache()
+            logging.info('Save a grid of images...')
+            if hasattr(dataset, "token_embedding"):
+                contexts = torch.tensor(dataset.token_embedding, device=device)[ : config.train.n_samples_eval]
+                token_mask = torch.tensor(dataset.token_mask, device=device)[ : config.train.n_samples_eval]
+            elif hasattr(dataset, "contexts"):
+                contexts = torch.tensor(dataset.contexts, device=device)[ : config.train.n_samples_eval]
+                token_mask = None
+            else:
+                raise NotImplementedError
+            samples = ode_fm_solver_sample(nnet_ema, _n_samples=config.train.n_samples_eval, _sample_steps=50, context=contexts, token_mask=token_mask)
+            samples = make_grid(dataset.unpreprocess(samples), 5)
+            if accelerator.is_main_process:
+                save_image(samples, os.path.join(config.sample_dir, f'{train_state.step}.png'))
+                wandb.log({'samples': wandb.Image(samples)}, step=train_state.step)
+            accelerator.wait_for_everyone()
+            torch.cuda.empty_cache()
+        ############ save checkpoint and evaluate results
+        if train_state.step % config.train.save_interval == 0 or train_state.step == config.train.n_steps:
+            torch.cuda.empty_cache()
+            logging.info(f'Save and eval checkpoint {train_state.step}...')
+            if accelerator.local_process_index == 0:
+                train_state.save(os.path.join(config.ckpt_root, f'{train_state.step}.ckpt'))
+            accelerator.wait_for_everyone()
+            fid = eval_step(n_samples=10000, sample_steps=50)  # calculate fid of the saved checkpoint
+            step_fid.append((train_state.step, fid))
+            torch.cuda.empty_cache()
+        accelerator.wait_for_everyone()
+    logging.info(f'Finish fitting, step={train_state.step}')
+    logging.info(f'step_fid: {step_fid}')
+    step_best = sorted(step_fid, key=lambda x: x[1])[0][0]
+    logging.info(f'step_best: {step_best}')
+    train_state.load(os.path.join(config.ckpt_root, f'{step_best}.ckpt'))
+    del metrics
+    accelerator.wait_for_everyone()
+    eval_step(n_samples=config.sample.n_samples, sample_steps=config.sample.sample_steps)
+from absl import flags
+from absl import app
+from ml_collections import config_flags
+import sys
+from pathlib import Path
+FLAGS = flags.FLAGS
+config_flags.DEFINE_config_file(
+    "config", None, "Training configuration.", lock_config=False)
+flags.mark_flags_as_required(["config"])
+flags.DEFINE_string("workdir", None, "Work unit directory.")
+def get_config_name():
+    argv = sys.argv
+    for i in range(1, len(argv)):
+        if argv[i].startswith('--config='):
+            return Path(argv[i].split('=')[-1]).stem
+def get_hparams():
+    argv = sys.argv
+    lst = []
+    for i in range(1, len(argv)):
+        assert '=' in argv[i]
+        if argv[i].startswith('--config.') and not argv[i].startswith('--config.dataset.path'):
+            hparam, val = argv[i].split('=')
+            hparam = hparam.split('.')[-1]
+            if hparam.endswith('path'):
+                val = Path(val).stem
+            lst.append(f'{hparam}={val}')
+    hparams = '-'.join(lst)
+    if hparams == '':
+        hparams = 'default'
+    return hparams
+def main(argv):
+    config = FLAGS.config
+    config.config_name = get_config_name()
+    config.hparams = get_hparams()
+    config.workdir = FLAGS.workdir or os.path.join('workdir', config.config_name, config.hparams)
+    config.ckpt_root = os.path.join(config.workdir, 'ckpts')
+    config.sample_dir = os.path.join(config.workdir, 'samples')
+    train(config)
+if __name__ == "__main__":
+    app.run(main)

utils.py ADDED Viewed

	@@ -0,0 +1,274 @@

+"""
+This file contains some tools
+"""
+import torch
+import torch.nn as nn
+import numpy as np
+import os
+from tqdm import tqdm
+from torchvision import transforms
+from torchvision.utils import save_image
+from absl import logging
+from PIL import Image, ImageDraw, ImageFont
+import textwrap
+def save_image_with_caption(image_tensor, caption, filename, font_size=20, font_path='/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf'):
+    """
+    Save an image with a caption
+    """
+    image_tensor = image_tensor.clone().detach()
+    image_tensor = torch.clamp(image_tensor, min=0, max=1)
+    image_pil = transforms.ToPILImage()(image_tensor)
+    draw = ImageDraw.Draw(image_pil)
+    font = ImageFont.truetype(font_path, font_size)
+    wrap_text = textwrap.wrap(caption, width=len(caption)//4 + 1)
+    text_sizes = [draw.textsize(line, font=font) for line in wrap_text]
+    max_text_width = max(size[0] for size in text_sizes)
+    total_text_height = sum(size[1] for size in text_sizes) + 15
+    new_height = image_pil.height + total_text_height + 25
+    new_image = Image.new('RGB', (image_pil.width, new_height), 'white')
+    new_image.paste(image_pil, (0, 0))
+    current_y = image_pil.height + 5
+    draw = ImageDraw.Draw(new_image)
+    for line, size in zip(wrap_text, text_sizes):
+        x = (new_image.width - size[0]) / 2
+        draw.text((x, current_y), line, font=font, fill='black')
+        current_y += size[1] + 5
+    new_image.save(filename)
+def set_logger(log_level='info', fname=None):
+    import logging as _logging
+    handler = logging.get_absl_handler()
+    formatter = _logging.Formatter('%(asctime)s - %(filename)s - %(message)s')
+    handler.setFormatter(formatter)
+    logging.set_verbosity(log_level)
+    if fname is not None:
+        handler = _logging.FileHandler(fname)
+        handler.setFormatter(formatter)
+        logging.get_absl_logger().addHandler(handler)
+def dct2str(dct):
+    return str({k: f'{v:.6g}' for k, v in dct.items()})
+def get_nnet(name, **kwargs):
+    if name == 'dimr':
+        from libs.model.dimr_t2i import MRModel
+        return MRModel(kwargs["model_args"])
+    elif name == 'dit':
+        from libs.model.dit_t2i import DiT_H_2
+        return DiT_H_2(kwargs["model_args"])
+    else:
+        raise NotImplementedError(name)
+def set_seed(seed: int):
+    if seed is not None:
+        torch.manual_seed(seed)
+        np.random.seed(seed)
+def get_optimizer(params, name, **kwargs):
+    if name == 'adam':
+        from torch.optim import Adam
+        return Adam(params, **kwargs)
+    elif name == 'adamw':
+        from torch.optim import AdamW
+        return AdamW(params, **kwargs)
+    else:
+        raise NotImplementedError(name)
+def customized_lr_scheduler(optimizer, warmup_steps=-1):
+    from torch.optim.lr_scheduler import LambdaLR
+    def fn(step):
+        if warmup_steps > 0:
+            return min(step / warmup_steps, 1)
+        else:
+            return 1
+    return LambdaLR(optimizer, fn)
+def get_lr_scheduler(optimizer, name, **kwargs):
+    if name == 'customized':
+        return customized_lr_scheduler(optimizer, **kwargs)
+    elif name == 'cosine':
+        from torch.optim.lr_scheduler import CosineAnnealingLR
+        return CosineAnnealingLR(optimizer, **kwargs)
+    else:
+        raise NotImplementedError(name)
+def ema(model_dest: nn.Module, model_src: nn.Module, rate):
+    param_dict_src = dict(model_src.named_parameters())
+    for p_name, p_dest in model_dest.named_parameters():
+        p_src = param_dict_src[p_name]
+        assert p_src is not p_dest
+        p_dest.data.mul_(rate).add_((1 - rate) * p_src.data)
+class TrainState(object):
+    def __init__(self, optimizer, lr_scheduler, step, nnet=None, nnet_ema=None):
+        self.optimizer = optimizer
+        self.lr_scheduler = lr_scheduler
+        self.step = step
+        self.nnet = nnet
+        self.nnet_ema = nnet_ema
+    def ema_update(self, rate=0.9999):
+        if self.nnet_ema is not None:
+            ema(self.nnet_ema, self.nnet, rate)
+    def save(self, path):
+        os.makedirs(path, exist_ok=True)
+        torch.save(self.step, os.path.join(path, 'step.pth'))
+        for key, val in self.__dict__.items():
+            if key != 'step' and val is not None:
+                torch.save(val.state_dict(), os.path.join(path, f'{key}.pth'))
+    def load(self, path):
+        logging.info(f'load from {path}')
+        self.step = torch.load(os.path.join(path, 'step.pth'))
+        for key, val in self.__dict__.items():
+            if key != 'step' and val is not None:
+                val.load_state_dict(torch.load(os.path.join(path, f'{key}.pth'), map_location='cpu'))
+    def resume(self, ckpt_root, step=None):
+        if not os.path.exists(ckpt_root):
+            return
+        if step is None:
+            ckpts = list(filter(lambda x: '.ckpt' in x, os.listdir(ckpt_root)))
+            if not ckpts:
+                return
+            steps = map(lambda x: int(x.split(".")[0]), ckpts)
+            step = max(steps)
+        ckpt_path = os.path.join(ckpt_root, f'{step}.ckpt')
+        logging.info(f'resume from {ckpt_path}')
+        self.load(ckpt_path)
+    def to(self, device):
+        for key, val in self.__dict__.items():
+            if isinstance(val, nn.Module):
+                val.to(device)
+def trainable_parameters(nnet):
+    params_decay = []
+    params_nodecay = []
+    for name, param in nnet.named_parameters():
+        if name.endswith(".nodecay_weight") or name.endswith(".nodecay_bias"):
+            params_nodecay.append(param)
+        else:
+            params_decay.append(param)
+    print("params_decay", len(params_decay))
+    print("params_nodecay", len(params_nodecay))
+    params = [
+        {'params': params_decay},
+        {'params': params_nodecay, 'weight_decay': 0.0}
+    ]
+    return params
+def initialize_train_state(config, device):
+    nnet = get_nnet(**config.nnet)
+    nnet_ema = get_nnet(**config.nnet)
+    nnet_ema.eval()
+    optimizer = get_optimizer(trainable_parameters(nnet), **config.optimizer)
+    lr_scheduler = get_lr_scheduler(optimizer, **config.lr_scheduler)
+    train_state = TrainState(optimizer=optimizer, lr_scheduler=lr_scheduler, step=0,
+                             nnet=nnet, nnet_ema=nnet_ema)
+    train_state.ema_update(0)
+    train_state.to(device)
+    return train_state
+def amortize(n_samples, batch_size):
+    k = n_samples // batch_size
+    r = n_samples % batch_size
+    return k * [batch_size] if r == 0 else k * [batch_size] + [r]
+def sample2dir(accelerator, path, n_samples, mini_batch_size, sample_fn, unpreprocess_fn=None, return_clipScore=False, ClipSocre_model=None, config=None):
+    os.makedirs(path, exist_ok=True)
+    idx = 0
+    batch_size = mini_batch_size * accelerator.num_processes
+    clip_score_list = []
+    if return_clipScore:
+        assert ClipSocre_model is not None
+    for _batch_size in tqdm(amortize(n_samples, batch_size), disable=not accelerator.is_main_process, desc='sample2dir'):
+        samples, clip_score = sample_fn(mini_batch_size, return_clipScore=return_clipScore, ClipSocre_model=ClipSocre_model, config=config)
+        samples = unpreprocess_fn(samples)
+        samples = accelerator.gather(samples.contiguous())[:_batch_size]
+        clip_score_list.append(accelerator.gather(clip_score)[:_batch_size])
+        if accelerator.is_main_process:
+            for sample in samples:
+                save_image(sample, os.path.join(path, f"{idx}.png"))
+                idx += 1
+    if return_clipScore:
+        return clip_score_list
+    else:
+        return None
+def sample2dir_wCLIP(accelerator, path, n_samples, mini_batch_size, sample_fn, unpreprocess_fn=None, return_clipScore=False, ClipSocre_model=None, config=None):
+    os.makedirs(path, exist_ok=True)
+    idx = 0
+    batch_size = mini_batch_size * accelerator.num_processes
+    clip_score_list = []
+    if return_clipScore:
+        assert ClipSocre_model is not None
+    for _batch_size in amortize(n_samples, batch_size):
+        samples, clip_score = sample_fn(mini_batch_size, return_clipScore=return_clipScore, ClipSocre_model=ClipSocre_model, config=config)
+        samples = unpreprocess_fn(samples)
+        samples = accelerator.gather(samples.contiguous())[:_batch_size]
+        clip_score_list.append(accelerator.gather(clip_score)[:_batch_size])
+        if accelerator.is_main_process:
+            for sample in samples:
+                save_image(sample, os.path.join(path, f"{idx}.png"))
+                idx += 1
+        break
+    if return_clipScore:
+        return clip_score_list
+    else:
+        return None
+def sample2dir_wPrompt(accelerator, path, n_samples, mini_batch_size, sample_fn, unpreprocess_fn=None, config=None):
+    os.makedirs(path, exist_ok=True)
+    idx = 0
+    batch_size = mini_batch_size * accelerator.num_processes
+    for _batch_size in tqdm(amortize(n_samples, batch_size), disable=not accelerator.is_main_process, desc='sample2dir'):
+        samples, samples_caption = sample_fn(mini_batch_size, return_caption=True, config=config)
+        samples = unpreprocess_fn(samples)
+        samples = accelerator.gather(samples.contiguous())[:_batch_size]
+        if accelerator.is_main_process:
+            for sample, caption in zip(samples,samples_caption):
+                try:
+                    save_image_with_caption(sample, caption, os.path.join(path, f"{idx}.png"))
+                except:
+                    save_image(sample, os.path.join(path, f"{idx}.png"))
+                idx += 1
+def grad_norm(model):
+    total_norm = 0.
+    for p in model.parameters():
+        param_norm = p.grad.data.norm(2)
+        total_norm += param_norm.item() ** 2
+    total_norm = total_norm ** (1. / 2)
+    return total_norm