Spaces:

ProgramerSalar
/

R1

Running on Zero

App Files Files Community

ProgramerSalar commited on 1 day ago

Commit

e661967

1 Parent(s): 32b7c72

'init'

Browse files

Files changed (29) hide show

LICENSE +21 -0
diffusion_schedulers/__init__.py +2 -0
diffusion_schedulers/scheduling_cosine_ddpm.py +137 -0
diffusion_schedulers/scheduling_flow_matching.py +298 -0
pre-requirements.txt +2 -0
pyramid_dit/__init__.py +3 -0
pyramid_dit/modeling_embedding.py +390 -0
pyramid_dit/modeling_mmdit_block.py +672 -0
pyramid_dit/modeling_normalization.py +179 -0
pyramid_dit/modeling_pyramid_mmdit.py +487 -0
pyramid_dit/modeling_text_encoder.py +140 -0
pyramid_dit/pyramid_dit_for_video_gen_pipeline.py +672 -0
requirements.txt +15 -0
trainer_misc/__init__.py +25 -0
trainer_misc/communicate.py +58 -0
trainer_misc/sp_utils.py +98 -0
trainer_misc/utils.py +382 -0
utils.py +457 -0
video_generation_demo.ipynb +181 -0
video_vae/__init__.py +2 -0
video_vae/context_parallel_ops.py +172 -0
video_vae/modeling_block.py +760 -0
video_vae/modeling_causal_conv.py +139 -0
video_vae/modeling_causal_vae.py +625 -0
video_vae/modeling_discriminator.py +122 -0
video_vae/modeling_enc_dec.py +422 -0
video_vae/modeling_loss.py +192 -0
video_vae/modeling_lpips.py +120 -0
video_vae/modeling_resnet.py +729 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Yang Jin
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

diffusion_schedulers/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .scheduling_cosine_ddpm import DDPMCosineScheduler
2	+ from .scheduling_flow_matching import PyramidFlowMatchEulerDiscreteScheduler

diffusion_schedulers/scheduling_cosine_ddpm.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils import BaseOutput
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+@dataclass
+class DDPMSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's step function output.
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+    """
+    prev_sample: torch.Tensor
+class DDPMCosineScheduler(SchedulerMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        scaler: float = 1.0,
+        s: float = 0.008,
+    ):
+        self.scaler = scaler
+        self.s = torch.tensor([s])
+        self._init_alpha_cumprod = torch.cos(self.s / (1 + self.s) * torch.pi * 0.5) ** 2
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+    def _alpha_cumprod(self, t, device):
+        if self.scaler > 1:
+            t = 1 - (1 - t) ** self.scaler
+        elif self.scaler < 1:
+            t = t**self.scaler
+        alpha_cumprod = torch.cos(
+            (t + self.s.to(device)) / (1 + self.s.to(device)) * torch.pi * 0.5
+        ) ** 2 / self._init_alpha_cumprod.to(device)
+        return alpha_cumprod.clamp(0.0001, 0.9999)
+    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+        Args:
+            sample (`torch.Tensor`): input sample
+            timestep (`int`, optional): current timestep
+        Returns:
+            `torch.Tensor`: scaled input sample
+        """
+        return sample
+    def set_timesteps(
+        self,
+        num_inference_steps: int = None,
+        timesteps: Optional[List[int]] = None,
+        device: Union[str, torch.device] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
+        Args:
+            num_inference_steps (`Dict[float, int]`):
+                the number of diffusion steps used when generating samples with a pre-trained model. If passed, then
+                `timesteps` must be `None`.
+            device (`str` or `torch.device`, optional):
+                the device to which the timesteps are moved to. {2 / 3: 20, 0.0: 10}
+        """
+        if timesteps is None:
+            timesteps = torch.linspace(1.0, 0.0, num_inference_steps + 1, device=device)
+        if not isinstance(timesteps, torch.Tensor):
+            timesteps = torch.Tensor(timesteps).to(device)
+        self.timesteps = timesteps
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: int,
+        sample: torch.Tensor,
+        generator=None,
+        return_dict: bool = True,
+    ) -> Union[DDPMSchedulerOutput, Tuple]:
+        dtype = model_output.dtype
+        device = model_output.device
+        t = timestep
+        prev_t = self.previous_timestep(t)
+        alpha_cumprod = self._alpha_cumprod(t, device).view(t.size(0), *[1 for _ in sample.shape[1:]])
+        alpha_cumprod_prev = self._alpha_cumprod(prev_t, device).view(prev_t.size(0), *[1 for _ in sample.shape[1:]])
+        alpha = alpha_cumprod / alpha_cumprod_prev
+        mu = (1.0 / alpha).sqrt() * (sample - (1 - alpha) * model_output / (1 - alpha_cumprod).sqrt())
+        std_noise = randn_tensor(mu.shape, generator=generator, device=model_output.device, dtype=model_output.dtype)
+        std = ((1 - alpha) * (1.0 - alpha_cumprod_prev) / (1.0 - alpha_cumprod)).sqrt() * std_noise
+        pred = mu + std * (prev_t != 0).float().view(prev_t.size(0), *[1 for _ in sample.shape[1:]])
+        if not return_dict:
+            return (pred.to(dtype),)
+        return DDPMSchedulerOutput(prev_sample=pred.to(dtype))
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.Tensor,
+    ) -> torch.Tensor:
+        device = original_samples.device
+        dtype = original_samples.dtype
+        alpha_cumprod = self._alpha_cumprod(timesteps, device=device).view(
+            timesteps.size(0), *[1 for _ in original_samples.shape[1:]]
+        )
+        noisy_samples = alpha_cumprod.sqrt() * original_samples + (1 - alpha_cumprod).sqrt() * noise
+        return noisy_samples.to(dtype=dtype)
+    def __len__(self):
+        return self.config.num_train_timesteps
+    def previous_timestep(self, timestep):
+        index = (self.timesteps - timestep[0]).abs().argmin().item()
+        prev_t = self.timesteps[index + 1][None].expand(timestep.shape[0])
+        return prev_t

diffusion_schedulers/scheduling_flow_matching.py ADDED Viewed

	@@ -0,0 +1,298 @@

+from dataclasses import dataclass
+from typing import Optional, Tuple, Union, List
+import math
+import numpy as np
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils import BaseOutput, logging
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+from IPython import embed
+@dataclass
+class FlowMatchEulerDiscreteSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+    """
+    prev_sample: torch.FloatTensor
+class PyramidFlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Euler scheduler.
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        shift (`float`, defaults to 1.0):
+            The shift value for the timestep schedule.
+    """
+    _compatibles = []
+    order = 1
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        shift: float = 1.0,     # Following Stable diffusion 3,
+        stages: int = 3,
+        stage_range: List = [0, 1/3, 2/3, 1],
+        gamma: float = 1/3,
+    ):
+        self.timestep_ratios = {}           # The timestep ratio for each stage
+        self.timesteps_per_stage = {}       # The  detailed timesteps per stage
+        self.sigmas_per_stage = {}
+        self.start_sigmas = {}
+        self.end_sigmas = {}
+        self.ori_start_sigmas = {}
+        # self.init_sigmas()
+        self.init_sigmas_for_each_stage()
+        self.sigma_min = self.sigmas[-1].item()
+        self.sigma_max = self.sigmas[0].item()
+        self.gamma = gamma
+    def init_sigmas(self):
+        """
+            initialize the global timesteps and sigmas
+        """
+        num_train_timesteps = self.config.num_train_timesteps
+        shift = self.config.shift
+        timesteps = np.linspace(1, num_train_timesteps, num_train_timesteps, dtype=np.float32)[::-1].copy()
+        timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32)
+        sigmas = timesteps / num_train_timesteps
+        sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
+        self.timesteps = sigmas * num_train_timesteps
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+    def init_sigmas_for_each_stage(self):
+        """
+            Init the timesteps for each stage
+        """
+        self.init_sigmas()
+        stage_distance = []
+        stages = self.config.stages
+        training_steps = self.config.num_train_timesteps
+        stage_range = self.config.stage_range
+        # Init the start and end point of each stage
+        for i_s in range(stages):
+            # To decide the start and ends point
+            start_indice = int(stage_range[i_s] * training_steps)
+            start_indice = max(start_indice, 0)
+            end_indice = int(stage_range[i_s+1] * training_steps)
+            end_indice = min(end_indice, training_steps)
+            start_sigma = self.sigmas[start_indice].item()
+            end_sigma = self.sigmas[end_indice].item() if end_indice < training_steps else 0.0
+            self.ori_start_sigmas[i_s] = start_sigma
+            if i_s != 0:
+                ori_sigma = 1 - start_sigma
+                gamma = self.config.gamma
+                corrected_sigma = (1 / (math.sqrt(1 + (1 / gamma)) * (1 - ori_sigma) + ori_sigma)) * ori_sigma
+                # corrected_sigma = 1 / (2 - ori_sigma) * ori_sigma
+                start_sigma = 1 - corrected_sigma
+            stage_distance.append(start_sigma - end_sigma)
+            self.start_sigmas[i_s] = start_sigma
+            self.end_sigmas[i_s] = end_sigma
+        # Determine the ratio of each stage according to flow length
+        tot_distance = sum(stage_distance)
+        for i_s in range(stages):
+            if i_s == 0:
+                start_ratio = 0.0
+            else:
+                start_ratio = sum(stage_distance[:i_s]) / tot_distance
+            if i_s == stages - 1:
+                end_ratio = 1.0
+            else:
+                end_ratio = sum(stage_distance[:i_s+1]) / tot_distance
+            self.timestep_ratios[i_s] = (start_ratio, end_ratio)
+        # Determine the timesteps and sigmas for each stage
+        for i_s in range(stages):
+            timestep_ratio = self.timestep_ratios[i_s]
+            timestep_max = self.timesteps[int(timestep_ratio[0] * training_steps)]
+            timestep_min = self.timesteps[min(int(timestep_ratio[1] * training_steps), training_steps - 1)]
+            timesteps = np.linspace(
+                timestep_max, timestep_min, training_steps + 1,
+            )
+            self.timesteps_per_stage[i_s] = torch.from_numpy(timesteps[:-1])
+            stage_sigmas = np.linspace(
+                1, 0, training_steps + 1,
+            )
+            self.sigmas_per_stage[i_s] = torch.from_numpy(stage_sigmas[:-1])
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+    def _sigma_to_t(self, sigma):
+        return sigma * self.config.num_train_timesteps
+    def set_timesteps(self, num_inference_steps: int, stage_index: int, device: Union[str, torch.device] = None):
+        """
+            Setting the timesteps and sigmas for each stage
+        """
+        self.num_inference_steps = num_inference_steps
+        training_steps = self.config.num_train_timesteps
+        self.init_sigmas()
+        stage_timesteps = self.timesteps_per_stage[stage_index]
+        timestep_max = stage_timesteps[0].item()
+        timestep_min = stage_timesteps[-1].item()
+        timesteps = np.linspace(
+            timestep_max, timestep_min, num_inference_steps,
+        )
+        self.timesteps = torch.from_numpy(timesteps).to(device=device)
+        stage_sigmas = self.sigmas_per_stage[stage_index]
+        sigma_max = stage_sigmas[0].item()
+        sigma_min = stage_sigmas[-1].item()
+        ratios = np.linspace(
+            sigma_max, sigma_min, num_inference_steps
+        )
+        sigmas = torch.from_numpy(ratios).to(device=device)
+        self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
+        self._step_index = None
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+        indices = (schedule_timesteps == timestep).nonzero()
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+        return indices[pos].item()
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        sample: torch.FloatTensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[FlowMatchEulerDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or
+                tuple.
+        Returns:
+            [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
+                returned, otherwise a tuple is returned where the first element is the sample tensor.
+        """
+        if (
+            isinstance(timestep, int)
+            or isinstance(timestep, torch.IntTensor)
+            or isinstance(timestep, torch.LongTensor)
+        ):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+        if self.step_index is None:
+            self._step_index = 0
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+        sigma = self.sigmas[self.step_index]
+        sigma_next = self.sigmas[self.step_index + 1]
+        prev_sample = sample + (sigma_next - sigma) * model_output
+        # Cast sample back to model compatible dtype
+        prev_sample = prev_sample.to(model_output.dtype)
+        # upon completion increase step index by one
+        self._step_index += 1
+        if not return_dict:
+            return (prev_sample,)
+        return FlowMatchEulerDiscreteSchedulerOutput(prev_sample=prev_sample)
+    def __len__(self):
+        return self.config.num_train_timesteps

pre-requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ wheel
2	+ torch

pyramid_dit/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .modeling_pyramid_mmdit import PyramidDiffusionMMDiT
+from .pyramid_dit_for_video_gen_pipeline import PyramidDiTForVideoGeneration
+from .modeling_text_encoder import SD3TextEncoderWithMask

pyramid_dit/modeling_embedding.py ADDED Viewed

	@@ -0,0 +1,390 @@

+from typing import Any, Dict, Optional, Union
+import torch
+import torch.nn as nn
+import numpy as np
+import math
+from diffusers.models.activations import get_activation
+from einops import rearrange
+def get_1d_sincos_pos_embed(
+    embed_dim, num_frames, cls_token=False, extra_tokens=0,
+):
+    t = np.arange(num_frames, dtype=np.float32)
+    pos_embed = get_1d_sincos_pos_embed_from_grid(embed_dim, t)  # (T, D)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed(
+    embed_dim, grid_size, cls_token=False, extra_tokens=0, interpolation_scale=1.0, base_size=16
+):
+    """
+    grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or
+    [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    if isinstance(grid_size, int):
+        grid_size = (grid_size, grid_size)
+    grid_h = np.arange(grid_size[0], dtype=np.float32) / (grid_size[0] / base_size) / interpolation_scale
+    grid_w = np.arange(grid_size[1], dtype=np.float32) / (grid_size[1] / base_size) / interpolation_scale
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size[1], grid_size[0]])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    if embed_dim % 2 != 0:
+        raise ValueError("embed_dim must be divisible by 2")
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D)
+    """
+    if embed_dim % 2 != 0:
+        raise ValueError("embed_dim must be divisible by 2")
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+def get_timestep_embedding(
+    timesteps: torch.Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = False,
+    downscale_freq_shift: float = 1,
+    scale: float = 1,
+    max_period: int = 10000,
+):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element. These may be fractional.
+    :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
+    embeddings. :return: an [N x dim] Tensor of positional embeddings.
+    """
+    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
+    half_dim = embedding_dim // 2
+    exponent = -math.log(max_period) * torch.arange(
+        start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
+    )
+    exponent = exponent / (half_dim - downscale_freq_shift)
+    emb = torch.exp(exponent)
+    emb = timesteps[:, None].float() * emb[None, :]
+    # scale embeddings
+    emb = scale * emb
+    # concat sine and cosine embeddings
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+    # flip sine and cosine embeddings
+    if flip_sin_to_cos:
+        emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
+    # zero pad
+    if embedding_dim % 2 == 1:
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+class Timesteps(nn.Module):
+    def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float):
+        super().__init__()
+        self.num_channels = num_channels
+        self.flip_sin_to_cos = flip_sin_to_cos
+        self.downscale_freq_shift = downscale_freq_shift
+    def forward(self, timesteps):
+        t_emb = get_timestep_embedding(
+            timesteps,
+            self.num_channels,
+            flip_sin_to_cos=self.flip_sin_to_cos,
+            downscale_freq_shift=self.downscale_freq_shift,
+        )
+        return t_emb
+class TimestepEmbedding(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        time_embed_dim: int,
+        act_fn: str = "silu",
+        out_dim: int = None,
+        post_act_fn: Optional[str] = None,
+        sample_proj_bias=True,
+    ):
+        super().__init__()
+        self.linear_1 = nn.Linear(in_channels, time_embed_dim, sample_proj_bias)
+        self.act = get_activation(act_fn)
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim, sample_proj_bias)
+    def forward(self, sample):
+        sample = self.linear_1(sample)
+        sample = self.act(sample)
+        sample = self.linear_2(sample)
+        return sample
+class TextProjection(nn.Module):
+    def __init__(self, in_features, hidden_size, act_fn="silu"):
+        super().__init__()
+        self.linear_1 = nn.Linear(in_features=in_features, out_features=hidden_size, bias=True)
+        self.act_1 = get_activation(act_fn)
+        self.linear_2 = nn.Linear(in_features=hidden_size, out_features=hidden_size, bias=True)
+    def forward(self, caption):
+        hidden_states = self.linear_1(caption)
+        hidden_states = self.act_1(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+class CombinedTimestepConditionEmbeddings(nn.Module):
+    def __init__(self, embedding_dim, pooled_projection_dim):
+        super().__init__()
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+        self.text_embedder = TextProjection(pooled_projection_dim, embedding_dim, act_fn="silu")
+    def forward(self, timestep, pooled_projection):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=pooled_projection.dtype))  # (N, D)
+        pooled_projections = self.text_embedder(pooled_projection)
+        conditioning = timesteps_emb + pooled_projections
+        return conditioning
+class CombinedTimestepEmbeddings(nn.Module):
+    def __init__(self, embedding_dim):
+        super().__init__()
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+    def forward(self, timestep):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj)  # (N, D)
+        return timesteps_emb
+class PatchEmbed3D(nn.Module):
+    """Support the 3D Tensor input"""
+    def __init__(
+        self,
+        height=128,
+        width=128,
+        patch_size=2,
+        in_channels=16,
+        embed_dim=1536,
+        layer_norm=False,
+        bias=True,
+        interpolation_scale=1,
+        pos_embed_type="sincos",
+        temp_pos_embed_type='rope',
+        pos_embed_max_size=192,   # For SD3 cropping
+        max_num_frames=64,
+        add_temp_pos_embed=False,
+        interp_condition_pos=False,
+    ):
+        super().__init__()
+        num_patches = (height // patch_size) * (width // patch_size)
+        self.layer_norm = layer_norm
+        self.pos_embed_max_size = pos_embed_max_size
+        self.proj = nn.Conv2d(
+            in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias
+        )
+        if layer_norm:
+            self.norm = nn.LayerNorm(embed_dim, elementwise_affine=False, eps=1e-6)
+        else:
+            self.norm = None
+        self.patch_size = patch_size
+        self.height, self.width = height // patch_size, width // patch_size
+        self.base_size = height // patch_size
+        self.interpolation_scale = interpolation_scale
+        self.add_temp_pos_embed = add_temp_pos_embed
+        # Calculate positional embeddings based on max size or default
+        if pos_embed_max_size:
+            grid_size = pos_embed_max_size
+        else:
+            grid_size = int(num_patches**0.5)
+        if pos_embed_type is None:
+            self.pos_embed = None
+        elif pos_embed_type == "sincos":
+            pos_embed = get_2d_sincos_pos_embed(
+                embed_dim, grid_size, base_size=self.base_size, interpolation_scale=self.interpolation_scale
+            )
+            persistent = True if pos_embed_max_size else False
+            self.register_buffer("pos_embed", torch.from_numpy(pos_embed).float().unsqueeze(0), persistent=persistent)
+            if add_temp_pos_embed and temp_pos_embed_type == 'sincos':
+                time_pos_embed = get_1d_sincos_pos_embed(embed_dim, max_num_frames)
+                self.register_buffer("temp_pos_embed", torch.from_numpy(time_pos_embed).float().unsqueeze(0), persistent=True)
+        elif pos_embed_type == "rope":
+            print("Using the rotary position embedding")
+        else:
+            raise ValueError(f"Unsupported pos_embed_type: {pos_embed_type}")
+        self.pos_embed_type = pos_embed_type
+        self.temp_pos_embed_type = temp_pos_embed_type
+        self.interp_condition_pos = interp_condition_pos
+    def cropped_pos_embed(self, height, width, ori_height, ori_width):
+        """Crops positional embeddings for SD3 compatibility."""
+        if self.pos_embed_max_size is None:
+            raise ValueError("`pos_embed_max_size` must be set for cropping.")
+        height = height // self.patch_size
+        width = width // self.patch_size
+        ori_height = ori_height // self.patch_size
+        ori_width = ori_width // self.patch_size
+        assert ori_height >= height, "The ori_height needs >= height"
+        assert ori_width >= width, "The ori_width needs >= width"
+        if height > self.pos_embed_max_size:
+            raise ValueError(
+                f"Height ({height}) cannot be greater than `pos_embed_max_size`: {self.pos_embed_max_size}."
+            )
+        if width > self.pos_embed_max_size:
+            raise ValueError(
+                f"Width ({width}) cannot be greater than `pos_embed_max_size`: {self.pos_embed_max_size}."
+            )
+        if self.interp_condition_pos:
+            top = (self.pos_embed_max_size - ori_height) // 2
+            left = (self.pos_embed_max_size - ori_width) // 2
+            spatial_pos_embed = self.pos_embed.reshape(1, self.pos_embed_max_size, self.pos_embed_max_size, -1)
+            spatial_pos_embed = spatial_pos_embed[:, top : top + ori_height, left : left + ori_width, :]   # [b h w c]
+            if ori_height != height or ori_width != width:
+                spatial_pos_embed = spatial_pos_embed.permute(0, 3, 1, 2)
+                spatial_pos_embed = torch.nn.functional.interpolate(spatial_pos_embed, size=(height, width), mode='bilinear')
+                spatial_pos_embed = spatial_pos_embed.permute(0, 2, 3, 1)
+        else:
+            top = (self.pos_embed_max_size - height) // 2
+            left = (self.pos_embed_max_size - width) // 2
+            spatial_pos_embed = self.pos_embed.reshape(1, self.pos_embed_max_size, self.pos_embed_max_size, -1)
+            spatial_pos_embed = spatial_pos_embed[:, top : top + height, left : left + width, :]
+        spatial_pos_embed = spatial_pos_embed.reshape(1, -1, spatial_pos_embed.shape[-1])
+        return spatial_pos_embed
+    def forward_func(self, latent, time_index=0, ori_height=None, ori_width=None):
+        if self.pos_embed_max_size is not None:
+            height, width = latent.shape[-2:]
+        else:
+            height, width = latent.shape[-2] // self.patch_size, latent.shape[-1] // self.patch_size
+        bs = latent.shape[0]
+        temp = latent.shape[2]
+        latent = rearrange(latent, 'b c t h w -> (b t) c h w')
+        latent = self.proj(latent)
+        latent = latent.flatten(2).transpose(1, 2)  # (BT)CHW -> (BT)NC
+        if self.layer_norm:
+            latent = self.norm(latent)
+        if self.pos_embed_type == 'sincos':
+            # Spatial position embedding, Interpolate or crop positional embeddings as needed
+            if self.pos_embed_max_size:
+                pos_embed = self.cropped_pos_embed(height, width, ori_height, ori_width)
+            else:
+                raise NotImplementedError("Not implemented sincos pos embed without sd3 max pos crop")
+                if self.height != height or self.width != width:
+                    pos_embed = get_2d_sincos_pos_embed(
+                        embed_dim=self.pos_embed.shape[-1],
+                        grid_size=(height, width),
+                        base_size=self.base_size,
+                        interpolation_scale=self.interpolation_scale,
+                    )
+                    pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).to(latent.device)
+                else:
+                    pos_embed = self.pos_embed
+            if self.add_temp_pos_embed and self.temp_pos_embed_type == 'sincos':
+                latent_dtype = latent.dtype
+                latent = latent + pos_embed
+                latent = rearrange(latent, '(b t) n c -> (b n) t c', t=temp)
+                latent = latent + self.temp_pos_embed[:, time_index:time_index + temp, :]
+                latent = latent.to(latent_dtype)
+                latent = rearrange(latent, '(b n) t c -> b t n c', b=bs)
+            else:
+                latent = (latent + pos_embed).to(latent.dtype)
+                latent = rearrange(latent, '(b t) n c -> b t n c', b=bs, t=temp)
+        else:
+            assert self.pos_embed_type == "rope", "Only supporting the sincos and rope embedding"
+            latent = rearrange(latent, '(b t) n c -> b t n c', b=bs, t=temp)
+        return latent
+    def forward(self, latent):
+        """
+        Arguments:
+            past_condition_latents (Torch.FloatTensor): The past latent during the generation
+            flatten_input (bool): True indicate flatten the latent into 1D sequence
+        """
+        if isinstance(latent, list):
+            output_list = []
+            for latent_ in latent:
+                if not isinstance(latent_, list):
+                    latent_ = [latent_]
+                output_latent = []
+                time_index = 0
+                ori_height, ori_width = latent_[-1].shape[-2:]
+                for each_latent in latent_:
+                    hidden_state = self.forward_func(each_latent, time_index=time_index, ori_height=ori_height, ori_width=ori_width)
+                    time_index += each_latent.shape[2]
+                    hidden_state = rearrange(hidden_state, "b t n c -> b (t n) c")
+                    output_latent.append(hidden_state)
+                output_latent = torch.cat(output_latent, dim=1)
+                output_list.append(output_latent)
+            return output_list
+        else:
+            hidden_states = self.forward_func(latent)
+            hidden_states = rearrange(hidden_states, "b t n c -> b (t n) c")
+            return hidden_states

pyramid_dit/modeling_mmdit_block.py ADDED Viewed

	@@ -0,0 +1,672 @@

+from typing import Dict, Optional, Tuple, List
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from diffusers.models.activations import GEGLU, GELU, ApproximateGELU
+try:
+    from flash_attn import flash_attn_qkvpacked_func, flash_attn_func
+    from flash_attn.bert_padding import pad_input, unpad_input, index_first_axis
+    from flash_attn.flash_attn_interface import flash_attn_varlen_func
+except:
+    flash_attn_func = None
+    flash_attn_qkvpacked_func = None
+    flash_attn_varlen_func = None
+    print("Please install flash attention")
+from trainer_misc import (
+    is_sequence_parallel_initialized,
+    get_sequence_parallel_group,
+    get_sequence_parallel_world_size,
+    all_to_all,
+)
+from .modeling_normalization import AdaLayerNormZero, AdaLayerNormContinuous, RMSNorm
+class FeedForward(nn.Module):
+    r"""
+    A feed-forward layer.
+    Parameters:
+        dim (`int`): The number of channels in the input.
+        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
+        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+        activation_fn: str = "geglu",
+        final_dropout: bool = False,
+        inner_dim=None,
+        bias: bool = True,
+    ):
+        super().__init__()
+        if inner_dim is None:
+            inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        if activation_fn == "gelu":
+            act_fn = GELU(dim, inner_dim, bias=bias)
+        if activation_fn == "gelu-approximate":
+            act_fn = GELU(dim, inner_dim, approximate="tanh", bias=bias)
+        elif activation_fn == "geglu":
+            act_fn = GEGLU(dim, inner_dim, bias=bias)
+        elif activation_fn == "geglu-approximate":
+            act_fn = ApproximateGELU(dim, inner_dim, bias=bias)
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(act_fn)
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(nn.Linear(inner_dim, dim_out, bias=bias))
+        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
+        if final_dropout:
+            self.net.append(nn.Dropout(dropout))
+    def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+        for module in self.net:
+            hidden_states = module(hidden_states)
+        return hidden_states
+class VarlenFlashSelfAttentionWithT5Mask:
+    def __init__(self):
+        pass
+    def apply_rope(self, xq, xk, freqs_cis):
+        xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+        xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+        xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+        xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+        return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
+    def __call__(
+            self, query, key, value, encoder_query, encoder_key, encoder_value,
+            heads, scale, hidden_length=None, image_rotary_emb=None, encoder_attention_mask=None,
+        ):
+        assert encoder_attention_mask is not None, "The encoder-hidden mask needed to be set"
+        batch_size = query.shape[0]
+        output_hidden = torch.zeros_like(query)
+        output_encoder_hidden = torch.zeros_like(encoder_query)
+        encoder_length = encoder_query.shape[1]
+        qkv_list = []
+        num_stages = len(hidden_length)
+        encoder_qkv = torch.stack([encoder_query, encoder_key, encoder_value], dim=2) # [bs, sub_seq, 3, head, head_dim]
+        qkv = torch.stack([query, key, value], dim=2) # [bs, sub_seq, 3, head, head_dim]
+        i_sum = 0
+        for i_p, length in enumerate(hidden_length):
+            encoder_qkv_tokens = encoder_qkv[i_p::num_stages]
+            qkv_tokens = qkv[:, i_sum:i_sum+length]
+            concat_qkv_tokens = torch.cat([encoder_qkv_tokens, qkv_tokens], dim=1)  # [bs, tot_seq, 3, nhead, dim]
+            if image_rotary_emb is not None:
+                concat_qkv_tokens[:,:,0], concat_qkv_tokens[:,:,1] = self.apply_rope(concat_qkv_tokens[:,:,0], concat_qkv_tokens[:,:,1], image_rotary_emb[i_p])
+            indices = encoder_attention_mask[i_p]['indices']
+            qkv_list.append(index_first_axis(rearrange(concat_qkv_tokens, "b s ... -> (b s) ..."), indices))
+            i_sum += length
+        token_lengths = [x_.shape[0] for x_ in qkv_list]
+        qkv = torch.cat(qkv_list, dim=0)
+        query, key, value = qkv.unbind(1)
+        cu_seqlens = torch.cat([x_['seqlens_in_batch'] for x_ in encoder_attention_mask], dim=0)
+        max_seqlen_q = cu_seqlens.max().item()
+        max_seqlen_k = max_seqlen_q
+        cu_seqlens_q = F.pad(torch.cumsum(cu_seqlens, dim=0, dtype=torch.int32), (1, 0))
+        cu_seqlens_k = cu_seqlens_q.clone()
+        output = flash_attn_varlen_func(
+            query,
+            key,
+            value,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=max_seqlen_k,
+            dropout_p=0.0,
+            causal=False,
+            softmax_scale=scale,
+        )
+        # To merge the tokens
+        i_sum = 0;token_sum = 0
+        for i_p, length in enumerate(hidden_length):
+            tot_token_num = token_lengths[i_p]
+            stage_output = output[token_sum : token_sum + tot_token_num]
+            stage_output = pad_input(stage_output, encoder_attention_mask[i_p]['indices'], batch_size, encoder_length + length)
+            stage_encoder_hidden_output = stage_output[:, :encoder_length]
+            stage_hidden_output = stage_output[:, encoder_length:]
+            output_hidden[:, i_sum:i_sum+length] = stage_hidden_output
+            output_encoder_hidden[i_p::num_stages] = stage_encoder_hidden_output
+            token_sum += tot_token_num
+            i_sum += length
+        output_hidden = output_hidden.flatten(2, 3)
+        output_encoder_hidden = output_encoder_hidden.flatten(2, 3)
+        return output_hidden, output_encoder_hidden
+class SequenceParallelVarlenFlashSelfAttentionWithT5Mask:
+    def __init__(self):
+        pass
+    def apply_rope(self, xq, xk, freqs_cis):
+        xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+        xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+        xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+        xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+        return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
+    def __call__(
+            self, query, key, value, encoder_query, encoder_key, encoder_value,
+            heads, scale, hidden_length=None, image_rotary_emb=None, encoder_attention_mask=None,
+        ):
+        assert encoder_attention_mask is not None, "The encoder-hidden mask needed to be set"
+        batch_size = query.shape[0]
+        qkv_list = []
+        num_stages = len(hidden_length)
+        encoder_qkv = torch.stack([encoder_query, encoder_key, encoder_value], dim=2) # [bs, sub_seq, 3, head, head_dim]
+        qkv = torch.stack([query, key, value], dim=2) # [bs, sub_seq, 3, head, head_dim]
+        # To sync the encoder query, key and values
+        sp_group = get_sequence_parallel_group()
+        sp_group_size = get_sequence_parallel_world_size()
+        encoder_qkv = all_to_all(encoder_qkv, sp_group, sp_group_size, scatter_dim=3, gather_dim=1) # [bs, seq, 3, sub_head, head_dim]
+        output_hidden = torch.zeros_like(qkv[:,:,0])
+        output_encoder_hidden = torch.zeros_like(encoder_qkv[:,:,0])
+        encoder_length = encoder_qkv.shape[1]
+        i_sum = 0
+        for i_p, length in enumerate(hidden_length):
+            # get the query, key, value from padding sequence
+            encoder_qkv_tokens = encoder_qkv[i_p::num_stages]
+            qkv_tokens = qkv[:, i_sum:i_sum+length]
+            qkv_tokens = all_to_all(qkv_tokens, sp_group, sp_group_size, scatter_dim=3, gather_dim=1) # [bs, seq, 3, sub_head, head_dim]
+            concat_qkv_tokens = torch.cat([encoder_qkv_tokens, qkv_tokens], dim=1)  # [bs, pad_seq, 3, nhead, dim]
+            if image_rotary_emb is not None:
+                concat_qkv_tokens[:,:,0], concat_qkv_tokens[:,:,1] = self.apply_rope(concat_qkv_tokens[:,:,0], concat_qkv_tokens[:,:,1], image_rotary_emb[i_p])
+            indices = encoder_attention_mask[i_p]['indices']
+            qkv_list.append(index_first_axis(rearrange(concat_qkv_tokens, "b s ... -> (b s) ..."), indices))
+            i_sum += length
+        token_lengths = [x_.shape[0] for x_ in qkv_list]
+        qkv = torch.cat(qkv_list, dim=0)
+        query, key, value = qkv.unbind(1)
+        cu_seqlens = torch.cat([x_['seqlens_in_batch'] for x_ in encoder_attention_mask], dim=0)
+        max_seqlen_q = cu_seqlens.max().item()
+        max_seqlen_k = max_seqlen_q
+        cu_seqlens_q = F.pad(torch.cumsum(cu_seqlens, dim=0, dtype=torch.int32), (1, 0))
+        cu_seqlens_k = cu_seqlens_q.clone()
+        output = flash_attn_varlen_func(
+            query,
+            key,
+            value,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=max_seqlen_k,
+            dropout_p=0.0,
+            causal=False,
+            softmax_scale=scale,
+        )
+        # To merge the tokens
+        i_sum = 0;token_sum = 0
+        for i_p, length in enumerate(hidden_length):
+            tot_token_num = token_lengths[i_p]
+            stage_output = output[token_sum : token_sum + tot_token_num]
+            stage_output = pad_input(stage_output, encoder_attention_mask[i_p]['indices'], batch_size, encoder_length + length * sp_group_size)
+            stage_encoder_hidden_output = stage_output[:, :encoder_length]
+            stage_hidden_output = stage_output[:, encoder_length:]
+            stage_hidden_output = all_to_all(stage_hidden_output, sp_group, sp_group_size, scatter_dim=1, gather_dim=2)
+            output_hidden[:, i_sum:i_sum+length] = stage_hidden_output
+            output_encoder_hidden[i_p::num_stages] = stage_encoder_hidden_output
+            token_sum += tot_token_num
+            i_sum += length
+        output_encoder_hidden = all_to_all(output_encoder_hidden, sp_group, sp_group_size, scatter_dim=1, gather_dim=2)
+        output_hidden = output_hidden.flatten(2, 3)
+        output_encoder_hidden = output_encoder_hidden.flatten(2, 3)
+        return output_hidden, output_encoder_hidden
+class VarlenSelfAttentionWithT5Mask:
+    """
+        For chunk stage attention without using flash attention
+    """
+    def __init__(self):
+        pass
+    def apply_rope(self, xq, xk, freqs_cis):
+        xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+        xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+        xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+        xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+        return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
+    def __call__(
+            self, query, key, value, encoder_query, encoder_key, encoder_value,
+            heads, scale, hidden_length=None, image_rotary_emb=None, attention_mask=None,
+        ):
+        assert attention_mask is not None, "The attention mask needed to be set"
+        encoder_length = encoder_query.shape[1]
+        num_stages = len(hidden_length)
+        encoder_qkv = torch.stack([encoder_query, encoder_key, encoder_value], dim=2) # [bs, sub_seq, 3, head, head_dim]
+        qkv = torch.stack([query, key, value], dim=2) # [bs, sub_seq, 3, head, head_dim]
+        i_sum = 0
+        output_encoder_hidden_list = []
+        output_hidden_list = []
+        for i_p, length in enumerate(hidden_length):
+            encoder_qkv_tokens = encoder_qkv[i_p::num_stages]
+            qkv_tokens = qkv[:, i_sum:i_sum+length]
+            concat_qkv_tokens = torch.cat([encoder_qkv_tokens, qkv_tokens], dim=1)  # [bs, tot_seq, 3, nhead, dim]
+            if image_rotary_emb is not None:
+                concat_qkv_tokens[:,:,0], concat_qkv_tokens[:,:,1] = self.apply_rope(concat_qkv_tokens[:,:,0], concat_qkv_tokens[:,:,1], image_rotary_emb[i_p])
+            query, key, value = concat_qkv_tokens.unbind(2)   # [bs, tot_seq, nhead, dim]
+            query = query.transpose(1, 2)
+            key = key.transpose(1, 2)
+            value = value.transpose(1, 2)
+            # with torch.backends.cuda.sdp_kernel(enable_math=False, enable_flash=False, enable_mem_efficient=True):
+            stage_hidden_states = F.scaled_dot_product_attention(
+                query, key, value, dropout_p=0.0, is_causal=False, attn_mask=attention_mask[i_p],
+            )
+            stage_hidden_states = stage_hidden_states.transpose(1, 2).flatten(2, 3)   # [bs, tot_seq, dim]
+            output_encoder_hidden_list.append(stage_hidden_states[:, :encoder_length])
+            output_hidden_list.append(stage_hidden_states[:, encoder_length:])
+            i_sum += length
+        output_encoder_hidden = torch.stack(output_encoder_hidden_list, dim=1)  # [b n s d]
+        output_encoder_hidden = rearrange(output_encoder_hidden, 'b n s d -> (b n) s d')
+        output_hidden = torch.cat(output_hidden_list, dim=1)
+        return output_hidden, output_encoder_hidden
+class SequenceParallelVarlenSelfAttentionWithT5Mask:
+    """
+        For chunk stage attention without using flash attention
+    """
+    def __init__(self):
+        pass
+    def apply_rope(self, xq, xk, freqs_cis):
+        xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+        xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+        xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+        xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+        return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
+    def __call__(
+            self, query, key, value, encoder_query, encoder_key, encoder_value,
+            heads, scale, hidden_length=None, image_rotary_emb=None, attention_mask=None,
+        ):
+        assert attention_mask is not None, "The attention mask needed to be set"
+        num_stages = len(hidden_length)
+        encoder_qkv = torch.stack([encoder_query, encoder_key, encoder_value], dim=2) # [bs, sub_seq, 3, head, head_dim]
+        qkv = torch.stack([query, key, value], dim=2) # [bs, sub_seq, 3, head, head_dim]
+        # To sync the encoder query, key and values
+        sp_group = get_sequence_parallel_group()
+        sp_group_size = get_sequence_parallel_world_size()
+        encoder_qkv = all_to_all(encoder_qkv, sp_group, sp_group_size, scatter_dim=3, gather_dim=1) # [bs, seq, 3, sub_head, head_dim]
+        encoder_length = encoder_qkv.shape[1]
+        i_sum = 0
+        output_encoder_hidden_list = []
+        output_hidden_list = []
+        for i_p, length in enumerate(hidden_length):
+            encoder_qkv_tokens = encoder_qkv[i_p::num_stages]
+            qkv_tokens = qkv[:, i_sum:i_sum+length]
+            qkv_tokens = all_to_all(qkv_tokens, sp_group, sp_group_size, scatter_dim=3, gather_dim=1) # [bs, seq, 3, sub_head, head_dim]
+            concat_qkv_tokens = torch.cat([encoder_qkv_tokens, qkv_tokens], dim=1)  # [bs, tot_seq, 3, nhead, dim]
+            if image_rotary_emb is not None:
+                concat_qkv_tokens[:,:,0], concat_qkv_tokens[:,:,1] = self.apply_rope(concat_qkv_tokens[:,:,0], concat_qkv_tokens[:,:,1], image_rotary_emb[i_p])
+            query, key, value = concat_qkv_tokens.unbind(2)   # [bs, tot_seq, nhead, dim]
+            query = query.transpose(1, 2)
+            key = key.transpose(1, 2)
+            value = value.transpose(1, 2)
+            stage_hidden_states = F.scaled_dot_product_attention(
+                query, key, value, dropout_p=0.0, is_causal=False, attn_mask=attention_mask[i_p],
+            )
+            stage_hidden_states = stage_hidden_states.transpose(1, 2)   # [bs, tot_seq, nhead, dim]
+            output_encoder_hidden_list.append(stage_hidden_states[:, :encoder_length])
+            output_hidden = stage_hidden_states[:, encoder_length:]
+            output_hidden = all_to_all(output_hidden, sp_group, sp_group_size, scatter_dim=1, gather_dim=2)
+            output_hidden_list.append(output_hidden)
+            i_sum += length
+        output_encoder_hidden = torch.stack(output_encoder_hidden_list, dim=1)  # [b n s nhead d]
+        output_encoder_hidden = rearrange(output_encoder_hidden, 'b n s h d -> (b n) s h d')
+        output_encoder_hidden = all_to_all(output_encoder_hidden, sp_group, sp_group_size, scatter_dim=1, gather_dim=2)
+        output_encoder_hidden = output_encoder_hidden.flatten(2, 3)
+        output_hidden = torch.cat(output_hidden_list, dim=1).flatten(2, 3)
+        return output_hidden, output_encoder_hidden
+class JointAttention(nn.Module):
+    def __init__(
+        self,
+        query_dim: int,
+        cross_attention_dim: Optional[int] = None,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        bias: bool = False,
+        qk_norm: Optional[str] = None,
+        added_kv_proj_dim: Optional[int] = None,
+        out_bias: bool = True,
+        eps: float = 1e-5,
+        out_dim: int = None,
+        context_pre_only=None,
+        use_flash_attn=True,
+    ):
+        """
+            Fixing the QKNorm, following the flux, norm the head dimension
+        """
+        super().__init__()
+        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
+        self.query_dim = query_dim
+        self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+        self.use_bias = bias
+        self.dropout = dropout
+        self.out_dim = out_dim if out_dim is not None else query_dim
+        self.context_pre_only = context_pre_only
+        self.scale = dim_head**-0.5
+        self.heads = out_dim // dim_head if out_dim is not None else heads
+        self.added_kv_proj_dim = added_kv_proj_dim
+        if qk_norm is None:
+            self.norm_q = None
+            self.norm_k = None
+        elif qk_norm == "layer_norm":
+            self.norm_q = nn.LayerNorm(dim_head, eps=eps)
+            self.norm_k = nn.LayerNorm(dim_head, eps=eps)
+        elif qk_norm == 'rms_norm':
+            self.norm_q = RMSNorm(dim_head, eps=eps)
+            self.norm_k = RMSNorm(dim_head, eps=eps)
+        else:
+            raise ValueError(f"unknown qk_norm: {qk_norm}. Should be None or 'layer_norm'")
+        self.to_q = nn.Linear(query_dim, self.inner_dim, bias=bias)
+        self.to_k = nn.Linear(self.cross_attention_dim, self.inner_dim, bias=bias)
+        self.to_v = nn.Linear(self.cross_attention_dim, self.inner_dim, bias=bias)
+        if self.added_kv_proj_dim is not None:
+            self.add_k_proj = nn.Linear(added_kv_proj_dim, self.inner_dim)
+            self.add_v_proj = nn.Linear(added_kv_proj_dim, self.inner_dim)
+            self.add_q_proj = nn.Linear(added_kv_proj_dim, self.inner_dim)
+            if qk_norm is None:
+                self.norm_add_q = None
+                self.norm_add_k = None
+            elif qk_norm == "layer_norm":
+                self.norm_add_q = nn.LayerNorm(dim_head, eps=eps)
+                self.norm_add_k = nn.LayerNorm(dim_head, eps=eps)
+            elif qk_norm == 'rms_norm':
+                self.norm_add_q = RMSNorm(dim_head, eps=eps)
+                self.norm_add_k = RMSNorm(dim_head, eps=eps)
+            else:
+                raise ValueError(f"unknown qk_norm: {qk_norm}. Should be None or 'layer_norm'")
+        self.to_out = nn.ModuleList([])
+        self.to_out.append(nn.Linear(self.inner_dim, self.out_dim, bias=out_bias))
+        self.to_out.append(nn.Dropout(dropout))
+        if not self.context_pre_only:
+            self.to_add_out = nn.Linear(self.inner_dim, self.out_dim, bias=out_bias)
+        self.use_flash_attn = use_flash_attn
+        if flash_attn_func is None:
+            self.use_flash_attn = False
+        # print(f"Using flash-attention: {self.use_flash_attn}")
+        if self.use_flash_attn:
+            if is_sequence_parallel_initialized():
+                self.var_flash_attn = SequenceParallelVarlenFlashSelfAttentionWithT5Mask()
+            else:
+                self.var_flash_attn = VarlenFlashSelfAttentionWithT5Mask()
+        else:
+            if is_sequence_parallel_initialized():
+                self.var_len_attn = SequenceParallelVarlenSelfAttentionWithT5Mask()
+            else:
+                self.var_len_attn = VarlenSelfAttentionWithT5Mask()
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        encoder_attention_mask: torch.FloatTensor = None,
+        attention_mask: torch.FloatTensor = None,   # [B, L, S]
+        hidden_length: torch.Tensor = None,
+        image_rotary_emb: torch.Tensor = None,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        # This function is only used during training
+        # `sample` projections.
+        query = self.to_q(hidden_states)
+        key = self.to_k(hidden_states)
+        value = self.to_v(hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // self.heads
+        query = query.view(query.shape[0], -1, self.heads, head_dim)
+        key = key.view(key.shape[0], -1, self.heads, head_dim)
+        value = value.view(value.shape[0], -1, self.heads, head_dim)
+        if self.norm_q is not None:
+            query = self.norm_q(query)
+        if self.norm_k is not None:
+            key = self.norm_k(key)
+        # `context` projections.
+        encoder_hidden_states_query_proj = self.add_q_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = self.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = self.add_v_proj(encoder_hidden_states)
+        encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+            encoder_hidden_states_query_proj.shape[0], -1, self.heads, head_dim
+        )
+        encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
+            encoder_hidden_states_key_proj.shape[0], -1, self.heads, head_dim
+        )
+        encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+            encoder_hidden_states_value_proj.shape[0], -1, self.heads, head_dim
+        )
+        if self.norm_add_q is not None:
+            encoder_hidden_states_query_proj = self.norm_add_q(encoder_hidden_states_query_proj)
+        if self.norm_add_k is not None:
+            encoder_hidden_states_key_proj = self.norm_add_k(encoder_hidden_states_key_proj)
+        # To cat the hidden and encoder hidden, perform attention compuataion, and then split
+        if self.use_flash_attn:
+            hidden_states, encoder_hidden_states = self.var_flash_attn(
+                query, key, value,
+                encoder_hidden_states_query_proj, encoder_hidden_states_key_proj,
+                encoder_hidden_states_value_proj, self.heads, self.scale, hidden_length,
+                image_rotary_emb, encoder_attention_mask,
+            )
+        else:
+            hidden_states, encoder_hidden_states = self.var_len_attn(
+                query, key, value,
+                encoder_hidden_states_query_proj, encoder_hidden_states_key_proj,
+                encoder_hidden_states_value_proj, self.heads, self.scale, hidden_length,
+                image_rotary_emb, attention_mask,
+            )
+        # linear proj
+        hidden_states = self.to_out[0](hidden_states)
+        # dropout
+        hidden_states = self.to_out[1](hidden_states)
+        if not self.context_pre_only:
+            encoder_hidden_states = self.to_add_out(encoder_hidden_states)
+        return hidden_states, encoder_hidden_states
+class JointTransformerBlock(nn.Module):
+    r"""
+    A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
+    Reference: https://arxiv.org/abs/2403.03206
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
+            processing of `context` conditions.
+    """
+    def __init__(
+        self, dim, num_attention_heads, attention_head_dim, qk_norm=None,
+        context_pre_only=False, use_flash_attn=True,
+    ):
+        super().__init__()
+        self.context_pre_only = context_pre_only
+        context_norm_type = "ada_norm_continous" if context_pre_only else "ada_norm_zero"
+        self.norm1 = AdaLayerNormZero(dim)
+        if context_norm_type == "ada_norm_continous":
+            self.norm1_context = AdaLayerNormContinuous(
+                dim, dim, elementwise_affine=False, eps=1e-6, bias=True, norm_type="layer_norm"
+            )
+        elif context_norm_type == "ada_norm_zero":
+            self.norm1_context = AdaLayerNormZero(dim)
+        else:
+            raise ValueError(
+                f"Unknown context_norm_type: {context_norm_type}, currently only support `ada_norm_continous`, `ada_norm_zero`"
+            )
+        self.attn = JointAttention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            added_kv_proj_dim=dim,
+            dim_head=attention_head_dim // num_attention_heads,
+            heads=num_attention_heads,
+            out_dim=attention_head_dim,
+            qk_norm=qk_norm,
+            context_pre_only=context_pre_only,
+            bias=True,
+            use_flash_attn=use_flash_attn,
+        )
+        self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+        if not context_pre_only:
+            self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+            self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+        else:
+            self.norm2_context = None
+            self.ff_context = None
+    def forward(
+        self, hidden_states: torch.FloatTensor, encoder_hidden_states: torch.FloatTensor,
+        encoder_attention_mask: torch.FloatTensor, temb: torch.FloatTensor,
+        attention_mask: torch.FloatTensor = None, hidden_length: List = None,
+        image_rotary_emb: torch.FloatTensor = None,
+    ):
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb, hidden_length=hidden_length)
+        if self.context_pre_only:
+            norm_encoder_hidden_states = self.norm1_context(encoder_hidden_states, temb)
+        else:
+            norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
+                encoder_hidden_states, emb=temb,
+            )
+        # Attention
+        attn_output, context_attn_output = self.attn(
+            hidden_states=norm_hidden_states, encoder_hidden_states=norm_encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask, attention_mask=attention_mask,
+            hidden_length=hidden_length, image_rotary_emb=image_rotary_emb,
+        )
+        # Process attention outputs for the `hidden_states`.
+        attn_output = gate_msa * attn_output
+        hidden_states = hidden_states + attn_output
+        norm_hidden_states = self.norm2(hidden_states)
+        norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+        ff_output = self.ff(norm_hidden_states)
+        ff_output = gate_mlp * ff_output
+        hidden_states = hidden_states + ff_output
+        # Process attention outputs for the `encoder_hidden_states`.
+        if self.context_pre_only:
+            encoder_hidden_states = None
+        else:
+            context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
+            encoder_hidden_states = encoder_hidden_states + context_attn_output
+            norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+            norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+            context_ff_output = self.ff_context(norm_encoder_hidden_states)
+            encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
+        return encoder_hidden_states, hidden_states

pyramid_dit/modeling_normalization.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import numbers
+from typing import Dict, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from diffusers.utils import is_torch_version
+if is_torch_version(">=", "2.1.0"):
+    LayerNorm = nn.LayerNorm
+else:
+    # Has optional bias parameter compared to torch layer norm
+    # TODO: replace with torch layernorm once min required torch version >= 2.1
+    class LayerNorm(nn.Module):
+        def __init__(self, dim, eps: float = 1e-5, elementwise_affine: bool = True, bias: bool = True):
+            super().__init__()
+            self.eps = eps
+            if isinstance(dim, numbers.Integral):
+                dim = (dim,)
+            self.dim = torch.Size(dim)
+            if elementwise_affine:
+                self.weight = nn.Parameter(torch.ones(dim))
+                self.bias = nn.Parameter(torch.zeros(dim)) if bias else None
+            else:
+                self.weight = None
+                self.bias = None
+        def forward(self, input):
+            return F.layer_norm(input, self.dim, self.weight, self.bias, self.eps)
+class RMSNorm(nn.Module):
+    def __init__(self, dim, eps: float, elementwise_affine: bool = True):
+        super().__init__()
+        self.eps = eps
+        if isinstance(dim, numbers.Integral):
+            dim = (dim,)
+        self.dim = torch.Size(dim)
+        if elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(dim))
+        else:
+            self.weight = None
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
+        if self.weight is not None:
+            # convert into half-precision if necessary
+            if self.weight.dtype in [torch.float16, torch.bfloat16]:
+                hidden_states = hidden_states.to(self.weight.dtype)
+            hidden_states = hidden_states * self.weight
+        hidden_states = hidden_states.to(input_dtype)
+        return hidden_states
+class AdaLayerNormContinuous(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        conditioning_embedding_dim: int,
+        # NOTE: It is a bit weird that the norm layer can be configured to have scale and shift parameters
+        # because the output is immediately scaled and shifted by the projected conditioning embeddings.
+        # Note that AdaLayerNorm does not let the norm layer have scale and shift parameters.
+        # However, this is how it was implemented in the original code, and it's rather likely you should
+        # set `elementwise_affine` to False.
+        elementwise_affine=True,
+        eps=1e-5,
+        bias=True,
+        norm_type="layer_norm",
+    ):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=bias)
+        if norm_type == "layer_norm":
+            self.norm = LayerNorm(embedding_dim, eps, elementwise_affine, bias)
+        elif norm_type == "rms_norm":
+            self.norm = RMSNorm(embedding_dim, eps, elementwise_affine)
+        else:
+            raise ValueError(f"unknown norm_type {norm_type}")
+    def forward_with_pad(self, x: torch.Tensor, conditioning_embedding: torch.Tensor, hidden_length=None) -> torch.Tensor:
+        assert hidden_length is not None
+        emb = self.linear(self.silu(conditioning_embedding).to(x.dtype))
+        batch_emb = torch.zeros_like(x).repeat(1, 1, 2)
+        i_sum = 0
+        num_stages = len(hidden_length)
+        for i_p, length in enumerate(hidden_length):
+            batch_emb[:, i_sum:i_sum+length] = emb[i_p::num_stages][:,None]
+            i_sum += length
+        batch_scale, batch_shift = torch.chunk(batch_emb, 2, dim=2)
+        x = self.norm(x) * (1 + batch_scale) + batch_shift
+        return x
+    def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor, hidden_length=None) -> torch.Tensor:
+        # convert back to the original dtype in case `conditioning_embedding`` is upcasted to float32 (needed for hunyuanDiT)
+        if hidden_length is not None:
+            return self.forward_with_pad(x, conditioning_embedding, hidden_length)
+        emb = self.linear(self.silu(conditioning_embedding).to(x.dtype))
+        scale, shift = torch.chunk(emb, 2, dim=1)
+        x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
+        return x
+class AdaLayerNormZero(nn.Module):
+    r"""
+    Norm layer adaptive layer norm zero (adaLN-Zero).
+    Parameters:
+        embedding_dim (`int`): The size of each embedding vector.
+        num_embeddings (`int`): The size of the embeddings dictionary.
+    """
+    def __init__(self, embedding_dim: int, num_embeddings: Optional[int] = None):
+        super().__init__()
+        self.emb = None
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)
+        self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6)
+    def forward_with_pad(
+        self,
+        x: torch.Tensor,
+        timestep: Optional[torch.Tensor] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        hidden_dtype: Optional[torch.dtype] = None,
+        emb: Optional[torch.Tensor] = None,
+        hidden_length: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # x: [bs, seq_len, dim]
+        if self.emb is not None:
+            emb = self.emb(timestep, class_labels, hidden_dtype=hidden_dtype)
+        emb = self.linear(self.silu(emb))
+        batch_emb = torch.zeros_like(x).repeat(1, 1, 6)
+        i_sum = 0
+        num_stages = len(hidden_length)
+        for i_p, length in enumerate(hidden_length):
+            batch_emb[:, i_sum:i_sum+length] = emb[i_p::num_stages][:,None]
+            i_sum += length
+        batch_shift_msa, batch_scale_msa, batch_gate_msa, batch_shift_mlp, batch_scale_mlp, batch_gate_mlp = batch_emb.chunk(6, dim=2)
+        x = self.norm(x) * (1 + batch_scale_msa) + batch_shift_msa
+        return x, batch_gate_msa, batch_shift_mlp, batch_scale_mlp, batch_gate_mlp
+    def forward(
+        self,
+        x: torch.Tensor,
+        timestep: Optional[torch.Tensor] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        hidden_dtype: Optional[torch.dtype] = None,
+        emb: Optional[torch.Tensor] = None,
+        hidden_length: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        if hidden_length is not None:
+            return self.forward_with_pad(x, timestep, class_labels, hidden_dtype, emb, hidden_length)
+        if self.emb is not None:
+            emb = self.emb(timestep, class_labels, hidden_dtype=hidden_dtype)
+        emb = self.linear(self.silu(emb))
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, dim=1)
+        x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
+        return x, gate_msa, shift_mlp, scale_mlp, gate_mlp

pyramid_dit/modeling_pyramid_mmdit.py ADDED Viewed

	@@ -0,0 +1,487 @@

+import torch
+import torch.nn as nn
+import os
+import torch.nn.functional as F
+from einops import rearrange
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils import is_torch_version
+from typing import Any, Callable, Dict, List, Optional, Union
+from tqdm import tqdm
+from .modeling_embedding import PatchEmbed3D, CombinedTimestepConditionEmbeddings
+from .modeling_normalization import AdaLayerNormContinuous
+from .modeling_mmdit_block import JointTransformerBlock
+from trainer_misc import (
+    is_sequence_parallel_initialized,
+    get_sequence_parallel_group,
+    get_sequence_parallel_world_size,
+    get_sequence_parallel_rank,
+    all_to_all,
+)
+from IPython import embed
+def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
+    assert dim % 2 == 0, "The dimension must be even."
+    scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
+    omega = 1.0 / (theta**scale)
+    batch_size, seq_length = pos.shape
+    out = torch.einsum("...n,d->...nd", pos, omega)
+    cos_out = torch.cos(out)
+    sin_out = torch.sin(out)
+    stacked_out = torch.stack([cos_out, -sin_out, sin_out, cos_out], dim=-1)
+    out = stacked_out.view(batch_size, -1, dim // 2, 2, 2)
+    return out.float()
+class EmbedNDRoPE(nn.Module):
+    def __init__(self, dim: int, theta: int, axes_dim: List[int]):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.axes_dim = axes_dim
+    def forward(self, ids: torch.Tensor) -> torch.Tensor:
+        n_axes = ids.shape[-1]
+        emb = torch.cat(
+            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
+            dim=-3,
+        )
+        return emb.unsqueeze(2)
+class PyramidDiffusionMMDiT(ModelMixin, ConfigMixin):
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: int = 128,
+        patch_size: int = 2,
+        in_channels: int = 16,
+        num_layers: int = 24,
+        attention_head_dim: int = 64,
+        num_attention_heads: int = 24,
+        caption_projection_dim: int = 1152,
+        pooled_projection_dim: int = 2048,
+        pos_embed_max_size: int = 192,
+        max_num_frames: int = 200,
+        qk_norm: str = 'rms_norm',
+        pos_embed_type: str = 'rope',
+        temp_pos_embed_type: str = 'sincos',
+        joint_attention_dim: int = 4096,
+        use_gradient_checkpointing: bool = False,
+        use_flash_attn: bool = True,
+        use_temporal_causal: bool = False,
+        use_t5_mask: bool = False,
+        add_temp_pos_embed: bool = False,
+        interp_condition_pos: bool = False,
+    ):
+        super().__init__()
+        self.out_channels = in_channels
+        self.inner_dim = num_attention_heads * attention_head_dim
+        assert temp_pos_embed_type in ['rope', 'sincos']
+        # The input latent embeder, using the name pos_embed to remain the same with SD#
+        self.pos_embed = PatchEmbed3D(
+            height=sample_size,
+            width=sample_size,
+            patch_size=patch_size,
+            in_channels=in_channels,
+            embed_dim=self.inner_dim,
+            pos_embed_max_size=pos_embed_max_size,  # hard-code for now.
+            max_num_frames=max_num_frames,
+            pos_embed_type=pos_embed_type,
+            temp_pos_embed_type=temp_pos_embed_type,
+            add_temp_pos_embed=add_temp_pos_embed,
+            interp_condition_pos=interp_condition_pos,
+        )
+        # The RoPE EMbedding
+        if pos_embed_type == 'rope':
+            self.rope_embed = EmbedNDRoPE(self.inner_dim, 10000, axes_dim=[16, 24, 24])
+        else:
+            self.rope_embed = None
+        if temp_pos_embed_type == 'rope':
+            self.temp_rope_embed = EmbedNDRoPE(self.inner_dim, 10000, axes_dim=[attention_head_dim])
+        else:
+            self.temp_rope_embed = None
+        self.time_text_embed = CombinedTimestepConditionEmbeddings(
+            embedding_dim=self.inner_dim, pooled_projection_dim=self.config.pooled_projection_dim,
+        )
+        self.context_embedder = nn.Linear(self.config.joint_attention_dim, self.config.caption_projection_dim)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                JointTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=self.inner_dim,
+                    qk_norm=qk_norm,
+                    context_pre_only=i == num_layers - 1,
+                    use_flash_attn=use_flash_attn,
+                )
+                for i in range(num_layers)
+            ]
+        )
+        self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
+        self.gradient_checkpointing = use_gradient_checkpointing
+        self.patch_size = patch_size
+        self.use_flash_attn = use_flash_attn
+        self.use_temporal_causal = use_temporal_causal
+        self.pos_embed_type = pos_embed_type
+        self.temp_pos_embed_type = temp_pos_embed_type
+        self.add_temp_pos_embed = add_temp_pos_embed
+        if self.use_temporal_causal:
+            print("Using temporal causal attention")
+            assert self.use_flash_attn is False, "The flash attention does not support temporal causal"
+        if interp_condition_pos:
+            print("We interp the position embedding of condition latents")
+        # init weights
+        self.initialize_weights()
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, (nn.Linear, nn.Conv2d, nn.Conv3d)):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
+        w = self.pos_embed.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.pos_embed.proj.bias, 0)
+        # Initialize all the conditioning to normal init
+        nn.init.normal_(self.time_text_embed.timestep_embedder.linear_1.weight, std=0.02)
+        nn.init.normal_(self.time_text_embed.timestep_embedder.linear_2.weight, std=0.02)
+        nn.init.normal_(self.time_text_embed.text_embedder.linear_1.weight, std=0.02)
+        nn.init.normal_(self.time_text_embed.text_embedder.linear_2.weight, std=0.02)
+        nn.init.normal_(self.context_embedder.weight, std=0.02)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        for block in self.transformer_blocks:
+            nn.init.constant_(block.norm1.linear.weight, 0)
+            nn.init.constant_(block.norm1.linear.bias, 0)
+            nn.init.constant_(block.norm1_context.linear.weight, 0)
+            nn.init.constant_(block.norm1_context.linear.bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.norm_out.linear.weight, 0)
+        nn.init.constant_(self.norm_out.linear.bias, 0)
+        nn.init.constant_(self.proj_out.weight, 0)
+        nn.init.constant_(self.proj_out.bias, 0)
+    @torch.no_grad()
+    def _prepare_latent_image_ids(self, batch_size, temp, height, width, device):
+        latent_image_ids = torch.zeros(temp, height, width, 3)
+        latent_image_ids[..., 0] = latent_image_ids[..., 0] + torch.arange(temp)[:, None, None]
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height)[None, :, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width)[None, None, :]
+        latent_image_ids = latent_image_ids[None, :].repeat(batch_size, 1, 1, 1, 1)
+        latent_image_ids = rearrange(latent_image_ids, 'b t h w c -> b (t h w) c')
+        return latent_image_ids.to(device=device)
+    @torch.no_grad()
+    def _prepare_pyramid_latent_image_ids(self, batch_size, temp_list, height_list, width_list, device):
+        base_width = width_list[-1]; base_height = height_list[-1]
+        assert base_width == max(width_list)
+        assert base_height == max(height_list)
+        image_ids_list = []
+        for temp, height, width in zip(temp_list, height_list, width_list):
+            latent_image_ids = torch.zeros(temp, height, width, 3)
+            if height != base_height:
+                height_pos = F.interpolate(torch.arange(base_height)[None, None, :].float(), height, mode='linear').squeeze(0, 1)
+            else:
+                height_pos = torch.arange(base_height).float()
+            if width != base_width:
+                width_pos = F.interpolate(torch.arange(base_width)[None, None, :].float(), width, mode='linear').squeeze(0, 1)
+            else:
+                width_pos = torch.arange(base_width).float()
+            latent_image_ids[..., 0] = latent_image_ids[..., 0] + torch.arange(temp)[:, None, None]
+            latent_image_ids[..., 1] = latent_image_ids[..., 1] + height_pos[None, :, None]
+            latent_image_ids[..., 2] = latent_image_ids[..., 2] + width_pos[None, None, :]
+            latent_image_ids = latent_image_ids[None, :].repeat(batch_size, 1, 1, 1, 1)
+            latent_image_ids = rearrange(latent_image_ids, 'b t h w c -> b (t h w) c').to(device)
+            image_ids_list.append(latent_image_ids)
+        return image_ids_list
+    @torch.no_grad()
+    def _prepare_temporal_rope_ids(self, batch_size, temp, height, width, device, start_time_stamp=0):
+        latent_image_ids = torch.zeros(temp, height, width, 1)
+        latent_image_ids[..., 0] = latent_image_ids[..., 0] + torch.arange(start_time_stamp, start_time_stamp + temp)[:, None, None]
+        latent_image_ids = latent_image_ids[None, :].repeat(batch_size, 1, 1, 1, 1)
+        latent_image_ids = rearrange(latent_image_ids, 'b t h w c -> b (t h w) c')
+        return latent_image_ids.to(device=device)
+    @torch.no_grad()
+    def _prepare_pyramid_temporal_rope_ids(self, sample, batch_size, device):
+        image_ids_list = []
+        for i_b, sample_ in enumerate(sample):
+            if not isinstance(sample_, list):
+                sample_ = [sample_]
+            cur_image_ids = []
+            start_time_stamp = 0
+            for clip_ in sample_:
+                _, _, temp, height, width = clip_.shape
+                height = height // self.patch_size
+                width = width // self.patch_size
+                cur_image_ids.append(self._prepare_temporal_rope_ids(batch_size, temp, height, width, device, start_time_stamp=start_time_stamp))
+                start_time_stamp += temp
+            cur_image_ids = torch.cat(cur_image_ids, dim=1)
+            image_ids_list.append(cur_image_ids)
+        return image_ids_list
+    def merge_input(self, sample, encoder_hidden_length, encoder_attention_mask):
+        """
+            Merge the input video with different resolutions into one sequence
+            Sample: From low resolution to high resolution
+        """
+        if isinstance(sample[0], list):
+            device = sample[0][-1].device
+            pad_batch_size = sample[0][-1].shape[0]
+        else:
+            device = sample[0].device
+            pad_batch_size = sample[0].shape[0]
+        num_stages = len(sample)
+        height_list = [];width_list = [];temp_list = []
+        trainable_token_list = []
+        for i_b, sample_ in enumerate(sample):
+            if isinstance(sample_, list):
+                sample_ = sample_[-1]
+            _, _, temp, height, width = sample_.shape
+            height = height // self.patch_size
+            width = width // self.patch_size
+            temp_list.append(temp)
+            height_list.append(height)
+            width_list.append(width)
+            trainable_token_list.append(height * width * temp)
+        # prepare the RoPE embedding if needed
+        if self.pos_embed_type == 'rope':
+            # TODO: support the 3D Rope for video
+            raise NotImplementedError("Not compatible with video generation now")
+            text_ids = torch.zeros(pad_batch_size, encoder_hidden_length, 3).to(device=device)
+            image_ids_list = self._prepare_pyramid_latent_image_ids(pad_batch_size, temp_list, height_list, width_list, device)
+            input_ids_list = [torch.cat([text_ids, image_ids], dim=1) for image_ids in image_ids_list]
+            image_rotary_emb = [self.rope_embed(input_ids) for input_ids in input_ids_list]  # [bs, seq_len, 1, head_dim // 2, 2, 2]
+        else:
+            if self.temp_pos_embed_type == 'rope' and self.add_temp_pos_embed:
+                image_ids_list = self._prepare_pyramid_temporal_rope_ids(sample, pad_batch_size, device)
+                text_ids = torch.zeros(pad_batch_size, encoder_attention_mask.shape[1], 1).to(device=device)
+                input_ids_list = [torch.cat([text_ids, image_ids], dim=1) for image_ids in image_ids_list]
+                image_rotary_emb = [self.temp_rope_embed(input_ids) for input_ids in input_ids_list]  # [bs, seq_len, 1, head_dim // 2, 2, 2]
+                if is_sequence_parallel_initialized():
+                    sp_group = get_sequence_parallel_group()
+                    sp_group_size = get_sequence_parallel_world_size()
+                    image_rotary_emb = [all_to_all(x_.repeat(1, 1, sp_group_size, 1, 1, 1), sp_group, sp_group_size, scatter_dim=2, gather_dim=0) for x_ in image_rotary_emb]
+                    input_ids_list = [all_to_all(input_ids.repeat(1, 1, sp_group_size), sp_group, sp_group_size, scatter_dim=2, gather_dim=0) for input_ids in input_ids_list]
+            else:
+                image_rotary_emb = None
+        hidden_states = self.pos_embed(sample)  # hidden states is a list of [b c t h w] b = real_b // num_stages
+        hidden_length = []
+        for i_b in range(num_stages):
+            hidden_length.append(hidden_states[i_b].shape[1])
+        # prepare the attention mask
+        if self.use_flash_attn:
+            attention_mask = None
+            indices_list = []
+            for i_p, length in enumerate(hidden_length):
+                pad_attention_mask = torch.ones((pad_batch_size, length), dtype=encoder_attention_mask.dtype).to(device)
+                pad_attention_mask = torch.cat([encoder_attention_mask[i_p::num_stages], pad_attention_mask], dim=1)
+                if is_sequence_parallel_initialized():
+                    sp_group = get_sequence_parallel_group()
+                    sp_group_size = get_sequence_parallel_world_size()
+                    pad_attention_mask = all_to_all(pad_attention_mask.unsqueeze(2).repeat(1, 1, sp_group_size), sp_group, sp_group_size, scatter_dim=2, gather_dim=0)
+                    pad_attention_mask = pad_attention_mask.squeeze(2)
+                seqlens_in_batch = pad_attention_mask.sum(dim=-1, dtype=torch.int32)
+                indices = torch.nonzero(pad_attention_mask.flatten(), as_tuple=False).flatten()
+                indices_list.append(
+                    {
+                        'indices': indices,
+                        'seqlens_in_batch': seqlens_in_batch,
+                    }
+                )
+            encoder_attention_mask = indices_list
+        else:
+            assert encoder_attention_mask.shape[1] == encoder_hidden_length
+            real_batch_size = encoder_attention_mask.shape[0]
+            # prepare text ids
+            text_ids = torch.arange(1, real_batch_size + 1, dtype=encoder_attention_mask.dtype).unsqueeze(1).repeat(1, encoder_hidden_length)
+            text_ids = text_ids.to(device)
+            text_ids[encoder_attention_mask == 0] = 0
+            # prepare image ids
+            image_ids = torch.arange(1, real_batch_size + 1, dtype=encoder_attention_mask.dtype).unsqueeze(1).repeat(1, max(hidden_length))
+            image_ids = image_ids.to(device)
+            image_ids_list = []
+            for i_p, length in enumerate(hidden_length):
+                image_ids_list.append(image_ids[i_p::num_stages][:, :length])
+            if is_sequence_parallel_initialized():
+                sp_group = get_sequence_parallel_group()
+                sp_group_size = get_sequence_parallel_world_size()
+                text_ids = all_to_all(text_ids.unsqueeze(2).repeat(1, 1, sp_group_size), sp_group, sp_group_size, scatter_dim=2, gather_dim=0).squeeze(2)
+                image_ids_list = [all_to_all(image_ids_.unsqueeze(2).repeat(1, 1, sp_group_size), sp_group, sp_group_size, scatter_dim=2, gather_dim=0).squeeze(2) for image_ids_ in image_ids_list]
+            attention_mask = []
+            for i_p in range(len(hidden_length)):
+                image_ids = image_ids_list[i_p]
+                token_ids = torch.cat([text_ids[i_p::num_stages], image_ids], dim=1)
+                stage_attention_mask = rearrange(token_ids, 'b i -> b 1 i 1') == rearrange(token_ids, 'b j -> b 1 1 j')  # [bs, 1, q_len, k_len]
+                if self.use_temporal_causal:
+                    input_order_ids = input_ids_list[i_p].squeeze(2)
+                    temporal_causal_mask = rearrange(input_order_ids, 'b i -> b 1 i 1') >= rearrange(input_order_ids, 'b j -> b 1 1 j')
+                    stage_attention_mask = stage_attention_mask & temporal_causal_mask
+                attention_mask.append(stage_attention_mask)
+        return hidden_states, hidden_length, temp_list, height_list, width_list, trainable_token_list, encoder_attention_mask, attention_mask, image_rotary_emb
+    def split_output(self, batch_hidden_states, hidden_length, temps, heights, widths, trainable_token_list):
+        # To split the hidden states
+        batch_size = batch_hidden_states.shape[0]
+        output_hidden_list = []
+        batch_hidden_states = torch.split(batch_hidden_states, hidden_length, dim=1)
+        if is_sequence_parallel_initialized():
+            sp_group_size = get_sequence_parallel_world_size()
+            batch_size = batch_size // sp_group_size
+        for i_p, length in enumerate(hidden_length):
+            width, height, temp = widths[i_p], heights[i_p], temps[i_p]
+            trainable_token_num = trainable_token_list[i_p]
+            hidden_states = batch_hidden_states[i_p]
+            if is_sequence_parallel_initialized():
+                sp_group = get_sequence_parallel_group()
+                sp_group_size = get_sequence_parallel_world_size()
+                hidden_states = all_to_all(hidden_states, sp_group, sp_group_size, scatter_dim=0, gather_dim=1)
+            # only the trainable token are taking part in loss computation
+            hidden_states = hidden_states[:, -trainable_token_num:]
+            # unpatchify
+            hidden_states = hidden_states.reshape(
+                shape=(batch_size, temp, height, width, self.patch_size, self.patch_size, self.out_channels)
+            )
+            hidden_states = rearrange(hidden_states, "b t h w p1 p2 c -> b t (h p1) (w p2) c")
+            hidden_states = rearrange(hidden_states, "b t h w c -> b c t h w")
+            output_hidden_list.append(hidden_states)
+        return output_hidden_list
+    def forward(
+        self,
+        sample: torch.FloatTensor, # [num_stages]
+        encoder_hidden_states: torch.FloatTensor = None,
+        encoder_attention_mask: torch.FloatTensor = None,
+        pooled_projections: torch.FloatTensor = None,
+        timestep_ratio: torch.FloatTensor = None,
+    ):
+        # Get the timestep embedding
+        temb = self.time_text_embed(timestep_ratio, pooled_projections)
+        encoder_hidden_states = self.context_embedder(encoder_hidden_states)
+        encoder_hidden_length = encoder_hidden_states.shape[1]
+        # Get the input sequence
+        hidden_states, hidden_length, temps, heights, widths, trainable_token_list, encoder_attention_mask, \
+                attention_mask, image_rotary_emb = self.merge_input(sample, encoder_hidden_length, encoder_attention_mask)
+        # split the long latents if necessary
+        if is_sequence_parallel_initialized():
+            sp_group = get_sequence_parallel_group()
+            sp_group_size = get_sequence_parallel_world_size()
+            # sync the input hidden states
+            batch_hidden_states = []
+            for i_p, hidden_states_ in enumerate(hidden_states):
+                assert hidden_states_.shape[1] % sp_group_size == 0, "The sequence length should be divided by sequence parallel size"
+                hidden_states_ = all_to_all(hidden_states_, sp_group, sp_group_size, scatter_dim=1, gather_dim=0)
+                hidden_length[i_p] = hidden_length[i_p] // sp_group_size
+                batch_hidden_states.append(hidden_states_)
+            # sync the encoder hidden states
+            hidden_states = torch.cat(batch_hidden_states, dim=1)
+            encoder_hidden_states = all_to_all(encoder_hidden_states, sp_group, sp_group_size, scatter_dim=1, gather_dim=0)
+            temb = all_to_all(temb.unsqueeze(1).repeat(1, sp_group_size, 1), sp_group, sp_group_size, scatter_dim=1, gather_dim=0)
+            temb = temb.squeeze(1)
+        else:
+            hidden_states = torch.cat(hidden_states, dim=1)
+        # print(hidden_length)
+        for i_b, block in enumerate(self.transformer_blocks):
+            if self.training and self.gradient_checkpointing and (i_b >= 2):
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    temb,
+                    attention_mask,
+                    hidden_length,
+                    image_rotary_emb,
+                    **ckpt_kwargs,
+                )
+            else:
+                encoder_hidden_states, hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    temb=temb,
+                    attention_mask=attention_mask,
+                    hidden_length=hidden_length,
+                    image_rotary_emb=image_rotary_emb,
+                )
+        hidden_states = self.norm_out(hidden_states, temb, hidden_length=hidden_length)
+        hidden_states = self.proj_out(hidden_states)
+        output = self.split_output(hidden_states, hidden_length, temps, heights, widths, trainable_token_list)
+        return output

pyramid_dit/modeling_text_encoder.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import torch
+import torch.nn as nn
+import os
+from transformers import (
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    T5EncoderModel,
+    T5TokenizerFast,
+)
+from typing import Any, Callable, Dict, List, Optional, Union
+class SD3TextEncoderWithMask(nn.Module):
+    def __init__(self, model_path, torch_dtype):
+        super().__init__()
+        # CLIP-L
+        self.tokenizer = CLIPTokenizer.from_pretrained(os.path.join(model_path, 'tokenizer'))
+        self.tokenizer_max_length = self.tokenizer.model_max_length
+        self.text_encoder = CLIPTextModelWithProjection.from_pretrained(os.path.join(model_path, 'text_encoder'), torch_dtype=torch_dtype)
+        # CLIP-G
+        self.tokenizer_2 = CLIPTokenizer.from_pretrained(os.path.join(model_path, 'tokenizer_2'))
+        self.text_encoder_2 = CLIPTextModelWithProjection.from_pretrained(os.path.join(model_path, 'text_encoder_2'), torch_dtype=torch_dtype)
+        # T5
+        self.tokenizer_3 = T5TokenizerFast.from_pretrained(os.path.join(model_path, 'tokenizer_3'))
+        self.text_encoder_3 = T5EncoderModel.from_pretrained(os.path.join(model_path, 'text_encoder_3'), torch_dtype=torch_dtype)
+        self._freeze()
+    def _freeze(self):
+        for param in self.parameters():
+            param.requires_grad = False
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        max_sequence_length: int = 128,
+    ):
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        text_inputs = self.tokenizer_3(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        prompt_attention_mask = text_inputs.attention_mask
+        prompt_attention_mask = prompt_attention_mask.to(device)
+        prompt_embeds = self.text_encoder_3(text_input_ids.to(device), attention_mask=prompt_attention_mask)[0]
+        dtype = self.text_encoder_3.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        _, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        prompt_attention_mask = prompt_attention_mask.view(batch_size, -1)
+        prompt_attention_mask = prompt_attention_mask.repeat(num_images_per_prompt, 1)
+        return prompt_embeds, prompt_attention_mask
+    def _get_clip_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        clip_skip: Optional[int] = None,
+        clip_model_index: int = 0,
+    ):
+        clip_tokenizers = [self.tokenizer, self.tokenizer_2]
+        clip_text_encoders = [self.text_encoder, self.text_encoder_2]
+        tokenizer = clip_tokenizers[clip_model_index]
+        text_encoder = clip_text_encoders[clip_model_index]
+        batch_size = len(prompt)
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+        pooled_prompt_embeds = prompt_embeds[0]
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        pooled_prompt_embeds = pooled_prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+        return pooled_prompt_embeds
+    def encode_prompt(self,
+        prompt,
+        num_images_per_prompt=1,
+        clip_skip: Optional[int] = None,
+        device=None,
+    ):
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        pooled_prompt_embed = self._get_clip_prompt_embeds(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            clip_skip=clip_skip,
+            clip_model_index=0,
+        )
+        pooled_prompt_2_embed = self._get_clip_prompt_embeds(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            clip_skip=clip_skip,
+            clip_model_index=1,
+        )
+        pooled_prompt_embeds = torch.cat([pooled_prompt_embed, pooled_prompt_2_embed], dim=-1)
+        prompt_embeds, prompt_attention_mask = self._get_t5_prompt_embeds(
+            prompt=prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+        )
+        return prompt_embeds, prompt_attention_mask, pooled_prompt_embeds
+    def forward(self, input_prompts, device):
+        with torch.no_grad():
+            prompt_embeds, prompt_attention_mask, pooled_prompt_embeds = self.encode_prompt(input_prompts, 1, clip_skip=None, device=device)
+        return prompt_embeds, prompt_attention_mask, pooled_prompt_embeds

pyramid_dit/pyramid_dit_for_video_gen_pipeline.py ADDED Viewed

	@@ -0,0 +1,672 @@

+import torch
+import os
+import sys
+import torch.nn as nn
+import torch.nn.functional as F
+from collections import OrderedDict
+from einops import rearrange
+from diffusers.utils.torch_utils import randn_tensor
+import numpy as np
+import math
+import random
+import PIL
+from PIL import Image
+from tqdm import tqdm
+from torchvision import transforms
+from copy import deepcopy
+from typing import Any, Callable, Dict, List, Optional, Union
+from accelerate import Accelerator
+from diffusion_schedulers import PyramidFlowMatchEulerDiscreteScheduler
+from video_vae.modeling_causal_vae import CausalVideoVAE
+from trainer_misc import (
+    all_to_all,
+    is_sequence_parallel_initialized,
+    get_sequence_parallel_group,
+    get_sequence_parallel_group_rank,
+    get_sequence_parallel_rank,
+    get_sequence_parallel_world_size,
+    get_rank,
+)
+from .modeling_pyramid_mmdit import PyramidDiffusionMMDiT
+from .modeling_text_encoder import SD3TextEncoderWithMask
+def compute_density_for_timestep_sampling(
+    weighting_scheme: str, batch_size: int, logit_mean: float = None, logit_std: float = None, mode_scale: float = None
+):
+    if weighting_scheme == "logit_normal":
+        # See 3.1 in the SD3 paper ($rf/lognorm(0.00,1.00)$).
+        u = torch.normal(mean=logit_mean, std=logit_std, size=(batch_size,), device="cpu")
+        u = torch.nn.functional.sigmoid(u)
+    elif weighting_scheme == "mode":
+        u = torch.rand(size=(batch_size,), device="cpu")
+        u = 1 - u - mode_scale * (torch.cos(math.pi * u / 2) ** 2 - 1 + u)
+    else:
+        u = torch.rand(size=(batch_size,), device="cpu")
+    return u
+class PyramidDiTForVideoGeneration:
+    """
+        The pyramid dit for both image and video generation, The running class wrapper
+        This class is mainly for fixed unit implementation: 1 + n + n + n
+    """
+    def __init__(self, model_path, model_dtype='bf16', use_gradient_checkpointing=False, return_log=True,
+        model_variant="diffusion_transformer_768p", timestep_shift=1.0, stage_range=[0, 1/3, 2/3, 1],
+        sample_ratios=[1, 1, 1], scheduler_gamma=1/3, use_mixed_training=False, use_flash_attn=False,
+        load_text_encoder=True, load_vae=True, max_temporal_length=31, frame_per_unit=1, use_temporal_causal=True,
+        corrupt_ratio=1/3, interp_condition_pos=True, stages=[1, 2, 4], **kwargs,
+    ):
+        super().__init__()
+        if model_dtype == 'bf16':
+            torch_dtype = torch.bfloat16
+        elif model_dtype == 'fp16':
+            torch_dtype = torch.float16
+        else:
+            torch_dtype = torch.float32
+        self.stages = stages
+        self.sample_ratios = sample_ratios
+        self.corrupt_ratio = corrupt_ratio
+        dit_path = os.path.join(model_path, model_variant)
+        # The dit
+        if use_mixed_training:
+            print("using mixed precision training, do not explicitly casting models")
+            self.dit = PyramidDiffusionMMDiT.from_pretrained(
+                dit_path, use_gradient_checkpointing=use_gradient_checkpointing,
+                use_flash_attn=use_flash_attn, use_t5_mask=True,
+                add_temp_pos_embed=True, temp_pos_embed_type='rope',
+                use_temporal_causal=use_temporal_causal, interp_condition_pos=interp_condition_pos,
+            )
+        else:
+            print("using half precision")
+            self.dit = PyramidDiffusionMMDiT.from_pretrained(
+                dit_path, torch_dtype=torch_dtype,
+                use_gradient_checkpointing=use_gradient_checkpointing,
+                use_flash_attn=use_flash_attn, use_t5_mask=True,
+                add_temp_pos_embed=True, temp_pos_embed_type='rope',
+                use_temporal_causal=use_temporal_causal, interp_condition_pos=interp_condition_pos,
+            )
+        # The text encoder
+        if load_text_encoder:
+            self.text_encoder = SD3TextEncoderWithMask(model_path, torch_dtype=torch_dtype)
+        else:
+            self.text_encoder = None
+        # The base video vae decoder
+        if load_vae:
+            self.vae = CausalVideoVAE.from_pretrained(os.path.join(model_path, 'causal_video_vae'), torch_dtype=torch_dtype, interpolate=False)
+            # Freeze vae
+            for parameter in self.vae.parameters():
+                parameter.requires_grad = False
+        else:
+            self.vae = None
+        # For the image latent
+        self.vae_shift_factor = 0.1490
+        self.vae_scale_factor = 1 / 1.8415
+        # For the video latent
+        self.vae_video_shift_factor = -0.2343
+        self.vae_video_scale_factor = 1 / 3.0986
+        self.downsample = 8
+        # Configure the video training hyper-parameters
+        # The video sequence: one frame + N * unit
+        self.frame_per_unit = frame_per_unit
+        self.max_temporal_length = max_temporal_length
+        assert (max_temporal_length - 1) % frame_per_unit == 0, "The frame number should be divided by the frame number per unit"
+        self.num_units_per_video = 1 + ((max_temporal_length - 1) // frame_per_unit) + int(sum(sample_ratios))
+        self.scheduler = PyramidFlowMatchEulerDiscreteScheduler(
+            shift=timestep_shift, stages=len(self.stages),
+            stage_range=stage_range, gamma=scheduler_gamma,
+        )
+        print(f"The start sigmas and end sigmas of each stage is Start: {self.scheduler.start_sigmas}, End: {self.scheduler.end_sigmas}, Ori_start: {self.scheduler.ori_start_sigmas}")
+        self.cfg_rate = 0.1
+        self.return_log = return_log
+        self.use_flash_attn = use_flash_attn
+    def load_checkpoint(self, checkpoint_path, model_key='model', **kwargs):
+        checkpoint = torch.load(checkpoint_path, map_location='cpu')
+        dit_checkpoint = OrderedDict()
+        for key in checkpoint:
+            if key.startswith('vae') or key.startswith('text_encoder'):
+                continue
+            if key.startswith('dit'):
+                new_key = key.split('.')
+                new_key = '.'.join(new_key[1:])
+                dit_checkpoint[new_key] = checkpoint[key]
+            else:
+                dit_checkpoint[key] = checkpoint[key]
+        load_result = self.dit.load_state_dict(dit_checkpoint, strict=True)
+        print(f"Load checkpoint from {checkpoint_path}, load result: {load_result}")
+    def load_vae_checkpoint(self, vae_checkpoint_path, model_key='model'):
+        checkpoint = torch.load(vae_checkpoint_path, map_location='cpu')
+        checkpoint = checkpoint[model_key]
+        loaded_checkpoint = OrderedDict()
+        for key in checkpoint.keys():
+            if key.startswith('vae.'):
+                new_key = key.split('.')
+                new_key = '.'.join(new_key[1:])
+                loaded_checkpoint[new_key] = checkpoint[key]
+        load_result = self.vae.load_state_dict(loaded_checkpoint)
+        print(f"Load the VAE from {vae_checkpoint_path}, load result: {load_result}")
+    @torch.no_grad()
+    def get_pyramid_latent(self, x, stage_num):
+        # x is the origin vae latent
+        vae_latent_list = []
+        vae_latent_list.append(x)
+        temp, height, width = x.shape[-3], x.shape[-2], x.shape[-1]
+        for _ in range(stage_num):
+            height //= 2
+            width //= 2
+            x = rearrange(x, 'b c t h w -> (b t) c h w')
+            x = torch.nn.functional.interpolate(x, size=(height, width), mode='bilinear')
+            x = rearrange(x, '(b t) c h w -> b c t h w', t=temp)
+            vae_latent_list.append(x)
+        vae_latent_list = list(reversed(vae_latent_list))
+        return vae_latent_list
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        temp,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(temp),
+            int(height) // self.downsample,
+            int(width) // self.downsample,
+        )
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        return latents
+    def sample_block_noise(self, bs, ch, temp, height, width):
+        gamma = self.scheduler.config.gamma
+        dist = torch.distributions.multivariate_normal.MultivariateNormal(torch.zeros(4), torch.eye(4) * (1 + gamma) - torch.ones(4, 4) * gamma)
+        block_number = bs * ch * temp * (height // 2) * (width // 2)
+        noise = torch.stack([dist.sample() for _ in range(block_number)]) # [block number, 4]
+        noise = rearrange(noise, '(b c t h w) (p q) -> b c t (h p) (w q)',b=bs,c=ch,t=temp,h=height//2,w=width//2,p=2,q=2)
+        return noise
+    @torch.no_grad()
+    def generate_one_unit(
+        self,
+        latents,
+        past_conditions, # List of past conditions, contains the conditions of each stage
+        prompt_embeds,
+        prompt_attention_mask,
+        pooled_prompt_embeds,
+        num_inference_steps,
+        height,
+        width,
+        temp,
+        device,
+        dtype,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        is_first_frame: bool = False,
+    ):
+        stages = self.stages
+        intermed_latents = []
+        for i_s in range(len(stages)):
+            self.scheduler.set_timesteps(num_inference_steps[i_s], i_s, device=device)
+            timesteps = self.scheduler.timesteps
+            if i_s > 0:
+                height *= 2; width *= 2
+                latents = rearrange(latents, 'b c t h w -> (b t) c h w')
+                latents = F.interpolate(latents, size=(height, width), mode='nearest')
+                latents = rearrange(latents, '(b t) c h w -> b c t h w', t=temp)
+                # Fix the stage
+                ori_sigma = 1 - self.scheduler.ori_start_sigmas[i_s]   # the original coeff of signal
+                gamma = self.scheduler.config.gamma
+                alpha = 1 / (math.sqrt(1 + (1 / gamma)) * (1 - ori_sigma) + ori_sigma)
+                beta = alpha * (1 - ori_sigma) / math.sqrt(gamma)
+                bs, ch, temp, height, width = latents.shape
+                noise = self.sample_block_noise(bs, ch, temp, height, width)
+                noise = noise.to(device=device, dtype=dtype)
+                latents = alpha * latents + beta * noise    # To fix the block artifact
+            for idx, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0]).to(latent_model_input.dtype)
+                latent_model_input = past_conditions[i_s] + [latent_model_input]
+                noise_pred = self.dit(
+                    sample=[latent_model_input],
+                    timestep_ratio=timestep,
+                    encoder_hidden_states=prompt_embeds,
+                    encoder_attention_mask=prompt_attention_mask,
+                    pooled_projections=pooled_prompt_embeds,
+                )
+                noise_pred = noise_pred[0]
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    if is_first_frame:
+                        noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    else:
+                        noise_pred = noise_pred_uncond + self.video_guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    model_output=noise_pred,
+                    timestep=timestep,
+                    sample=latents,
+                    generator=generator,
+                ).prev_sample
+            intermed_latents.append(latents)
+        return intermed_latents
+    @torch.no_grad()
+    def generate_i2v(
+        self,
+        prompt: Union[str, List[str]] = '',
+        input_image: PIL.Image = None,
+        temp: int = 1,
+        num_inference_steps: Optional[Union[int, List[int]]] = 28,
+        guidance_scale: float = 7.0,
+        video_guidance_scale: float = 4.0,
+        min_guidance_scale: float = 2.0,
+        use_linear_guidance: bool = False,
+        alpha: float = 0.5,
+        negative_prompt: Optional[Union[str, List[str]]]="cartoon style, worst quality, low quality, blurry, absolute black, absolute white, low res, extra limbs, extra digits, misplaced objects, mutated anatomy, monochrome, horror",
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: Optional[str] = "pil",
+        save_memory: bool = True,
+    ):
+        device = self.device
+        dtype = self.dtype
+        width = input_image.width
+        height = input_image.height
+        assert temp % self.frame_per_unit == 0, "The frames should be divided by frame_per unit"
+        if isinstance(prompt, str):
+            batch_size = 1
+            prompt = prompt + ", hyper quality, Ultra HD, 8K"   # adding this prompt to improve aesthetics
+        else:
+            assert isinstance(prompt, list)
+            batch_size = len(prompt)
+            prompt = [_ + ", hyper quality, Ultra HD, 8K" for _ in prompt]
+        if isinstance(num_inference_steps, int):
+            num_inference_steps = [num_inference_steps] * len(self.stages)
+        negative_prompt = negative_prompt or ""
+        # Get the text embeddings
+        prompt_embeds, prompt_attention_mask, pooled_prompt_embeds = self.text_encoder(prompt, device)
+        negative_prompt_embeds, negative_prompt_attention_mask, negative_pooled_prompt_embeds = self.text_encoder(negative_prompt, device)
+        if use_linear_guidance:
+            max_guidance_scale = guidance_scale
+            guidance_scale_list = [max(max_guidance_scale - alpha * t_, min_guidance_scale) for t_ in range(temp+1)]
+            print(guidance_scale_list)
+        self._guidance_scale = guidance_scale
+        self._video_guidance_scale = video_guidance_scale
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
+            prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
+        # Create the initial random noise
+        num_channels_latents = self.dit.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            temp,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+        )
+        temp, height, width = latents.shape[-3], latents.shape[-2], latents.shape[-1]
+        latents = rearrange(latents, 'b c t h w -> (b t) c h w')
+        # by defalut, we needs to start from the block noise
+        for _ in range(len(self.stages)-1):
+            height //= 2;width //= 2
+            latents = F.interpolate(latents, size=(height, width), mode='bilinear') * 2
+        latents = rearrange(latents, '(b t) c h w -> b c t h w', t=temp)
+        num_units = temp // self.frame_per_unit
+        stages = self.stages
+        # encode the image latents
+        image_transform = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
+        ])
+        input_image_tensor = image_transform(input_image).unsqueeze(0).unsqueeze(2)   # [b c 1 h w]
+        input_image_latent = (self.vae.encode(input_image_tensor.to(device)).latent_dist.sample() - self.vae_shift_factor) * self.vae_scale_factor  # [b c 1 h w]
+        generated_latents_list = [input_image_latent]    # The generated results
+        last_generated_latents = input_image_latent
+        for unit_index in tqdm(range(1, num_units + 1)):
+            if use_linear_guidance:
+                self._guidance_scale = guidance_scale_list[unit_index]
+                self._video_guidance_scale = guidance_scale_list[unit_index]
+            # prepare the condition latents
+            past_condition_latents = []
+            clean_latents_list = self.get_pyramid_latent(torch.cat(generated_latents_list, dim=2), len(stages) - 1)
+            for i_s in range(len(stages)):
+                last_cond_latent = clean_latents_list[i_s][:,:,-self.frame_per_unit:]
+                stage_input = [torch.cat([last_cond_latent] * 2) if self.do_classifier_free_guidance else last_cond_latent]
+                # pad the past clean latents
+                cur_unit_num = unit_index
+                cur_stage = i_s
+                cur_unit_ptx = 1
+                while cur_unit_ptx < cur_unit_num:
+                    cur_stage = max(cur_stage - 1, 0)
+                    if cur_stage == 0:
+                        break
+                    cur_unit_ptx += 1
+                    cond_latents = clean_latents_list[cur_stage][:, :, -(cur_unit_ptx * self.frame_per_unit) : -((cur_unit_ptx - 1) * self.frame_per_unit)]
+                    stage_input.append(torch.cat([cond_latents] * 2) if self.do_classifier_free_guidance else cond_latents)
+                if cur_stage == 0 and cur_unit_ptx < cur_unit_num:
+                    cond_latents = clean_latents_list[0][:, :, :-(cur_unit_ptx * self.frame_per_unit)]
+                    stage_input.append(torch.cat([cond_latents] * 2) if self.do_classifier_free_guidance else cond_latents)
+                stage_input = list(reversed(stage_input))
+                past_condition_latents.append(stage_input)
+            intermed_latents = self.generate_one_unit(
+                latents[:,:,(unit_index - 1) * self.frame_per_unit:unit_index * self.frame_per_unit],
+                past_condition_latents,
+                prompt_embeds,
+                prompt_attention_mask,
+                pooled_prompt_embeds,
+                num_inference_steps,
+                height,
+                width,
+                self.frame_per_unit,
+                device,
+                dtype,
+                generator,
+                is_first_frame=False,
+            )
+            generated_latents_list.append(intermed_latents[-1])
+            last_generated_latents = intermed_latents
+        generated_latents = torch.cat(generated_latents_list, dim=2)
+        if output_type == "latent":
+            image = generated_latents
+        else:
+            image = self.decode_latent(generated_latents, save_memory=save_memory)
+        return image
+    @torch.no_grad()
+    def generate(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        temp: int = 1,
+        num_inference_steps: Optional[Union[int, List[int]]] = 28,
+        video_num_inference_steps: Optional[Union[int, List[int]]] = 28,
+        guidance_scale: float = 7.0,
+        video_guidance_scale: float = 7.0,
+        min_guidance_scale: float = 2.0,
+        use_linear_guidance: bool = False,
+        alpha: float = 0.5,
+        negative_prompt: Optional[Union[str, List[str]]]="cartoon style, worst quality, low quality, blurry, absolute black, absolute white, low res, extra limbs, extra digits, misplaced objects, mutated anatomy, monochrome, horror",
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: Optional[str] = "pil",
+        save_memory: bool = True,
+    ):
+        device = self.device
+        dtype = self.dtype
+        assert (temp - 1) % self.frame_per_unit == 0, "The frames should be divided by frame_per unit"
+        if isinstance(prompt, str):
+            batch_size = 1
+            prompt = prompt + ", hyper quality, Ultra HD, 8K"        # adding this prompt to improve aesthetics
+        else:
+            assert isinstance(prompt, list)
+            batch_size = len(prompt)
+            prompt = [_ + ", hyper quality, Ultra HD, 8K" for _ in prompt]
+        if isinstance(num_inference_steps, int):
+            num_inference_steps = [num_inference_steps] * len(self.stages)
+        if isinstance(video_num_inference_steps, int):
+            video_num_inference_steps = [video_num_inference_steps] * len(self.stages)
+        negative_prompt = negative_prompt or ""
+        # Get the text embeddings
+        prompt_embeds, prompt_attention_mask, pooled_prompt_embeds = self.text_encoder(prompt, device)
+        negative_prompt_embeds, negative_prompt_attention_mask, negative_pooled_prompt_embeds = self.text_encoder(negative_prompt, device)
+        if use_linear_guidance:
+            max_guidance_scale = guidance_scale
+            # guidance_scale_list = torch.linspace(max_guidance_scale, min_guidance_scale, temp).tolist()
+            guidance_scale_list = [max(max_guidance_scale - alpha * t_, min_guidance_scale) for t_ in range(temp)]
+            print(guidance_scale_list)
+        self._guidance_scale = guidance_scale
+        self._video_guidance_scale = video_guidance_scale
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
+            prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
+        # Create the initial random noise
+        num_channels_latents = self.dit.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            temp,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+        )
+        temp, height, width = latents.shape[-3], latents.shape[-2], latents.shape[-1]
+        latents = rearrange(latents, 'b c t h w -> (b t) c h w')
+        # by defalut, we needs to start from the block noise
+        for _ in range(len(self.stages)-1):
+            height //= 2;width //= 2
+            latents = F.interpolate(latents, size=(height, width), mode='bilinear') * 2
+        latents = rearrange(latents, '(b t) c h w -> b c t h w', t=temp)
+        num_units = 1 + (temp - 1) // self.frame_per_unit
+        stages = self.stages
+        generated_latents_list = []    # The generated results
+        last_generated_latents = None
+        for unit_index in tqdm(range(num_units)):
+            if use_linear_guidance:
+                self._guidance_scale = guidance_scale_list[unit_index]
+                self._video_guidance_scale = guidance_scale_list[unit_index]
+            if unit_index == 0:
+                past_condition_latents = [[] for _ in range(len(stages))]
+                intermed_latents = self.generate_one_unit(
+                    latents[:,:,:1],
+                    past_condition_latents,
+                    prompt_embeds,
+                    prompt_attention_mask,
+                    pooled_prompt_embeds,
+                    num_inference_steps,
+                    height,
+                    width,
+                    1,
+                    device,
+                    dtype,
+                    generator,
+                    is_first_frame=True,
+                )
+            else:
+                # prepare the condition latents
+                past_condition_latents = []
+                clean_latents_list = self.get_pyramid_latent(torch.cat(generated_latents_list, dim=2), len(stages) - 1)
+                for i_s in range(len(stages)):
+                    last_cond_latent = clean_latents_list[i_s][:,:,-(self.frame_per_unit):]
+                    stage_input = [torch.cat([last_cond_latent] * 2) if self.do_classifier_free_guidance else last_cond_latent]
+                    # pad the past clean latents
+                    cur_unit_num = unit_index
+                    cur_stage = i_s
+                    cur_unit_ptx = 1
+                    while cur_unit_ptx < cur_unit_num:
+                        cur_stage = max(cur_stage - 1, 0)
+                        if cur_stage == 0:
+                            break
+                        cur_unit_ptx += 1
+                        cond_latents = clean_latents_list[cur_stage][:, :, -(cur_unit_ptx * self.frame_per_unit) : -((cur_unit_ptx - 1) * self.frame_per_unit)]
+                        stage_input.append(torch.cat([cond_latents] * 2) if self.do_classifier_free_guidance else cond_latents)
+                    if cur_stage == 0 and cur_unit_ptx < cur_unit_num:
+                        cond_latents = clean_latents_list[0][:, :, :-(cur_unit_ptx * self.frame_per_unit)]
+                        stage_input.append(torch.cat([cond_latents] * 2) if self.do_classifier_free_guidance else cond_latents)
+                    stage_input = list(reversed(stage_input))
+                    past_condition_latents.append(stage_input)
+                intermed_latents = self.generate_one_unit(
+                    latents[:,:, 1 + (unit_index - 1) * self.frame_per_unit:1 + unit_index * self.frame_per_unit],
+                    past_condition_latents,
+                    prompt_embeds,
+                    prompt_attention_mask,
+                    pooled_prompt_embeds,
+                    video_num_inference_steps,
+                    height,
+                    width,
+                    self.frame_per_unit,
+                    device,
+                    dtype,
+                    generator,
+                    is_first_frame=False,
+                )
+            generated_latents_list.append(intermed_latents[-1])
+            last_generated_latents = intermed_latents
+        generated_latents = torch.cat(generated_latents_list, dim=2)
+        if output_type == "latent":
+            image = generated_latents
+        else:
+            image = self.decode_latent(generated_latents, save_memory=save_memory)
+        return image
+    def decode_latent(self, latents, save_memory=True):
+        if latents.shape[2] == 1:
+            latents = (latents / self.vae_scale_factor) + self.vae_shift_factor
+        else:
+            latents[:, :, :1] = (latents[:, :, :1] / self.vae_scale_factor) + self.vae_shift_factor
+            latents[:, :, 1:] = (latents[:, :, 1:] / self.vae_video_scale_factor) + self.vae_video_shift_factor
+        if save_memory:
+            # reducing the tile size and temporal chunk window size
+            image = self.vae.decode(latents, temporal_chunk=True, window_size=1, tile_sample_min_size=256).sample
+        else:
+            image = self.vae.decode(latents, temporal_chunk=True, window_size=2, tile_sample_min_size=512).sample
+        image = image.float()
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = rearrange(image, "B C T H W -> (B T) C H W")
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        image = self.numpy_to_pil(image)
+        return image
+    @staticmethod
+    def numpy_to_pil(images):
+        """
+        Convert a numpy image or a batch of images to a PIL image.
+        """
+        if images.ndim == 3:
+            images = images[None, ...]
+        images = (images * 255).round().astype("uint8")
+        if images.shape[-1] == 1:
+            # special case for grayscale (single channel) images
+            pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
+        else:
+            pil_images = [Image.fromarray(image) for image in images]
+        return pil_images
+    @property
+    def device(self):
+        return next(self.dit.parameters()).device
+    @property
+    def dtype(self):
+        return next(self.dit.parameters()).dtype
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def video_guidance_scale(self):
+        return self._video_guidance_scale
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 0

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+sentencepiece
+tiktoken
+jsonlines
+diffusers==0.30.1
+accelerate==0.30.0
+torchvision
+numpy==1.26.4
+imageio
+imageio-ffmpeg
+timm
+transformers
+opencv-python-headless
+einops
+tensorboardX
+ipython

trainer_misc/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from .utils import (
+    create_optimizer,
+    get_rank,
+    get_world_size,
+    is_main_process,
+    is_dist_avail_and_initialized,
+    init_distributed_mode,
+    setup_for_distributed,
+    cosine_scheduler,
+    constant_scheduler,
+)
+from .sp_utils import (
+    is_sequence_parallel_initialized,
+    init_sequence_parallel_group,
+    get_sequence_parallel_group,
+    get_sequence_parallel_world_size,
+    get_sequence_parallel_rank,
+    get_sequence_parallel_group_rank,
+    get_sequence_parallel_proc_num,
+    init_sync_input_group,
+    get_sync_input_group,
+)
+from .communicate import all_to_all

trainer_misc/communicate.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import torch
+import torch.nn as nn
+import math
+import torch.distributed as dist
+def _all_to_all(
+    input_: torch.Tensor,
+    world_size: int,
+    group: dist.ProcessGroup,
+    scatter_dim: int,
+    gather_dim: int,
+):
+    if world_size == 1:
+        return input_
+    input_list = [t.contiguous() for t in torch.tensor_split(input_, world_size, scatter_dim)]
+    output_list = [torch.empty_like(input_list[0]) for _ in range(world_size)]
+    dist.all_to_all(output_list, input_list, group=group)
+    return torch.cat(output_list, dim=gather_dim).contiguous()
+class _AllToAll(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input_, process_group, world_size, scatter_dim, gather_dim):
+        ctx.process_group = process_group
+        ctx.scatter_dim = scatter_dim
+        ctx.gather_dim = gather_dim
+        ctx.world_size = world_size
+        output = _all_to_all(input_, ctx.world_size, process_group, scatter_dim, gather_dim)
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_output = _all_to_all(
+            grad_output,
+            ctx.world_size,
+            ctx.process_group,
+            ctx.gather_dim,
+            ctx.scatter_dim,
+        )
+        return (
+            grad_output,
+            None,
+            None,
+            None,
+            None,
+        )
+def all_to_all(
+    input_: torch.Tensor,
+    process_group: dist.ProcessGroup,
+    world_size: int = 1,
+    scatter_dim: int = 2,
+    gather_dim: int = 1,
+):
+    return _AllToAll.apply(input_, process_group, world_size, scatter_dim, gather_dim)

trainer_misc/sp_utils.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import os
+import torch
+import torch.distributed as dist
+from .utils import is_dist_avail_and_initialized, get_rank
+SEQ_PARALLEL_GROUP = None
+SEQ_PARALLEL_SIZE = None
+SEQ_PARALLEL_PROC_NUM = None    # using how many process for sequence parallel
+SYNC_INPUT_GROUP = None
+SYNC_INPUT_SIZE = None
+def is_sequence_parallel_initialized():
+    if SEQ_PARALLEL_GROUP is None:
+        return False
+    else:
+        return True
+def init_sequence_parallel_group(args):
+    global SEQ_PARALLEL_GROUP
+    global SEQ_PARALLEL_SIZE
+    global SEQ_PARALLEL_PROC_NUM
+    assert SEQ_PARALLEL_GROUP is None, "sequence parallel group is already initialized"
+    assert is_dist_avail_and_initialized(), "The pytorch distributed should be initialized"
+    SEQ_PARALLEL_SIZE = args.sp_group_size
+    print(f"Setting the Sequence Parallel Size {SEQ_PARALLEL_SIZE}")
+    rank = torch.distributed.get_rank()
+    world_size = torch.distributed.get_world_size()
+    if args.sp_proc_num == -1:
+        SEQ_PARALLEL_PROC_NUM = world_size
+    else:
+        SEQ_PARALLEL_PROC_NUM = args.sp_proc_num
+    assert SEQ_PARALLEL_PROC_NUM % SEQ_PARALLEL_SIZE == 0, "The process needs to be evenly divided"
+    for i in range(0, SEQ_PARALLEL_PROC_NUM, SEQ_PARALLEL_SIZE):
+        ranks = list(range(i, i + SEQ_PARALLEL_SIZE))
+        group = torch.distributed.new_group(ranks)
+        if rank in ranks:
+            SEQ_PARALLEL_GROUP = group
+            break
+def init_sync_input_group(args):
+    global SYNC_INPUT_GROUP
+    global SYNC_INPUT_SIZE
+    assert SYNC_INPUT_GROUP is None, "parallel group is already initialized"
+    assert is_dist_avail_and_initialized(), "The pytorch distributed should be initialized"
+    SYNC_INPUT_SIZE = args.max_frames
+    rank = torch.distributed.get_rank()
+    world_size = torch.distributed.get_world_size()
+    for i in range(0, world_size, SYNC_INPUT_SIZE):
+        ranks = list(range(i, i + SYNC_INPUT_SIZE))
+        group = torch.distributed.new_group(ranks)
+        if rank in ranks:
+            SYNC_INPUT_GROUP = group
+            break
+def get_sequence_parallel_group():
+    assert SEQ_PARALLEL_GROUP is not None, "sequence parallel group is not initialized"
+    return SEQ_PARALLEL_GROUP
+def get_sync_input_group():
+    return SYNC_INPUT_GROUP
+def get_sequence_parallel_world_size():
+    assert SEQ_PARALLEL_SIZE is not None, "sequence parallel size is not initialized"
+    return SEQ_PARALLEL_SIZE
+def get_sequence_parallel_rank():
+    assert SEQ_PARALLEL_SIZE is not None, "sequence parallel size is not initialized"
+    rank = get_rank()
+    cp_rank = rank % SEQ_PARALLEL_SIZE
+    return cp_rank
+def get_sequence_parallel_group_rank():
+    assert SEQ_PARALLEL_SIZE is not None, "sequence parallel size is not initialized"
+    rank = get_rank()
+    cp_group_rank = rank // SEQ_PARALLEL_SIZE
+    return cp_group_rank
+def get_sequence_parallel_proc_num():
+    return SEQ_PARALLEL_PROC_NUM

trainer_misc/utils.py ADDED Viewed

	@@ -0,0 +1,382 @@

+import io
+import os
+import math
+import time
+import json
+import glob
+from collections import defaultdict, deque, OrderedDict
+import datetime
+import numpy as np
+from pathlib import Path
+import argparse
+import torch
+from torch import optim as optim
+import torch.distributed as dist
+from tensorboardX import SummaryWriter
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+def is_main_process():
+    return get_rank() == 0
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+    __builtin__.print = print
+def init_distributed_mode(args):
+    if int(os.getenv('OMPI_COMM_WORLD_SIZE', '0')) > 0:
+        rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
+        local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
+        world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
+        os.environ["LOCAL_RANK"] = os.environ['OMPI_COMM_WORLD_LOCAL_RANK']
+        os.environ["RANK"] = os.environ['OMPI_COMM_WORLD_RANK']
+        os.environ["WORLD_SIZE"] = os.environ['OMPI_COMM_WORLD_SIZE']
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ["WORLD_SIZE"])
+        args.gpu = int(os.environ["LOCAL_RANK"])
+    elif 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.gpu = int(os.environ['LOCAL_RANK'])
+    else:
+        print('Not using distributed mode')
+        args.distributed = False
+        return
+    args.distributed = True
+    args.dist_backend = 'nccl'
+    args.dist_url = "env://"
+    print('| distributed init (rank {}): {}, gpu {}'.format(
+        args.rank, args.dist_url, args.gpu), flush=True)
+def cosine_scheduler(base_value, final_value, epochs, niter_per_ep, warmup_epochs=0,
+        start_warmup_value=0, warmup_steps=-1):
+    warmup_schedule = np.array([])
+    warmup_iters = warmup_epochs * niter_per_ep
+    if warmup_steps > 0:
+        warmup_iters = warmup_steps
+    print("Set warmup steps = %d" % warmup_iters)
+    if warmup_epochs > 0:
+        warmup_schedule = np.linspace(start_warmup_value, base_value, warmup_iters)
+    iters = np.arange(epochs * niter_per_ep - warmup_iters)
+    schedule = np.array(
+        [final_value + 0.5 * (base_value - final_value) * (1 + math.cos(math.pi * i / (len(iters)))) for i in iters])
+    schedule = np.concatenate((warmup_schedule, schedule))
+    assert len(schedule) == epochs * niter_per_ep
+    return schedule
+def constant_scheduler(base_value, epochs, niter_per_ep, warmup_epochs=0,
+        start_warmup_value=1e-6, warmup_steps=-1):
+    warmup_schedule = np.array([])
+    warmup_iters = warmup_epochs * niter_per_ep
+    if warmup_steps > 0:
+        warmup_iters = warmup_steps
+    print("Set warmup steps = %d" % warmup_iters)
+    if warmup_iters > 0:
+        warmup_schedule = np.linspace(start_warmup_value, base_value, warmup_iters)
+    iters = epochs * niter_per_ep - warmup_iters
+    schedule = np.array([base_value] * iters)
+    schedule = np.concatenate((warmup_schedule, schedule))
+    assert len(schedule) == epochs * niter_per_ep
+    return schedule
+def get_parameter_groups(model, weight_decay=1e-5, base_lr=1e-4, skip_list=(), get_num_layer=None, get_layer_scale=None, **kwargs):
+    parameter_group_names = {}
+    parameter_group_vars = {}
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue  # frozen weights
+        if len(kwargs.get('filter_name', [])) > 0:
+            flag = False
+            for filter_n in kwargs.get('filter_name', []):
+                if filter_n in name:
+                    print(f"filter {name} because of the pattern {filter_n}")
+                    flag = True
+            if flag:
+                continue
+        default_scale=1.
+        if param.ndim <= 1 or name.endswith(".bias") or name in skip_list: # param.ndim <= 1 len(param.shape) == 1
+            group_name = "no_decay"
+            this_weight_decay = 0.
+        else:
+            group_name = "decay"
+            this_weight_decay = weight_decay
+        if get_num_layer is not None:
+            layer_id = get_num_layer(name)
+            group_name = "layer_%d_%s" % (layer_id, group_name)
+        else:
+            layer_id = None
+        if group_name not in parameter_group_names:
+            if get_layer_scale is not None:
+                scale = get_layer_scale(layer_id)
+            else:
+                scale = default_scale
+            parameter_group_names[group_name] = {
+                "weight_decay": this_weight_decay,
+                "params": [],
+                "lr": base_lr,
+                "lr_scale": scale,
+            }
+            parameter_group_vars[group_name] = {
+                "weight_decay": this_weight_decay,
+                "params": [],
+                "lr": base_lr,
+                "lr_scale": scale,
+            }
+        parameter_group_vars[group_name]["params"].append(param)
+        parameter_group_names[group_name]["params"].append(name)
+    print("Param groups = %s" % json.dumps(parameter_group_names, indent=2))
+    return list(parameter_group_vars.values())
+def create_optimizer(args, model, get_num_layer=None, get_layer_scale=None, filter_bias_and_bn=True, skip_list=None, **kwargs):
+    opt_lower = args.opt.lower()
+    weight_decay = args.weight_decay
+    skip = {}
+    if skip_list is not None:
+        skip = skip_list
+    elif hasattr(model, 'no_weight_decay'):
+        skip = model.no_weight_decay()
+    print(f"Skip weight decay name marked in model: {skip}")
+    parameters = get_parameter_groups(model, weight_decay, args.lr, skip, get_num_layer, get_layer_scale, **kwargs)
+    weight_decay = 0.
+    if 'fused' in opt_lower:
+        assert has_apex and torch.cuda.is_available(), 'APEX and CUDA required for fused optimizers'
+    opt_args = dict(lr=args.lr, weight_decay=weight_decay)
+    if hasattr(args, 'opt_eps') and args.opt_eps is not None:
+        opt_args['eps'] = args.opt_eps
+    if hasattr(args, 'opt_beta1') and args.opt_beta1 is not None:
+        opt_args['betas'] = (args.opt_beta1, args.opt_beta2)
+    print('Optimizer config:', opt_args)
+    opt_split = opt_lower.split('_')
+    opt_lower = opt_split[-1]
+    if opt_lower == 'sgd' or opt_lower == 'nesterov':
+        opt_args.pop('eps', None)
+        optimizer = optim.SGD(parameters, momentum=args.momentum, nesterov=True, **opt_args)
+    elif opt_lower == 'momentum':
+        opt_args.pop('eps', None)
+        optimizer = optim.SGD(parameters, momentum=args.momentum, nesterov=False, **opt_args)
+    elif opt_lower == 'adam':
+        optimizer = optim.Adam(parameters, **opt_args)
+    elif opt_lower == 'adamw':
+        optimizer = optim.AdamW(parameters, **opt_args)
+    elif opt_lower == 'adadelta':
+        optimizer = optim.Adadelta(parameters, **opt_args)
+    elif opt_lower == 'rmsprop':
+        optimizer = optim.RMSprop(parameters, alpha=0.9, momentum=args.momentum, **opt_args)
+    else:
+        assert False and "Invalid optimizer"
+        raise ValueError
+    return optimizer
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+    @property
+    def global_avg(self):
+        return self.total / self.count
+    @property
+    def max(self):
+        return max(self.deque)
+    @property
+    def value(self):
+        return self.deque[-1]
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value)
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if v is None:
+                continue
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(
+                "{}: {}".format(name, str(meter))
+            )
+        return self.delimiter.join(loss_str)
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ''
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt='{avg:.4f}')
+        data_time = SmoothedValue(fmt='{avg:.4f}')
+        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
+        log_msg = [
+            header,
+            '[{0' + space_fmt + '}/{1}]',
+            'eta: {eta}',
+            '{meters}',
+            'time: {time}',
+            'data: {data}'
+        ]
+        if torch.cuda.is_available():
+            log_msg.append('max mem: {memory:.0f}')
+        log_msg = self.delimiter.join(log_msg)
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time),
+                        memory=torch.cuda.max_memory_allocated() / MB))
+                else:
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time)))
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('{} Total time: {} ({:.4f} s / it)'.format(
+            header, total_time_str, total_time / len(iterable)))

utils.py ADDED Viewed

	@@ -0,0 +1,457 @@

+import os
+import torch
+import PIL.Image
+import numpy as np
+from torch import nn
+import torch.distributed as dist
+import timm.models.hub as timm_hub
+"""Modified from https://github.com/CompVis/taming-transformers.git"""
+import hashlib
+import requests
+from tqdm import tqdm
+try:
+    import piq
+except:
+    pass
+_CONTEXT_PARALLEL_GROUP = None
+_CONTEXT_PARALLEL_SIZE = None
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+def is_main_process():
+    return get_rank() == 0
+def is_context_parallel_initialized():
+    if _CONTEXT_PARALLEL_GROUP is None:
+        return False
+    else:
+        return True
+def set_context_parallel_group(size, group):
+    global _CONTEXT_PARALLEL_GROUP
+    global _CONTEXT_PARALLEL_SIZE
+    _CONTEXT_PARALLEL_GROUP = group
+    _CONTEXT_PARALLEL_SIZE = size
+def initialize_context_parallel(context_parallel_size):
+    global _CONTEXT_PARALLEL_GROUP
+    global _CONTEXT_PARALLEL_SIZE
+    assert _CONTEXT_PARALLEL_GROUP is None, "context parallel group is already initialized"
+    _CONTEXT_PARALLEL_SIZE = context_parallel_size
+    rank = torch.distributed.get_rank()
+    world_size = torch.distributed.get_world_size()
+    for i in range(0, world_size, context_parallel_size):
+        ranks = range(i, i + context_parallel_size)
+        group = torch.distributed.new_group(ranks)
+        if rank in ranks:
+            _CONTEXT_PARALLEL_GROUP = group
+            break
+def get_context_parallel_group():
+    assert _CONTEXT_PARALLEL_GROUP is not None, "context parallel group is not initialized"
+    return _CONTEXT_PARALLEL_GROUP
+def get_context_parallel_world_size():
+    assert _CONTEXT_PARALLEL_SIZE is not None, "context parallel size is not initialized"
+    return _CONTEXT_PARALLEL_SIZE
+def get_context_parallel_rank():
+    assert _CONTEXT_PARALLEL_SIZE is not None, "context parallel size is not initialized"
+    rank = get_rank()
+    cp_rank = rank % _CONTEXT_PARALLEL_SIZE
+    return cp_rank
+def get_context_parallel_group_rank():
+    assert _CONTEXT_PARALLEL_SIZE is not None, "context parallel size is not initialized"
+    rank = get_rank()
+    cp_group_rank = rank // _CONTEXT_PARALLEL_SIZE
+    return cp_group_rank
+def download_cached_file(url, check_hash=True, progress=False):
+    """
+    Download a file from a URL and cache it locally. If the file already exists, it is not downloaded again.
+    If distributed, only the main process downloads the file, and the other processes wait for the file to be downloaded.
+    """
+    def get_cached_file_path():
+        # a hack to sync the file path across processes
+        parts = torch.hub.urlparse(url)
+        filename = os.path.basename(parts.path)
+        cached_file = os.path.join(timm_hub.get_cache_dir(), filename)
+        return cached_file
+    if is_main_process():
+        timm_hub.download_cached_file(url, check_hash, progress)
+    if is_dist_avail_and_initialized():
+        dist.barrier()
+    return get_cached_file_path()
+def convert_weights_to_fp16(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+    def _convert_weights_to_fp16(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Conv3d, nn.Linear)):
+            l.weight.data = l.weight.data.to(torch.float16)
+            if l.bias is not None:
+                l.bias.data = l.bias.data.to(torch.float16)
+    model.apply(_convert_weights_to_fp16)
+def convert_weights_to_bf16(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+    def _convert_weights_to_bf16(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Conv3d, nn.Linear)):
+            l.weight.data = l.weight.data.to(torch.bfloat16)
+            if l.bias is not None:
+                l.bias.data = l.bias.data.to(torch.bfloat16)
+    model.apply(_convert_weights_to_bf16)
+def save_result(result, result_dir, filename, remove_duplicate="", save_format='json'):
+    import json
+    import jsonlines
+    print("Dump result")
+    # Make the temp dir for saving results
+    if not os.path.exists(result_dir):
+        if is_main_process():
+            os.makedirs(result_dir)
+        if is_dist_avail_and_initialized():
+            torch.distributed.barrier()
+    result_file = os.path.join(
+        result_dir, "%s_rank%d.json" % (filename, get_rank())
+    )
+    final_result_file = os.path.join(result_dir, f"{filename}.{save_format}")
+    json.dump(result, open(result_file, "w"))
+    if is_dist_avail_and_initialized():
+        torch.distributed.barrier()
+    if is_main_process():
+        # print("rank %d starts merging results." % get_rank())
+        # combine results from all processes
+        result = []
+        for rank in range(get_world_size()):
+            result_file = os.path.join(result_dir, "%s_rank%d.json" % (filename, rank))
+            res = json.load(open(result_file, "r"))
+            result += res
+        # print("Remove duplicate")
+        if remove_duplicate:
+            result_new = []
+            id_set = set()
+            for res in result:
+                if res[remove_duplicate] not in id_set:
+                    id_set.add(res[remove_duplicate])
+                    result_new.append(res)
+            result = result_new
+        if save_format == 'json':
+            json.dump(result, open(final_result_file, "w"))
+        else:
+            assert save_format == 'jsonl', "Only support json adn jsonl format"
+            with jsonlines.open(final_result_file, "w") as writer:
+                writer.write_all(result)
+        # print("result file saved to %s" % final_result_file)
+    return final_result_file
+# resizing utils
+# TODO: clean up later
+def _resize_with_antialiasing(input, size, interpolation="bicubic", align_corners=True):
+    h, w = input.shape[-2:]
+    factors = (h / size[0], w / size[1])
+    # First, we have to determine sigma
+    # Taken from skimage: https://github.com/scikit-image/scikit-image/blob/v0.19.2/skimage/transform/_warps.py#L171
+    sigmas = (
+        max((factors[0] - 1.0) / 2.0, 0.001),
+        max((factors[1] - 1.0) / 2.0, 0.001),
+    )
+    # Now kernel size. Good results are for 3 sigma, but that is kind of slow. Pillow uses 1 sigma
+    # https://github.com/python-pillow/Pillow/blob/master/src/libImaging/Resample.c#L206
+    # But they do it in the 2 passes, which gives better results. Let's try 2 sigmas for now
+    ks = int(max(2.0 * 2 * sigmas[0], 3)), int(max(2.0 * 2 * sigmas[1], 3))
+    # Make sure it is odd
+    if (ks[0] % 2) == 0:
+        ks = ks[0] + 1, ks[1]
+    if (ks[1] % 2) == 0:
+        ks = ks[0], ks[1] + 1
+    input = _gaussian_blur2d(input, ks, sigmas)
+    output = torch.nn.functional.interpolate(input, size=size, mode=interpolation, align_corners=align_corners)
+    return output
+def _compute_padding(kernel_size):
+    """Compute padding tuple."""
+    # 4 or 6 ints:  (padding_left, padding_right,padding_top,padding_bottom)
+    # https://pytorch.org/docs/stable/nn.html#torch.nn.functional.pad
+    if len(kernel_size) < 2:
+        raise AssertionError(kernel_size)
+    computed = [k - 1 for k in kernel_size]
+    # for even kernels we need to do asymmetric padding :(
+    out_padding = 2 * len(kernel_size) * [0]
+    for i in range(len(kernel_size)):
+        computed_tmp = computed[-(i + 1)]
+        pad_front = computed_tmp // 2
+        pad_rear = computed_tmp - pad_front
+        out_padding[2 * i + 0] = pad_front
+        out_padding[2 * i + 1] = pad_rear
+    return out_padding
+def _filter2d(input, kernel):
+    # prepare kernel
+    b, c, h, w = input.shape
+    tmp_kernel = kernel[:, None, ...].to(device=input.device, dtype=input.dtype)
+    tmp_kernel = tmp_kernel.expand(-1, c, -1, -1)
+    height, width = tmp_kernel.shape[-2:]
+    padding_shape: list[int] = _compute_padding([height, width])
+    input = torch.nn.functional.pad(input, padding_shape, mode="reflect")
+    # kernel and input tensor reshape to align element-wise or batch-wise params
+    tmp_kernel = tmp_kernel.reshape(-1, 1, height, width)
+    input = input.view(-1, tmp_kernel.size(0), input.size(-2), input.size(-1))
+    # convolve the tensor with the kernel.
+    output = torch.nn.functional.conv2d(input, tmp_kernel, groups=tmp_kernel.size(0), padding=0, stride=1)
+    out = output.view(b, c, h, w)
+    return out
+def _gaussian(window_size: int, sigma):
+    if isinstance(sigma, float):
+        sigma = torch.tensor([[sigma]])
+    batch_size = sigma.shape[0]
+    x = (torch.arange(window_size, device=sigma.device, dtype=sigma.dtype) - window_size // 2).expand(batch_size, -1)
+    if window_size % 2 == 0:
+        x = x + 0.5
+    gauss = torch.exp(-x.pow(2.0) / (2 * sigma.pow(2.0)))
+    return gauss / gauss.sum(-1, keepdim=True)
+def _gaussian_blur2d(input, kernel_size, sigma):
+    if isinstance(sigma, tuple):
+        sigma = torch.tensor([sigma], dtype=input.dtype)
+    else:
+        sigma = sigma.to(dtype=input.dtype)
+    ky, kx = int(kernel_size[0]), int(kernel_size[1])
+    bs = sigma.shape[0]
+    kernel_x = _gaussian(kx, sigma[:, 1].view(bs, 1))
+    kernel_y = _gaussian(ky, sigma[:, 0].view(bs, 1))
+    out_x = _filter2d(input, kernel_x[..., None, :])
+    out = _filter2d(out_x, kernel_y[..., None])
+    return out
+URL_MAP = {
+    "vgg_lpips": "https://heibox.uni-heidelberg.de/f/607503859c864bc1b30b/?dl=1"
+}
+CKPT_MAP = {
+    "vgg_lpips": "vgg.pth"
+}
+MD5_MAP = {
+    "vgg_lpips": "d507d7349b931f0638a25a48a722f98a"
+}
+def download(url, local_path, chunk_size=1024):
+    os.makedirs(os.path.split(local_path)[0], exist_ok=True)
+    with requests.get(url, stream=True) as r:
+        total_size = int(r.headers.get("content-length", 0))
+        with tqdm(total=total_size, unit="B", unit_scale=True) as pbar:
+            with open(local_path, "wb") as f:
+                for data in r.iter_content(chunk_size=chunk_size):
+                    if data:
+                        f.write(data)
+                        pbar.update(chunk_size)
+def md5_hash(path):
+    with open(path, "rb") as f:
+        content = f.read()
+    return hashlib.md5(content).hexdigest()
+def get_ckpt_path(name, root, check=False):
+    assert name in URL_MAP
+    path = os.path.join(root, CKPT_MAP[name])
+    print(md5_hash(path))
+    if not os.path.exists(path) or (check and not md5_hash(path) == MD5_MAP[name]):
+        print("Downloading {} model from {} to {}".format(name, URL_MAP[name], path))
+        download(URL_MAP[name], path)
+        md5 = md5_hash(path)
+        assert md5 == MD5_MAP[name], md5
+    return path
+class KeyNotFoundError(Exception):
+    def __init__(self, cause, keys=None, visited=None):
+        self.cause = cause
+        self.keys = keys
+        self.visited = visited
+        messages = list()
+        if keys is not None:
+            messages.append("Key not found: {}".format(keys))
+        if visited is not None:
+            messages.append("Visited: {}".format(visited))
+        messages.append("Cause:\n{}".format(cause))
+        message = "\n".join(messages)
+        super().__init__(message)
+def retrieve(
+    list_or_dict, key, splitval="/", default=None, expand=True, pass_success=False
+):
+    """Given a nested list or dict return the desired value at key expanding
+    callable nodes if necessary and :attr:`expand` is ``True``. The expansion
+    is done in-place.
+    Parameters
+    ----------
+        list_or_dict : list or dict
+            Possibly nested list or dictionary.
+        key : str
+            key/to/value, path like string describing all keys necessary to
+            consider to get to the desired value. List indices can also be
+            passed here.
+        splitval : str
+            String that defines the delimiter between keys of the
+            different depth levels in `key`.
+        default : obj
+            Value returned if :attr:`key` is not found.
+        expand : bool
+            Whether to expand callable nodes on the path or not.
+    Returns
+    -------
+        The desired value or if :attr:`default` is not ``None`` and the
+        :attr:`key` is not found returns ``default``.
+    Raises
+    ------
+        Exception if ``key`` not in ``list_or_dict`` and :attr:`default` is
+        ``None``.
+    """
+    keys = key.split(splitval)
+    success = True
+    try:
+        visited = []
+        parent = None
+        last_key = None
+        for key in keys:
+            if callable(list_or_dict):
+                if not expand:
+                    raise KeyNotFoundError(
+                        ValueError(
+                            "Trying to get past callable node with expand=False."
+                        ),
+                        keys=keys,
+                        visited=visited,
+                    )
+                list_or_dict = list_or_dict()
+                parent[last_key] = list_or_dict
+            last_key = key
+            parent = list_or_dict
+            try:
+                if isinstance(list_or_dict, dict):
+                    list_or_dict = list_or_dict[key]
+                else:
+                    list_or_dict = list_or_dict[int(key)]
+            except (KeyError, IndexError, ValueError) as e:
+                raise KeyNotFoundError(e, keys=keys, visited=visited)
+            visited += [key]
+        # final expansion of retrieved value
+        if expand and callable(list_or_dict):
+            list_or_dict = list_or_dict()
+            parent[last_key] = list_or_dict
+    except KeyNotFoundError as e:
+        if default is None:
+            raise e
+        else:
+            list_or_dict = default
+            success = False
+    if not pass_success:
+        return list_or_dict
+    else:
+        return list_or_dict, success

video_generation_demo.ipynb ADDED Viewed

	@@ -0,0 +1,181 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "import torch\n",
+    "import numpy as np\n",
+    "import PIL\n",
+    "from PIL import Image\n",
+    "from IPython.display import HTML\n",
+    "from pyramid_dit import PyramidDiTForVideoGeneration\n",
+    "from IPython.display import Image as ipython_image\n",
+    "from diffusers.utils import load_image, export_to_video, export_to_gif"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "variant='diffusion_transformer_768p'         # For high resolution\n",
+    "# variant='diffusion_transformer_384p'       # For low resolution\n",
+    "\n",
+    "model_path = \"/home/jinyang06/models/pyramid-flow\"   # The downloaded checkpoint dir\n",
+    "model_dtype = 'bf16'\n",
+    "\n",
+    "device_id = 0\n",
+    "torch.cuda.set_device(device_id)\n",
+    "\n",
+    "model = PyramidDiTForVideoGeneration(\n",
+    "    model_path,\n",
+    "    model_dtype,\n",
+    "    model_variant=variant,\n",
+    ")\n",
+    "\n",
+    "model.vae.to(\"cuda\")\n",
+    "model.dit.to(\"cuda\")\n",
+    "model.text_encoder.to(\"cuda\")\n",
+    "\n",
+    "if model_dtype == \"bf16\":\n",
+    "    torch_dtype = torch.bfloat16 \n",
+    "elif model_dtype == \"fp16\":\n",
+    "    torch_dtype = torch.float16\n",
+    "else:\n",
+    "    torch_dtype = torch.float32\n",
+    "\n",
+    "\n",
+    "def show_video(ori_path, rec_path, width=\"100%\"):\n",
+    "    html = ''\n",
+    "    if ori_path is not None:\n",
+    "        html += f\"\"\"<video controls=\"\" name=\"media\" data-fullscreen-container=\"true\" width=\"{width}\">\n",
+    "        <source src=\"{ori_path}\" type=\"video/mp4\">\n",
+    "        </video>\n",
+    "        \"\"\"\n",
+    "    \n",
+    "    html += f\"\"\"<video controls=\"\" name=\"media\" data-fullscreen-container=\"true\" width=\"{width}\">\n",
+    "    <source src=\"{rec_path}\" type=\"video/mp4\">\n",
+    "    </video>\n",
+    "    \"\"\"\n",
+    "    return HTML(html)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Text-to-Video"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompt = \"A movie trailer featuring the adventures of the 30 year old space man wearing a red wool knitted motorcycle helmet, blue sky, salt desert, cinematic style, shot on 35mm film, vivid colors\"\n",
+    "\n",
+    "# used for 384p model variant\n",
+    "# width = 640\n",
+    "# height = 384\n",
+    "\n",
+    "# used for 768p model variant\n",
+    "width = 1280\n",
+    "height = 768\n",
+    "\n",
+    "temp = 16   # temp in [1, 31] <=> frame in [1, 241] <=> duration in [0, 10s]\n",
+    "\n",
+    "model.vae.enable_tiling()\n",
+    "\n",
+    "with torch.no_grad(), torch.cuda.amp.autocast(enabled=True if model_dtype != 'fp32' else False, dtype=torch_dtype):\n",
+    "    frames = model.generate(\n",
+    "        prompt=prompt,\n",
+    "        num_inference_steps=[20, 20, 20],\n",
+    "        video_num_inference_steps=[10, 10, 10],\n",
+    "        height=height,\n",
+    "        width=width,\n",
+    "        temp=temp,\n",
+    "        guidance_scale=9.0,         # The guidance for the first frame\n",
+    "        video_guidance_scale=5.0,   # The guidance for the other video latent\n",
+    "        output_type=\"pil\",\n",
+    "        save_memory=True,           # If you have enough GPU memory, set it to `False` to improve vae decoding speed\n",
+    "    )\n",
+    "\n",
+    "export_to_video(frames, \"./text_to_video_sample.mp4\", fps=24)\n",
+    "show_video(None, \"./text_to_video_sample.mp4\", \"70%\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Image-to-Video"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image_path = 'assets/the_great_wall.jpg'\n",
+    "image = Image.open(image_path).convert(\"RGB\")\n",
+    "\n",
+    "width = 1280\n",
+    "height = 768\n",
+    "temp = 16\n",
+    "\n",
+    "image = image.resize((width, height))\n",
+    "\n",
+    "display(image)\n",
+    "\n",
+    "prompt = \"FPV flying over the Great Wall\"\n",
+    "\n",
+    "with torch.no_grad(), torch.cuda.amp.autocast(enabled=True if model_dtype != 'fp32' else False, dtype=torch_dtype):\n",
+    "    frames = model.generate_i2v(\n",
+    "        prompt=prompt,\n",
+    "        input_image=image,\n",
+    "        num_inference_steps=[10, 10, 10],\n",
+    "        temp=temp,\n",
+    "        guidance_scale=7.0,\n",
+    "        video_guidance_scale=4.0,\n",
+    "        output_type=\"pil\",\n",
+    "        save_memory=True,         # If you have enough GPU memory, set it to `False` to improve vae decoding speed\n",
+    "    )\n",
+    "\n",
+    "export_to_video(frames, \"./image_to_video_sample.mp4\", fps=24)\n",
+    "show_video(None, \"./image_to_video_sample.mp4\", \"70%\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

video_vae/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .modeling_loss import LPIPSWithDiscriminator
2	+ from .modeling_causal_vae import CausalVideoVAE

video_vae/context_parallel_ops.py ADDED Viewed

	@@ -0,0 +1,172 @@

+# from cogvideoX
+import torch
+import torch.nn as nn
+import math
+from utils import (
+    get_context_parallel_group,
+    get_context_parallel_rank,
+    get_context_parallel_world_size,
+    get_context_parallel_group_rank,
+)
+def _conv_split(input_, dim=2, kernel_size=1):
+    cp_world_size = get_context_parallel_world_size()
+    # Bypass the function if context parallel is 1
+    if cp_world_size == 1:
+        return input_
+    # print('in _conv_split, cp_rank:', cp_rank, 'input_size:', input_.shape)
+    cp_rank = get_context_parallel_rank()
+    dim_size = (input_.size()[dim] - kernel_size) // cp_world_size
+    if cp_rank == 0:
+        output = input_.transpose(dim, 0)[: dim_size + kernel_size].transpose(dim, 0)
+    else:
+        # output = input_.transpose(dim, 0)[cp_rank * dim_size + 1:(cp_rank + 1) * dim_size + kernel_size].transpose(dim, 0)
+        output = input_.transpose(dim, 0)[
+            cp_rank * dim_size + kernel_size : (cp_rank + 1) * dim_size + kernel_size
+        ].transpose(dim, 0)
+    output = output.contiguous()
+    # print('out _conv_split, cp_rank:', cp_rank, 'input_size:', output.shape)
+    return output
+def _conv_gather(input_, dim=2, kernel_size=1):
+    cp_world_size = get_context_parallel_world_size()
+    # Bypass the function if context parallel is 1
+    if cp_world_size == 1:
+        return input_
+    group = get_context_parallel_group()
+    cp_rank = get_context_parallel_rank()
+    # print('in _conv_gather, cp_rank:', cp_rank, 'input_size:', input_.shape)
+    input_first_kernel_ = input_.transpose(0, dim)[:kernel_size].transpose(0, dim).contiguous()
+    if cp_rank == 0:
+        input_ = input_.transpose(0, dim)[kernel_size:].transpose(0, dim).contiguous()
+    else:
+        input_ = input_.transpose(0, dim)[max(kernel_size - 1, 0) :].transpose(0, dim).contiguous()
+    tensor_list = [torch.empty_like(torch.cat([input_first_kernel_, input_], dim=dim))] + [
+        torch.empty_like(input_) for _ in range(cp_world_size - 1)
+    ]
+    if cp_rank == 0:
+        input_ = torch.cat([input_first_kernel_, input_], dim=dim)
+    tensor_list[cp_rank] = input_
+    torch.distributed.all_gather(tensor_list, input_, group=group)
+    # Note: torch.cat already creates a contiguous tensor.
+    output = torch.cat(tensor_list, dim=dim).contiguous()
+    # print('out _conv_gather, cp_rank:', cp_rank, 'input_size:', output.shape)
+    return output
+def _cp_pass_from_previous_rank(input_, dim, kernel_size):
+    # Bypass the function if kernel size is 1
+    if kernel_size == 1:
+        return input_
+    group = get_context_parallel_group()
+    cp_rank = get_context_parallel_rank()
+    cp_group_rank = get_context_parallel_group_rank()
+    cp_world_size = get_context_parallel_world_size()
+    # print('in _pass_from_previous_rank, cp_rank:', cp_rank, 'input_size:', input_.shape)
+    global_rank = torch.distributed.get_rank()
+    global_world_size = torch.distributed.get_world_size()
+    input_ = input_.transpose(0, dim)
+    # pass from last rank
+    send_rank = global_rank + 1
+    recv_rank = global_rank - 1
+    if send_rank % cp_world_size == 0:
+        send_rank -= cp_world_size
+    if recv_rank % cp_world_size == cp_world_size - 1:
+        recv_rank += cp_world_size
+    recv_buffer = torch.empty_like(input_[-kernel_size + 1 :]).contiguous()
+    if cp_rank < cp_world_size - 1:
+        req_send = torch.distributed.isend(input_[-kernel_size + 1 :].contiguous(), send_rank, group=group)
+    if cp_rank > 0:
+        req_recv = torch.distributed.irecv(recv_buffer, recv_rank, group=group)
+    if cp_rank == 0:
+        input_ = torch.cat([torch.zeros_like(input_[:1])] * (kernel_size - 1) + [input_], dim=0)
+    else:
+        req_recv.wait()
+        input_ = torch.cat([recv_buffer, input_], dim=0)
+    input_ = input_.transpose(0, dim).contiguous()
+    return input_
+def _drop_from_previous_rank(input_, dim, kernel_size):
+    input_ = input_.transpose(0, dim)[kernel_size - 1 :].transpose(0, dim)
+    return input_
+class _ConvolutionScatterToContextParallelRegion(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input_, dim, kernel_size):
+        ctx.dim = dim
+        ctx.kernel_size = kernel_size
+        return _conv_split(input_, dim, kernel_size)
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _conv_gather(grad_output, ctx.dim, ctx.kernel_size), None, None
+class _ConvolutionGatherFromContextParallelRegion(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input_, dim, kernel_size):
+        ctx.dim = dim
+        ctx.kernel_size = kernel_size
+        return _conv_gather(input_, dim, kernel_size)
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _conv_split(grad_output, ctx.dim, ctx.kernel_size), None, None
+class _CPConvolutionPassFromPreviousRank(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input_, dim, kernel_size):
+        ctx.dim = dim
+        ctx.kernel_size = kernel_size
+        return _cp_pass_from_previous_rank(input_, dim, kernel_size)
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _drop_from_previous_rank(grad_output, ctx.dim, ctx.kernel_size), None, None
+def conv_scatter_to_context_parallel_region(input_, dim, kernel_size):
+    return _ConvolutionScatterToContextParallelRegion.apply(input_, dim, kernel_size)
+def conv_gather_from_context_parallel_region(input_, dim, kernel_size):
+    return _ConvolutionGatherFromContextParallelRegion.apply(input_, dim, kernel_size)
+def cp_pass_from_previous_rank(input_, dim, kernel_size):
+    return _CPConvolutionPassFromPreviousRank.apply(input_, dim, kernel_size)

video_vae/modeling_block.py ADDED Viewed

	@@ -0,0 +1,760 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from einops import rearrange
+from diffusers.utils import logging
+from diffusers.models.attention_processor import Attention
+from .modeling_resnet import (
+    Downsample2D, ResnetBlock2D, CausalResnetBlock3D, Upsample2D,
+    TemporalDownsample2x, TemporalUpsample2x,
+    CausalDownsample2x, CausalTemporalDownsample2x,
+    CausalUpsample2x, CausalTemporalUpsample2x,
+)
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def get_input_layer(
+    in_channels: int,
+    out_channels: int,
+    norm_num_groups: int,
+    layer_type: str,
+    norm_type: str = 'group',
+    affine: bool = True,
+):
+    if layer_type == 'conv':
+        input_layer = nn.Conv3d(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+    elif layer_type == 'pixel_shuffle':
+        input_layer = nn.Sequential(
+            nn.PixelUnshuffle(2),
+            nn.Conv2d(in_channels * 4, out_channels, kernel_size=1),
+        )
+    else:
+        raise NotImplementedError(f"Not support input layer {layer_type}")
+    return input_layer
+def get_output_layer(
+    in_channels: int,
+    out_channels: int,
+    norm_num_groups: int,
+    layer_type: str,
+    norm_type: str = 'group',
+    affine: bool = True,
+):
+    if layer_type == 'norm_act_conv':
+        output_layer = nn.Sequential(
+            nn.GroupNorm(num_channels=in_channels, num_groups=norm_num_groups, eps=1e-6, affine=affine),
+            nn.SiLU(),
+            nn.Conv3d(in_channels, out_channels, 3, stride=1, padding=1),
+        )
+    elif layer_type == 'pixel_shuffle':
+        output_layer = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels * 4, kernel_size=1),
+            nn.PixelShuffle(2),
+        )
+    else:
+        raise NotImplementedError(f"Not support output layer {layer_type}")
+    return output_layer
+def get_down_block(
+    down_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    out_channels: int = None,
+    temb_channels: int = None,
+    add_spatial_downsample: bool = None,
+    add_temporal_downsample: bool = None,
+    resnet_eps: float = 1e-6,
+    resnet_act_fn: str = 'silu',
+    resnet_groups: Optional[int] = None,
+    downsample_padding: Optional[int] = None,
+    resnet_time_scale_shift: str = "default",
+    attention_head_dim: Optional[int] = None,
+    dropout: float = 0.0,
+    norm_affline: bool = True,
+    norm_layer: str = 'layer',
+):
+    if down_block_type == "DownEncoderBlock2D":
+        return DownEncoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            dropout=dropout,
+            add_spatial_downsample=add_spatial_downsample,
+            add_temporal_downsample=add_temporal_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "DownEncoderBlockCausal3D":
+        return DownEncoderBlockCausal3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            dropout=dropout,
+            add_spatial_downsample=add_spatial_downsample,
+            add_temporal_downsample=add_temporal_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    raise ValueError(f"{down_block_type} does not exist.")
+def get_up_block(
+    up_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    out_channels: int,
+    prev_output_channel: int = None,
+    temb_channels: int = None,
+    add_spatial_upsample: bool = None,
+    add_temporal_upsample: bool = None,
+    resnet_eps: float = 1e-6,
+    resnet_act_fn: str = 'silu',
+    resolution_idx: Optional[int] = None,
+    resnet_groups: Optional[int] = None,
+    resnet_time_scale_shift: str = "default",
+    attention_head_dim: Optional[int] = None,
+    dropout: float = 0.0,
+    interpolate: bool = True,
+    norm_affline: bool = True,
+    norm_layer: str = 'layer',
+) -> nn.Module:
+    if up_block_type == "UpDecoderBlock2D":
+        return UpDecoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_spatial_upsample=add_spatial_upsample,
+            add_temporal_upsample=add_temporal_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            temb_channels=temb_channels,
+            interpolate=interpolate,
+        )
+    elif up_block_type == "UpDecoderBlockCausal3D":
+        return UpDecoderBlockCausal3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_spatial_upsample=add_spatial_upsample,
+            add_temporal_upsample=add_temporal_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            temb_channels=temb_channels,
+            interpolate=interpolate,
+        )
+    raise ValueError(f"{up_block_type} does not exist.")
+class UNetMidBlock2D(nn.Module):
+    """
+    A 2D UNet mid-block [`UNetMidBlock2D`] with multiple residual blocks and optional attention blocks.
+    Args:
+        in_channels (`int`): The number of input channels.
+        temb_channels (`int`): The number of temporal embedding channels.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout rate.
+        num_layers (`int`, *optional*, defaults to 1): The number of residual blocks.
+        resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks.
+        resnet_time_scale_shift (`str`, *optional*, defaults to `default`):
+            The type of normalization to apply to the time embeddings. This can help to improve the performance of the
+            model on tasks with long-range temporal dependencies.
+        resnet_act_fn (`str`, *optional*, defaults to `swish`): The activation function for the resnet blocks.
+        resnet_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use in the group normalization layers of the resnet blocks.
+        attn_groups (`Optional[int]`, *optional*, defaults to None): The number of groups for the attention blocks.
+        resnet_pre_norm (`bool`, *optional*, defaults to `True`):
+            Whether to use pre-normalization for the resnet blocks.
+        add_attention (`bool`, *optional*, defaults to `True`): Whether to add attention blocks.
+        attention_head_dim (`int`, *optional*, defaults to 1):
+            Dimension of a single attention head. The number of attention heads is determined based on this value and
+            the number of input channels.
+        output_scale_factor (`float`, *optional*, defaults to 1.0): The output scale factor.
+    Returns:
+        `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size,
+        in_channels, height, width)`.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",  # default, spatial
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        attn_groups: Optional[int] = None,
+        resnet_pre_norm: bool = True,
+        add_attention: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+    ):
+        super().__init__()
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        self.add_attention = add_attention
+        if attn_groups is None:
+            attn_groups = resnet_groups if resnet_time_scale_shift == "default" else None
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+        if attention_head_dim is None:
+            logger.warn(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {in_channels}."
+            )
+            attention_head_dim = in_channels
+        for _ in range(num_layers):
+            if self.add_attention:
+                # Spatial attention
+                attentions.append(
+                    Attention(
+                        in_channels,
+                        heads=in_channels // attention_head_dim,
+                        dim_head=attention_head_dim,
+                        rescale_output_factor=output_scale_factor,
+                        eps=resnet_eps,
+                        norm_num_groups=attn_groups,
+                        spatial_norm_dim=temb_channels if resnet_time_scale_shift == "spatial" else None,
+                        residual_connection=True,
+                        bias=True,
+                        upcast_softmax=True,
+                        _from_deprecated_attn_block=True,
+                    )
+                )
+            else:
+                attentions.append(None)
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+    def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor:
+        hidden_states = self.resnets[0](hidden_states, temb)
+        t = hidden_states.shape[2]
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            if attn is not None:
+                hidden_states = rearrange(hidden_states, 'b c t h w -> b t c h w')
+                hidden_states = rearrange(hidden_states, 'b t c h w -> (b t) c h w')
+                hidden_states = attn(hidden_states, temb=temb)
+                hidden_states = rearrange(hidden_states, '(b t) c h w -> b t c h w', t=t)
+                hidden_states = rearrange(hidden_states, 'b t c h w -> b c t h w')
+            hidden_states = resnet(hidden_states, temb)
+        return hidden_states
+class CausalUNetMidBlock2D(nn.Module):
+    """
+    A 2D UNet mid-block [`UNetMidBlock2D`] with multiple residual blocks and optional attention blocks.
+    Args:
+        in_channels (`int`): The number of input channels.
+        temb_channels (`int`): The number of temporal embedding channels.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout rate.
+        num_layers (`int`, *optional*, defaults to 1): The number of residual blocks.
+        resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks.
+        resnet_time_scale_shift (`str`, *optional*, defaults to `default`):
+            The type of normalization to apply to the time embeddings. This can help to improve the performance of the
+            model on tasks with long-range temporal dependencies.
+        resnet_act_fn (`str`, *optional*, defaults to `swish`): The activation function for the resnet blocks.
+        resnet_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use in the group normalization layers of the resnet blocks.
+        attn_groups (`Optional[int]`, *optional*, defaults to None): The number of groups for the attention blocks.
+        resnet_pre_norm (`bool`, *optional*, defaults to `True`):
+            Whether to use pre-normalization for the resnet blocks.
+        add_attention (`bool`, *optional*, defaults to `True`): Whether to add attention blocks.
+        attention_head_dim (`int`, *optional*, defaults to 1):
+            Dimension of a single attention head. The number of attention heads is determined based on this value and
+            the number of input channels.
+        output_scale_factor (`float`, *optional*, defaults to 1.0): The output scale factor.
+    Returns:
+        `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size,
+        in_channels, height, width)`.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",  # default, spatial
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        attn_groups: Optional[int] = None,
+        resnet_pre_norm: bool = True,
+        add_attention: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+    ):
+        super().__init__()
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        self.add_attention = add_attention
+        if attn_groups is None:
+            attn_groups = resnet_groups if resnet_time_scale_shift == "default" else None
+        # there is always at least one resnet
+        resnets = [
+            CausalResnetBlock3D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+        if attention_head_dim is None:
+            logger.warn(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {in_channels}."
+            )
+            attention_head_dim = in_channels
+        for _ in range(num_layers):
+            if self.add_attention:
+                # Spatial attention
+                attentions.append(
+                    Attention(
+                        in_channels,
+                        heads=in_channels // attention_head_dim,
+                        dim_head=attention_head_dim,
+                        rescale_output_factor=output_scale_factor,
+                        eps=resnet_eps,
+                        norm_num_groups=attn_groups,
+                        spatial_norm_dim=temb_channels if resnet_time_scale_shift == "spatial" else None,
+                        residual_connection=True,
+                        bias=True,
+                        upcast_softmax=True,
+                        _from_deprecated_attn_block=True,
+                    )
+                )
+            else:
+                attentions.append(None)
+            resnets.append(
+                CausalResnetBlock3D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+    def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None,
+            is_init_image=True, temporal_chunk=False) -> torch.FloatTensor:
+        hidden_states = self.resnets[0](hidden_states, temb, is_init_image=is_init_image, temporal_chunk=temporal_chunk)
+        t = hidden_states.shape[2]
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            if attn is not None:
+                hidden_states = rearrange(hidden_states, 'b c t h w -> b t c h w')
+                hidden_states = rearrange(hidden_states, 'b t c h w -> (b t) c h w')
+                hidden_states = attn(hidden_states, temb=temb)
+                hidden_states = rearrange(hidden_states, '(b t) c h w -> b t c h w', t=t)
+                hidden_states = rearrange(hidden_states, 'b t c h w -> b c t h w')
+            hidden_states = resnet(hidden_states, temb, is_init_image=is_init_image, temporal_chunk=temporal_chunk)
+        return hidden_states
+class DownEncoderBlockCausal3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_spatial_downsample: bool = True,
+        add_temporal_downsample: bool = False,
+        downsample_padding: int = 1,
+    ):
+        super().__init__()
+        resnets = []
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                CausalResnetBlock3D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=None,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        if add_spatial_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    CausalDownsample2x(
+                        out_channels, use_conv=True, out_channels=out_channels,
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+        if add_temporal_downsample:
+            self.temporal_downsamplers = nn.ModuleList(
+                [
+                    CausalTemporalDownsample2x(
+                        out_channels, use_conv=True, out_channels=out_channels,
+                    )
+                ]
+            )
+        else:
+            self.temporal_downsamplers = None
+    def forward(self, hidden_states: torch.FloatTensor, is_init_image=True, temporal_chunk=False) -> torch.FloatTensor:
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb=None, is_init_image=is_init_image, temporal_chunk=temporal_chunk)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states, is_init_image=is_init_image, temporal_chunk=temporal_chunk)
+        if self.temporal_downsamplers is not None:
+            for temporal_downsampler in self.temporal_downsamplers:
+                hidden_states = temporal_downsampler(hidden_states, is_init_image=is_init_image, temporal_chunk=temporal_chunk)
+        return hidden_states
+class DownEncoderBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_spatial_downsample: bool = True,
+        add_temporal_downsample: bool = False,
+        downsample_padding: int = 1,
+    ):
+        super().__init__()
+        resnets = []
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=None,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        if add_spatial_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+        if add_temporal_downsample:
+            self.temporal_downsamplers = nn.ModuleList(
+                [
+                    TemporalDownsample2x(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding,
+                    )
+                ]
+            )
+        else:
+            self.temporal_downsamplers = None
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb=None)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+        if self.temporal_downsamplers is not None:
+            for temporal_downsampler in self.temporal_downsamplers:
+                hidden_states = temporal_downsampler(hidden_states)
+        return hidden_states
+class UpDecoderBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",  # default, spatial
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_spatial_upsample: bool = True,
+        add_temporal_upsample: bool = False,
+        temb_channels: Optional[int] = None,
+        interpolate: bool = True,
+    ):
+        super().__init__()
+        resnets = []
+        for i in range(num_layers):
+            input_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=input_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        if add_spatial_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels, interpolate=interpolate)])
+        else:
+            self.upsamplers = None
+        if add_temporal_upsample:
+            self.temporal_upsamplers = nn.ModuleList([TemporalUpsample2x(out_channels, use_conv=True, out_channels=out_channels, interpolate=interpolate)])
+        else:
+            self.temporal_upsamplers = None
+        self.resolution_idx = resolution_idx
+    def forward(
+        self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, scale: float = 1.0, is_image: bool = False,
+    ) -> torch.FloatTensor:
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb=temb, scale=scale)
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+        if self.temporal_upsamplers is not None:
+            for temporal_upsampler in self.temporal_upsamplers:
+                hidden_states = temporal_upsampler(hidden_states, is_image=is_image)
+        return hidden_states
+class UpDecoderBlockCausal3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",  # default, spatial
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_spatial_upsample: bool = True,
+        add_temporal_upsample: bool = False,
+        temb_channels: Optional[int] = None,
+        interpolate: bool = True,
+    ):
+        super().__init__()
+        resnets = []
+        for i in range(num_layers):
+            input_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                CausalResnetBlock3D(
+                    in_channels=input_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        if add_spatial_upsample:
+            self.upsamplers = nn.ModuleList([CausalUpsample2x(out_channels, use_conv=True, out_channels=out_channels, interpolate=interpolate)])
+        else:
+            self.upsamplers = None
+        if add_temporal_upsample:
+            self.temporal_upsamplers = nn.ModuleList([CausalTemporalUpsample2x(out_channels, use_conv=True, out_channels=out_channels, interpolate=interpolate)])
+        else:
+            self.temporal_upsamplers = None
+        self.resolution_idx = resolution_idx
+    def forward(
+        self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None,
+        is_init_image=True, temporal_chunk=False,
+    ) -> torch.FloatTensor:
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb=temb, is_init_image=is_init_image, temporal_chunk=temporal_chunk)
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, is_init_image=is_init_image, temporal_chunk=temporal_chunk)
+        if self.temporal_upsamplers is not None:
+            for temporal_upsampler in self.temporal_upsamplers:
+                hidden_states = temporal_upsampler(hidden_states, is_init_image=is_init_image, temporal_chunk=temporal_chunk)
+        return hidden_states

video_vae/modeling_causal_conv.py ADDED Viewed

	@@ -0,0 +1,139 @@

+from typing import Tuple, Union
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+import torch.nn.functional as F
+from collections import deque
+from einops import rearrange
+from timm.models.layers import trunc_normal_
+from IPython import embed
+from torch import Tensor
+from utils import (
+    is_context_parallel_initialized,
+    get_context_parallel_group,
+    get_context_parallel_world_size,
+    get_context_parallel_rank,
+    get_context_parallel_group_rank,
+)
+from .context_parallel_ops import (
+    conv_scatter_to_context_parallel_region,
+    conv_gather_from_context_parallel_region,
+    cp_pass_from_previous_rank,
+)
+def divisible_by(num, den):
+    return (num % den) == 0
+def cast_tuple(t, length = 1):
+    return t if isinstance(t, tuple) else ((t,) * length)
+def is_odd(n):
+    return not divisible_by(n, 2)
+class CausalGroupNorm(nn.GroupNorm):
+    def forward(self, x: Tensor) -> Tensor:
+        t = x.shape[2]
+        x = rearrange(x, 'b c t h w -> (b t) c h w')
+        x = super().forward(x)
+        x = rearrange(x, '(b t) c h w -> b c t h w', t=t)
+        return x
+class CausalConv3d(nn.Module):
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size: Union[int, Tuple[int, int, int]],
+            stride: Union[int, Tuple[int, int, int]] = 1,
+            pad_mode: str ='constant',
+            **kwargs
+    ):
+        super().__init__()
+        if isinstance(kernel_size, int):
+            kernel_size = cast_tuple(kernel_size, 3)
+        time_kernel_size, height_kernel_size, width_kernel_size = kernel_size
+        self.time_kernel_size = time_kernel_size
+        assert is_odd(height_kernel_size) and is_odd(width_kernel_size)
+        dilation = kwargs.pop('dilation', 1)
+        self.pad_mode = pad_mode
+        if isinstance(stride, int):
+            stride = (stride, 1, 1)
+        time_pad = dilation * (time_kernel_size - 1)
+        height_pad = height_kernel_size // 2
+        width_pad = width_kernel_size // 2
+        self.temporal_stride = stride[0]
+        self.time_pad = time_pad
+        self.time_causal_padding = (width_pad, width_pad, height_pad, height_pad, time_pad, 0)
+        self.time_uncausal_padding = (width_pad, width_pad, height_pad, height_pad, 0, 0)
+        self.conv = nn.Conv3d(in_channels, out_channels, kernel_size, stride=stride, padding=0, dilation=dilation, **kwargs)
+        self.cache_front_feat = deque()
+    def _clear_context_parallel_cache(self):
+        del self.cache_front_feat
+        self.cache_front_feat = deque()
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Linear, nn.Conv2d, nn.Conv3d)):
+            trunc_normal_(m.weight, std=.02)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, (nn.LayerNorm, nn.GroupNorm)):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def context_parallel_forward(self, x):
+        x = cp_pass_from_previous_rank(x, dim=2, kernel_size=self.time_kernel_size)
+        x = F.pad(x, self.time_uncausal_padding, mode='constant')
+        cp_rank = get_context_parallel_rank()
+        if cp_rank != 0:
+            if self.temporal_stride == 2 and self.time_kernel_size == 3:
+                x = x[:,:,1:]
+        x = self.conv(x)
+        return x
+    def forward(self, x, is_init_image=True, temporal_chunk=False):
+        # temporal_chunk: whether to use the temporal chunk
+        if is_context_parallel_initialized():
+            return self.context_parallel_forward(x)
+        pad_mode = self.pad_mode if self.time_pad < x.shape[2] else 'constant'
+        if not temporal_chunk:
+            x = F.pad(x, self.time_causal_padding, mode=pad_mode)
+        else:
+            assert not self.training, "The feature cache should not be used in training"
+            if is_init_image:
+                # Encode the first chunk
+                x = F.pad(x, self.time_causal_padding, mode=pad_mode)
+                self._clear_context_parallel_cache()
+                self.cache_front_feat.append(x[:, :, -2:].clone().detach())
+            else:
+                x = F.pad(x, self.time_uncausal_padding, mode=pad_mode)
+                video_front_context = self.cache_front_feat.pop()
+                self._clear_context_parallel_cache()
+                if self.temporal_stride == 1 and self.time_kernel_size == 3:
+                    x = torch.cat([video_front_context, x], dim=2)
+                elif self.temporal_stride == 2 and self.time_kernel_size == 3:
+                    x = torch.cat([video_front_context[:,:,-1:], x], dim=2)
+                self.cache_front_feat.append(x[:, :, -2:].clone().detach())
+        x = self.conv(x)
+        return x

video_vae/modeling_causal_vae.py ADDED Viewed

	@@ -0,0 +1,625 @@

+from typing import Dict, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    Attention,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from diffusers.models.modeling_outputs import AutoencoderKLOutput
+from diffusers.models.modeling_utils import ModelMixin
+from timm.models.layers import drop_path, to_2tuple, trunc_normal_
+from .modeling_enc_dec import (
+    DecoderOutput, DiagonalGaussianDistribution,
+    CausalVaeDecoder, CausalVaeEncoder,
+)
+from .modeling_causal_conv import CausalConv3d
+from IPython import embed
+from utils import (
+    is_context_parallel_initialized,
+    get_context_parallel_group,
+    get_context_parallel_world_size,
+    get_context_parallel_rank,
+    get_context_parallel_group_rank,
+)
+from .context_parallel_ops import (
+    conv_scatter_to_context_parallel_region,
+    conv_gather_from_context_parallel_region,
+)
+class CausalVideoVAE(ModelMixin, ConfigMixin):
+    r"""
+    A VAE model with KL loss for encoding images into latents and decoding latent representations into images.
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+    Parameters:
+        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
+        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
+            Tuple of downsample block types.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
+            Tuple of upsample block types.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
+            Tuple of block output channels.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        latent_channels (`int`, *optional*, defaults to 4): Number of channels in the latent space.
+        sample_size (`int`, *optional*, defaults to `32`): Sample input size.
+        scaling_factor (`float`, *optional*, defaults to 0.18215):
+            The component-wise standard deviation of the trained latent space computed using the first batch of the
+            training set. This is used to scale the latent space to have unit variance when training the diffusion
+            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
+            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
+            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
+            Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
+        force_upcast (`bool`, *optional*, default to `True`):
+            If enabled it will force the VAE to run in float32 for high image resolution pipelines, such as SD-XL. VAE
+            can be fine-tuned / trained to a lower range without loosing too much precision in which case
+            `force_upcast` can be set to `False` - see: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        # encoder related parameters
+        encoder_in_channels: int = 3,
+        encoder_out_channels: int = 4,
+        encoder_layers_per_block: Tuple[int, ...] = (2, 2, 2, 2),
+        encoder_down_block_types: Tuple[str, ...] = (
+            "DownEncoderBlockCausal3D",
+            "DownEncoderBlockCausal3D",
+            "DownEncoderBlockCausal3D",
+            "DownEncoderBlockCausal3D",
+        ),
+        encoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 512),
+        encoder_spatial_down_sample: Tuple[bool, ...] = (True, True, True, False),
+        encoder_temporal_down_sample: Tuple[bool, ...] = (True, True, True, False),
+        encoder_block_dropout: Tuple[int, ...] = (0.0, 0.0, 0.0, 0.0),
+        encoder_act_fn: str = "silu",
+        encoder_norm_num_groups: int = 32,
+        encoder_double_z: bool = True,
+        encoder_type: str = 'causal_vae_conv',
+        # decoder related
+        decoder_in_channels: int = 4,
+        decoder_out_channels: int = 3,
+        decoder_layers_per_block: Tuple[int, ...] = (3, 3, 3, 3),
+        decoder_up_block_types: Tuple[str, ...] = (
+            "UpDecoderBlockCausal3D",
+            "UpDecoderBlockCausal3D",
+            "UpDecoderBlockCausal3D",
+            "UpDecoderBlockCausal3D",
+        ),
+        decoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 512),
+        decoder_spatial_up_sample: Tuple[bool, ...] = (True, True, True, False),
+        decoder_temporal_up_sample: Tuple[bool, ...] = (True, True, True, False),
+        decoder_block_dropout: Tuple[int, ...] = (0.0, 0.0, 0.0, 0.0),
+        decoder_act_fn: str = "silu",
+        decoder_norm_num_groups: int = 32,
+        decoder_type: str = 'causal_vae_conv',
+        sample_size: int = 256,
+        scaling_factor: float = 0.18215,
+        add_post_quant_conv: bool = True,
+        interpolate: bool = False,
+        downsample_scale: int = 8,
+    ):
+        super().__init__()
+        print(f"The latent dimmension channes is {encoder_out_channels}")
+        # pass init params to Encoder
+        self.encoder = CausalVaeEncoder(
+            in_channels=encoder_in_channels,
+            out_channels=encoder_out_channels,
+            down_block_types=encoder_down_block_types,
+            spatial_down_sample=encoder_spatial_down_sample,
+            temporal_down_sample=encoder_temporal_down_sample,
+            block_out_channels=encoder_block_out_channels,
+            layers_per_block=encoder_layers_per_block,
+            act_fn=encoder_act_fn,
+            norm_num_groups=encoder_norm_num_groups,
+            double_z=True,
+            block_dropout=encoder_block_dropout,
+        )
+        # pass init params to Decoder
+        self.decoder = CausalVaeDecoder(
+            in_channels=decoder_in_channels,
+            out_channels=decoder_out_channels,
+            up_block_types=decoder_up_block_types,
+            spatial_up_sample=decoder_spatial_up_sample,
+            temporal_up_sample=decoder_temporal_up_sample,
+            block_out_channels=decoder_block_out_channels,
+            layers_per_block=decoder_layers_per_block,
+            norm_num_groups=decoder_norm_num_groups,
+            act_fn=decoder_act_fn,
+            interpolate=interpolate,
+            block_dropout=decoder_block_dropout,
+        )
+        self.quant_conv = CausalConv3d(2 * encoder_out_channels, 2 * encoder_out_channels, kernel_size=1, stride=1)
+        self.post_quant_conv = CausalConv3d(encoder_out_channels, encoder_out_channels, kernel_size=1, stride=1)
+        self.use_tiling = False
+        # only relevant if vae tiling is enabled
+        self.tile_sample_min_size = self.config.sample_size
+        sample_size = (
+            self.config.sample_size[0]
+            if isinstance(self.config.sample_size, (list, tuple))
+            else self.config.sample_size
+        )
+        self.tile_latent_min_size = int(sample_size / downsample_scale)
+        self.encode_tile_overlap_factor = 1 / 8
+        self.decode_tile_overlap_factor = 1 / 8
+        self.downsample_scale = downsample_scale
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Linear, nn.Conv2d, nn.Conv3d)):
+            trunc_normal_(m.weight, std=.02)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, (nn.LayerNorm, nn.GroupNorm)):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (Encoder, Decoder)):
+            module.gradient_checkpointing = value
+    def enable_tiling(self, use_tiling: bool = True):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.use_tiling = use_tiling
+    def disable_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.enable_tiling(False)
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+        self.set_attn_processor(processor)
+    def encode(
+        self, x: torch.FloatTensor, return_dict: bool = True,
+        is_init_image=True, temporal_chunk=False, window_size=16, tile_sample_min_size=256,
+    ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
+        """
+        Encode a batch of images into latents.
+        Args:
+            x (`torch.FloatTensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+        Returns:
+                The latent representations of the encoded images. If `return_dict` is True, a
+                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
+        """
+        self.tile_sample_min_size = tile_sample_min_size
+        self.tile_latent_min_size = int(tile_sample_min_size / self.downsample_scale)
+        if self.use_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
+            return self.tiled_encode(x, return_dict=return_dict, is_init_image=is_init_image,
+                temporal_chunk=temporal_chunk, window_size=window_size)
+        if temporal_chunk:
+            moments = self.chunk_encode(x, window_size=window_size)
+        else:
+            h = self.encoder(x, is_init_image=is_init_image, temporal_chunk=False)
+            moments = self.quant_conv(h, is_init_image=is_init_image, temporal_chunk=False)
+        posterior = DiagonalGaussianDistribution(moments)
+        if not return_dict:
+            return (posterior,)
+        return AutoencoderKLOutput(latent_dist=posterior)
+    @torch.no_grad()
+    def chunk_encode(self, x: torch.FloatTensor, window_size=16):
+        # Only used during inference
+        # Encode a long video clips through sliding window
+        num_frames = x.shape[2]
+        assert (num_frames - 1) % self.downsample_scale == 0
+        init_window_size = window_size + 1
+        frame_list = [x[:,:,:init_window_size]]
+        # To chunk the long video
+        full_chunk_size = (num_frames - init_window_size) // window_size
+        fid = init_window_size
+        for idx in range(full_chunk_size):
+            frame_list.append(x[:, :, fid:fid+window_size])
+            fid += window_size
+        if fid < num_frames:
+            frame_list.append(x[:, :, fid:])
+        latent_list = []
+        for idx, frames in enumerate(frame_list):
+            if idx == 0:
+                h = self.encoder(frames, is_init_image=True, temporal_chunk=True)
+                moments = self.quant_conv(h, is_init_image=True, temporal_chunk=True)
+            else:
+                h = self.encoder(frames, is_init_image=False, temporal_chunk=True)
+                moments = self.quant_conv(h, is_init_image=False, temporal_chunk=True)
+            latent_list.append(moments)
+        latent = torch.cat(latent_list, dim=2)
+        return latent
+    def get_last_layer(self):
+        return self.decoder.conv_out.conv.weight
+    @torch.no_grad()
+    def chunk_decode(self, z: torch.FloatTensor, window_size=2):
+        num_frames = z.shape[2]
+        init_window_size = window_size + 1
+        frame_list = [z[:,:,:init_window_size]]
+        # To chunk the long video
+        full_chunk_size = (num_frames - init_window_size) // window_size
+        fid = init_window_size
+        for idx in range(full_chunk_size):
+            frame_list.append(z[:, :, fid:fid+window_size])
+            fid += window_size
+        if fid < num_frames:
+            frame_list.append(z[:, :, fid:])
+        dec_list = []
+        for idx, frames in enumerate(frame_list):
+            if idx == 0:
+                z_h = self.post_quant_conv(frames, is_init_image=True, temporal_chunk=True)
+                dec = self.decoder(z_h, is_init_image=True, temporal_chunk=True)
+            else:
+                z_h = self.post_quant_conv(frames, is_init_image=False, temporal_chunk=True)
+                dec = self.decoder(z_h, is_init_image=False, temporal_chunk=True)
+            dec_list.append(dec)
+        dec = torch.cat(dec_list, dim=2)
+        return dec
+    def decode(self, z: torch.FloatTensor, is_init_image=True, temporal_chunk=False,
+            return_dict: bool = True, window_size: int = 2, tile_sample_min_size: int = 256,) -> Union[DecoderOutput, torch.FloatTensor]:
+        self.tile_sample_min_size = tile_sample_min_size
+        self.tile_latent_min_size = int(tile_sample_min_size / self.downsample_scale)
+        if self.use_tiling and (z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size):
+            return self.tiled_decode(z, is_init_image=is_init_image,
+                    temporal_chunk=temporal_chunk, window_size=window_size, return_dict=return_dict)
+        if temporal_chunk:
+            dec = self.chunk_decode(z, window_size=window_size)
+        else:
+            z = self.post_quant_conv(z, is_init_image=is_init_image, temporal_chunk=False)
+            dec = self.decoder(z, is_init_image=is_init_image, temporal_chunk=False)
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)
+    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[3], b.shape[3], blend_extent)
+        for y in range(blend_extent):
+            b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (y / blend_extent)
+        return b
+    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[4], b.shape[4], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (x / blend_extent)
+        return b
+    def tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True,
+            is_init_image=True, temporal_chunk=False, window_size=16,) -> AutoencoderKLOutput:
+        r"""Encode a batch of images using a tiled encoder.
+        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
+        steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
+        different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
+        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
+        output, but they should be much less noticeable.
+        Args:
+            x (`torch.FloatTensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.autoencoder_kl.AutoencoderKLOutput`] or `tuple`:
+                If return_dict is True, a [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain
+                `tuple` is returned.
+        """
+        overlap_size = int(self.tile_sample_min_size * (1 - self.encode_tile_overlap_factor))
+        blend_extent = int(self.tile_latent_min_size * self.encode_tile_overlap_factor)
+        row_limit = self.tile_latent_min_size - blend_extent
+        # Split the image into 512x512 tiles and encode them separately.
+        rows = []
+        for i in range(0, x.shape[3], overlap_size):
+            row = []
+            for j in range(0, x.shape[4], overlap_size):
+                tile = x[:, :, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size]
+                if temporal_chunk:
+                    tile = self.chunk_encode(tile, window_size=window_size)
+                else:
+                    tile = self.encoder(tile, is_init_image=True, temporal_chunk=False)
+                    tile = self.quant_conv(tile, is_init_image=True, temporal_chunk=False)
+                row.append(tile)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=4))
+        moments = torch.cat(result_rows, dim=3)
+        posterior = DiagonalGaussianDistribution(moments)
+        if not return_dict:
+            return (posterior,)
+        return AutoencoderKLOutput(latent_dist=posterior)
+    def tiled_decode(self, z: torch.FloatTensor, is_init_image=True,
+            temporal_chunk=False, window_size=2, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
+        r"""
+        Decode a batch of images using a tiled decoder.
+        Args:
+            z (`torch.FloatTensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        overlap_size = int(self.tile_latent_min_size * (1 - self.decode_tile_overlap_factor))
+        blend_extent = int(self.tile_sample_min_size * self.decode_tile_overlap_factor)
+        row_limit = self.tile_sample_min_size - blend_extent
+        # Split z into overlapping 64x64 tiles and decode them separately.
+        # The tiles have an overlap to avoid seams between tiles.
+        rows = []
+        for i in range(0, z.shape[3], overlap_size):
+            row = []
+            for j in range(0, z.shape[4], overlap_size):
+                tile = z[:, :, :, i : i + self.tile_latent_min_size, j : j + self.tile_latent_min_size]
+                if temporal_chunk:
+                    decoded = self.chunk_decode(tile, window_size=window_size)
+                else:
+                    tile = self.post_quant_conv(tile, is_init_image=True, temporal_chunk=False)
+                    decoded = self.decoder(tile, is_init_image=True, temporal_chunk=False)
+                row.append(decoded)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=4))
+        dec = torch.cat(result_rows, dim=3)
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        sample_posterior: bool = True,
+        generator: Optional[torch.Generator] = None,
+        freeze_encoder: bool = False,
+        is_init_image=True,
+        temporal_chunk=False,
+    ) -> Union[DecoderOutput, torch.FloatTensor]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): Input sample.
+            sample_posterior (`bool`, *optional*, defaults to `False`):
+                Whether to sample from the posterior.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+        """
+        x = sample
+        if is_context_parallel_initialized():
+            assert self.training, "Only supports during training now"
+            if freeze_encoder:
+                with torch.no_grad():
+                    h = self.encoder(x, is_init_image=True, temporal_chunk=False)
+                    moments = self.quant_conv(h, is_init_image=True, temporal_chunk=False)
+                    posterior = DiagonalGaussianDistribution(moments)
+                    global_posterior = posterior
+            else:
+                h = self.encoder(x, is_init_image=True, temporal_chunk=False)
+                moments = self.quant_conv(h, is_init_image=True, temporal_chunk=False)
+                posterior = DiagonalGaussianDistribution(moments)
+                global_moments = conv_gather_from_context_parallel_region(moments, dim=2, kernel_size=1)
+                global_posterior = DiagonalGaussianDistribution(global_moments)
+            if sample_posterior:
+                z = posterior.sample(generator=generator)
+            else:
+                z = posterior.mode()
+            if get_context_parallel_rank() == 0:
+                dec = self.decode(z, is_init_image=True).sample
+            else:
+                # Do not drop the first upsampled frame
+                dec = self.decode(z, is_init_image=False).sample
+            return global_posterior, dec
+        else:
+            # The normal training
+            if freeze_encoder:
+                with torch.no_grad():
+                    posterior = self.encode(x, is_init_image=is_init_image,
+                            temporal_chunk=temporal_chunk).latent_dist
+            else:
+                posterior = self.encode(x, is_init_image=is_init_image,
+                        temporal_chunk=temporal_chunk).latent_dist
+            if sample_posterior:
+                z = posterior.sample(generator=generator)
+            else:
+                z = posterior.mode()
+            dec = self.decode(z, is_init_image=is_init_image, temporal_chunk=temporal_chunk).sample
+            return posterior, dec
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        self.original_attn_processors = None
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+        self.original_attn_processors = self.attn_processors
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)

video_vae/modeling_discriminator.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import functools
+import torch.nn as nn
+from einops import rearrange
+import torch
+def weights_init(m):
+    classname = m.__class__.__name__
+    if classname.find('Conv') != -1:
+        nn.init.normal_(m.weight.data, 0.0, 0.02)
+        nn.init.constant_(m.bias.data, 0)
+    elif classname.find('BatchNorm') != -1:
+        nn.init.normal_(m.weight.data, 1.0, 0.02)
+        nn.init.constant_(m.bias.data, 0)
+class NLayerDiscriminator(nn.Module):
+    """Defines a PatchGAN discriminator as in Pix2Pix
+        --> see https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/master/models/networks.py
+    """
+    def __init__(self, input_nc=3, ndf=64, n_layers=4):
+        """Construct a PatchGAN discriminator
+        Parameters:
+            input_nc (int)  -- the number of channels in input images
+            ndf (int)       -- the number of filters in the last conv layer
+            n_layers (int)  -- the number of conv layers in the discriminator
+            norm_layer      -- normalization layer
+        """
+        super(NLayerDiscriminator, self).__init__()
+        # norm_layer = nn.BatchNorm2d
+        norm_layer = nn.InstanceNorm2d
+        if type(norm_layer) == functools.partial:  # no need to use bias as BatchNorm2d has affine parameters
+            use_bias = norm_layer.func != nn.BatchNorm2d
+        else:
+            use_bias = norm_layer != nn.BatchNorm2d
+        kw = 4
+        padw = 1
+        sequence = [nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw), nn.LeakyReLU(0.2, True)]
+        nf_mult = 1
+        nf_mult_prev = 1
+        for n in range(1, n_layers):  # gradually increase the number of filters
+            nf_mult_prev = nf_mult
+            nf_mult = min(2 ** n, 8)
+            sequence += [
+                nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=2, padding=padw, bias=use_bias),
+                norm_layer(ndf * nf_mult),
+                nn.LeakyReLU(0.2, True)
+            ]
+        nf_mult_prev = nf_mult
+        nf_mult = min(2 ** n_layers, 8)
+        sequence += [
+            nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=1, padding=padw, bias=use_bias),
+            norm_layer(ndf * nf_mult),
+            nn.LeakyReLU(0.2, True)
+        ]
+        sequence += [
+            nn.Conv2d(ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw)]  # output 1 channel prediction map
+        self.main = nn.Sequential(*sequence)
+    def forward(self, input):
+        """Standard forward."""
+        return self.main(input)
+class NLayerDiscriminator3D(nn.Module):
+    """Defines a 3D PatchGAN discriminator as in Pix2Pix but for 3D inputs."""
+    def __init__(self, input_nc=3, ndf=64, n_layers=3, use_actnorm=False):
+        """
+        Construct a 3D PatchGAN discriminator
+        Parameters:
+            input_nc (int)  -- the number of channels in input volumes
+            ndf (int)       -- the number of filters in the last conv layer
+            n_layers (int)  -- the number of conv layers in the discriminator
+            use_actnorm (bool) -- flag to use actnorm instead of batchnorm
+        """
+        super(NLayerDiscriminator3D, self).__init__()
+        # if not use_actnorm:
+        #     norm_layer = nn.BatchNorm3d
+        # else:
+        #     raise NotImplementedError("Not implemented.")
+        norm_layer = nn.InstanceNorm3d
+        if type(norm_layer) == functools.partial:
+            use_bias = norm_layer.func != nn.BatchNorm3d
+        else:
+            use_bias = norm_layer != nn.BatchNorm3d
+        kw = 4
+        padw = 1
+        sequence = [nn.Conv3d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw), nn.LeakyReLU(0.2, True)]
+        nf_mult = 1
+        nf_mult_prev = 1
+        for n in range(1, n_layers):  # gradually increase the number of filters
+            nf_mult_prev = nf_mult
+            nf_mult = min(2 ** n, 8)
+            sequence += [
+                nn.Conv3d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=(kw, kw, kw), stride=(1,2,2), padding=padw, bias=use_bias),
+                norm_layer(ndf * nf_mult),
+                nn.LeakyReLU(0.2, True)
+            ]
+        nf_mult_prev = nf_mult
+        nf_mult = min(2 ** n_layers, 8)
+        sequence += [
+            nn.Conv3d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=(kw, kw, kw), stride=1, padding=padw, bias=use_bias),
+            norm_layer(ndf * nf_mult),
+            nn.LeakyReLU(0.2, True)
+        ]
+        sequence += [nn.Conv3d(ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw)]  # output 1 channel prediction map
+        self.main = nn.Sequential(*sequence)
+    def forward(self, input):
+        """Standard forward."""
+        return self.main(input)

video_vae/modeling_enc_dec.py ADDED Viewed

	@@ -0,0 +1,422 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Optional, Tuple
+import numpy as np
+import torch
+import torch.nn as nn
+from einops import rearrange
+from diffusers.utils import BaseOutput, is_torch_version
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.models.attention_processor import SpatialNorm
+from .modeling_block import (
+    UNetMidBlock2D,
+    CausalUNetMidBlock2D,
+    get_down_block,
+    get_up_block,
+    get_input_layer,
+    get_output_layer,
+)
+from .modeling_resnet import (
+    Downsample2D,
+    Upsample2D,
+    TemporalDownsample2x,
+    TemporalUpsample2x,
+)
+from .modeling_causal_conv import CausalConv3d, CausalGroupNorm
+@dataclass
+class DecoderOutput(BaseOutput):
+    r"""
+    Output of decoding method.
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            The decoded output sample from the last layer of the model.
+    """
+    sample: torch.FloatTensor
+class CausalVaeEncoder(nn.Module):
+    r"""
+    The `Encoder` layer of a variational autoencoder that encodes its input into a latent representation.
+    Args:
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        down_block_types (`Tuple[str, ...]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
+            The types of down blocks to use. See `~diffusers.models.unet_2d_blocks.get_down_block` for available
+            options.
+        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
+            The number of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2):
+            The number of layers per block.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+        act_fn (`str`, *optional*, defaults to `"silu"`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+        double_z (`bool`, *optional*, defaults to `True`):
+            Whether to double the number of output channels for the last block.
+    """
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str, ...] = ("DownEncoderBlockCausal3D",),
+        spatial_down_sample: Tuple[bool, ...] = (True,),
+        temporal_down_sample: Tuple[bool, ...] = (False,),
+        block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_block: Tuple[int, ...] = (2,),
+        norm_num_groups: int = 32,
+        act_fn: str = "silu",
+        double_z: bool = True,
+        block_dropout: Tuple[int, ...] = (0.0,),
+        mid_block_add_attention=True,
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+        self.conv_in = CausalConv3d(
+            in_channels,
+            block_out_channels[0],
+            kernel_size=3,
+            stride=1,
+        )
+        self.mid_block = None
+        self.down_blocks = nn.ModuleList([])
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=self.layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                add_spatial_downsample=spatial_down_sample[i],
+                add_temporal_downsample=temporal_down_sample[i],
+                resnet_eps=1e-6,
+                downsample_padding=0,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                attention_head_dim=output_channel,
+                temb_channels=None,
+                dropout=block_dropout[i],
+            )
+            self.down_blocks.append(down_block)
+        # mid
+        self.mid_block = CausalUNetMidBlock2D(
+            in_channels=block_out_channels[-1],
+            resnet_eps=1e-6,
+            resnet_act_fn=act_fn,
+            output_scale_factor=1,
+            resnet_time_scale_shift="default",
+            attention_head_dim=block_out_channels[-1],
+            resnet_groups=norm_num_groups,
+            temb_channels=None,
+            add_attention=mid_block_add_attention,
+            dropout=block_dropout[-1],
+        )
+        # out
+        self.conv_norm_out = CausalGroupNorm(num_channels=block_out_channels[-1], num_groups=norm_num_groups, eps=1e-6)
+        self.conv_act = nn.SiLU()
+        conv_out_channels = 2 * out_channels if double_z else out_channels
+        self.conv_out = CausalConv3d(block_out_channels[-1], conv_out_channels, kernel_size=3, stride=1)
+        self.gradient_checkpointing = False
+    def forward(self, sample: torch.FloatTensor, is_init_image=True, temporal_chunk=False) -> torch.FloatTensor:
+        r"""The forward method of the `Encoder` class."""
+        sample = self.conv_in(sample, is_init_image=is_init_image, temporal_chunk=temporal_chunk)
+        if self.training and self.gradient_checkpointing:
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+                return custom_forward
+            # down
+            if is_torch_version(">=", "1.11.0"):
+                for down_block in self.down_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(down_block), sample, is_init_image,
+                            temporal_chunk, use_reentrant=False
+                    )
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block), sample, is_init_image,
+                        temporal_chunk, use_reentrant=False
+                )
+            else:
+                for down_block in self.down_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(create_custom_forward(down_block), sample, is_init_image, temporal_chunk)
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(create_custom_forward(self.mid_block), sample, is_init_image, temporal_chunk)
+        else:
+            # down
+            for down_block in self.down_blocks:
+                sample = down_block(sample, is_init_image=is_init_image, temporal_chunk=temporal_chunk)
+            # middle
+            sample = self.mid_block(sample, is_init_image=is_init_image, temporal_chunk=temporal_chunk)
+        # post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample, is_init_image=is_init_image, temporal_chunk=temporal_chunk)
+        return sample
+class CausalVaeDecoder(nn.Module):
+    r"""
+    The `Decoder` layer of a variational autoencoder that decodes its latent representation into an output sample.
+    Args:
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        up_block_types (`Tuple[str, ...]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
+            The types of up blocks to use. See `~diffusers.models.unet_2d_blocks.get_up_block` for available options.
+        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
+            The number of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2):
+            The number of layers per block.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+        act_fn (`str`, *optional*, defaults to `"silu"`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+        norm_type (`str`, *optional*, defaults to `"group"`):
+            The normalization type to use. Can be either `"group"` or `"spatial"`.
+    """
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        up_block_types: Tuple[str, ...] = ("UpDecoderBlockCausal3D",),
+        spatial_up_sample: Tuple[bool, ...] = (True,),
+        temporal_up_sample: Tuple[bool, ...] = (False,),
+        block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_block: Tuple[int, ...] = (2,),
+        norm_num_groups: int = 32,
+        act_fn: str = "silu",
+        mid_block_add_attention=True,
+        interpolate: bool = True,
+        block_dropout: Tuple[int, ...] = (0.0,),
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+        self.conv_in = CausalConv3d(
+            in_channels,
+            block_out_channels[-1],
+            kernel_size=3,
+            stride=1,
+        )
+        self.mid_block = None
+        self.up_blocks = nn.ModuleList([])
+        # mid
+        self.mid_block = CausalUNetMidBlock2D(
+            in_channels=block_out_channels[-1],
+            resnet_eps=1e-6,
+            resnet_act_fn=act_fn,
+            output_scale_factor=1,
+            resnet_time_scale_shift="default",
+            attention_head_dim=block_out_channels[-1],
+            resnet_groups=norm_num_groups,
+            temb_channels=None,
+            add_attention=mid_block_add_attention,
+            dropout=block_dropout[-1],
+        )
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=self.layers_per_block[i],
+                in_channels=prev_output_channel,
+                out_channels=output_channel,
+                prev_output_channel=None,
+                add_spatial_upsample=spatial_up_sample[i],
+                add_temporal_upsample=temporal_up_sample[i],
+                resnet_eps=1e-6,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                attention_head_dim=output_channel,
+                temb_channels=None,
+                resnet_time_scale_shift='default',
+                interpolate=interpolate,
+                dropout=block_dropout[i],
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        # out
+        self.conv_norm_out = CausalGroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6)
+        self.conv_act = nn.SiLU()
+        self.conv_out = CausalConv3d(block_out_channels[0], out_channels, kernel_size=3, stride=1)
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        is_init_image=True,
+        temporal_chunk=False,
+    ) -> torch.FloatTensor:
+        r"""The forward method of the `Decoder` class."""
+        sample = self.conv_in(sample, is_init_image=is_init_image, temporal_chunk=temporal_chunk)
+        upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
+        if self.training and self.gradient_checkpointing:
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+                return custom_forward
+            if is_torch_version(">=", "1.11.0"):
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block),
+                    sample,
+                    is_init_image=is_init_image,
+                    temporal_chunk=temporal_chunk,
+                    use_reentrant=False,
+                )
+                sample = sample.to(upscale_dtype)
+                # up
+                for up_block in self.up_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(up_block),
+                        sample,
+                        is_init_image=is_init_image,
+                        temporal_chunk=temporal_chunk,
+                        use_reentrant=False,
+                    )
+            else:
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block), sample, is_init_image=is_init_image, temporal_chunk=temporal_chunk,
+                )
+                sample = sample.to(upscale_dtype)
+                # up
+                for up_block in self.up_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample,
+                        is_init_image=is_init_image, temporal_chunk=temporal_chunk,)
+        else:
+            # middle
+            sample = self.mid_block(sample, is_init_image=is_init_image, temporal_chunk=temporal_chunk)
+            sample = sample.to(upscale_dtype)
+            # up
+            for up_block in self.up_blocks:
+                sample = up_block(sample, is_init_image=is_init_image, temporal_chunk=temporal_chunk,)
+        # post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample, is_init_image=is_init_image, temporal_chunk=temporal_chunk)
+        return sample
+class DiagonalGaussianDistribution(object):
+    def __init__(self, parameters: torch.Tensor, deterministic: bool = False):
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(
+                self.mean, device=self.parameters.device, dtype=self.parameters.dtype
+            )
+    def sample(self, generator: Optional[torch.Generator] = None) -> torch.FloatTensor:
+        # make sure sample is on the same device as the parameters and has same dtype
+        sample = randn_tensor(
+            self.mean.shape,
+            generator=generator,
+            device=self.parameters.device,
+            dtype=self.parameters.dtype,
+        )
+        x = self.mean + self.std * sample
+        return x
+    def kl(self, other: "DiagonalGaussianDistribution" = None) -> torch.Tensor:
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        else:
+            if other is None:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
+                    dim=[2, 3, 4],
+                )
+            else:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean - other.mean, 2) / other.var
+                    + self.var / other.var
+                    - 1.0
+                    - self.logvar
+                    + other.logvar,
+                    dim=[2, 3, 4],
+                )
+    def nll(self, sample: torch.Tensor, dims: Tuple[int, ...] = [1, 2, 3]) -> torch.Tensor:
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        logtwopi = np.log(2.0 * np.pi)
+        return 0.5 * torch.sum(
+            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
+            dim=dims,
+        )
+    def mode(self) -> torch.Tensor:
+        return self.mean

video_vae/modeling_loss.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import os
+import torch
+from torch import nn
+import torch.nn.functional as F
+from einops import rearrange
+from .modeling_lpips import LPIPS
+from .modeling_discriminator import NLayerDiscriminator, NLayerDiscriminator3D, weights_init
+from IPython import embed
+class AdaptiveLossWeight:
+    def __init__(self, timestep_range=[0, 1], buckets=300, weight_range=[1e-7, 1e7]):
+        self.bucket_ranges = torch.linspace(timestep_range[0], timestep_range[1], buckets-1)
+        self.bucket_losses = torch.ones(buckets)
+        self.weight_range = weight_range
+    def weight(self, timestep):
+        indices = torch.searchsorted(self.bucket_ranges.to(timestep.device), timestep)
+        return (1/self.bucket_losses.to(timestep.device)[indices]).clamp(*self.weight_range)
+    def update_buckets(self, timestep, loss, beta=0.99):
+        indices = torch.searchsorted(self.bucket_ranges.to(timestep.device), timestep).cpu()
+        self.bucket_losses[indices] = self.bucket_losses[indices]*beta + loss.detach().cpu() * (1-beta)
+def hinge_d_loss(logits_real, logits_fake):
+    loss_real = torch.mean(F.relu(1.0 - logits_real))
+    loss_fake = torch.mean(F.relu(1.0 + logits_fake))
+    d_loss = 0.5 * (loss_real + loss_fake)
+    return d_loss
+def vanilla_d_loss(logits_real, logits_fake):
+    d_loss = 0.5 * (
+        torch.mean(torch.nn.functional.softplus(-logits_real))
+        + torch.mean(torch.nn.functional.softplus(logits_fake))
+    )
+    return d_loss
+def adopt_weight(weight, global_step, threshold=0, value=0.0):
+    if global_step < threshold:
+        weight = value
+    return weight
+class LPIPSWithDiscriminator(nn.Module):
+    def __init__(
+        self,
+        disc_start,
+        logvar_init=0.0,
+        kl_weight=1.0,
+        pixelloss_weight=1.0,
+        perceptual_weight=1.0,
+        # --- Discriminator Loss ---
+        disc_num_layers=4,
+        disc_in_channels=3,
+        disc_factor=1.0,
+        disc_weight=0.5,
+        disc_loss="hinge",
+        add_discriminator=True,
+        using_3d_discriminator=False,
+    ):
+        super().__init__()
+        assert disc_loss in ["hinge", "vanilla"]
+        self.kl_weight = kl_weight
+        self.pixel_weight = pixelloss_weight
+        self.perceptual_loss = LPIPS().eval()
+        self.perceptual_weight = perceptual_weight
+        self.logvar = nn.Parameter(torch.ones(size=()) * logvar_init)
+        if add_discriminator:
+            disc_cls = NLayerDiscriminator3D if using_3d_discriminator else NLayerDiscriminator
+            self.discriminator = disc_cls(
+                input_nc=disc_in_channels, n_layers=disc_num_layers,
+            ).apply(weights_init)
+        else:
+            self.discriminator = None
+        self.discriminator_iter_start = disc_start
+        self.disc_loss = hinge_d_loss if disc_loss == "hinge" else vanilla_d_loss
+        self.disc_factor = disc_factor
+        self.discriminator_weight = disc_weight
+        self.using_3d_discriminator = using_3d_discriminator
+    def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
+        if last_layer is not None:
+            nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
+            g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
+        else:
+            nll_grads = torch.autograd.grad(
+                nll_loss, self.last_layer[0], retain_graph=True
+            )[0]
+            g_grads = torch.autograd.grad(
+                g_loss, self.last_layer[0], retain_graph=True
+            )[0]
+        d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
+        d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
+        d_weight = d_weight * self.discriminator_weight
+        return d_weight
+    def forward(
+        self,
+        inputs,
+        reconstructions,
+        posteriors,
+        optimizer_idx,
+        global_step,
+        split="train",
+        last_layer=None,
+    ):
+        t = reconstructions.shape[2]
+        inputs = rearrange(inputs, "b c t h w -> (b t) c h w").contiguous()
+        reconstructions = rearrange(reconstructions, "b c t h w -> (b t) c h w").contiguous()
+        if optimizer_idx == 0:
+            # rec_loss = torch.mean(torch.abs(inputs - reconstructions), dim=(1,2,3), keepdim=True)
+            rec_loss = torch.mean(F.mse_loss(inputs, reconstructions, reduction='none'), dim=(1,2,3), keepdim=True)
+            if self.perceptual_weight > 0:
+                p_loss = self.perceptual_loss(inputs, reconstructions)
+                nll_loss = self.pixel_weight * rec_loss + self.perceptual_weight * p_loss
+            nll_loss = nll_loss / torch.exp(self.logvar) + self.logvar
+            weighted_nll_loss = nll_loss
+            weighted_nll_loss = torch.sum(weighted_nll_loss) / weighted_nll_loss.shape[0]
+            nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
+            kl_loss = posteriors.kl()
+            kl_loss = torch.mean(kl_loss)
+            disc_factor = adopt_weight(
+                self.disc_factor, global_step, threshold=self.discriminator_iter_start
+            )
+            if disc_factor > 0.0:
+                if self.using_3d_discriminator:
+                    reconstructions = rearrange(reconstructions, '(b t) c h w -> b c t h w', t=t)
+                logits_fake = self.discriminator(reconstructions.contiguous())
+                g_loss = -torch.mean(logits_fake)
+                try:
+                    d_weight = self.calculate_adaptive_weight(
+                        nll_loss, g_loss, last_layer=last_layer
+                    )
+                except RuntimeError:
+                    assert not self.training
+                    d_weight = torch.tensor(0.0)
+            else:
+                d_weight = torch.tensor(0.0)
+                g_loss = torch.tensor(0.0)
+            loss = (
+                weighted_nll_loss
+                + self.kl_weight * kl_loss
+                + d_weight * disc_factor * g_loss
+            )
+            log = {
+                "{}/total_loss".format(split): loss.clone().detach().mean(),
+                "{}/logvar".format(split): self.logvar.detach(),
+                "{}/kl_loss".format(split): kl_loss.detach().mean(),
+                "{}/nll_loss".format(split): nll_loss.detach().mean(),
+                "{}/rec_loss".format(split): rec_loss.detach().mean(),
+                "{}/perception_loss".format(split): p_loss.detach().mean(),
+                "{}/d_weight".format(split): d_weight.detach(),
+                "{}/disc_factor".format(split): torch.tensor(disc_factor),
+                "{}/g_loss".format(split): g_loss.detach().mean(),
+            }
+            return loss, log
+        if optimizer_idx == 1:
+            if self.using_3d_discriminator:
+                inputs = rearrange(inputs, '(b t) c h w -> b c t h w', t=t)
+                reconstructions = rearrange(reconstructions, '(b t) c h w -> b c t h w', t=t)
+            logits_real = self.discriminator(inputs.contiguous().detach())
+            logits_fake = self.discriminator(reconstructions.contiguous().detach())
+            disc_factor = adopt_weight(
+                self.disc_factor, global_step, threshold=self.discriminator_iter_start
+            )
+            d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)
+            log = {
+                "{}/disc_loss".format(split): d_loss.clone().detach().mean(),
+                "{}/logits_real".format(split): logits_real.detach().mean(),
+                "{}/logits_fake".format(split): logits_fake.detach().mean(),
+            }
+            return d_loss, log

video_vae/modeling_lpips.py ADDED Viewed

	@@ -0,0 +1,120 @@

+"""Stripped version of https://github.com/richzhang/PerceptualSimilarity/tree/master/models"""
+import torch
+import torch.nn as nn
+from torchvision import models
+from collections import namedtuple
+class LPIPS(nn.Module):
+    # Learned perceptual metric
+    def __init__(self, use_dropout=True):
+        super().__init__()
+        self.scaling_layer = ScalingLayer()
+        self.chns = [64, 128, 256, 512, 512]  # vg16 features
+        self.net = vgg16(pretrained=False, requires_grad=False)
+        self.lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout)
+        self.lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout)
+        self.lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout)
+        self.lin3 = NetLinLayer(self.chns[3], use_dropout=use_dropout)
+        self.lin4 = NetLinLayer(self.chns[4], use_dropout=use_dropout)
+        self.load_from_pretrained()
+        for param in self.parameters():
+            param.requires_grad = False
+    def load_from_pretrained(self):
+        ckpt = "/home/jinyang/models/vae/video_vae_baseline/vgg_lpips.pth"    # replace with your lpips
+        self.load_state_dict(torch.load(ckpt, map_location=torch.device("cpu")), strict=True)
+        print("loaded pretrained LPIPS loss from {}".format(ckpt))
+    def forward(self, input, target):
+        in0_input, in1_input = (self.scaling_layer(input), self.scaling_layer(target))
+        outs0, outs1 = self.net(in0_input), self.net(in1_input)
+        feats0, feats1, diffs = {}, {}, {}
+        lins = [self.lin0, self.lin1, self.lin2, self.lin3, self.lin4]
+        for kk in range(len(self.chns)):
+            feats0[kk], feats1[kk] = normalize_tensor(outs0[kk]), normalize_tensor(outs1[kk])
+            diffs[kk] = (feats0[kk] - feats1[kk]) ** 2
+        res = [spatial_average(lins[kk].model(diffs[kk]), keepdim=True) for kk in range(len(self.chns))]
+        val = res[0]
+        for l in range(1, len(self.chns)):
+            val += res[l]
+        return val
+class ScalingLayer(nn.Module):
+    def __init__(self):
+        super(ScalingLayer, self).__init__()
+        self.register_buffer('shift', torch.Tensor([-.030, -.088, -.188])[None, :, None, None])
+        self.register_buffer('scale', torch.Tensor([.458, .448, .450])[None, :, None, None])
+    def forward(self, inp):
+        return (inp - self.shift) / self.scale
+class NetLinLayer(nn.Module):
+    """ A single linear layer which does a 1x1 conv """
+    def __init__(self, chn_in, chn_out=1, use_dropout=False):
+        super(NetLinLayer, self).__init__()
+        layers = [nn.Dropout(), ] if (use_dropout) else []
+        layers += [nn.Conv2d(chn_in, chn_out, 1, stride=1, padding=0, bias=False), ]
+        self.model = nn.Sequential(*layers)
+class vgg16(torch.nn.Module):
+    def __init__(self, requires_grad=False, pretrained=True):
+        super(vgg16, self).__init__()
+        vgg_pretrained_features = models.vgg16(pretrained=pretrained).features
+        self.slice1 = torch.nn.Sequential()
+        self.slice2 = torch.nn.Sequential()
+        self.slice3 = torch.nn.Sequential()
+        self.slice4 = torch.nn.Sequential()
+        self.slice5 = torch.nn.Sequential()
+        self.N_slices = 5
+        for x in range(4):
+            self.slice1.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(4, 9):
+            self.slice2.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(9, 16):
+            self.slice3.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(16, 23):
+            self.slice4.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(23, 30):
+            self.slice5.add_module(str(x), vgg_pretrained_features[x])
+        if not requires_grad:
+            for param in self.parameters():
+                param.requires_grad = False
+    def forward(self, X):
+        h = self.slice1(X)
+        h_relu1_2 = h
+        h = self.slice2(h)
+        h_relu2_2 = h
+        h = self.slice3(h)
+        h_relu3_3 = h
+        h = self.slice4(h)
+        h_relu4_3 = h
+        h = self.slice5(h)
+        h_relu5_3 = h
+        vgg_outputs = namedtuple("VggOutputs", ['relu1_2', 'relu2_2', 'relu3_3', 'relu4_3', 'relu5_3'])
+        out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3)
+        return out
+def normalize_tensor(x,eps=1e-10):
+    norm_factor = torch.sqrt(torch.sum(x**2,dim=1,keepdim=True))
+    return x/(norm_factor+eps)
+def spatial_average(x, keepdim=True):
+    return x.mean([2,3],keepdim=keepdim)
+if __name__ == "__main__":
+    model = LPIPS().eval()
+    _ = torch.manual_seed(123)
+    img1 = (torch.rand(10, 3, 100, 100) * 2) - 1
+    img2 = (torch.rand(10, 3, 100, 100) * 2) - 1
+    print(model(img1, img2).shape)
+    # embed()

video_vae/modeling_resnet.py ADDED Viewed

	@@ -0,0 +1,729 @@

+from functools import partial
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from diffusers.models.activations import get_activation
+from diffusers.models.attention_processor import SpatialNorm
+from diffusers.models.lora import LoRACompatibleConv, LoRACompatibleLinear
+from diffusers.models.normalization import AdaGroupNorm
+from timm.models.layers import drop_path, to_2tuple, trunc_normal_
+from .modeling_causal_conv import CausalConv3d, CausalGroupNorm
+class CausalResnetBlock3D(nn.Module):
+    r"""
+    A Resnet block.
+    Parameters:
+        in_channels (`int`): The number of channels in the input.
+        out_channels (`int`, *optional*, default to be `None`):
+            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
+        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
+        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
+        groups (`int`, *optional*, default to `32`): The number of groups to use for the first normalization layer.
+        groups_out (`int`, *optional*, default to None):
+            The number of groups to use for the second normalization layer. if set to None, same as `groups`.
+        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
+        non_linearity (`str`, *optional*, default to `"swish"`): the activation function to use.
+        time_embedding_norm (`str`, *optional*, default to `"default"` ): Time scale shift config.
+            By default, apply timestep embedding conditioning with a simple shift mechanism. Choose "scale_shift" or
+            "ada_group" for a stronger conditioning with scale and shift.
+        kernel (`torch.FloatTensor`, optional, default to None): FIR filter, see
+            [`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`].
+        output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output.
+        use_in_shortcut (`bool`, *optional*, default to `True`):
+            If `True`, add a 1x1 nn.conv2d layer for skip-connection.
+        up (`bool`, *optional*, default to `False`): If `True`, add an upsample layer.
+        down (`bool`, *optional*, default to `False`): If `True`, add a downsample layer.
+        conv_shortcut_bias (`bool`, *optional*, default to `True`):  If `True`, adds a learnable bias to the
+            `conv_shortcut` output.
+        conv_2d_out_channels (`int`, *optional*, default to `None`): the number of channels in the output.
+            If None, same as `out_channels`.
+    """
+    def __init__(
+        self,
+        *,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        conv_shortcut: bool = False,
+        dropout: float = 0.0,
+        temb_channels: int = 512,
+        groups: int = 32,
+        groups_out: Optional[int] = None,
+        pre_norm: bool = True,
+        eps: float = 1e-6,
+        non_linearity: str = "swish",
+        time_embedding_norm: str = "default",  # default, scale_shift, ada_group, spatial
+        output_scale_factor: float = 1.0,
+        use_in_shortcut: Optional[bool] = None,
+        conv_shortcut_bias: bool = True,
+        conv_2d_out_channels: Optional[int] = None,
+    ):
+        super().__init__()
+        self.pre_norm = pre_norm
+        self.pre_norm = True
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.output_scale_factor = output_scale_factor
+        self.time_embedding_norm = time_embedding_norm
+        linear_cls = nn.Linear
+        if groups_out is None:
+            groups_out = groups
+        if self.time_embedding_norm == "ada_group":
+            self.norm1 = AdaGroupNorm(temb_channels, in_channels, groups, eps=eps)
+        elif self.time_embedding_norm == "spatial":
+            self.norm1 = SpatialNorm(in_channels, temb_channels)
+        else:
+            self.norm1 = CausalGroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
+        self.conv1 = CausalConv3d(in_channels, out_channels, kernel_size=3, stride=1)
+        if self.time_embedding_norm == "ada_group":
+            self.norm2 = AdaGroupNorm(temb_channels, out_channels, groups_out, eps=eps)
+        elif self.time_embedding_norm == "spatial":
+            self.norm2 = SpatialNorm(out_channels, temb_channels)
+        else:
+            self.norm2 = CausalGroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
+        self.dropout = torch.nn.Dropout(dropout)
+        conv_2d_out_channels = conv_2d_out_channels or out_channels
+        self.conv2 = CausalConv3d(out_channels, conv_2d_out_channels, kernel_size=3, stride=1)
+        self.nonlinearity = get_activation(non_linearity)
+        self.upsample = self.downsample = None
+        self.use_in_shortcut = self.in_channels != conv_2d_out_channels if use_in_shortcut is None else use_in_shortcut
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = CausalConv3d(
+                in_channels,
+                conv_2d_out_channels,
+                kernel_size=1,
+                stride=1,
+                bias=conv_shortcut_bias,
+            )
+    def forward(
+        self,
+        input_tensor: torch.FloatTensor,
+        temb: torch.FloatTensor = None,
+        is_init_image=True,
+        temporal_chunk=False,
+    ) -> torch.FloatTensor:
+        hidden_states = input_tensor
+        if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
+            hidden_states = self.norm1(hidden_states, temb)
+        else:
+            hidden_states = self.norm1(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.conv1(hidden_states, is_init_image=is_init_image, temporal_chunk=temporal_chunk)
+        if temb is not None and self.time_embedding_norm == "default":
+            hidden_states = hidden_states + temb
+        if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
+            hidden_states = self.norm2(hidden_states, temb)
+        else:
+            hidden_states = self.norm2(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states, is_init_image=is_init_image, temporal_chunk=temporal_chunk)
+        if self.conv_shortcut is not None:
+            input_tensor = self.conv_shortcut(input_tensor, is_init_image=is_init_image, temporal_chunk=temporal_chunk)
+        output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
+        return output_tensor
+class ResnetBlock2D(nn.Module):
+    r"""
+    A Resnet block.
+    Parameters:
+        in_channels (`int`): The number of channels in the input.
+        out_channels (`int`, *optional*, default to be `None`):
+            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
+        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
+        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
+        groups (`int`, *optional*, default to `32`): The number of groups to use for the first normalization layer.
+        groups_out (`int`, *optional*, default to None):
+            The number of groups to use for the second normalization layer. if set to None, same as `groups`.
+        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
+        non_linearity (`str`, *optional*, default to `"swish"`): the activation function to use.
+        time_embedding_norm (`str`, *optional*, default to `"default"` ): Time scale shift config.
+            By default, apply timestep embedding conditioning with a simple shift mechanism. Choose "scale_shift" or
+            "ada_group" for a stronger conditioning with scale and shift.
+        kernel (`torch.FloatTensor`, optional, default to None): FIR filter, see
+            [`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`].
+        output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output.
+        use_in_shortcut (`bool`, *optional*, default to `True`):
+            If `True`, add a 1x1 nn.conv2d layer for skip-connection.
+        up (`bool`, *optional*, default to `False`): If `True`, add an upsample layer.
+        down (`bool`, *optional*, default to `False`): If `True`, add a downsample layer.
+        conv_shortcut_bias (`bool`, *optional*, default to `True`):  If `True`, adds a learnable bias to the
+            `conv_shortcut` output.
+        conv_2d_out_channels (`int`, *optional*, default to `None`): the number of channels in the output.
+            If None, same as `out_channels`.
+    """
+    def __init__(
+        self,
+        *,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        conv_shortcut: bool = False,
+        dropout: float = 0.0,
+        temb_channels: int = 512,
+        groups: int = 32,
+        groups_out: Optional[int] = None,
+        pre_norm: bool = True,
+        eps: float = 1e-6,
+        non_linearity: str = "swish",
+        time_embedding_norm: str = "default",  # default, scale_shift, ada_group, spatial
+        output_scale_factor: float = 1.0,
+        use_in_shortcut: Optional[bool] = None,
+        conv_shortcut_bias: bool = True,
+        conv_2d_out_channels: Optional[int] = None,
+    ):
+        super().__init__()
+        self.pre_norm = pre_norm
+        self.pre_norm = True
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.output_scale_factor = output_scale_factor
+        self.time_embedding_norm = time_embedding_norm
+        linear_cls = nn.Linear
+        conv_cls = nn.Conv3d
+        if groups_out is None:
+            groups_out = groups
+        if self.time_embedding_norm == "ada_group":
+            self.norm1 = AdaGroupNorm(temb_channels, in_channels, groups, eps=eps)
+        elif self.time_embedding_norm == "spatial":
+            self.norm1 = SpatialNorm(in_channels, temb_channels)
+        else:
+            self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
+        self.conv1 = conv_cls(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.time_embedding_norm == "ada_group":
+            self.norm2 = AdaGroupNorm(temb_channels, out_channels, groups_out, eps=eps)
+        elif self.time_embedding_norm == "spatial":
+            self.norm2 = SpatialNorm(out_channels, temb_channels)
+        else:
+            self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
+        self.dropout = torch.nn.Dropout(dropout)
+        conv_2d_out_channels = conv_2d_out_channels or out_channels
+        self.conv2 = conv_cls(out_channels, conv_2d_out_channels, kernel_size=3, stride=1, padding=1)
+        self.nonlinearity = get_activation(non_linearity)
+        self.upsample = self.downsample = None
+        self.use_in_shortcut = self.in_channels != conv_2d_out_channels if use_in_shortcut is None else use_in_shortcut
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = conv_cls(
+                in_channels,
+                conv_2d_out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=conv_shortcut_bias,
+            )
+    def forward(
+        self,
+        input_tensor: torch.FloatTensor,
+        temb: torch.FloatTensor = None,
+        scale: float = 1.0,
+    ) -> torch.FloatTensor:
+        hidden_states = input_tensor
+        if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
+            hidden_states = self.norm1(hidden_states, temb)
+        else:
+            hidden_states = self.norm1(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+        if temb is not None and self.time_embedding_norm == "default":
+            hidden_states = hidden_states + temb
+        if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
+            hidden_states = self.norm2(hidden_states, temb)
+        else:
+            hidden_states = self.norm2(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        if self.conv_shortcut is not None:
+            input_tensor = self.conv_shortcut(input_tensor)
+        output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
+        return output_tensor
+class CausalDownsample2x(nn.Module):
+    """A 2D downsampling layer with an optional convolution.
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+        padding (`int`, default `1`):
+            padding for the convolution.
+        name (`str`, default `conv`):
+            name of the downsampling 2D layer.
+    """
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool = True,
+        out_channels: Optional[int] = None,
+        name: str = "conv",
+        kernel_size=3,
+        bias=True,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        stride = (1, 2, 2)
+        self.name = name
+        if use_conv:
+            conv = CausalConv3d(
+                self.channels, self.out_channels, kernel_size=kernel_size, stride=stride, bias=bias
+            )
+        else:
+            assert self.channels == self.out_channels
+            conv = nn.AvgPool3d(kernel_size=stride, stride=stride)
+        self.conv = conv
+    def forward(self, hidden_states: torch.FloatTensor, is_init_image=True, temporal_chunk=False) -> torch.FloatTensor:
+        assert hidden_states.shape[1] == self.channels
+        hidden_states = self.conv(hidden_states, is_init_image=is_init_image, temporal_chunk=temporal_chunk)
+        return hidden_states
+class Downsample2D(nn.Module):
+    """A 2D downsampling layer with an optional convolution.
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+        padding (`int`, default `1`):
+            padding for the convolution.
+        name (`str`, default `conv`):
+            name of the downsampling 2D layer.
+    """
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool = True,
+        out_channels: Optional[int] = None,
+        padding: int = 0,
+        name: str = "conv",
+        kernel_size=3,
+        bias=True,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.padding = padding
+        stride = (1, 2, 2)
+        self.name = name
+        conv_cls = nn.Conv3d
+        if use_conv:
+            conv = conv_cls(
+                self.channels, self.out_channels, kernel_size=kernel_size, stride=stride, padding=padding, bias=bias
+            )
+        else:
+            assert self.channels == self.out_channels
+            conv = nn.AvgPool2d(kernel_size=stride, stride=stride)
+        self.conv = conv
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        assert hidden_states.shape[1] == self.channels
+        if self.use_conv and self.padding == 0:
+            pad = (0, 1, 0, 1, 1, 1)
+            hidden_states = F.pad(hidden_states, pad, mode="constant", value=0)
+        assert hidden_states.shape[1] == self.channels
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+class TemporalDownsample2x(nn.Module):
+    """A Temporal downsampling layer with an optional convolution.
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+        padding (`int`, default `1`):
+            padding for the convolution.
+        name (`str`, default `conv`):
+            name of the downsampling 2D layer.
+    """
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool = False,
+        out_channels: Optional[int] = None,
+        padding: int = 0,
+        kernel_size=3,
+        bias=True,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.padding = padding
+        stride = (2, 1, 1)
+        conv_cls = nn.Conv3d
+        if use_conv:
+            conv = conv_cls(
+                self.channels, self.out_channels, kernel_size=kernel_size, stride=stride, padding=padding, bias=bias
+            )
+        else:
+            raise NotImplementedError("Not implemented for temporal downsample without")
+        self.conv = conv
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        assert hidden_states.shape[1] == self.channels
+        if self.use_conv and self.padding == 0:
+            if hidden_states.shape[2] == 1:
+                # image
+                pad = (1, 1, 1, 1, 1, 1)
+            else:
+                # video
+                pad = (1, 1, 1, 1, 0, 1)
+            hidden_states = F.pad(hidden_states, pad, mode="constant", value=0)
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+class CausalTemporalDownsample2x(nn.Module):
+    """A Temporal downsampling layer with an optional convolution.
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+        padding (`int`, default `1`):
+            padding for the convolution.
+        name (`str`, default `conv`):
+            name of the downsampling 2D layer.
+    """
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool = False,
+        out_channels: Optional[int] = None,
+        kernel_size=3,
+        bias=True,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        stride = (2, 1, 1)
+        conv_cls = nn.Conv3d
+        if use_conv:
+            conv = CausalConv3d(
+                self.channels, self.out_channels, kernel_size=kernel_size, stride=stride, bias=bias
+            )
+        else:
+            raise NotImplementedError("Not implemented for temporal downsample without")
+        self.conv = conv
+    def forward(self, hidden_states: torch.FloatTensor, is_init_image=True, temporal_chunk=False) -> torch.FloatTensor:
+        assert hidden_states.shape[1] == self.channels
+        hidden_states = self.conv(hidden_states, is_init_image=is_init_image, temporal_chunk=temporal_chunk)
+        return hidden_states
+class Upsample2D(nn.Module):
+    """A 2D upsampling layer with an optional convolution.
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+        name (`str`, default `conv`):
+            name of the upsampling 2D layer.
+    """
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool = False,
+        out_channels: Optional[int] = None,
+        name: str = "conv",
+        kernel_size: Optional[int] = None,
+        padding=1,
+        bias=True,
+        interpolate=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.name = name
+        self.interpolate = interpolate
+        conv_cls = nn.Conv3d
+        conv = None
+        if interpolate:
+            raise NotImplementedError("Not implemented for spatial upsample with interpolate")
+        else:
+            if kernel_size is None:
+                kernel_size = 3
+            conv = conv_cls(self.channels, self.out_channels * 4, kernel_size=kernel_size, padding=padding, bias=bias)
+        self.conv = conv
+        self.conv.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Linear, nn.Conv2d, nn.Conv3d)):
+            trunc_normal_(m.weight, std=.02)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        assert hidden_states.shape[1] == self.channels
+        hidden_states = self.conv(hidden_states)
+        hidden_states = rearrange(hidden_states, 'b (c p1 p2) t h w -> b c t (h p1) (w p2)', p1=2, p2=2)
+        return hidden_states
+class CausalUpsample2x(nn.Module):
+    """A 2D upsampling layer with an optional convolution.
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+        name (`str`, default `conv`):
+            name of the upsampling 2D layer.
+    """
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool = False,
+        out_channels: Optional[int] = None,
+        name: str = "conv",
+        kernel_size: Optional[int] = 3,
+        bias=True,
+        interpolate=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.name = name
+        self.interpolate = interpolate
+        conv = None
+        if interpolate:
+            raise NotImplementedError("Not implemented for spatial upsample with interpolate")
+        else:
+            conv = CausalConv3d(self.channels, self.out_channels * 4, kernel_size=kernel_size, stride=1, bias=bias)
+        self.conv = conv
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        is_init_image=True, temporal_chunk=False,
+    ) -> torch.FloatTensor:
+        assert hidden_states.shape[1] == self.channels
+        hidden_states = self.conv(hidden_states, is_init_image=is_init_image, temporal_chunk=temporal_chunk)
+        hidden_states = rearrange(hidden_states, 'b (c p1 p2) t h w -> b c t (h p1) (w p2)', p1=2, p2=2)
+        return hidden_states
+class TemporalUpsample2x(nn.Module):
+    """A 2D upsampling layer with an optional convolution.
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+        name (`str`, default `conv`):
+            name of the upsampling 2D layer.
+    """
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool = True,
+        out_channels: Optional[int] = None,
+        kernel_size: Optional[int] = None,
+        padding=1,
+        bias=True,
+        interpolate=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.interpolate = interpolate
+        conv_cls = nn.Conv3d
+        conv = None
+        if interpolate:
+            raise NotImplementedError("Not implemented for spatial upsample with interpolate")
+        else:
+            # depth to space operator
+            if kernel_size is None:
+                kernel_size = 3
+            conv = conv_cls(self.channels, self.out_channels * 2, kernel_size=kernel_size, padding=padding, bias=bias)
+        self.conv = conv
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        is_image: bool = False,
+    ) -> torch.FloatTensor:
+        assert hidden_states.shape[1] == self.channels
+        t = hidden_states.shape[2]
+        hidden_states = self.conv(hidden_states)
+        hidden_states = rearrange(hidden_states, 'b (c p) t h w -> b c (p t) h w', p=2)
+        if t == 1 and is_image:
+            hidden_states = hidden_states[:, :, 1:]
+        return hidden_states
+class CausalTemporalUpsample2x(nn.Module):
+    """A 2D upsampling layer with an optional convolution.
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+        name (`str`, default `conv`):
+            name of the upsampling 2D layer.
+    """
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool = True,
+        out_channels: Optional[int] = None,
+        kernel_size: Optional[int] = 3,
+        bias=True,
+        interpolate=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.interpolate = interpolate
+        conv = None
+        if interpolate:
+            raise NotImplementedError("Not implemented for spatial upsample with interpolate")
+        else:
+            # depth to space operator
+            conv = CausalConv3d(self.channels, self.out_channels * 2, kernel_size=kernel_size, stride=1, bias=bias)
+        self.conv = conv
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        is_init_image=True, temporal_chunk=False,
+    ) -> torch.FloatTensor:
+        assert hidden_states.shape[1] == self.channels
+        t = hidden_states.shape[2]
+        hidden_states = self.conv(hidden_states, is_init_image=is_init_image, temporal_chunk=temporal_chunk)
+        hidden_states = rearrange(hidden_states, 'b (c p) t h w -> b c (t p) h w', p=2)
+        if is_init_image:
+            hidden_states = hidden_states[:, :, 1:]
+        return hidden_states