ShoeGenv2

Runtime error

App Files Files Community

MaxMilan1 commited on Apr 15, 2024

Commit

09339b5

1 Parent(s): 63f29cf

possible working changes for V3D?

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +60 -1
requirements.txt +44 -7
scripts/__init__.py +0 -0
scripts/pub/V3D_512.py +317 -0
scripts/pub/configs/V3D_512.yaml +161 -0
scripts/tests/attention.py +319 -0
scripts/util/__init__.py +0 -0
scripts/util/detection/__init__.py +0 -0
scripts/util/detection/nsfw_and_watermark_dectection.py +110 -0
scripts/util/detection/p_head_v1.npz +3 -0
scripts/util/detection/w_head_v1.npz +3 -0
sgm/__init__.py +4 -0
sgm/data/__init__.py +1 -0
sgm/data/cam_utils.py +1253 -0
sgm/data/cifar10.py +67 -0
sgm/data/co3d.py +1367 -0
sgm/data/colmap.py +605 -0
sgm/data/dataset.py +80 -0
sgm/data/joint3d.py +10 -0
sgm/data/json_index_dataset.py +1080 -0
sgm/data/latent_objaverse.py +52 -0
sgm/data/mnist.py +85 -0
sgm/data/mvimagenet.py +408 -0
sgm/data/objaverse.py +882 -0
sgm/inference/api.py +385 -0
sgm/inference/helpers.py +305 -0
sgm/lr_scheduler.py +135 -0
sgm/models/__init__.py +2 -0
sgm/models/autoencoder.py +615 -0
sgm/models/diffusion.py +358 -0
sgm/models/video3d_diffusion.py +524 -0
sgm/models/video_diffusion.py +503 -0
sgm/modules/__init__.py +6 -0
sgm/modules/attention.py +764 -0
sgm/modules/autoencoding/__init__.py +0 -0
sgm/modules/autoencoding/losses/__init__.py +7 -0
sgm/modules/autoencoding/losses/discriminator_loss.py +306 -0
sgm/modules/autoencoding/losses/lpips.py +73 -0
sgm/modules/autoencoding/lpips/__init__.py +0 -0
sgm/modules/autoencoding/lpips/loss/.gitignore +1 -0
sgm/modules/autoencoding/lpips/loss/LICENSE +23 -0
sgm/modules/autoencoding/lpips/loss/__init__.py +0 -0
sgm/modules/autoencoding/lpips/loss/lpips.py +147 -0
sgm/modules/autoencoding/lpips/model/LICENSE +58 -0
sgm/modules/autoencoding/lpips/model/__init__.py +0 -0
sgm/modules/autoencoding/lpips/model/model.py +88 -0
sgm/modules/autoencoding/lpips/util.py +128 -0
sgm/modules/autoencoding/lpips/vqperceptual.py +17 -0
sgm/modules/autoencoding/regularizers/__init__.py +31 -0
sgm/modules/autoencoding/regularizers/base.py +40 -0

app.py CHANGED Viewed

@@ -1,5 +1,9 @@
 import gradio as gr
 from util.text_img import generate_image
 _TITLE = "Shoe Generator"
 with gr.Blocks(_TITLE) as ShoeGen:
@@ -18,6 +22,61 @@ with gr.Blocks(_TITLE) as ShoeGen:
         button_gen.click(generate_image, inputs=[prompt], outputs=[image, image_nobg])
     with gr.Tab("Image to Video Generator (V3D)"):
-        pass
 ShoeGen.launch()

 import gradio as gr
 from util.text_img import generate_image
+from util.v3d import generate_v3d, prep
+# Prepare the V3D model
+model, clip_model, ae_model, device, num_frames, num_steps, rembg_session, output_folder = prep()
 _TITLE = "Shoe Generator"
 with gr.Blocks(_TITLE) as ShoeGen:
         button_gen.click(generate_image, inputs=[prompt], outputs=[image, image_nobg])
     with gr.Tab("Image to Video Generator (V3D)"):
+        with gr.Row(equal_height=True):
+            with gr.Column():
+                input_image = gr.Image(value=None, label="Input Image")
+                border_ratio_slider = gr.Slider(
+                    value=0.3,
+                    label="Border Ratio",
+                    minimum=0.05,
+                    maximum=0.5,
+                    step=0.05,
+                )
+                decoding_t_slider = gr.Slider(
+                    value=1,
+                    label="Number of Decoding frames",
+                    minimum=1,
+                    maximum=num_frames,
+                    step=1,
+                )
+                min_guidance_slider = gr.Slider(
+                    value=3.5,
+                    label="Min CFG Value",
+                    minimum=0.05,
+                    maximum=0.5,
+                    step=0.05,
+                )
+                max_guidance_slider = gr.Slider(
+                    value=3.5,
+                    label="Max CFG Value",
+                    minimum=0.05,
+                    maximum=0.5,
+                    step=0.05,
+                )
+                run_button = gr.Button(value="Run V3D")
+            with gr.Column():
+                output_video = gr.Video(value=None, label="Output Orbit Video")
+        run_button.click(generate_v3d,
+            inputs=[
+                input_image,
+                model,
+                clip_model,
+                ae_model,
+                num_frames,
+                num_steps,
+                int(decoding_t_slider),
+                border_ratio_slider,
+                False,
+                rembg_session,
+                output_folder,
+                min_guidance_slider,
+                max_guidance_slider,
+                device,
+            ],
+            outputs=[output_video],
+        )
 ShoeGen.launch()

requirements.txt CHANGED Viewed

@@ -1,12 +1,49 @@
-torch
 gradio
 diffusers==0.26.3
-transformers==4.38.1
 accelerate==0.27.2
-xformers
 rembg
-Pillow
 Python-IO
-numpy
-opencv-python
-huggingface-hub

 gradio
 diffusers==0.26.3
 accelerate==0.27.2
 rembg
 Python-IO
+huggingface-hub
+black==23.7.0
+chardet==5.1.0
+clip @ git+https://github.com/openai/CLIP.git
+einops>=0.6.1
+fairscale>=0.4.13
+fire>=0.5.0
+fsspec>=2023.6.0
+invisible-watermark>=0.2.0
+kornia==0.6.9
+matplotlib>=3.7.2
+natsort>=8.4.0
+ninja>=1.11.1
+numpy>=1.24.4
+omegaconf>=2.3.0
+open-clip-torch>=2.20.0
+opencv-python==4.6.0.66
+pandas>=2.0.3
+pillow>=9.5.0
+pudb>=2022.1.3
+pytorch-lightning==2.0.1
+pyyaml>=6.0.1
+scipy>=1.10.1
+streamlit>=0.73.1
+tensorboardx==2.6
+timm>=0.9.2
+tokenizers==0.12.1
+torch>=2.0.1
+torchaudio>=2.0.2
+torchdata==0.6.1
+torchmetrics>=1.0.1
+torchvision>=0.15.2
+tqdm>=4.65.0
+transformers==4.19.1
+triton==2.0.0
+urllib3<1.27,>=1.25.4
+wandb>=0.15.6
+webdataset>=0.2.33
+wheel>=0.41.0
+xformers>=0.0.20
+streamlit-keyup==0.2.0
+mediapy
+tyro
+wget

scripts/__init__.py ADDED Viewed

File without changes

scripts/pub/V3D_512.py ADDED Viewed

	@@ -0,0 +1,317 @@

+import math
+import os
+from glob import glob
+from pathlib import Path
+from typing import Optional
+import cv2
+import numpy as np
+import torch
+from einops import rearrange, repeat
+from fire import Fire
+import tyro
+from omegaconf import OmegaConf
+from PIL import Image
+from torchvision.transforms import ToTensor
+from mediapy import write_video
+import rembg
+from kiui.op import recenter
+from safetensors.torch import load_file as load_safetensors
+from typing import Any
+from scripts.util.detection.nsfw_and_watermark_dectection import DeepFloydDataFiltering
+from sgm.inference.helpers import embed_watermark
+from sgm.util import default, instantiate_from_config
+def get_unique_embedder_keys_from_conditioner(conditioner):
+    return list(set([x.input_key for x in conditioner.embedders]))
+def get_batch(keys, value_dict, N, T, device):
+    batch = {}
+    batch_uc = {}
+    for key in keys:
+        if key == "fps_id":
+            batch[key] = (
+                torch.tensor([value_dict["fps_id"]])
+                .to(device)
+                .repeat(int(math.prod(N)))
+            )
+        elif key == "motion_bucket_id":
+            batch[key] = (
+                torch.tensor([value_dict["motion_bucket_id"]])
+                .to(device)
+                .repeat(int(math.prod(N)))
+            )
+        elif key == "cond_aug":
+            batch[key] = repeat(
+                torch.tensor([value_dict["cond_aug"]]).to(device),
+                "1 -> b",
+                b=math.prod(N),
+            )
+        elif key == "cond_frames":
+            batch[key] = repeat(value_dict["cond_frames"], "1 ... -> b ...", b=N[0])
+        elif key == "cond_frames_without_noise":
+            batch[key] = repeat(
+                value_dict["cond_frames_without_noise"], "1 ... -> b ...", b=N[0]
+            )
+        else:
+            batch[key] = value_dict[key]
+    if T is not None:
+        batch["num_video_frames"] = T
+    for key in batch.keys():
+        if key not in batch_uc and isinstance(batch[key], torch.Tensor):
+            batch_uc[key] = torch.clone(batch[key])
+    return batch, batch_uc
+def load_model(
+    config: str,
+    device: str,
+    num_frames: int,
+    num_steps: int,
+    ckpt_path: Optional[str] = None,
+    min_cfg: Optional[float] = None,
+    max_cfg: Optional[float] = None,
+    sigma_max: Optional[float] = None,
+):
+    config = OmegaConf.load(config)
+    config.model.params.sampler_config.params.num_steps = num_steps
+    config.model.params.sampler_config.params.guider_config.params.num_frames = (
+        num_frames
+    )
+    if max_cfg is not None:
+        config.model.params.sampler_config.params.guider_config.params.max_scale = (
+            max_cfg
+        )
+    if min_cfg is not None:
+        config.model.params.sampler_config.params.guider_config.params.min_scale = (
+            min_cfg
+        )
+    if sigma_max is not None:
+        print("Overriding sigma_max to ", sigma_max)
+        config.model.params.sampler_config.params.discretization_config.params.sigma_max = (
+            sigma_max
+        )
+    config.model.params.from_scratch = False
+    if ckpt_path is not None:
+        config.model.params.ckpt_path = str(ckpt_path)
+    if device == "cuda":
+        with torch.device(device):
+            model = instantiate_from_config(config.model).to(device).eval()
+    else:
+        model = instantiate_from_config(config.model).to(device).eval()
+    return model, None
+def sample_one(
+    input_path: str = "assets/test_image.png",  # Can either be image file or folder with image files
+    checkpoint_path: Optional[str] = None,
+    num_frames: Optional[int] = None,
+    num_steps: Optional[int] = None,
+    fps_id: int = 1,
+    motion_bucket_id: int = 300,
+    cond_aug: float = 0.02,
+    seed: int = 23,
+    decoding_t: int = 24,  # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
+    device: str = "cuda",
+    output_folder: Optional[str] = None,
+    noise: torch.Tensor = None,
+    save: bool = False,
+    cached_model: Any = None,
+    border_ratio: float = 0.3,
+    min_guidance_scale: float = 3.5,
+    max_guidance_scale: float = 3.5,
+    sigma_max: float = None,
+    ignore_alpha: bool = False,
+):
+    model_config = "scripts/pub/configs/V3D_512.yaml"
+    num_frames = OmegaConf.load(
+        model_config
+    ).model.params.sampler_config.params.guider_config.params.num_frames
+    print("Detected num_frames:", num_frames)
+    num_steps = default(num_steps, 25)
+    output_folder = default(output_folder, f"outputs/V3D_512")
+    decoding_t = min(decoding_t, num_frames)
+    sd = load_safetensors("./ckpts/svd_xt.safetensors")
+    clip_model_config = OmegaConf.load("configs/embedder/clip_image.yaml")
+    clip_model = instantiate_from_config(clip_model_config).eval()
+    clip_sd = dict()
+    for k, v in sd.items():
+        if "conditioner.embedders.0" in k:
+            clip_sd[k.replace("conditioner.embedders.0.", "")] = v
+    clip_model.load_state_dict(clip_sd)
+    clip_model = clip_model.to(device)
+    ae_model_config = OmegaConf.load("configs/ae/video.yaml")
+    ae_model = instantiate_from_config(ae_model_config).eval()
+    encoder_sd = dict()
+    for k, v in sd.items():
+        if "first_stage_model" in k:
+            encoder_sd[k.replace("first_stage_model.", "")] = v
+    ae_model.load_state_dict(encoder_sd)
+    ae_model = ae_model.to(device)
+    if cached_model is None:
+        model, filter = load_model(
+            model_config,
+            device,
+            num_frames,
+            num_steps,
+            ckpt_path=checkpoint_path,
+            min_cfg=min_guidance_scale,
+            max_cfg=max_guidance_scale,
+            sigma_max=sigma_max,
+        )
+    else:
+        model = cached_model
+    torch.manual_seed(seed)
+    need_return = True
+    path = Path(input_path)
+    if path.is_file():
+        if any([input_path.endswith(x) for x in ["jpg", "jpeg", "png"]]):
+            all_img_paths = [input_path]
+        else:
+            raise ValueError("Path is not valid image file.")
+    elif path.is_dir():
+        all_img_paths = sorted(
+            [
+                f
+                for f in path.iterdir()
+                if f.is_file() and f.suffix.lower() in [".jpg", ".jpeg", ".png"]
+            ]
+        )
+        need_return = False
+        if len(all_img_paths) == 0:
+            raise ValueError("Folder does not contain any images.")
+    else:
+        raise ValueError
+    for input_path in all_img_paths:
+        with Image.open(input_path) as image:
+            # if image.mode == "RGBA":
+            #     image = image.convert("RGB")
+            w, h = image.size
+            if border_ratio > 0:
+                if image.mode != "RGBA" or ignore_alpha:
+                    image = image.convert("RGB")
+                    image = np.asarray(image)
+                    carved_image = rembg.remove(image)  # [H, W, 4]
+                else:
+                    image = np.asarray(image)
+                    carved_image = image
+                mask = carved_image[..., -1] > 0
+                image = recenter(carved_image, mask, border_ratio=border_ratio)
+                image = image.astype(np.float32) / 255.0
+                if image.shape[-1] == 4:
+                    image = image[..., :3] * image[..., 3:4] + (1 - image[..., 3:4])
+                image = Image.fromarray((image * 255).astype(np.uint8))
+            else:
+                print("Ignore border ratio")
+            image = image.resize((512, 512))
+            image = ToTensor()(image)
+            image = image * 2.0 - 1.0
+        image = image.unsqueeze(0).to(device)
+        H, W = image.shape[2:]
+        assert image.shape[1] == 3
+        F = 8
+        C = 4
+        shape = (num_frames, C, H // F, W // F)
+        value_dict = {}
+        value_dict["motion_bucket_id"] = motion_bucket_id
+        value_dict["fps_id"] = fps_id
+        value_dict["cond_aug"] = cond_aug
+        value_dict["cond_frames_without_noise"] = clip_model(image)
+        value_dict["cond_frames"] = ae_model.encode(image)
+        value_dict["cond_frames"] += cond_aug * torch.randn_like(
+            value_dict["cond_frames"]
+        )
+        value_dict["cond_aug"] = cond_aug
+        with torch.no_grad():
+            with torch.autocast(device):
+                batch, batch_uc = get_batch(
+                    get_unique_embedder_keys_from_conditioner(model.conditioner),
+                    value_dict,
+                    [1, num_frames],
+                    T=num_frames,
+                    device=device,
+                )
+                c, uc = model.conditioner.get_unconditional_conditioning(
+                    batch,
+                    batch_uc=batch_uc,
+                    force_uc_zero_embeddings=[
+                        "cond_frames",
+                        "cond_frames_without_noise",
+                    ],
+                )
+                for k in ["crossattn", "concat"]:
+                    uc[k] = repeat(uc[k], "b ... -> b t ...", t=num_frames)
+                    uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=num_frames)
+                    c[k] = repeat(c[k], "b ... -> b t ...", t=num_frames)
+                    c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=num_frames)
+                randn = torch.randn(shape, device=device) if noise is None else noise
+                randn = randn.to(device)
+                additional_model_inputs = {}
+                additional_model_inputs["image_only_indicator"] = torch.zeros(
+                    2, num_frames
+                ).to(device)
+                additional_model_inputs["num_video_frames"] = batch["num_video_frames"]
+                def denoiser(input, sigma, c):
+                    return model.denoiser(
+                        model.model, input, sigma, c, **additional_model_inputs
+                    )
+                samples_z = model.sampler(denoiser, randn, cond=c, uc=uc)
+                model.en_and_decode_n_samples_a_time = decoding_t
+                samples_x = model.decode_first_stage(samples_z)
+                samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
+                os.makedirs(output_folder, exist_ok=True)
+                base_count = len(glob(os.path.join(output_folder, "*.mp4")))
+                video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")
+                # writer = cv2.VideoWriter(
+                #     video_path,
+                #     cv2.VideoWriter_fourcc(*"MP4V"),
+                #     fps_id + 1,
+                #     (samples.shape[-1], samples.shape[-2]),
+                # )
+                frames = (
+                    (rearrange(samples, "t c h w -> t h w c") * 255)
+                    .cpu()
+                    .numpy()
+                    .astype(np.uint8)
+                )
+                if save:
+                    write_video(video_path, frames, fps=3)
+                images = []
+                for frame in frames:
+                    images.append(Image.fromarray(frame))
+                if need_return:
+                    return images, model
+if __name__ == "__main__":
+    tyro.cli(sample_one)

scripts/pub/configs/V3D_512.yaml ADDED Viewed

	@@ -0,0 +1,161 @@

+model:
+  base_learning_rate: 1.0e-04
+  target: sgm.models.video_diffusion.DiffusionEngine
+  params:
+    ckpt_path: ckpts/V3D_512.ckpt
+    scale_factor: 0.18215
+    disable_first_stage_autocast: true
+    input_key: latents
+    log_keys: []
+    scheduler_config:
+      target: sgm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps:
+        - 1
+        cycle_lengths:
+        - 10000000000000
+        f_start:
+        - 1.0e-06
+        f_max:
+        - 1.0
+        f_min:
+        - 1.0
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.Denoiser
+      params:
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
+    network_config:
+      target: sgm.modules.diffusionmodules.video_model.VideoUNet
+      params:
+        adm_in_channels: 768
+        num_classes: sequential
+        use_checkpoint: true
+        in_channels: 8
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions:
+        - 4
+        - 2
+        - 1
+        num_res_blocks: 2
+        channel_mult:
+        - 1
+        - 2
+        - 4
+        - 4
+        num_head_channels: 64
+        use_linear_in_transformer: true
+        transformer_depth: 1
+        context_dim: 1024
+        spatial_transformer_attn_type: softmax-xformers
+        extra_ff_mix_layer: true
+        use_spatial_context: true
+        merge_strategy: learned_with_images
+        video_kernel_size:
+        - 3
+        - 1
+        - 1
+    conditioner_config:
+      target: sgm.modules.GeneralConditioner
+      params:
+        emb_models:
+        - is_trainable: false
+          ucg_rate: 0.2
+          input_key: cond_frames_without_noise
+          target: sgm.modules.encoders.modules.IdentityEncoder
+        - input_key: fps_id
+          is_trainable: true
+          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 256
+        - input_key: motion_bucket_id
+          is_trainable: true
+          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 256
+        - input_key: cond_frames
+          is_trainable: false
+          ucg_rate: 0.2
+          target: sgm.modules.encoders.modules.IdentityEncoder
+        - input_key: cond_aug
+          is_trainable: true
+          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 256
+    first_stage_config:
+      target: sgm.models.autoencoder.AutoencodingEngine
+      params:
+        loss_config:
+          target: torch.nn.Identity
+        regularizer_config:
+          target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
+        encoder_config:
+          target: sgm.modules.diffusionmodules.model.Encoder
+          params:
+            attn_type: vanilla
+            double_z: true
+            z_channels: 4
+            resolution: 256
+            in_channels: 3
+            out_ch: 3
+            ch: 128
+            ch_mult:
+            - 1
+            - 2
+            - 4
+            - 4
+            num_res_blocks: 2
+            attn_resolutions: []
+            dropout: 0.0
+        decoder_config:
+          target: sgm.modules.autoencoding.temporal_ae.VideoDecoder
+          params:
+            attn_type: vanilla
+            double_z: true
+            z_channels: 4
+            resolution: 256
+            in_channels: 3
+            out_ch: 3
+            ch: 128
+            ch_mult:
+            - 1
+            - 2
+            - 4
+            - 4
+            num_res_blocks: 2
+            attn_resolutions: []
+            dropout: 0.0
+            video_kernel_size:
+            - 3
+            - 1
+            - 1
+    sampler_config:
+      target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
+      params:
+        num_steps: 30
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
+          params:
+            sigma_max: 700.0
+        guider_config:
+          target: sgm.modules.diffusionmodules.guiders.LinearPredictionGuider
+          params:
+            max_scale: 3.5
+            min_scale: 3.5
+            num_frames: 18
+    loss_fn_config:
+      target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
+      params:
+        batch2model_keys:
+        - num_video_frames
+        - image_only_indicator
+        loss_weighting_config:
+          target: sgm.modules.diffusionmodules.loss_weighting.EDMWeighting
+          params:
+            sigma_data: 1.0
+        sigma_sampler_config:
+          target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling
+          params:
+            p_mean: 1.5
+            p_std: 2.0

scripts/tests/attention.py ADDED Viewed

	@@ -0,0 +1,319 @@

+import einops
+import torch
+import torch.nn.functional as F
+import torch.utils.benchmark as benchmark
+from torch.backends.cuda import SDPBackend
+from sgm.modules.attention import BasicTransformerBlock, SpatialTransformer
+def benchmark_attn():
+    # Lets define a helpful benchmarking function:
+    # https://pytorch.org/tutorials/intermediate/scaled_dot_product_attention_tutorial.html
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    def benchmark_torch_function_in_microseconds(f, *args, **kwargs):
+        t0 = benchmark.Timer(
+            stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f}
+        )
+        return t0.blocked_autorange().mean * 1e6
+    # Lets define the hyper-parameters of our input
+    batch_size = 32
+    max_sequence_len = 1024
+    num_heads = 32
+    embed_dimension = 32
+    dtype = torch.float16
+    query = torch.rand(
+        batch_size,
+        num_heads,
+        max_sequence_len,
+        embed_dimension,
+        device=device,
+        dtype=dtype,
+    )
+    key = torch.rand(
+        batch_size,
+        num_heads,
+        max_sequence_len,
+        embed_dimension,
+        device=device,
+        dtype=dtype,
+    )
+    value = torch.rand(
+        batch_size,
+        num_heads,
+        max_sequence_len,
+        embed_dimension,
+        device=device,
+        dtype=dtype,
+    )
+    print(f"q/k/v shape:", query.shape, key.shape, value.shape)
+    # Lets explore the speed of each of the 3 implementations
+    from torch.backends.cuda import SDPBackend, sdp_kernel
+    # Helpful arguments mapper
+    backend_map = {
+        SDPBackend.MATH: {
+            "enable_math": True,
+            "enable_flash": False,
+            "enable_mem_efficient": False,
+        },
+        SDPBackend.FLASH_ATTENTION: {
+            "enable_math": False,
+            "enable_flash": True,
+            "enable_mem_efficient": False,
+        },
+        SDPBackend.EFFICIENT_ATTENTION: {
+            "enable_math": False,
+            "enable_flash": False,
+            "enable_mem_efficient": True,
+        },
+    }
+    from torch.profiler import ProfilerActivity, profile, record_function
+    activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA]
+    print(
+        f"The default implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds"
+    )
+    with profile(
+        activities=activities, record_shapes=False, profile_memory=True
+    ) as prof:
+        with record_function("Default detailed stats"):
+            for _ in range(25):
+                o = F.scaled_dot_product_attention(query, key, value)
+    print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
+    print(
+        f"The math implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds"
+    )
+    with sdp_kernel(**backend_map[SDPBackend.MATH]):
+        with profile(
+            activities=activities, record_shapes=False, profile_memory=True
+        ) as prof:
+            with record_function("Math implmentation stats"):
+                for _ in range(25):
+                    o = F.scaled_dot_product_attention(query, key, value)
+        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
+    with sdp_kernel(**backend_map[SDPBackend.FLASH_ATTENTION]):
+        try:
+            print(
+                f"The flash attention implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds"
+            )
+        except RuntimeError:
+            print("FlashAttention is not supported. See warnings for reasons.")
+        with profile(
+            activities=activities, record_shapes=False, profile_memory=True
+        ) as prof:
+            with record_function("FlashAttention stats"):
+                for _ in range(25):
+                    o = F.scaled_dot_product_attention(query, key, value)
+        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
+    with sdp_kernel(**backend_map[SDPBackend.EFFICIENT_ATTENTION]):
+        try:
+            print(
+                f"The memory efficient implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds"
+            )
+        except RuntimeError:
+            print("EfficientAttention is not supported. See warnings for reasons.")
+        with profile(
+            activities=activities, record_shapes=False, profile_memory=True
+        ) as prof:
+            with record_function("EfficientAttention stats"):
+                for _ in range(25):
+                    o = F.scaled_dot_product_attention(query, key, value)
+        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
+def run_model(model, x, context):
+    return model(x, context)
+def benchmark_transformer_blocks():
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    import torch.utils.benchmark as benchmark
+    def benchmark_torch_function_in_microseconds(f, *args, **kwargs):
+        t0 = benchmark.Timer(
+            stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f}
+        )
+        return t0.blocked_autorange().mean * 1e6
+    checkpoint = True
+    compile = False
+    batch_size = 32
+    h, w = 64, 64
+    context_len = 77
+    embed_dimension = 1024
+    context_dim = 1024
+    d_head = 64
+    transformer_depth = 4
+    n_heads = embed_dimension // d_head
+    dtype = torch.float16
+    model_native = SpatialTransformer(
+        embed_dimension,
+        n_heads,
+        d_head,
+        context_dim=context_dim,
+        use_linear=True,
+        use_checkpoint=checkpoint,
+        attn_type="softmax",
+        depth=transformer_depth,
+        sdp_backend=SDPBackend.FLASH_ATTENTION,
+    ).to(device)
+    model_efficient_attn = SpatialTransformer(
+        embed_dimension,
+        n_heads,
+        d_head,
+        context_dim=context_dim,
+        use_linear=True,
+        depth=transformer_depth,
+        use_checkpoint=checkpoint,
+        attn_type="softmax-xformers",
+    ).to(device)
+    if not checkpoint and compile:
+        print("compiling models")
+        model_native = torch.compile(model_native)
+        model_efficient_attn = torch.compile(model_efficient_attn)
+    x = torch.rand(batch_size, embed_dimension, h, w, device=device, dtype=dtype)
+    c = torch.rand(batch_size, context_len, context_dim, device=device, dtype=dtype)
+    from torch.profiler import ProfilerActivity, profile, record_function
+    activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA]
+    with torch.autocast("cuda"):
+        print(
+            f"The native model runs in {benchmark_torch_function_in_microseconds(model_native.forward, x, c):.3f} microseconds"
+        )
+        print(
+            f"The efficientattn model runs in {benchmark_torch_function_in_microseconds(model_efficient_attn.forward, x, c):.3f} microseconds"
+        )
+        print(75 * "+")
+        print("NATIVE")
+        print(75 * "+")
+        torch.cuda.reset_peak_memory_stats()
+        with profile(
+            activities=activities, record_shapes=False, profile_memory=True
+        ) as prof:
+            with record_function("NativeAttention stats"):
+                for _ in range(25):
+                    model_native(x, c)
+        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
+        print(torch.cuda.max_memory_allocated() * 1e-9, "GB used by native block")
+        print(75 * "+")
+        print("Xformers")
+        print(75 * "+")
+        torch.cuda.reset_peak_memory_stats()
+        with profile(
+            activities=activities, record_shapes=False, profile_memory=True
+        ) as prof:
+            with record_function("xformers stats"):
+                for _ in range(25):
+                    model_efficient_attn(x, c)
+        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
+        print(torch.cuda.max_memory_allocated() * 1e-9, "GB used by xformers block")
+def test01():
+    # conv1x1 vs linear
+    from sgm.util import count_params
+    conv = torch.nn.Conv2d(3, 32, kernel_size=1).cuda()
+    print(count_params(conv))
+    linear = torch.nn.Linear(3, 32).cuda()
+    print(count_params(linear))
+    print(conv.weight.shape)
+    # use same initialization
+    linear.weight = torch.nn.Parameter(conv.weight.squeeze(-1).squeeze(-1))
+    linear.bias = torch.nn.Parameter(conv.bias)
+    print(linear.weight.shape)
+    x = torch.randn(11, 3, 64, 64).cuda()
+    xr = einops.rearrange(x, "b c h w -> b (h w) c").contiguous()
+    print(xr.shape)
+    out_linear = linear(xr)
+    print(out_linear.mean(), out_linear.shape)
+    out_conv = conv(x)
+    print(out_conv.mean(), out_conv.shape)
+    print("done with test01.\n")
+def test02():
+    # try cosine flash attention
+    import time
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+    torch.backends.cudnn.benchmark = True
+    print("testing cosine flash attention...")
+    DIM = 1024
+    SEQLEN = 4096
+    BS = 16
+    print(" softmax (vanilla) first...")
+    model = BasicTransformerBlock(
+        dim=DIM,
+        n_heads=16,
+        d_head=64,
+        dropout=0.0,
+        context_dim=None,
+        attn_mode="softmax",
+    ).cuda()
+    try:
+        x = torch.randn(BS, SEQLEN, DIM).cuda()
+        tic = time.time()
+        y = model(x)
+        toc = time.time()
+        print(y.shape, toc - tic)
+    except RuntimeError as e:
+        # likely oom
+        print(str(e))
+    print("\n now flash-cosine...")
+    model = BasicTransformerBlock(
+        dim=DIM,
+        n_heads=16,
+        d_head=64,
+        dropout=0.0,
+        context_dim=None,
+        attn_mode="flash-cosine",
+    ).cuda()
+    x = torch.randn(BS, SEQLEN, DIM).cuda()
+    tic = time.time()
+    y = model(x)
+    toc = time.time()
+    print(y.shape, toc - tic)
+    print("done with test02.\n")
+if __name__ == "__main__":
+    # test01()
+    # test02()
+    # test03()
+    # benchmark_attn()
+    benchmark_transformer_blocks()
+    print("done.")

scripts/util/__init__.py ADDED Viewed

File without changes

scripts/util/detection/__init__.py ADDED Viewed

File without changes

scripts/util/detection/nsfw_and_watermark_dectection.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import os
+import clip
+import numpy as np
+import torch
+import torchvision.transforms as T
+from PIL import Image
+RESOURCES_ROOT = "scripts/util/detection/"
+def predict_proba(X, weights, biases):
+    logits = X @ weights.T + biases
+    proba = np.where(
+        logits >= 0, 1 / (1 + np.exp(-logits)), np.exp(logits) / (1 + np.exp(logits))
+    )
+    return proba.T
+def load_model_weights(path: str):
+    model_weights = np.load(path)
+    return model_weights["weights"], model_weights["biases"]
+def clip_process_images(images: torch.Tensor) -> torch.Tensor:
+    min_size = min(images.shape[-2:])
+    return T.Compose(
+        [
+            T.CenterCrop(min_size),  # TODO: this might affect the watermark, check this
+            T.Resize(224, interpolation=T.InterpolationMode.BICUBIC, antialias=True),
+            T.Normalize(
+                (0.48145466, 0.4578275, 0.40821073),
+                (0.26862954, 0.26130258, 0.27577711),
+            ),
+        ]
+    )(images)
+class DeepFloydDataFiltering(object):
+    def __init__(
+        self, verbose: bool = False, device: torch.device = torch.device("cpu")
+    ):
+        super().__init__()
+        self.verbose = verbose
+        self._device = None
+        self.clip_model, _ = clip.load("ViT-L/14", device=device)
+        self.clip_model.eval()
+        self.cpu_w_weights, self.cpu_w_biases = load_model_weights(
+            os.path.join(RESOURCES_ROOT, "w_head_v1.npz")
+        )
+        self.cpu_p_weights, self.cpu_p_biases = load_model_weights(
+            os.path.join(RESOURCES_ROOT, "p_head_v1.npz")
+        )
+        self.w_threshold, self.p_threshold = 0.5, 0.5
+    @torch.inference_mode()
+    def __call__(self, images: torch.Tensor) -> torch.Tensor:
+        imgs = clip_process_images(images)
+        if self._device is None:
+            self._device = next(p for p in self.clip_model.parameters()).device
+        image_features = self.clip_model.encode_image(imgs.to(self._device))
+        image_features = image_features.detach().cpu().numpy().astype(np.float16)
+        p_pred = predict_proba(image_features, self.cpu_p_weights, self.cpu_p_biases)
+        w_pred = predict_proba(image_features, self.cpu_w_weights, self.cpu_w_biases)
+        print(f"p_pred = {p_pred}, w_pred = {w_pred}") if self.verbose else None
+        query = p_pred > self.p_threshold
+        if query.sum() > 0:
+            print(f"Hit for p_threshold: {p_pred}") if self.verbose else None
+            images[query] = T.GaussianBlur(99, sigma=(100.0, 100.0))(images[query])
+        query = w_pred > self.w_threshold
+        if query.sum() > 0:
+            print(f"Hit for w_threshold: {w_pred}") if self.verbose else None
+            images[query] = T.GaussianBlur(99, sigma=(100.0, 100.0))(images[query])
+        return images
+def load_img(path: str) -> torch.Tensor:
+    image = Image.open(path)
+    if not image.mode == "RGB":
+        image = image.convert("RGB")
+    image_transforms = T.Compose(
+        [
+            T.ToTensor(),
+        ]
+    )
+    return image_transforms(image)[None, ...]
+def test(root):
+    from einops import rearrange
+    filter = DeepFloydDataFiltering(verbose=True)
+    for p in os.listdir((root)):
+        print(f"running on {p}...")
+        img = load_img(os.path.join(root, p))
+        filtered_img = filter(img)
+        filtered_img = rearrange(
+            255.0 * (filtered_img.numpy())[0], "c h w -> h w c"
+        ).astype(np.uint8)
+        Image.fromarray(filtered_img).save(
+            os.path.join(root, f"{os.path.splitext(p)[0]}-filtered.jpg")
+        )
+if __name__ == "__main__":
+    import fire
+    fire.Fire(test)
+    print("done.")

scripts/util/detection/p_head_v1.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b4653a64d5f85d8d4c5f6c5ec175f1c5c5e37db8f38d39b2ed8b5979da7fdc76
+size 3588

scripts/util/detection/w_head_v1.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b6af23687aa347073e692025f405ccc48c14aadc5dbe775b3312041006d496d1
+size 3588

sgm/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .models import AutoencodingEngine, DiffusionEngine
+from .util import get_configs_path, instantiate_from_config
+__version__ = "0.1.0"

sgm/data/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .dataset import StableDataModuleFromConfig

sgm/data/cam_utils.py ADDED Viewed

	@@ -0,0 +1,1253 @@

+'''
+Common camera utilities
+'''
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+from pytorch3d.renderer import PerspectiveCameras
+from pytorch3d.renderer.cameras import look_at_view_transform
+from pytorch3d.renderer.implicit.raysampling import _xy_to_ray_bundle
+class RelativeCameraLoader(nn.Module):
+    def __init__(self,
+            query_batch_size=1,
+            rand_query=True,
+            relative=True,
+            center_at_origin=False,
+        ):
+        super().__init__()
+        self.query_batch_size = query_batch_size
+        self.rand_query = rand_query
+        self.relative = relative
+        self.center_at_origin = center_at_origin
+    def plot_cameras(self, cameras_1, cameras_2):
+        '''
+        Helper function to plot cameras
+        Args:
+            cameras_1 (PyTorch3D camera): cameras object to plot
+            cameras_2 (PyTorch3D camera): cameras object to plot
+        '''
+        from pytorch3d.vis.plotly_vis import AxisArgs, plot_batch_individually, plot_scene
+        import plotly.graph_objects as go
+        plotlyplot = plot_scene(
+                {
+                    'scene_batch': {
+                        'cameras': cameras_1.to('cpu'),
+                        'rel_cameras': cameras_2.to('cpu'),
+                    }
+                },
+                camera_scale=.5,#0.05,
+                pointcloud_max_points=10000,
+                pointcloud_marker_size=1.0,
+                raybundle_max_rays=100
+            )
+        plotlyplot.show()
+    def concat_cameras(self, camera_list):
+        '''
+        Returns a concatenation of a list of cameras
+        Args:
+            camera_list (List[PyTorch3D camera]): a list of PyTorch3D cameras
+        '''
+        R_list, T_list, f_list, c_list, size_list = [], [], [], [], []
+        for cameras in camera_list:
+            R_list.append(cameras.R)
+            T_list.append(cameras.T)
+            f_list.append(cameras.focal_length)
+            c_list.append(cameras.principal_point)
+            size_list.append(cameras.image_size)
+        camera_slice = PerspectiveCameras(
+            R = torch.cat(R_list),
+            T = torch.cat(T_list),
+            focal_length = torch.cat(f_list),
+            principal_point = torch.cat(c_list),
+            image_size = torch.cat(size_list),
+            device = camera_list[0].device,
+        )
+        return camera_slice
+    def get_camera_slice(self, scene_cameras, indices):
+        '''
+        Return a subset of cameras from a super set given indices
+        Args:
+            scene_cameras (PyTorch3D Camera): cameras object
+            indices (tensor or List): a flat list or tensor of indices
+        Returns:
+            camera_slice (PyTorch3D Camera) - cameras subset
+        '''
+        camera_slice = PerspectiveCameras(
+            R = scene_cameras.R[indices],
+            T = scene_cameras.T[indices],
+            focal_length = scene_cameras.focal_length[indices],
+            principal_point = scene_cameras.principal_point[indices],
+            image_size = scene_cameras.image_size[indices],
+            device = scene_cameras.device,
+        )
+        return camera_slice
+    def get_relative_camera(self, scene_cameras:PerspectiveCameras, query_idx, center_at_origin=False):
+        """
+        Transform context cameras relative to a base query camera
+        Args:
+            scene_cameras (PyTorch3D Camera): cameras object
+            query_idx (tensor or List): a length 1 list defining query idx
+        Returns:
+            cams_relative (PyTorch3D Camera): cameras object relative to query camera
+        """
+        query_camera = self.get_camera_slice(scene_cameras, query_idx)
+        query_world2view = query_camera.get_world_to_view_transform()
+        all_world2view = scene_cameras.get_world_to_view_transform()
+        if center_at_origin:
+            identity_cam = PerspectiveCameras(device=scene_cameras.device, R=query_camera.R, T=query_camera.T)
+        else:
+            T = torch.zeros((1, 3))
+            identity_cam = PerspectiveCameras(device=scene_cameras.device, R=query_camera.R, T=T)
+        identity_world2view  = identity_cam.get_world_to_view_transform()
+        # compose the relative transformation as g_i^{-1} g_j
+        relative_world2view = identity_world2view.inverse().compose(all_world2view)
+        # generate a camera from the relative transform
+        relative_matrix = relative_world2view.get_matrix()
+        cams_relative = PerspectiveCameras(
+                            R = relative_matrix[:, :3, :3],
+                            T = relative_matrix[:, 3, :3],
+                            focal_length = scene_cameras.focal_length,
+                            principal_point = scene_cameras.principal_point,
+                            image_size = scene_cameras.image_size,
+                            device = scene_cameras.device,
+                        )
+        return cams_relative
+    def forward(self, scene_cameras, scene_rgb=None, scene_masks=None, query_idx=None, context_size=3, context_idx=None, return_context=False):
+        '''
+        Return a sampled batch of query and context cameras (used in training)
+        Args:
+            scene_cameras (PyTorch3D Camera): a batch of PyTorch3D cameras
+            scene_rgb (Tensor): a batch of rgb
+            scene_masks (Tensor): a batch of masks (optional)
+            query_idx (List or Tensor): desired query idx (optional)
+            context_size (int): number of views for context
+        Returns:
+            query_cameras, query_rgb, query_masks: random query view
+            context_cameras, context_rgb, context_masks: context views
+        '''
+        if query_idx is None:
+            query_idx = [0]
+            if self.rand_query:
+                rand = torch.randperm(len(scene_cameras))
+                query_idx = rand[:1]
+        if context_idx is None:
+            rand = torch.randperm(len(scene_cameras))
+            context_idx = rand[:context_size]
+        if self.relative:
+            rel_cameras = self.get_relative_camera(scene_cameras, query_idx, center_at_origin=self.center_at_origin)
+        else:
+            rel_cameras = scene_cameras
+        query_cameras = self.get_camera_slice(rel_cameras, query_idx)
+        query_rgb = None
+        if scene_rgb is not None:
+            query_rgb = scene_rgb[query_idx]
+        query_masks = None
+        if scene_masks is not None:
+            query_masks = scene_masks[query_idx]
+        context_cameras = self.get_camera_slice(rel_cameras, context_idx)
+        context_rgb = None
+        if scene_rgb is not None:
+            context_rgb = scene_rgb[context_idx]
+        context_masks = None
+        if scene_masks is not None:
+            context_masks = scene_masks[context_idx]
+        if return_context:
+            return query_cameras, query_rgb, query_masks, context_cameras, context_rgb, context_masks, context_idx
+        return query_cameras, query_rgb, query_masks, context_cameras, context_rgb, context_masks
+def get_interpolated_path(cameras: PerspectiveCameras, n=50, method='circle', theta_offset_max=0.0):
+    '''
+    Given a camera object containing a set of cameras, fit a circle and get
+    interpolated cameras
+    Args:
+        cameras (PyTorch3D Camera): input camera object
+        n (int): length of cameras in new path
+        method (str): 'circle'
+        theta_offset_max (int): max camera jitter in radians
+    Returns:
+        path_cameras (PyTorch3D Camera): interpolated cameras
+    '''
+    device = cameras.device
+    cameras = cameras.cpu()
+    if method == 'circle':
+        #@ https://meshlogic.github.io/posts/jupyter/curve-fitting/fitting-a-circle-to-cluster-of-3d-points/
+        #@ Fit plane
+        P = cameras.get_camera_center().cpu()
+        P_mean = P.mean(axis=0)
+        P_centered = P - P_mean
+        U,s,V = torch.linalg.svd(P_centered)
+        normal = V[2,:]
+        if (normal*2 - P_mean).norm() < (normal - P_mean).norm():
+            normal = - normal
+        d = -torch.dot(P_mean, normal)  # d = -<p,n>
+        #@ Project pts to plane
+        P_xy = rodrigues_rot(P_centered, normal, torch.tensor([0.0,0.0,1.0]))
+        #@ Fit circle in 2D
+        xc, yc, r = fit_circle_2d(P_xy[:,0], P_xy[:,1])
+        t = torch.linspace(0, 2*math.pi, 100)
+        xx = xc + r*torch.cos(t)
+        yy = yc + r*torch.sin(t)
+        #@ Project circle to 3D
+        C = rodrigues_rot(torch.tensor([xc,yc,0.0]), torch.tensor([0.0,0.0,1.0]), normal) + P_mean
+        C = C.flatten()
+        #@ Get pts n 3D
+        t = torch.linspace(0, 2*math.pi, n)
+        u = P[0] - C
+        new_camera_centers = generate_circle_by_vectors(t, C, r, normal, u)
+        #@ OPTIONAL THETA OFFSET
+        if theta_offset_max > 0.0:
+            aug_theta = (torch.rand((new_camera_centers.shape[0])) * (2*theta_offset_max)) - theta_offset_max
+            new_camera_centers = rodrigues_rot2(new_camera_centers, normal, aug_theta)
+        #@ Get camera look at
+        new_camera_look_at = get_nearest_centroid(cameras)
+        #@ Get R T
+        up_vec = -normal
+        R, T = look_at_view_transform(eye=new_camera_centers, at=new_camera_look_at.unsqueeze(0), up=up_vec.unsqueeze(0), device=cameras.device)
+    else:
+        raise NotImplementedError
+    c = (cameras.principal_point).mean(dim=0, keepdim=True).expand(R.shape[0],-1)
+    f = (cameras.focal_length).mean(dim=0, keepdim=True).expand(R.shape[0],-1)
+    image_size = cameras.image_size[:1].expand(R.shape[0],-1)
+    path_cameras = PerspectiveCameras(R=R,T=T,focal_length=f,principal_point=c,image_size=image_size, device=device)
+    cameras = cameras.to(device)
+    return path_cameras
+def np_normalize(vec, axis=-1):
+    vec = vec / (np.linalg.norm(vec, axis=axis, keepdims=True) + 1e-9)
+    return vec
+#@ https://meshlogic.github.io/posts/jupyter/curve-fitting/fitting-a-circle-to-cluster-of-3d-points/
+#-------------------------------------------------------------------------------
+# Generate points on circle
+# P(t) = r*cos(t)*u + r*sin(t)*(n x u) + C
+#-------------------------------------------------------------------------------
+def generate_circle_by_vectors(t, C, r, n, u):
+    n = n/torch.linalg.norm(n)
+    u = u/torch.linalg.norm(u)
+    P_circle = r*torch.cos(t)[:,None]*u + r*torch.sin(t)[:,None]*torch.cross(n,u) + C
+    return P_circle
+#@ https://meshlogic.github.io/posts/jupyter/curve-fitting/fitting-a-circle-to-cluster-of-3d-points/
+#-------------------------------------------------------------------------------
+# FIT CIRCLE 2D
+# - Find center [xc, yc] and radius r of circle fitting to set of 2D points
+# - Optionally specify weights for points
+#
+# - Implicit circle function:
+#   (x-xc)^2 + (y-yc)^2 = r^2
+#   (2*xc)*x + (2*yc)*y + (r^2-xc^2-yc^2) = x^2+y^2
+#   c[0]*x + c[1]*y + c[2] = x^2+y^2
+#
+# - Solution by method of least squares:
+#   A*c = b, c' = argmin(||A*c - b||^2)
+#   A = [x y 1], b = [x^2+y^2]
+#-------------------------------------------------------------------------------
+def fit_circle_2d(x, y, w=[]):
+    A = torch.stack([x, y, torch.ones(len(x))]).T
+    b = x**2 + y**2
+    # Modify A,b for weighted least squares
+    if len(w) == len(x):
+        W = torch.diag(w)
+        A = torch.dot(W,A)
+        b = torch.dot(W,b)
+    # Solve by method of least squares
+    c = torch.linalg.lstsq(A,b,rcond=None)[0]
+    # Get circle parameters from solution c
+    xc = c[0]/2
+    yc = c[1]/2
+    r = torch.sqrt(c[2] + xc**2 + yc**2)
+    return xc, yc, r
+#@ https://meshlogic.github.io/posts/jupyter/curve-fitting/fitting-a-circle-to-cluster-of-3d-points/
+#-------------------------------------------------------------------------------
+# RODRIGUES ROTATION
+# - Rotate given points based on a starting and ending vector
+# - Axis k and angle of rotation theta given by vectors n0,n1
+#   P_rot = P*cos(theta) + (k x P)*sin(theta) + k*<k,P>*(1-cos(theta))
+#-------------------------------------------------------------------------------
+def rodrigues_rot(P, n0, n1):
+    # If P is only 1d array (coords of single point), fix it to be matrix
+    if P.ndim == 1:
+        P = P[None,...]
+    # Get vector of rotation k and angle theta
+    n0 = n0/torch.linalg.norm(n0)
+    n1 = n1/torch.linalg.norm(n1)
+    k = torch.cross(n0,n1)
+    k = k/torch.linalg.norm(k)
+    theta = torch.arccos(torch.dot(n0,n1))
+    # Compute rotated points
+    P_rot = torch.zeros((len(P),3))
+    for i in range(len(P)):
+        P_rot[i] = P[i]*torch.cos(theta) + torch.cross(k,P[i])*torch.sin(theta) + k*torch.dot(k,P[i])*(1-torch.cos(theta))
+    return P_rot
+def rodrigues_rot2(P, n1, theta):
+    '''
+    Rotate points P wrt axis k by theta radians
+    '''
+    # If P is only 1d array (coords of single point), fix it to be matrix
+    if P.ndim == 1:
+        P = P[None,...]
+    k = torch.cross(P, n1.unsqueeze(0))
+    k = k/torch.linalg.norm(k)
+    # Compute rotated points
+    P_rot = torch.zeros((len(P),3))
+    for i in range(len(P)):
+        P_rot[i] = P[i]*torch.cos(theta[i]) + torch.cross(k[i],P[i])*torch.sin(theta[i]) + k[i]*torch.dot(k[i],P[i])*(1-torch.cos(theta[i]))
+    return P_rot
+#@ https://meshlogic.github.io/posts/jupyter/curve-fitting/fitting-a-circle-to-cluster-of-3d-points/
+#-------------------------------------------------------------------------------
+# ANGLE BETWEEN
+# - Get angle between vectors u,v with sign based on plane with unit normal n
+#-------------------------------------------------------------------------------
+def angle_between(u, v, n=None):
+    if n is None:
+        return torch.arctan2(torch.linalg.norm(torch.cross(u,v)), torch.dot(u,v))
+    else:
+        return torch.arctan2(torch.dot(n,torch.cross(u,v)), torch.dot(u,v))
+#@ https://www.crewes.org/Documents/ResearchReports/2010/CRR201032.pdf
+def get_nearest_centroid(cameras: PerspectiveCameras):
+    '''
+    Given PyTorch3D cameras, find the nearest point along their principal ray
+    '''
+    #@ GET CAMERA CENTERS AND DIRECTIONS
+    camera_centers = cameras.get_camera_center()
+    c_mean = (cameras.principal_point).mean(dim=0)
+    xy_grid = c_mean.unsqueeze(0).unsqueeze(0)
+    ray_vis = _xy_to_ray_bundle(cameras, xy_grid.expand(len(cameras),-1,-1), 1.0, 15.0, 20, True)
+    camera_directions = ray_vis.directions
+    #@ CONSTRUCT MATRICIES
+    A = torch.zeros((3*len(cameras)), len(cameras)+3)
+    b = torch.zeros((3*len(cameras), 1))
+    A[:,:3] = torch.eye(3).repeat(len(cameras),1)
+    for ci in range(len(camera_directions)):
+        A[3*ci:3*ci+3, ci+3] = -camera_directions[ci]
+        b[3*ci:3*ci+3, 0] = camera_centers[ci]
+    #' A (3*N, 3*N+3)   b (3*N, 1)
+    #@ SVD
+    U, s, VT = torch.linalg.svd(A)
+    Sinv = torch.diag(1/s)
+    if len(s) < 3*len(cameras):
+        Sinv = torch.cat((Sinv, torch.zeros((Sinv.shape[0], 3*len(cameras) - Sinv.shape[1]), device=Sinv.device)), dim=1)
+    x = torch.matmul(VT.T, torch.matmul(Sinv,torch.matmul(U.T, b)))
+    centroid = x[:3,0]
+    return centroid
+def get_angles(target_camera: PerspectiveCameras, context_cameras: PerspectiveCameras, centroid=None):
+    '''
+    Get angles between cameras wrt a centroid
+    Args:
+        target_camera (Pytorch3D Camera): a camera object with a single camera
+        context_cameras (PyTorch3D Camera): a camera object
+    Returns:
+        theta_deg (Tensor): a tensor containing angles in degrees
+    '''
+    a1 = target_camera.get_camera_center()
+    b1 = context_cameras.get_camera_center()
+    a = a1 - centroid.unsqueeze(0)
+    a = a.expand(len(context_cameras), -1)
+    b = b1 - centroid.unsqueeze(0)
+    ab_dot = (a*b).sum(dim=-1)
+    theta = torch.acos((ab_dot)/(torch.linalg.norm(a, dim=-1) * torch.linalg.norm(b, dim=-1)))
+    theta_deg = theta * 180 / math.pi
+    return theta_deg
+import math
+from typing import List, Literal, Optional, Tuple
+import numpy as np
+import torch
+from jaxtyping import Float
+from numpy.typing import NDArray
+from torch import Tensor
+_EPS = np.finfo(float).eps * 4.0
+def unit_vector(data: NDArray, axis: Optional[int] = None) -> np.ndarray:
+    """Return ndarray normalized by length, i.e. Euclidean norm, along axis.
+    Args:
+        axis: the axis along which to normalize into unit vector
+        out: where to write out the data to. If None, returns a new np ndarray
+    """
+    data = np.array(data, dtype=np.float64, copy=True)
+    if data.ndim == 1:
+        data /= math.sqrt(np.dot(data, data))
+        return data
+    length = np.atleast_1d(np.sum(data * data, axis))
+    np.sqrt(length, length)
+    if axis is not None:
+        length = np.expand_dims(length, axis)
+    data /= length
+    return data
+def quaternion_from_matrix(matrix: NDArray, isprecise: bool = False) -> np.ndarray:
+    """Return quaternion from rotation matrix.
+    Args:
+        matrix: rotation matrix to obtain quaternion
+        isprecise: if True, input matrix is assumed to be precise rotation matrix and a faster algorithm is used.
+    """
+    M = np.array(matrix, dtype=np.float64, copy=False)[:4, :4]
+    if isprecise:
+        q = np.empty((4,))
+        t = np.trace(M)
+        if t > M[3, 3]:
+            q[0] = t
+            q[3] = M[1, 0] - M[0, 1]
+            q[2] = M[0, 2] - M[2, 0]
+            q[1] = M[2, 1] - M[1, 2]
+        else:
+            i, j, k = 1, 2, 3
+            if M[1, 1] > M[0, 0]:
+                i, j, k = 2, 3, 1
+            if M[2, 2] > M[i, i]:
+                i, j, k = 3, 1, 2
+            t = M[i, i] - (M[j, j] + M[k, k]) + M[3, 3]
+            q[i] = t
+            q[j] = M[i, j] + M[j, i]
+            q[k] = M[k, i] + M[i, k]
+            q[3] = M[k, j] - M[j, k]
+        q *= 0.5 / math.sqrt(t * M[3, 3])
+    else:
+        m00 = M[0, 0]
+        m01 = M[0, 1]
+        m02 = M[0, 2]
+        m10 = M[1, 0]
+        m11 = M[1, 1]
+        m12 = M[1, 2]
+        m20 = M[2, 0]
+        m21 = M[2, 1]
+        m22 = M[2, 2]
+        # symmetric matrix K
+        K = [
+            [m00 - m11 - m22, 0.0, 0.0, 0.0],
+            [m01 + m10, m11 - m00 - m22, 0.0, 0.0],
+            [m02 + m20, m12 + m21, m22 - m00 - m11, 0.0],
+            [m21 - m12, m02 - m20, m10 - m01, m00 + m11 + m22],
+        ]
+        K = np.array(K)
+        K /= 3.0
+        # quaternion is eigenvector of K that corresponds to largest eigenvalue
+        w, V = np.linalg.eigh(K)
+        q = V[np.array([3, 0, 1, 2]), np.argmax(w)]
+    if q[0] < 0.0:
+        np.negative(q, q)
+    return q
+def quaternion_slerp(
+    quat0: NDArray, quat1: NDArray, fraction: float, spin: int = 0, shortestpath: bool = True
+) -> np.ndarray:
+    """Return spherical linear interpolation between two quaternions.
+    Args:
+        quat0: first quaternion
+        quat1: second quaternion
+        fraction: how much to interpolate between quat0 vs quat1 (if 0, closer to quat0; if 1, closer to quat1)
+        spin: how much of an additional spin to place on the interpolation
+        shortestpath: whether to return the short or long path to rotation
+    """
+    q0 = unit_vector(quat0[:4])
+    q1 = unit_vector(quat1[:4])
+    if q0 is None or q1 is None:
+        raise ValueError("Input quaternions invalid.")
+    if fraction == 0.0:
+        return q0
+    if fraction == 1.0:
+        return q1
+    d = np.dot(q0, q1)
+    if abs(abs(d) - 1.0) < _EPS:
+        return q0
+    if shortestpath and d < 0.0:
+        # invert rotation
+        d = -d
+        np.negative(q1, q1)
+    angle = math.acos(d) + spin * math.pi
+    if abs(angle) < _EPS:
+        return q0
+    isin = 1.0 / math.sin(angle)
+    q0 *= math.sin((1.0 - fraction) * angle) * isin
+    q1 *= math.sin(fraction * angle) * isin
+    q0 += q1
+    return q0
+def quaternion_matrix(quaternion: NDArray) -> np.ndarray:
+    """Return homogeneous rotation matrix from quaternion.
+    Args:
+        quaternion: value to convert to matrix
+    """
+    q = np.array(quaternion, dtype=np.float64, copy=True)
+    n = np.dot(q, q)
+    if n < _EPS:
+        return np.identity(4)
+    q *= math.sqrt(2.0 / n)
+    q = np.outer(q, q)
+    return np.array(
+        [
+            [1.0 - q[2, 2] - q[3, 3], q[1, 2] - q[3, 0], q[1, 3] + q[2, 0], 0.0],
+            [q[1, 2] + q[3, 0], 1.0 - q[1, 1] - q[3, 3], q[2, 3] - q[1, 0], 0.0],
+            [q[1, 3] - q[2, 0], q[2, 3] + q[1, 0], 1.0 - q[1, 1] - q[2, 2], 0.0],
+            [0.0, 0.0, 0.0, 1.0],
+        ]
+    )
+def get_interpolated_poses(pose_a: NDArray, pose_b: NDArray, steps: int = 10) -> List[float]:
+    """Return interpolation of poses with specified number of steps.
+    Args:
+        pose_a: first pose
+        pose_b: second pose
+        steps: number of steps the interpolated pose path should contain
+    """
+    quat_a = quaternion_from_matrix(pose_a[:3, :3])
+    quat_b = quaternion_from_matrix(pose_b[:3, :3])
+    ts = np.linspace(0, 1, steps)
+    quats = [quaternion_slerp(quat_a, quat_b, t) for t in ts]
+    trans = [(1 - t) * pose_a[:3, 3] + t * pose_b[:3, 3] for t in ts]
+    poses_ab = []
+    for quat, tran in zip(quats, trans):
+        pose = np.identity(4)
+        pose[:3, :3] = quaternion_matrix(quat)[:3, :3]
+        pose[:3, 3] = tran
+        poses_ab.append(pose[:3])
+    return poses_ab
+def get_interpolated_k(
+    k_a: Float[Tensor, "3 3"], k_b: Float[Tensor, "3 3"], steps: int = 10
+) -> List[Float[Tensor, "3 4"]]:
+    """
+    Returns interpolated path between two camera poses with specified number of steps.
+    Args:
+        k_a: camera matrix 1
+        k_b: camera matrix 2
+        steps: number of steps the interpolated pose path should contain
+    Returns:
+        List of interpolated camera poses
+    """
+    Ks: List[Float[Tensor, "3 3"]] = []
+    ts = np.linspace(0, 1, steps)
+    for t in ts:
+        new_k = k_a * (1.0 - t) + k_b * t
+        Ks.append(new_k)
+    return Ks
+def get_ordered_poses_and_k(
+    poses: Float[Tensor, "num_poses 3 4"],
+    Ks: Float[Tensor, "num_poses 3 3"],
+) -> Tuple[Float[Tensor, "num_poses 3 4"], Float[Tensor, "num_poses 3 3"]]:
+    """
+    Returns ordered poses and intrinsics by euclidian distance between poses.
+    Args:
+        poses: list of camera poses
+        Ks: list of camera intrinsics
+    Returns:
+        tuple of ordered poses and intrinsics
+    """
+    poses_num = len(poses)
+    ordered_poses = torch.unsqueeze(poses[0], 0)
+    ordered_ks = torch.unsqueeze(Ks[0], 0)
+    # remove the first pose from poses
+    poses = poses[1:]
+    Ks = Ks[1:]
+    for _ in range(poses_num - 1):
+        distances = torch.norm(ordered_poses[-1][:, 3] - poses[:, :, 3], dim=1)
+        idx = torch.argmin(distances)
+        ordered_poses = torch.cat((ordered_poses, torch.unsqueeze(poses[idx], 0)), dim=0)
+        ordered_ks = torch.cat((ordered_ks, torch.unsqueeze(Ks[idx], 0)), dim=0)
+        poses = torch.cat((poses[0:idx], poses[idx + 1 :]), dim=0)
+        Ks = torch.cat((Ks[0:idx], Ks[idx + 1 :]), dim=0)
+    return ordered_poses, ordered_ks
+def get_interpolated_poses_many(
+    poses: Float[Tensor, "num_poses 3 4"],
+    Ks: Float[Tensor, "num_poses 3 3"],
+    steps_per_transition: int = 10,
+    order_poses: bool = False,
+) -> Tuple[Float[Tensor, "num_poses 3 4"], Float[Tensor, "num_poses 3 3"]]:
+    """Return interpolated poses for many camera poses.
+    Args:
+        poses: list of camera poses
+        Ks: list of camera intrinsics
+        steps_per_transition: number of steps per transition
+        order_poses: whether to order poses by euclidian distance
+    Returns:
+        tuple of new poses and intrinsics
+    """
+    traj = []
+    k_interp = []
+    if order_poses:
+        poses, Ks = get_ordered_poses_and_k(poses, Ks)
+    for idx in range(poses.shape[0] - 1):
+        pose_a = poses[idx].cpu().numpy()
+        pose_b = poses[idx + 1].cpu().numpy()
+        poses_ab = get_interpolated_poses(pose_a, pose_b, steps=steps_per_transition)
+        traj += poses_ab
+        k_interp += get_interpolated_k(Ks[idx], Ks[idx + 1], steps=steps_per_transition)
+    traj = np.stack(traj, axis=0)
+    k_interp = torch.stack(k_interp, dim=0)
+    return torch.tensor(traj, dtype=torch.float32), torch.tensor(k_interp, dtype=torch.float32)
+def normalize(x: torch.Tensor) -> Float[Tensor, "*batch"]:
+    """Returns a normalized vector."""
+    return x / torch.linalg.norm(x)
+def normalize_with_norm(x: torch.Tensor, dim: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Normalize tensor along axis and return normalized value with norms.
+    Args:
+        x: tensor to normalize.
+        dim: axis along which to normalize.
+    Returns:
+        Tuple of normalized tensor and corresponding norm.
+    """
+    norm = torch.maximum(torch.linalg.vector_norm(x, dim=dim, keepdims=True), torch.tensor([_EPS]).to(x))
+    return x / norm, norm
+def viewmatrix(lookat: torch.Tensor, up: torch.Tensor, pos: torch.Tensor) -> Float[Tensor, "*batch"]:
+    """Returns a camera transformation matrix.
+    Args:
+        lookat: The direction the camera is looking.
+        up: The upward direction of the camera.
+        pos: The position of the camera.
+    Returns:
+        A camera transformation matrix.
+    """
+    vec2 = normalize(lookat)
+    vec1_avg = normalize(up)
+    vec0 = normalize(torch.cross(vec1_avg, vec2))
+    vec1 = normalize(torch.cross(vec2, vec0))
+    m = torch.stack([vec0, vec1, vec2, pos], 1)
+    return m
+def get_distortion_params(
+    k1: float = 0.0,
+    k2: float = 0.0,
+    k3: float = 0.0,
+    k4: float = 0.0,
+    p1: float = 0.0,
+    p2: float = 0.0,
+) -> Float[Tensor, "*batch"]:
+    """Returns a distortion parameters matrix.
+    Args:
+        k1: The first radial distortion parameter.
+        k2: The second radial distortion parameter.
+        k3: The third radial distortion parameter.
+        k4: The fourth radial distortion parameter.
+        p1: The first tangential distortion parameter.
+        p2: The second tangential distortion parameter.
+    Returns:
+        torch.Tensor: A distortion parameters matrix.
+    """
+    return torch.Tensor([k1, k2, k3, k4, p1, p2])
+def _compute_residual_and_jacobian(
+    x: torch.Tensor,
+    y: torch.Tensor,
+    xd: torch.Tensor,
+    yd: torch.Tensor,
+    distortion_params: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Auxiliary function of radial_and_tangential_undistort() that computes residuals and jacobians.
+    Adapted from MultiNeRF:
+    https://github.com/google-research/multinerf/blob/b02228160d3179300c7d499dca28cb9ca3677f32/internal/camera_utils.py#L427-L474
+    Args:
+        x: The updated x coordinates.
+        y: The updated y coordinates.
+        xd: The distorted x coordinates.
+        yd: The distorted y coordinates.
+        distortion_params: The distortion parameters [k1, k2, k3, k4, p1, p2].
+    Returns:
+        The residuals (fx, fy) and jacobians (fx_x, fx_y, fy_x, fy_y).
+    """
+    k1 = distortion_params[..., 0]
+    k2 = distortion_params[..., 1]
+    k3 = distortion_params[..., 2]
+    k4 = distortion_params[..., 3]
+    p1 = distortion_params[..., 4]
+    p2 = distortion_params[..., 5]
+    # let r(x, y) = x^2 + y^2;
+    #     d(x, y) = 1 + k1 * r(x, y) + k2 * r(x, y) ^2 + k3 * r(x, y)^3 +
+    #                   k4 * r(x, y)^4;
+    r = x * x + y * y
+    d = 1.0 + r * (k1 + r * (k2 + r * (k3 + r * k4)))
+    # The perfect projection is:
+    # xd = x * d(x, y) + 2 * p1 * x * y + p2 * (r(x, y) + 2 * x^2);
+    # yd = y * d(x, y) + 2 * p2 * x * y + p1 * (r(x, y) + 2 * y^2);
+    #
+    # Let's define
+    #
+    # fx(x, y) = x * d(x, y) + 2 * p1 * x * y + p2 * (r(x, y) + 2 * x^2) - xd;
+    # fy(x, y) = y * d(x, y) + 2 * p2 * x * y + p1 * (r(x, y) + 2 * y^2) - yd;
+    #
+    # We are looking for a solution that satisfies
+    # fx(x, y) = fy(x, y) = 0;
+    fx = d * x + 2 * p1 * x * y + p2 * (r + 2 * x * x) - xd
+    fy = d * y + 2 * p2 * x * y + p1 * (r + 2 * y * y) - yd
+    # Compute derivative of d over [x, y]
+    d_r = k1 + r * (2.0 * k2 + r * (3.0 * k3 + r * 4.0 * k4))
+    d_x = 2.0 * x * d_r
+    d_y = 2.0 * y * d_r
+    # Compute derivative of fx over x and y.
+    fx_x = d + d_x * x + 2.0 * p1 * y + 6.0 * p2 * x
+    fx_y = d_y * x + 2.0 * p1 * x + 2.0 * p2 * y
+    # Compute derivative of fy over x and y.
+    fy_x = d_x * y + 2.0 * p2 * y + 2.0 * p1 * x
+    fy_y = d + d_y * y + 2.0 * p2 * x + 6.0 * p1 * y
+    return fx, fy, fx_x, fx_y, fy_x, fy_y
+# @torch_compile(dynamic=True, mode="reduce-overhead", backend="eager")
+def radial_and_tangential_undistort(
+    coords: torch.Tensor,
+    distortion_params: torch.Tensor,
+    eps: float = 1e-3,
+    max_iterations: int = 10,
+) -> torch.Tensor:
+    """Computes undistorted coords given opencv distortion parameters.
+    Adapted from MultiNeRF
+    https://github.com/google-research/multinerf/blob/b02228160d3179300c7d499dca28cb9ca3677f32/internal/camera_utils.py#L477-L509
+    Args:
+        coords: The distorted coordinates.
+        distortion_params: The distortion parameters [k1, k2, k3, k4, p1, p2].
+        eps: The epsilon for the convergence.
+        max_iterations: The maximum number of iterations to perform.
+    Returns:
+        The undistorted coordinates.
+    """
+    # Initialize from the distorted point.
+    x = coords[..., 0]
+    y = coords[..., 1]
+    for _ in range(max_iterations):
+        fx, fy, fx_x, fx_y, fy_x, fy_y = _compute_residual_and_jacobian(
+            x=x, y=y, xd=coords[..., 0], yd=coords[..., 1], distortion_params=distortion_params
+        )
+        denominator = fy_x * fx_y - fx_x * fy_y
+        x_numerator = fx * fy_y - fy * fx_y
+        y_numerator = fy * fx_x - fx * fy_x
+        step_x = torch.where(torch.abs(denominator) > eps, x_numerator / denominator, torch.zeros_like(denominator))
+        step_y = torch.where(torch.abs(denominator) > eps, y_numerator / denominator, torch.zeros_like(denominator))
+        x = x + step_x
+        y = y + step_y
+    return torch.stack([x, y], dim=-1)
+def rotation_matrix(a: Float[Tensor, "3"], b: Float[Tensor, "3"]) -> Float[Tensor, "3 3"]:
+    """Compute the rotation matrix that rotates vector a to vector b.
+    Args:
+        a: The vector to rotate.
+        b: The vector to rotate to.
+    Returns:
+        The rotation matrix.
+    """
+    a = a / torch.linalg.norm(a)
+    b = b / torch.linalg.norm(b)
+    v = torch.cross(a, b)
+    c = torch.dot(a, b)
+    # If vectors are exactly opposite, we add a little noise to one of them
+    if c < -1 + 1e-8:
+        eps = (torch.rand(3) - 0.5) * 0.01
+        return rotation_matrix(a + eps, b)
+    s = torch.linalg.norm(v)
+    skew_sym_mat = torch.Tensor(
+        [
+            [0, -v[2], v[1]],
+            [v[2], 0, -v[0]],
+            [-v[1], v[0], 0],
+        ]
+    )
+    return torch.eye(3) + skew_sym_mat + skew_sym_mat @ skew_sym_mat * ((1 - c) / (s**2 + 1e-8))
+def focus_of_attention(poses: Float[Tensor, "*num_poses 4 4"], initial_focus: Float[Tensor, "3"]) -> Float[Tensor, "3"]:
+    """Compute the focus of attention of a set of cameras. Only cameras
+    that have the focus of attention in front of them are considered.
+     Args:
+        poses: The poses to orient.
+        initial_focus: The 3D point views to decide which cameras are initially activated.
+    Returns:
+        The 3D position of the focus of attention.
+    """
+    # References to the same method in third-party code:
+    # https://github.com/google-research/multinerf/blob/1c8b1c552133cdb2de1c1f3c871b2813f6662265/internal/camera_utils.py#L145
+    # https://github.com/bmild/nerf/blob/18b8aebda6700ed659cb27a0c348b737a5f6ab60/load_llff.py#L197
+    active_directions = -poses[:, :3, 2:3]
+    active_origins = poses[:, :3, 3:4]
+    # initial value for testing if the focus_pt is in front or behind
+    focus_pt = initial_focus
+    # Prune cameras which have the current have the focus_pt behind them.
+    active = torch.sum(active_directions.squeeze(-1) * (focus_pt - active_origins.squeeze(-1)), dim=-1) > 0
+    done = False
+    # We need at least two active cameras, else fallback on the previous solution.
+    # This may be the "poses" solution if no cameras are active on first iteration, e.g.
+    # they are in an outward-looking configuration.
+    while torch.sum(active.int()) > 1 and not done:
+        active_directions = active_directions[active]
+        active_origins = active_origins[active]
+        # https://en.wikipedia.org/wiki/Line–line_intersection#In_more_than_two_dimensions
+        m = torch.eye(3) - active_directions * torch.transpose(active_directions, -2, -1)
+        mt_m = torch.transpose(m, -2, -1) @ m
+        focus_pt = torch.linalg.inv(mt_m.mean(0)) @ (mt_m @ active_origins).mean(0)[:, 0]
+        active = torch.sum(active_directions.squeeze(-1) * (focus_pt - active_origins.squeeze(-1)), dim=-1) > 0
+        if active.all():
+            # the set of active cameras did not change, so we're done.
+            done = True
+    return focus_pt
+def auto_orient_and_center_poses(
+    poses: Float[Tensor, "*num_poses 4 4"],
+    method: Literal["pca", "up", "vertical", "none"] = "up",
+    center_method: Literal["poses", "focus", "none"] = "poses",
+) -> Tuple[Float[Tensor, "*num_poses 3 4"], Float[Tensor, "3 4"]]:
+    """Orients and centers the poses.
+    We provide three methods for orientation:
+    - pca: Orient the poses so that the principal directions of the camera centers are aligned
+        with the axes, Z corresponding to the smallest principal component.
+        This method works well when all of the cameras are in the same plane, for example when
+        images are taken using a mobile robot.
+    - up: Orient the poses so that the average up vector is aligned with the z axis.
+        This method works well when images are not at arbitrary angles.
+    - vertical: Orient the poses so that the Z 3D direction projects close to the
+        y axis in images. This method works better if cameras are not all
+        looking in the same 3D direction, which may happen in camera arrays or in LLFF.
+    There are two centering methods:
+    - poses: The poses are centered around the origin.
+    - focus: The origin is set to the focus of attention of all cameras (the
+        closest point to cameras optical axes). Recommended for inward-looking
+        camera configurations.
+    Args:
+        poses: The poses to orient.
+        method: The method to use for orientation.
+        center_method: The method to use to center the poses.
+    Returns:
+        Tuple of the oriented poses and the transform matrix.
+    """
+    origins = poses[..., :3, 3]
+    mean_origin = torch.mean(origins, dim=0)
+    translation_diff = origins - mean_origin
+    if center_method == "poses":
+        translation = mean_origin
+    elif center_method == "focus":
+        translation = focus_of_attention(poses, mean_origin)
+    elif center_method == "none":
+        translation = torch.zeros_like(mean_origin)
+    else:
+        raise ValueError(f"Unknown value for center_method: {center_method}")
+    if method == "pca":
+        _, eigvec = torch.linalg.eigh(translation_diff.T @ translation_diff)
+        eigvec = torch.flip(eigvec, dims=(-1,))
+        if torch.linalg.det(eigvec) < 0:
+            eigvec[:, 2] = -eigvec[:, 2]
+        transform = torch.cat([eigvec, eigvec @ -translation[..., None]], dim=-1)
+        oriented_poses = transform @ poses
+        if oriented_poses.mean(dim=0)[2, 1] < 0:
+            oriented_poses[:, 1:3] = -1 * oriented_poses[:, 1:3]
+    elif method in ("up", "vertical"):
+        up = torch.mean(poses[:, :3, 1], dim=0)
+        up = up / torch.linalg.norm(up)
+        if method == "vertical":
+            # If cameras are not all parallel (e.g. not in an LLFF configuration),
+            # we can find the 3D direction that most projects vertically in all
+            # cameras by minimizing ||Xu|| s.t. ||u||=1. This total least squares
+            # problem is solved by SVD.
+            x_axis_matrix = poses[:, :3, 0]
+            _, S, Vh = torch.linalg.svd(x_axis_matrix, full_matrices=False)
+            # Singular values are S_i=||Xv_i|| for each right singular vector v_i.
+            # ||S|| = sqrt(n) because lines of X are all unit vectors and the v_i
+            # are an orthonormal basis.
+            # ||Xv_i|| = sqrt(sum(dot(x_axis_j,v_i)^2)), thus S_i/sqrt(n) is the
+            # RMS of cosines between x axes and v_i. If the second smallest singular
+            # value corresponds to an angle error less than 10° (cos(80°)=0.17),
+            # this is probably a degenerate camera configuration (typical values
+            # are around 5° average error for the true vertical). In this case,
+            # rather than taking the vector corresponding to the smallest singular
+            # value, we project the "up" vector on the plane spanned by the two
+            # best singular vectors. We could also just fallback to the "up"
+            # solution.
+            if S[1] > 0.17 * math.sqrt(poses.shape[0]):
+                # regular non-degenerate configuration
+                up_vertical = Vh[2, :]
+                # It may be pointing up or down. Use "up" to disambiguate the sign.
+                up = up_vertical if torch.dot(up_vertical, up) > 0 else -up_vertical
+            else:
+                # Degenerate configuration: project "up" on the plane spanned by
+                # the last two right singular vectors (which are orthogonal to the
+                # first). v_0 is a unit vector, no need to divide by its norm when
+                # projecting.
+                up = up - Vh[0, :] * torch.dot(up, Vh[0, :])
+                # re-normalize
+                up = up / torch.linalg.norm(up)
+        rotation = rotation_matrix(up, torch.Tensor([0, 0, 1]))
+        transform = torch.cat([rotation, rotation @ -translation[..., None]], dim=-1)
+        oriented_poses = transform @ poses
+    elif method == "none":
+        transform = torch.eye(4)
+        transform[:3, 3] = -translation
+        transform = transform[:3, :]
+        oriented_poses = transform @ poses
+    else:
+        raise ValueError(f"Unknown value for method: {method}")
+    return oriented_poses, transform
+@torch.jit.script
+def fisheye624_project(xyz, params):
+    """
+    Batched implementation of the FisheyeRadTanThinPrism (aka Fisheye624) camera
+    model project() function.
+    Inputs:
+        xyz: BxNx3 tensor of 3D points to be projected
+        params: Bx16 tensor of Fisheye624 parameters formatted like this:
+                [f_u f_v c_u c_v {k_0 ... k_5} {p_0 p_1} {s_0 s_1 s_2 s_3}]
+                or Bx15 tensor of Fisheye624 parameters formatted like this:
+                [f c_u c_v {k_0 ... k_5} {p_0 p_1} {s_0 s_1 s_2 s_3}]
+    Outputs:
+        uv: BxNx2 tensor of 2D projections of xyz in image plane
+    Model for fisheye cameras with radial, tangential, and thin-prism distortion.
+    This model allows fu != fv.
+    Specifically, the model is:
+    uvDistorted = [x_r]  + tangentialDistortion  + thinPrismDistortion
+                  [y_r]
+    proj = diag(fu,fv) * uvDistorted + [cu;cv];
+    where:
+      a = x/z, b = y/z, r = (a^2+b^2)^(1/2)
+      th = atan(r)
+      cosPhi = a/r, sinPhi = b/r
+      [x_r]  = (th+ k0 * th^3 + k1* th^5 + ...) [cosPhi]
+      [y_r]                                     [sinPhi]
+      the number of terms in the series is determined by the template parameter numK.
+      tangentialDistortion = [(2 x_r^2 + rd^2)*p_0 + 2*x_r*y_r*p_1]
+                             [(2 y_r^2 + rd^2)*p_1 + 2*x_r*y_r*p_0]
+      where rd^2 = x_r^2 + y_r^2
+      thinPrismDistortion = [s0 * rd^2 + s1 rd^4]
+                            [s2 * rd^2 + s3 rd^4]
+    Author: Daniel DeTone ([email protected])
+    """
+    assert xyz.ndim == 3
+    assert params.ndim == 2
+    assert params.shape[-1] == 16 or params.shape[-1] == 15, "This model allows fx != fy"
+    eps = 1e-9
+    B, N = xyz.shape[0], xyz.shape[1]
+    # Radial correction.
+    z = xyz[:, :, 2].reshape(B, N, 1)
+    z = torch.where(torch.abs(z) < eps, eps * torch.sign(z), z)
+    ab = xyz[:, :, :2] / z
+    r = torch.norm(ab, dim=-1, p=2, keepdim=True)
+    th = torch.atan(r)
+    th_divr = torch.where(r < eps, torch.ones_like(ab), ab / r)
+    th_k = th.reshape(B, N, 1).clone()
+    for i in range(6):
+        th_k = th_k + params[:, -12 + i].reshape(B, 1, 1) * torch.pow(th, 3 + i * 2)
+    xr_yr = th_k * th_divr
+    uv_dist = xr_yr
+    # Tangential correction.
+    p0 = params[:, -6].reshape(B, 1)
+    p1 = params[:, -5].reshape(B, 1)
+    xr = xr_yr[:, :, 0].reshape(B, N)
+    yr = xr_yr[:, :, 1].reshape(B, N)
+    xr_yr_sq = torch.square(xr_yr)
+    xr_sq = xr_yr_sq[:, :, 0].reshape(B, N)
+    yr_sq = xr_yr_sq[:, :, 1].reshape(B, N)
+    rd_sq = xr_sq + yr_sq
+    uv_dist_tu = uv_dist[:, :, 0] + ((2.0 * xr_sq + rd_sq) * p0 + 2.0 * xr * yr * p1)
+    uv_dist_tv = uv_dist[:, :, 1] + ((2.0 * yr_sq + rd_sq) * p1 + 2.0 * xr * yr * p0)
+    uv_dist = torch.stack([uv_dist_tu, uv_dist_tv], dim=-1)  # Avoids in-place complaint.
+    # Thin Prism correction.
+    s0 = params[:, -4].reshape(B, 1)
+    s1 = params[:, -3].reshape(B, 1)
+    s2 = params[:, -2].reshape(B, 1)
+    s3 = params[:, -1].reshape(B, 1)
+    rd_4 = torch.square(rd_sq)
+    uv_dist[:, :, 0] = uv_dist[:, :, 0] + (s0 * rd_sq + s1 * rd_4)
+    uv_dist[:, :, 1] = uv_dist[:, :, 1] + (s2 * rd_sq + s3 * rd_4)
+    # Finally, apply standard terms: focal length and camera centers.
+    if params.shape[-1] == 15:
+        fx_fy = params[:, 0].reshape(B, 1, 1)
+        cx_cy = params[:, 1:3].reshape(B, 1, 2)
+    else:
+        fx_fy = params[:, 0:2].reshape(B, 1, 2)
+        cx_cy = params[:, 2:4].reshape(B, 1, 2)
+    result = uv_dist * fx_fy + cx_cy
+    return result
+# Core implementation of fisheye 624 unprojection. More details are documented here:
+# https://facebookresearch.github.io/projectaria_tools/docs/tech_insights/camera_intrinsic_models#the-fisheye62-model
+@torch.jit.script
+def fisheye624_unproject_helper(uv, params, max_iters: int = 5):
+    """
+    Batched implementation of the FisheyeRadTanThinPrism (aka Fisheye624) camera
+    model. There is no analytical solution for the inverse of the project()
+    function so this solves an optimization problem using Newton's method to get
+    the inverse.
+    Inputs:
+        uv: BxNx2 tensor of 2D pixels to be unprojected
+        params: Bx16 tensor of Fisheye624 parameters formatted like this:
+                [f_u f_v c_u c_v {k_0 ... k_5} {p_0 p_1} {s_0 s_1 s_2 s_3}]
+                or Bx15 tensor of Fisheye624 parameters formatted like this:
+                [f c_u c_v {k_0 ... k_5} {p_0 p_1} {s_0 s_1 s_2 s_3}]
+    Outputs:
+        xyz: BxNx3 tensor of 3D rays of uv points with z = 1.
+    Model for fisheye cameras with radial, tangential, and thin-prism distortion.
+    This model assumes fu=fv. This unproject function holds that:
+    X = unproject(project(X))     [for X=(x,y,z) in R^3, z>0]
+    and
+    x = project(unproject(s*x))   [for s!=0 and x=(u,v) in R^2]
+    Author: Daniel DeTone ([email protected])
+    """
+    assert uv.ndim == 3, "Expected batched input shaped BxNx3"
+    assert params.ndim == 2
+    assert params.shape[-1] == 16 or params.shape[-1] == 15, "This model allows fx != fy"
+    eps = 1e-6
+    B, N = uv.shape[0], uv.shape[1]
+    if params.shape[-1] == 15:
+        fx_fy = params[:, 0].reshape(B, 1, 1)
+        cx_cy = params[:, 1:3].reshape(B, 1, 2)
+    else:
+        fx_fy = params[:, 0:2].reshape(B, 1, 2)
+        cx_cy = params[:, 2:4].reshape(B, 1, 2)
+    uv_dist = (uv - cx_cy) / fx_fy
+    # Compute xr_yr using Newton's method.
+    xr_yr = uv_dist.clone()  # Initial guess.
+    for _ in range(max_iters):
+        uv_dist_est = xr_yr.clone()
+        # Tangential terms.
+        p0 = params[:, -6].reshape(B, 1)
+        p1 = params[:, -5].reshape(B, 1)
+        xr = xr_yr[:, :, 0].reshape(B, N)
+        yr = xr_yr[:, :, 1].reshape(B, N)
+        xr_yr_sq = torch.square(xr_yr)
+        xr_sq = xr_yr_sq[:, :, 0].reshape(B, N)
+        yr_sq = xr_yr_sq[:, :, 1].reshape(B, N)
+        rd_sq = xr_sq + yr_sq
+        uv_dist_est[:, :, 0] = uv_dist_est[:, :, 0] + ((2.0 * xr_sq + rd_sq) * p0 + 2.0 * xr * yr * p1)
+        uv_dist_est[:, :, 1] = uv_dist_est[:, :, 1] + ((2.0 * yr_sq + rd_sq) * p1 + 2.0 * xr * yr * p0)
+        # Thin Prism terms.
+        s0 = params[:, -4].reshape(B, 1)
+        s1 = params[:, -3].reshape(B, 1)
+        s2 = params[:, -2].reshape(B, 1)
+        s3 = params[:, -1].reshape(B, 1)
+        rd_4 = torch.square(rd_sq)
+        uv_dist_est[:, :, 0] = uv_dist_est[:, :, 0] + (s0 * rd_sq + s1 * rd_4)
+        uv_dist_est[:, :, 1] = uv_dist_est[:, :, 1] + (s2 * rd_sq + s3 * rd_4)
+        # Compute the derivative of uv_dist w.r.t. xr_yr.
+        duv_dist_dxr_yr = uv.new_ones(B, N, 2, 2)
+        duv_dist_dxr_yr[:, :, 0, 0] = 1.0 + 6.0 * xr_yr[:, :, 0] * p0 + 2.0 * xr_yr[:, :, 1] * p1
+        offdiag = 2.0 * (xr_yr[:, :, 0] * p1 + xr_yr[:, :, 1] * p0)
+        duv_dist_dxr_yr[:, :, 0, 1] = offdiag
+        duv_dist_dxr_yr[:, :, 1, 0] = offdiag
+        duv_dist_dxr_yr[:, :, 1, 1] = 1.0 + 6.0 * xr_yr[:, :, 1] * p1 + 2.0 * xr_yr[:, :, 0] * p0
+        xr_yr_sq_norm = xr_yr_sq[:, :, 0] + xr_yr_sq[:, :, 1]
+        temp1 = 2.0 * (s0 + 2.0 * s1 * xr_yr_sq_norm)
+        duv_dist_dxr_yr[:, :, 0, 0] = duv_dist_dxr_yr[:, :, 0, 0] + (xr_yr[:, :, 0] * temp1)
+        duv_dist_dxr_yr[:, :, 0, 1] = duv_dist_dxr_yr[:, :, 0, 1] + (xr_yr[:, :, 1] * temp1)
+        temp2 = 2.0 * (s2 + 2.0 * s3 * xr_yr_sq_norm)
+        duv_dist_dxr_yr[:, :, 1, 0] = duv_dist_dxr_yr[:, :, 1, 0] + (xr_yr[:, :, 0] * temp2)
+        duv_dist_dxr_yr[:, :, 1, 1] = duv_dist_dxr_yr[:, :, 1, 1] + (xr_yr[:, :, 1] * temp2)
+        # Compute 2x2 inverse manually here since torch.inverse() is very slow.
+        # Because this is slow: inv = duv_dist_dxr_yr.inverse()
+        # About a 10x reduction in speed with above line.
+        mat = duv_dist_dxr_yr.reshape(-1, 2, 2)
+        a = mat[:, 0, 0].reshape(-1, 1, 1)
+        b = mat[:, 0, 1].reshape(-1, 1, 1)
+        c = mat[:, 1, 0].reshape(-1, 1, 1)
+        d = mat[:, 1, 1].reshape(-1, 1, 1)
+        det = 1.0 / ((a * d) - (b * c))
+        top = torch.cat([d, -b], dim=2)
+        bot = torch.cat([-c, a], dim=2)
+        inv = det * torch.cat([top, bot], dim=1)
+        inv = inv.reshape(B, N, 2, 2)
+        # Manually compute 2x2 @ 2x1 matrix multiply.
+        # Because this is slow: step = (inv @ (uv_dist - uv_dist_est)[..., None])[..., 0]
+        diff = uv_dist - uv_dist_est
+        a = inv[:, :, 0, 0]
+        b = inv[:, :, 0, 1]
+        c = inv[:, :, 1, 0]
+        d = inv[:, :, 1, 1]
+        e = diff[:, :, 0]
+        f = diff[:, :, 1]
+        step = torch.stack([a * e + b * f, c * e + d * f], dim=-1)
+        # Newton step.
+        xr_yr = xr_yr + step
+    # Compute theta using Newton's method.
+    xr_yr_norm = xr_yr.norm(p=2, dim=2).reshape(B, N, 1)
+    th = xr_yr_norm.clone()
+    for _ in range(max_iters):
+        th_radial = uv.new_ones(B, N, 1)
+        dthd_th = uv.new_ones(B, N, 1)
+        for k in range(6):
+            r_k = params[:, -12 + k].reshape(B, 1, 1)
+            th_radial = th_radial + (r_k * torch.pow(th, 2 + k * 2))
+            dthd_th = dthd_th + ((3.0 + 2.0 * k) * r_k * torch.pow(th, 2 + k * 2))
+        th_radial = th_radial * th
+        step = (xr_yr_norm - th_radial) / dthd_th
+        # handle dthd_th close to 0.
+        step = torch.where(dthd_th.abs() > eps, step, torch.sign(step) * eps * 10.0)
+        th = th + step
+    # Compute the ray direction using theta and xr_yr.
+    close_to_zero = torch.logical_and(th.abs() < eps, xr_yr_norm.abs() < eps)
+    ray_dir = torch.where(close_to_zero, xr_yr, torch.tan(th) / xr_yr_norm * xr_yr)
+    ray = torch.cat([ray_dir, uv.new_ones(B, N, 1)], dim=2)
+    return ray
+# unproject 2D point to 3D with fisheye624 model
+def fisheye624_unproject(coords: torch.Tensor, distortion_params: torch.Tensor) -> torch.Tensor:
+    dirs = fisheye624_unproject_helper(coords.unsqueeze(0), distortion_params[0].unsqueeze(0))
+    # correct for camera space differences:
+    dirs[..., 1] = -dirs[..., 1]
+    dirs[..., 2] = -dirs[..., 2]
+    return dirs

sgm/data/cifar10.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import pytorch_lightning as pl
+import torchvision
+from torch.utils.data import DataLoader, Dataset
+from torchvision import transforms
+class CIFAR10DataDictWrapper(Dataset):
+    def __init__(self, dset):
+        super().__init__()
+        self.dset = dset
+    def __getitem__(self, i):
+        x, y = self.dset[i]
+        return {"jpg": x, "cls": y}
+    def __len__(self):
+        return len(self.dset)
+class CIFAR10Loader(pl.LightningDataModule):
+    def __init__(self, batch_size, num_workers=0, shuffle=True):
+        super().__init__()
+        transform = transforms.Compose(
+            [transforms.ToTensor(), transforms.Lambda(lambda x: x * 2.0 - 1.0)]
+        )
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.shuffle = shuffle
+        self.train_dataset = CIFAR10DataDictWrapper(
+            torchvision.datasets.CIFAR10(
+                root=".data/", train=True, download=True, transform=transform
+            )
+        )
+        self.test_dataset = CIFAR10DataDictWrapper(
+            torchvision.datasets.CIFAR10(
+                root=".data/", train=False, download=True, transform=transform
+            )
+        )
+    def prepare_data(self):
+        pass
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            num_workers=self.num_workers,
+        )
+    def test_dataloader(self):
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            num_workers=self.num_workers,
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            num_workers=self.num_workers,
+        )

sgm/data/co3d.py ADDED Viewed

	@@ -0,0 +1,1367 @@

+"""
+adopted from SparseFusion
+Wrapper for the full CO3Dv2 dataset
+#@ Modified from https://github.com/facebookresearch/pytorch3d
+"""
+import json
+import logging
+import math
+import os
+import random
+import time
+import warnings
+from collections import defaultdict
+from itertools import islice
+from typing import (
+    Any,
+    ClassVar,
+    List,
+    Mapping,
+    Optional,
+    Sequence,
+    Tuple,
+    Type,
+    TypedDict,
+    Union,
+)
+from einops import rearrange, repeat
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torchvision.transforms.functional as TF
+from pytorch3d.utils import opencv_from_cameras_projection
+from pytorch3d.implicitron.dataset import types
+from pytorch3d.implicitron.dataset.dataset_base import DatasetBase
+from sgm.data.json_index_dataset import (
+    FrameAnnotsEntry,
+    _bbox_xywh_to_xyxy,
+    _bbox_xyxy_to_xywh,
+    _clamp_box_to_image_bounds_and_round,
+    _crop_around_box,
+    _get_1d_bounds,
+    _get_bbox_from_mask,
+    _get_clamp_bbox,
+    _load_1bit_png_mask,
+    _load_16big_png_depth,
+    _load_depth,
+    _load_depth_mask,
+    _load_image,
+    _load_mask,
+    _load_pointcloud,
+    _rescale_bbox,
+    _safe_as_tensor,
+    _seq_name_to_seed,
+)
+from sgm.data.objaverse import video_collate_fn
+from pytorch3d.implicitron.dataset.json_index_dataset_map_provider_v2 import (
+    get_available_subset_names,
+)
+from pytorch3d.renderer.cameras import PerspectiveCameras
+logger = logging.getLogger(__name__)
+from dataclasses import dataclass, field, fields
+from pytorch3d.renderer.camera_utils import join_cameras_as_batch
+from pytorch3d.renderer.cameras import CamerasBase, PerspectiveCameras
+from pytorch3d.structures.pointclouds import Pointclouds, join_pointclouds_as_batch
+from pytorch_lightning import LightningDataModule
+from torch.utils.data import DataLoader
+CO3D_ALL_CATEGORIES = list(
+    reversed(
+        [
+            "baseballbat",
+            "banana",
+            "bicycle",
+            "microwave",
+            "tv",
+            "cellphone",
+            "toilet",
+            "hairdryer",
+            "couch",
+            "kite",
+            "pizza",
+            "umbrella",
+            "wineglass",
+            "laptop",
+            "hotdog",
+            "stopsign",
+            "frisbee",
+            "baseballglove",
+            "cup",
+            "parkingmeter",
+            "backpack",
+            "toyplane",
+            "toybus",
+            "handbag",
+            "chair",
+            "keyboard",
+            "car",
+            "motorcycle",
+            "carrot",
+            "bottle",
+            "sandwich",
+            "remote",
+            "bowl",
+            "skateboard",
+            "toaster",
+            "mouse",
+            "toytrain",
+            "book",
+            "toytruck",
+            "orange",
+            "broccoli",
+            "plant",
+            "teddybear",
+            "suitcase",
+            "bench",
+            "ball",
+            "cake",
+            "vase",
+            "hydrant",
+            "apple",
+            "donut",
+        ]
+    )
+)
+CO3D_ALL_TEN = [
+    "donut",
+    "apple",
+    "hydrant",
+    "vase",
+    "cake",
+    "ball",
+    "bench",
+    "suitcase",
+    "teddybear",
+    "plant",
+]
+# @ FROM https://github.com/facebookresearch/pytorch3d
+@dataclass
+class FrameData(Mapping[str, Any]):
+    """
+    A type of the elements returned by indexing the dataset object.
+    It can represent both individual frames and batches of thereof;
+    in this documentation, the sizes of tensors refer to single frames;
+    add the first batch dimension for the collation result.
+    Args:
+        frame_number: The number of the frame within its sequence.
+            0-based continuous integers.
+        sequence_name: The unique name of the frame's sequence.
+        sequence_category: The object category of the sequence.
+        frame_timestamp: The time elapsed since the start of a sequence in sec.
+        image_size_hw: The size of the image in pixels; (height, width) tensor
+                        of shape (2,).
+        image_path: The qualified path to the loaded image (with dataset_root).
+        image_rgb: A Tensor of shape `(3, H, W)` holding the RGB image
+            of the frame; elements are floats in [0, 1].
+        mask_crop: A binary mask of shape `(1, H, W)` denoting the valid image
+            regions. Regions can be invalid (mask_crop[i,j]=0) in case they
+            are a result of zero-padding of the image after cropping around
+            the object bounding box; elements are floats in {0.0, 1.0}.
+        depth_path: The qualified path to the frame's depth map.
+        depth_map: A float Tensor of shape `(1, H, W)` holding the depth map
+            of the frame; values correspond to distances from the camera;
+            use `depth_mask` and `mask_crop` to filter for valid pixels.
+        depth_mask: A binary mask of shape `(1, H, W)` denoting pixels of the
+            depth map that are valid for evaluation, they have been checked for
+            consistency across views; elements are floats in {0.0, 1.0}.
+        mask_path: A qualified path to the foreground probability mask.
+        fg_probability: A Tensor of `(1, H, W)` denoting the probability of the
+            pixels belonging to the captured object; elements are floats
+            in [0, 1].
+        bbox_xywh: The bounding box tightly enclosing the foreground object in the
+            format (x0, y0, width, height). The convention assumes that
+            `x0+width` and `y0+height` includes the boundary of the box.
+            I.e., to slice out the corresponding crop from an image tensor `I`
+            we execute `crop = I[..., y0:y0+height, x0:x0+width]`
+        crop_bbox_xywh: The bounding box denoting the boundaries of `image_rgb`
+            in the original image coordinates in the format (x0, y0, width, height).
+            The convention is the same as for `bbox_xywh`. `crop_bbox_xywh` differs
+            from `bbox_xywh` due to padding (which can happen e.g. due to
+            setting `JsonIndexDataset.box_crop_context > 0`)
+        camera: A PyTorch3D camera object corresponding the frame's viewpoint,
+            corrected for cropping if it happened.
+        camera_quality_score: The score proportional to the confidence of the
+            frame's camera estimation (the higher the more accurate).
+        point_cloud_quality_score: The score proportional to the accuracy of the
+            frame's sequence point cloud (the higher the more accurate).
+        sequence_point_cloud_path: The path to the sequence's point cloud.
+        sequence_point_cloud: A PyTorch3D Pointclouds object holding the
+            point cloud corresponding to the frame's sequence. When the object
+            represents a batch of frames, point clouds may be deduplicated;
+            see `sequence_point_cloud_idx`.
+        sequence_point_cloud_idx: Integer indices mapping frame indices to the
+            corresponding point clouds in `sequence_point_cloud`; to get the
+            corresponding point cloud to `image_rgb[i]`, use
+            `sequence_point_cloud[sequence_point_cloud_idx[i]]`.
+        frame_type: The type of the loaded frame specified in
+            `subset_lists_file`, if provided.
+        meta: A dict for storing additional frame information.
+    """
+    frame_number: Optional[torch.LongTensor]
+    sequence_name: Union[str, List[str]]
+    sequence_category: Union[str, List[str]]
+    frame_timestamp: Optional[torch.Tensor] = None
+    image_size_hw: Optional[torch.Tensor] = None
+    image_path: Union[str, List[str], None] = None
+    image_rgb: Optional[torch.Tensor] = None
+    # masks out padding added due to cropping the square bit
+    mask_crop: Optional[torch.Tensor] = None
+    depth_path: Union[str, List[str], None] = ""
+    depth_map: Optional[torch.Tensor] = torch.zeros(1)
+    depth_mask: Optional[torch.Tensor] = torch.zeros(1)
+    mask_path: Union[str, List[str], None] = None
+    fg_probability: Optional[torch.Tensor] = None
+    bbox_xywh: Optional[torch.Tensor] = None
+    crop_bbox_xywh: Optional[torch.Tensor] = None
+    camera: Optional[PerspectiveCameras] = None
+    camera_quality_score: Optional[torch.Tensor] = None
+    point_cloud_quality_score: Optional[torch.Tensor] = None
+    sequence_point_cloud_path: Union[str, List[str], None] = ""
+    sequence_point_cloud: Optional[Pointclouds] = torch.zeros(1)
+    sequence_point_cloud_idx: Optional[torch.Tensor] = torch.zeros(1)
+    frame_type: Union[str, List[str], None] = ""  # known | unseen
+    meta: dict = field(default_factory=lambda: {})
+    valid_region: Optional[torch.Tensor] = None
+    category_one_hot: Optional[torch.Tensor] = None
+    def to(self, *args, **kwargs):
+        new_params = {}
+        for f in fields(self):
+            value = getattr(self, f.name)
+            if isinstance(value, (torch.Tensor, Pointclouds, CamerasBase)):
+                new_params[f.name] = value.to(*args, **kwargs)
+            else:
+                new_params[f.name] = value
+        return type(self)(**new_params)
+    def cpu(self):
+        return self.to(device=torch.device("cpu"))
+    def cuda(self):
+        return self.to(device=torch.device("cuda"))
+    # the following functions make sure **frame_data can be passed to functions
+    def __iter__(self):
+        for f in fields(self):
+            yield f.name
+    def __getitem__(self, key):
+        return getattr(self, key)
+    def __len__(self):
+        return len(fields(self))
+    @classmethod
+    def collate(cls, batch):
+        """
+        Given a list objects `batch` of class `cls`, collates them into a batched
+        representation suitable for processing with deep networks.
+        """
+        elem = batch[0]
+        if isinstance(elem, cls):
+            pointcloud_ids = [id(el.sequence_point_cloud) for el in batch]
+            id_to_idx = defaultdict(list)
+            for i, pc_id in enumerate(pointcloud_ids):
+                id_to_idx[pc_id].append(i)
+            sequence_point_cloud = []
+            sequence_point_cloud_idx = -np.ones((len(batch),))
+            for i, ind in enumerate(id_to_idx.values()):
+                sequence_point_cloud_idx[ind] = i
+                sequence_point_cloud.append(batch[ind[0]].sequence_point_cloud)
+            assert (sequence_point_cloud_idx >= 0).all()
+            override_fields = {
+                "sequence_point_cloud": sequence_point_cloud,
+                "sequence_point_cloud_idx": sequence_point_cloud_idx.tolist(),
+            }
+            # note that the pre-collate value of sequence_point_cloud_idx is unused
+            collated = {}
+            for f in fields(elem):
+                list_values = override_fields.get(
+                    f.name, [getattr(d, f.name) for d in batch]
+                )
+                collated[f.name] = (
+                    cls.collate(list_values)
+                    if all(list_value is not None for list_value in list_values)
+                    else None
+                )
+            return cls(**collated)
+        elif isinstance(elem, Pointclouds):
+            return join_pointclouds_as_batch(batch)
+        elif isinstance(elem, CamerasBase):
+            # TODO: don't store K; enforce working in NDC space
+            return join_cameras_as_batch(batch)
+        else:
+            return torch.utils.data._utils.collate.default_collate(batch)
+# @ MODIFIED FROM https://github.com/facebookresearch/pytorch3d
+class CO3Dv2Wrapper(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        root_dir="/drive/datasets/co3d/",
+        category="hydrant",
+        subset="fewview_train",
+        stage="train",
+        sample_batch_size=20,
+        image_size=256,
+        masked=False,
+        deprecated_val_region=False,
+        return_frame_data_list=False,
+        reso: int = 256,
+        mask_type: str = "random",
+        cond_aug_mean=-3.0,
+        cond_aug_std=0.5,
+        condition_on_elevation=False,
+        fps_id=0.0,
+        motion_bucket_id=300.0,
+        num_frames: int = 20,
+        use_mask: bool = True,
+        load_pixelnerf: bool = True,
+        scale_pose: bool = True,
+        max_n_cond: int = 5,
+        min_n_cond: int = 2,
+        cond_on_multi: bool = False,
+    ):
+        root = root_dir
+        from typing import List
+        from co3d.dataset.data_types import (
+            FrameAnnotation,
+            SequenceAnnotation,
+            load_dataclass_jgzip,
+        )
+        self.dataset_root = root
+        self.path_manager = None
+        self.subset = subset
+        self.stage = stage
+        self.subset_lists_file: List[str] = [
+            f"{self.dataset_root}/{category}/set_lists/set_lists_{subset}.json"
+        ]
+        self.subsets: Optional[List[str]] = [subset]
+        self.sample_batch_size = sample_batch_size
+        self.limit_to: int = 0
+        self.limit_sequences_to: int = 0
+        self.pick_sequence: Tuple[str, ...] = ()
+        self.exclude_sequence: Tuple[str, ...] = ()
+        self.limit_category_to: Tuple[int, ...] = ()
+        self.load_images: bool = True
+        self.load_depths: bool = False
+        self.load_depth_masks: bool = False
+        self.load_masks: bool = True
+        self.load_point_clouds: bool = False
+        self.max_points: int = 0
+        self.mask_images: bool = False
+        self.mask_depths: bool = False
+        self.image_height: Optional[int] = image_size
+        self.image_width: Optional[int] = image_size
+        self.box_crop: bool = True
+        self.box_crop_mask_thr: float = 0.4
+        self.box_crop_context: float = 0.3
+        self.remove_empty_masks: bool = True
+        self.n_frames_per_sequence: int = -1
+        self.seed: int = 0
+        self.sort_frames: bool = False
+        self.eval_batches: Any = None
+        self.img_h = self.image_height
+        self.img_w = self.image_width
+        self.masked = masked
+        self.deprecated_val_region = deprecated_val_region
+        self.return_frame_data_list = return_frame_data_list
+        self.reso = reso
+        self.num_frames = num_frames
+        self.cond_aug_mean = cond_aug_mean
+        self.cond_aug_std = cond_aug_std
+        self.condition_on_elevation = condition_on_elevation
+        self.fps_id = fps_id
+        self.motion_bucket_id = motion_bucket_id
+        self.mask_type = mask_type
+        self.use_mask = use_mask
+        self.load_pixelnerf = load_pixelnerf
+        self.scale_pose = scale_pose
+        self.max_n_cond = max_n_cond
+        self.min_n_cond = min_n_cond
+        self.cond_on_multi = cond_on_multi
+        if self.cond_on_multi:
+            assert self.min_n_cond == self.max_n_cond
+        start_time = time.time()
+        if "all_" in category or category == "all":
+            self.category_frame_annotations = []
+            self.category_sequence_annotations = []
+            self.subset_lists_file = []
+            if category == "all":
+                cats = CO3D_ALL_CATEGORIES
+            elif category == "all_four":
+                cats = ["hydrant", "teddybear", "motorcycle", "bench"]
+            elif category == "all_ten":
+                cats = [
+                    "donut",
+                    "apple",
+                    "hydrant",
+                    "vase",
+                    "cake",
+                    "ball",
+                    "bench",
+                    "suitcase",
+                    "teddybear",
+                    "plant",
+                ]
+            elif category == "all_15":
+                cats = [
+                    "hydrant",
+                    "teddybear",
+                    "motorcycle",
+                    "bench",
+                    "hotdog",
+                    "remote",
+                    "suitcase",
+                    "donut",
+                    "plant",
+                    "toaster",
+                    "keyboard",
+                    "handbag",
+                    "toyplane",
+                    "tv",
+                    "orange",
+                ]
+            else:
+                print("UNSPECIFIED CATEGORY SUBSET")
+                cats = ["hydrant", "teddybear"]
+            print("loading", cats)
+            for cat in cats:
+                self.category_frame_annotations.extend(
+                    load_dataclass_jgzip(
+                        f"{self.dataset_root}/{cat}/frame_annotations.jgz",
+                        List[FrameAnnotation],
+                    )
+                )
+                self.category_sequence_annotations.extend(
+                    load_dataclass_jgzip(
+                        f"{self.dataset_root}/{cat}/sequence_annotations.jgz",
+                        List[SequenceAnnotation],
+                    )
+                )
+                self.subset_lists_file.append(
+                    f"{self.dataset_root}/{cat}/set_lists/set_lists_{subset}.json"
+                )
+        else:
+            self.category_frame_annotations = load_dataclass_jgzip(
+                f"{self.dataset_root}/{category}/frame_annotations.jgz",
+                List[FrameAnnotation],
+            )
+            self.category_sequence_annotations = load_dataclass_jgzip(
+                f"{self.dataset_root}/{category}/sequence_annotations.jgz",
+                List[SequenceAnnotation],
+            )
+        self.subset_to_image_path = None
+        self._load_frames()
+        self._load_sequences()
+        self._sort_frames()
+        self._load_subset_lists()
+        self._filter_db()  # also computes sequence indices
+        # self._extract_and_set_eval_batches()
+        # print(self.eval_batches)
+        logger.info(str(self))
+        self.seq_to_frames = {}
+        for fi, item in enumerate(self.frame_annots):
+            if item["frame_annotation"].sequence_name in self.seq_to_frames:
+                self.seq_to_frames[item["frame_annotation"].sequence_name].append(fi)
+            else:
+                self.seq_to_frames[item["frame_annotation"].sequence_name] = [fi]
+        if self.stage != "test" or self.subset != "fewview_test":
+            count = 0
+            new_seq_to_frames = {}
+            for item in self.seq_to_frames:
+                if len(self.seq_to_frames[item]) > 10:
+                    count += 1
+                    new_seq_to_frames[item] = self.seq_to_frames[item]
+            self.seq_to_frames = new_seq_to_frames
+        self.seq_list = list(self.seq_to_frames.keys())
+        # @ REMOVE A FEW TRAINING SEQ THAT CAUSES BUG
+        remove_list = ["411_55952_107659", "376_42884_85882"]
+        for remove_idx in remove_list:
+            if remove_idx in self.seq_to_frames:
+                self.seq_list.remove(remove_idx)
+                print("removing", remove_idx)
+        print("total training seq", len(self.seq_to_frames))
+        print("data loading took", time.time() - start_time, "seconds")
+        self.all_category_list = list(CO3D_ALL_CATEGORIES)
+        self.all_category_list.sort()
+        self.cat_to_idx = {}
+        for ci, cname in enumerate(self.all_category_list):
+            self.cat_to_idx[cname] = ci
+    def __len__(self):
+        return len(self.seq_list)
+    def __getitem__(self, index):
+        seq_index = self.seq_list[index]
+        if self.subset == "fewview_test" and self.stage == "test":
+            batch_idx = torch.arange(len(self.seq_to_frames[seq_index]))
+        elif self.stage == "test":
+            batch_idx = (
+                torch.linspace(
+                    0, len(self.seq_to_frames[seq_index]) - 1, self.sample_batch_size
+                )
+                .long()
+                .tolist()
+            )
+        else:
+            rand = torch.randperm(len(self.seq_to_frames[seq_index]))
+            batch_idx = rand[: min(len(rand), self.sample_batch_size)]
+        frame_data_list = []
+        idx_list = []
+        timestamp_list = []
+        for idx in batch_idx:
+            idx_list.append(self.seq_to_frames[seq_index][idx])
+            timestamp_list.append(
+                self.frame_annots[self.seq_to_frames[seq_index][idx]][
+                    "frame_annotation"
+                ].frame_timestamp
+            )
+            frame_data_list.append(
+                self._get_frame(int(self.seq_to_frames[seq_index][idx]))
+            )
+        time_order = torch.argsort(torch.tensor(timestamp_list))
+        frame_data_list = [frame_data_list[i] for i in time_order]
+        frame_data = FrameData.collate(frame_data_list)
+        image_size = torch.Tensor([self.image_height]).repeat(
+            frame_data.camera.R.shape[0], 2
+        )
+        frame_dict = {
+            "R": frame_data.camera.R,
+            "T": frame_data.camera.T,
+            "f": frame_data.camera.focal_length,
+            "c": frame_data.camera.principal_point,
+            "images": frame_data.image_rgb * frame_data.fg_probability
+            + (1 - frame_data.fg_probability),
+            "valid_region": frame_data.mask_crop,
+            "bbox": frame_data.valid_region,
+            "image_size": image_size,
+            "frame_type": frame_data.frame_type,
+            "idx": seq_index,
+            "category": frame_data.category_one_hot,
+        }
+        if not self.masked:
+            frame_dict["images_full"] = frame_data.image_rgb
+            frame_dict["masks"] = frame_data.fg_probability
+            frame_dict["mask_crop"] = frame_data.mask_crop
+        cond_aug = np.exp(
+            np.random.randn(1)[0] * self.cond_aug_std + self.cond_aug_mean
+        )
+        def _pad(input):
+            return torch.cat([input, torch.flip(input, dims=[0])], dim=0)[
+                : self.num_frames
+            ]
+        if len(frame_dict["images"]) < self.num_frames:
+            for k in frame_dict:
+                if isinstance(frame_dict[k], torch.Tensor):
+                    frame_dict[k] = _pad(frame_dict[k])
+        data = dict()
+        if "images_full" in frame_dict:
+            frames = frame_dict["images_full"] * 2 - 1
+        else:
+            frames = frame_dict["images"] * 2 - 1
+        data["frames"] = frames
+        cond = frames[0]
+        data["cond_frames_without_noise"] = cond
+        data["cond_aug"] = torch.as_tensor([cond_aug] * self.num_frames)
+        data["cond_frames"] = cond + cond_aug * torch.randn_like(cond)
+        data["fps_id"] = torch.as_tensor([self.fps_id] * self.num_frames)
+        data["motion_bucket_id"] = torch.as_tensor(
+            [self.motion_bucket_id] * self.num_frames
+        )
+        data["num_video_frames"] = self.num_frames
+        data["image_only_indicator"] = torch.as_tensor([0.0] * self.num_frames)
+        if self.load_pixelnerf:
+            data["pixelnerf_input"] = dict()
+            # Rs = frame_dict["R"].transpose(-1, -2)
+            # Ts = frame_dict["T"]
+            # Rs[:, :, 2] *= -1
+            # Rs[:, :, 0] *= -1
+            # Ts[:, 2] *= -1
+            # Ts[:, 0] *= -1
+            # c2ws = torch.zeros(Rs.shape[0], 4, 4)
+            # c2ws[:, :3, :3] = Rs
+            # c2ws[:, :3, 3] = Ts
+            # c2ws[:, 3, 3] = 1
+            # c2ws = c2ws.inverse()
+            # # c2ws[..., 0] *= -1
+            # # c2ws[..., 2] *= -1
+            # cx = frame_dict["c"][:, 0]
+            # cy = frame_dict["c"][:, 1]
+            # fx = frame_dict["f"][:, 0]
+            # fy = frame_dict["f"][:, 1]
+            # intrinsics = torch.zeros(cx.shape[0], 3, 3)
+            # intrinsics[:, 2, 2] = 1
+            # intrinsics[:, 0, 0] = fx
+            # intrinsics[:, 1, 1] = fy
+            # intrinsics[:, 0, 2] = cx
+            # intrinsics[:, 1, 2] = cy
+            scene_cameras = PerspectiveCameras(
+                R=frame_dict["R"],
+                T=frame_dict["T"],
+                focal_length=frame_dict["f"],
+                principal_point=frame_dict["c"],
+                image_size=frame_dict["image_size"],
+            )
+            R, T, intrinsics = opencv_from_cameras_projection(
+                scene_cameras, frame_dict["image_size"]
+            )
+            c2ws = torch.zeros(R.shape[0], 4, 4)
+            c2ws[:, :3, :3] = R
+            c2ws[:, :3, 3] = T
+            c2ws[:, 3, 3] = 1.0
+            c2ws = c2ws.inverse()
+            c2ws[..., 1:3] *= -1
+            intrinsics[:, :2] /= 256
+            cameras = torch.zeros(c2ws.shape[0], 25)
+            cameras[..., :16] = c2ws.reshape(-1, 16)
+            cameras[..., 16:] = intrinsics.reshape(-1, 9)
+            if self.scale_pose:
+                c2ws = cameras[..., :16].reshape(-1, 4, 4)
+                center = c2ws[:, :3, 3].mean(0)
+                radius = (c2ws[:, :3, 3] - center).norm(dim=-1).max()
+                scale = 1.5 / radius
+                c2ws[..., :3, 3] = (c2ws[..., :3, 3] - center) * scale
+                cameras[..., :16] = c2ws.reshape(-1, 16)
+            data["pixelnerf_input"]["frames"] = frames
+            data["pixelnerf_input"]["cameras"] = cameras
+            data["pixelnerf_input"]["rgb"] = (
+                F.interpolate(
+                    frames,
+                    (self.image_width // 8, self.image_height // 8),
+                    mode="bilinear",
+                    align_corners=False,
+                )
+                + 1
+            ) * 0.5
+        return data
+        # if self.return_frame_data_list:
+        #     return (frame_dict, frame_data_list)
+        # return frame_dict
+    def collate_fn(self, batch):
+        # a hack to add source index and keep consistent within a batch
+        if self.max_n_cond > 1:
+            # TODO implement this
+            n_cond = np.random.randint(self.min_n_cond, self.max_n_cond + 1)
+            # debug
+            # source_index = [0]
+            if n_cond > 1:
+                for b in batch:
+                    source_index = [0] + np.random.choice(
+                        np.arange(1, self.num_frames),
+                        self.max_n_cond - 1,
+                        replace=False,
+                    ).tolist()
+                    b["pixelnerf_input"]["source_index"] = torch.as_tensor(source_index)
+                    b["pixelnerf_input"]["n_cond"] = n_cond
+                    b["pixelnerf_input"]["source_images"] = b["frames"][source_index]
+                    b["pixelnerf_input"]["source_cameras"] = b["pixelnerf_input"][
+                        "cameras"
+                    ][source_index]
+                    if self.cond_on_multi:
+                        b["cond_frames_without_noise"] = b["frames"][source_index]
+        ret = video_collate_fn(batch)
+        if self.cond_on_multi:
+            ret["cond_frames_without_noise"] = rearrange(
+                ret["cond_frames_without_noise"], "b t ... -> (b t) ..."
+            )
+        return ret
+    def _get_frame(self, index):
+        # if index >= len(self.frame_annots):
+        #     raise IndexError(f"index {index} out of range {len(self.frame_annots)}")
+        entry = self.frame_annots[index]["frame_annotation"]
+        # pyre-ignore[16]
+        point_cloud = self.seq_annots[entry.sequence_name].point_cloud
+        frame_data = FrameData(
+            frame_number=_safe_as_tensor(entry.frame_number, torch.long),
+            frame_timestamp=_safe_as_tensor(entry.frame_timestamp, torch.float),
+            sequence_name=entry.sequence_name,
+            sequence_category=self.seq_annots[entry.sequence_name].category,
+            camera_quality_score=_safe_as_tensor(
+                self.seq_annots[entry.sequence_name].viewpoint_quality_score,
+                torch.float,
+            ),
+            point_cloud_quality_score=_safe_as_tensor(
+                point_cloud.quality_score, torch.float
+            )
+            if point_cloud is not None
+            else None,
+        )
+        # The rest of the fields are optional
+        frame_data.frame_type = self._get_frame_type(self.frame_annots[index])
+        (
+            frame_data.fg_probability,
+            frame_data.mask_path,
+            frame_data.bbox_xywh,
+            clamp_bbox_xyxy,
+            frame_data.crop_bbox_xywh,
+        ) = self._load_crop_fg_probability(entry)
+        scale = 1.0
+        if self.load_images and entry.image is not None:
+            # original image size
+            frame_data.image_size_hw = _safe_as_tensor(entry.image.size, torch.long)
+            (
+                frame_data.image_rgb,
+                frame_data.image_path,
+                frame_data.mask_crop,
+                scale,
+            ) = self._load_crop_images(
+                entry, frame_data.fg_probability, clamp_bbox_xyxy
+            )
+            # print(frame_data.fg_probability.sum())
+            # print('scale', scale)
+        #! INSERT
+        if self.deprecated_val_region:
+            # print(frame_data.crop_bbox_xywh)
+            valid_bbox = _bbox_xywh_to_xyxy(frame_data.crop_bbox_xywh).float()
+            # print(valid_bbox, frame_data.image_size_hw)
+            valid_bbox[0] = torch.clip(
+                (
+                    valid_bbox[0]
+                    - torch.div(frame_data.image_size_hw[1], 2, rounding_mode="floor")
+                )
+                / torch.div(frame_data.image_size_hw[1], 2, rounding_mode="floor"),
+                -1.0,
+                1.0,
+            )
+            valid_bbox[1] = torch.clip(
+                (
+                    valid_bbox[1]
+                    - torch.div(frame_data.image_size_hw[0], 2, rounding_mode="floor")
+                )
+                / torch.div(frame_data.image_size_hw[0], 2, rounding_mode="floor"),
+                -1.0,
+                1.0,
+            )
+            valid_bbox[2] = torch.clip(
+                (
+                    valid_bbox[2]
+                    - torch.div(frame_data.image_size_hw[1], 2, rounding_mode="floor")
+                )
+                / torch.div(frame_data.image_size_hw[1], 2, rounding_mode="floor"),
+                -1.0,
+                1.0,
+            )
+            valid_bbox[3] = torch.clip(
+                (
+                    valid_bbox[3]
+                    - torch.div(frame_data.image_size_hw[0], 2, rounding_mode="floor")
+                )
+                / torch.div(frame_data.image_size_hw[0], 2, rounding_mode="floor"),
+                -1.0,
+                1.0,
+            )
+            # print(valid_bbox)
+            frame_data.valid_region = valid_bbox
+        else:
+            #! UPDATED VALID BBOX
+            if self.stage == "train":
+                assert self.image_height == 256 and self.image_width == 256
+                valid = torch.nonzero(frame_data.mask_crop[0])
+                min_y = valid[:, 0].min()
+                min_x = valid[:, 1].min()
+                max_y = valid[:, 0].max()
+                max_x = valid[:, 1].max()
+                valid_bbox = torch.tensor(
+                    [min_y, min_x, max_y, max_x], device=frame_data.image_rgb.device
+                ).unsqueeze(0)
+                valid_bbox = torch.clip(
+                    (valid_bbox - (256 // 2)) / (256 // 2), -1.0, 1.0
+                )
+                frame_data.valid_region = valid_bbox[0]
+            else:
+                valid = torch.nonzero(frame_data.mask_crop[0])
+                min_y = valid[:, 0].min()
+                min_x = valid[:, 1].min()
+                max_y = valid[:, 0].max()
+                max_x = valid[:, 1].max()
+                valid_bbox = torch.tensor(
+                    [min_y, min_x, max_y, max_x], device=frame_data.image_rgb.device
+                ).unsqueeze(0)
+                valid_bbox = torch.clip(
+                    (valid_bbox - (self.image_height // 2)) / (self.image_height // 2),
+                    -1.0,
+                    1.0,
+                )
+                frame_data.valid_region = valid_bbox[0]
+        #! SET CLASS ONEHOT
+        frame_data.category_one_hot = torch.zeros(
+            (len(self.all_category_list)), device=frame_data.image_rgb.device
+        )
+        frame_data.category_one_hot[self.cat_to_idx[frame_data.sequence_category]] = 1
+        if self.load_depths and entry.depth is not None:
+            (
+                frame_data.depth_map,
+                frame_data.depth_path,
+                frame_data.depth_mask,
+            ) = self._load_mask_depth(entry, clamp_bbox_xyxy, frame_data.fg_probability)
+        if entry.viewpoint is not None:
+            frame_data.camera = self._get_pytorch3d_camera(
+                entry,
+                scale,
+                clamp_bbox_xyxy,
+            )
+        if self.load_point_clouds and point_cloud is not None:
+            frame_data.sequence_point_cloud_path = pcl_path = os.path.join(
+                self.dataset_root, point_cloud.path
+            )
+            frame_data.sequence_point_cloud = _load_pointcloud(
+                self._local_path(pcl_path), max_points=self.max_points
+            )
+        # for key in frame_data:
+        #     if frame_data[key] == None:
+        #         print(key)
+        return frame_data
+    def _extract_and_set_eval_batches(self):
+        """
+        Sets eval_batches based on input eval_batch_index.
+        """
+        if self.eval_batch_index is not None:
+            if self.eval_batches is not None:
+                raise ValueError(
+                    "Cannot define both eval_batch_index and eval_batches."
+                )
+            self.eval_batches = self.seq_frame_index_to_dataset_index(
+                self.eval_batch_index
+            )
+    def _load_crop_fg_probability(
+        self, entry: types.FrameAnnotation
+    ) -> Tuple[
+        Optional[torch.Tensor],
+        Optional[str],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
+        fg_probability = None
+        full_path = None
+        bbox_xywh = None
+        clamp_bbox_xyxy = None
+        crop_box_xywh = None
+        if (self.load_masks or self.box_crop) and entry.mask is not None:
+            full_path = os.path.join(self.dataset_root, entry.mask.path)
+            mask = _load_mask(self._local_path(full_path))
+            if mask.shape[-2:] != entry.image.size:
+                raise ValueError(
+                    f"bad mask size: {mask.shape[-2:]} vs {entry.image.size}!"
+                )
+            bbox_xywh = torch.tensor(_get_bbox_from_mask(mask, self.box_crop_mask_thr))
+            if self.box_crop:
+                clamp_bbox_xyxy = _clamp_box_to_image_bounds_and_round(
+                    _get_clamp_bbox(
+                        bbox_xywh,
+                        image_path=entry.image.path,
+                        box_crop_context=self.box_crop_context,
+                    ),
+                    image_size_hw=tuple(mask.shape[-2:]),
+                )
+                crop_box_xywh = _bbox_xyxy_to_xywh(clamp_bbox_xyxy)
+                mask = _crop_around_box(mask, clamp_bbox_xyxy, full_path)
+            fg_probability, _, _ = self._resize_image(mask, mode="nearest")
+        return fg_probability, full_path, bbox_xywh, clamp_bbox_xyxy, crop_box_xywh
+    def _load_crop_images(
+        self,
+        entry: types.FrameAnnotation,
+        fg_probability: Optional[torch.Tensor],
+        clamp_bbox_xyxy: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, str, torch.Tensor, float]:
+        assert self.dataset_root is not None and entry.image is not None
+        path = os.path.join(self.dataset_root, entry.image.path)
+        image_rgb = _load_image(self._local_path(path))
+        if image_rgb.shape[-2:] != entry.image.size:
+            raise ValueError(
+                f"bad image size: {image_rgb.shape[-2:]} vs {entry.image.size}!"
+            )
+        if self.box_crop:
+            assert clamp_bbox_xyxy is not None
+            image_rgb = _crop_around_box(image_rgb, clamp_bbox_xyxy, path)
+        image_rgb, scale, mask_crop = self._resize_image(image_rgb)
+        if self.mask_images:
+            assert fg_probability is not None
+            image_rgb *= fg_probability
+        return image_rgb, path, mask_crop, scale
+    def _load_mask_depth(
+        self,
+        entry: types.FrameAnnotation,
+        clamp_bbox_xyxy: Optional[torch.Tensor],
+        fg_probability: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, str, torch.Tensor]:
+        entry_depth = entry.depth
+        assert entry_depth is not None
+        path = os.path.join(self.dataset_root, entry_depth.path)
+        depth_map = _load_depth(self._local_path(path), entry_depth.scale_adjustment)
+        if self.box_crop:
+            assert clamp_bbox_xyxy is not None
+            depth_bbox_xyxy = _rescale_bbox(
+                clamp_bbox_xyxy, entry.image.size, depth_map.shape[-2:]
+            )
+            depth_map = _crop_around_box(depth_map, depth_bbox_xyxy, path)
+        depth_map, _, _ = self._resize_image(depth_map, mode="nearest")
+        if self.mask_depths:
+            assert fg_probability is not None
+            depth_map *= fg_probability
+        if self.load_depth_masks:
+            assert entry_depth.mask_path is not None
+            mask_path = os.path.join(self.dataset_root, entry_depth.mask_path)
+            depth_mask = _load_depth_mask(self._local_path(mask_path))
+            if self.box_crop:
+                assert clamp_bbox_xyxy is not None
+                depth_mask_bbox_xyxy = _rescale_bbox(
+                    clamp_bbox_xyxy, entry.image.size, depth_mask.shape[-2:]
+                )
+                depth_mask = _crop_around_box(
+                    depth_mask, depth_mask_bbox_xyxy, mask_path
+                )
+            depth_mask, _, _ = self._resize_image(depth_mask, mode="nearest")
+        else:
+            depth_mask = torch.ones_like(depth_map)
+        return depth_map, path, depth_mask
+    def _get_pytorch3d_camera(
+        self,
+        entry: types.FrameAnnotation,
+        scale: float,
+        clamp_bbox_xyxy: Optional[torch.Tensor],
+    ) -> PerspectiveCameras:
+        entry_viewpoint = entry.viewpoint
+        assert entry_viewpoint is not None
+        # principal point and focal length
+        principal_point = torch.tensor(
+            entry_viewpoint.principal_point, dtype=torch.float
+        )
+        focal_length = torch.tensor(entry_viewpoint.focal_length, dtype=torch.float)
+        half_image_size_wh_orig = (
+            torch.tensor(list(reversed(entry.image.size)), dtype=torch.float) / 2.0
+        )
+        # first, we convert from the dataset's NDC convention to pixels
+        format = entry_viewpoint.intrinsics_format
+        if format.lower() == "ndc_norm_image_bounds":
+            # this is e.g. currently used in CO3D for storing intrinsics
+            rescale = half_image_size_wh_orig
+        elif format.lower() == "ndc_isotropic":
+            rescale = half_image_size_wh_orig.min()
+        else:
+            raise ValueError(f"Unknown intrinsics format: {format}")
+        # principal point and focal length in pixels
+        principal_point_px = half_image_size_wh_orig - principal_point * rescale
+        focal_length_px = focal_length * rescale
+        if self.box_crop:
+            assert clamp_bbox_xyxy is not None
+            principal_point_px -= clamp_bbox_xyxy[:2]
+        # now, convert from pixels to PyTorch3D v0.5+ NDC convention
+        if self.image_height is None or self.image_width is None:
+            out_size = list(reversed(entry.image.size))
+        else:
+            out_size = [self.image_width, self.image_height]
+        half_image_size_output = torch.tensor(out_size, dtype=torch.float) / 2.0
+        half_min_image_size_output = half_image_size_output.min()
+        # rescaled principal point and focal length in ndc
+        principal_point = (
+            half_image_size_output - principal_point_px * scale
+        ) / half_min_image_size_output
+        focal_length = focal_length_px * scale / half_min_image_size_output
+        return PerspectiveCameras(
+            focal_length=focal_length[None],
+            principal_point=principal_point[None],
+            R=torch.tensor(entry_viewpoint.R, dtype=torch.float)[None],
+            T=torch.tensor(entry_viewpoint.T, dtype=torch.float)[None],
+        )
+    def _load_frames(self) -> None:
+        self.frame_annots = [
+            FrameAnnotsEntry(frame_annotation=a, subset=None)
+            for a in self.category_frame_annotations
+        ]
+    def _load_sequences(self) -> None:
+        self.seq_annots = {
+            entry.sequence_name: entry for entry in self.category_sequence_annotations
+        }
+    def _load_subset_lists(self) -> None:
+        logger.info(f"Loading Co3D subset lists from {self.subset_lists_file}.")
+        if not self.subset_lists_file:
+            return
+        frame_path_to_subset = {}
+        for subset_list_file in self.subset_lists_file:
+            with open(self._local_path(subset_list_file), "r") as f:
+                subset_to_seq_frame = json.load(f)
+            #! PRINT SUBSET_LIST STATS
+            # if len(self.subset_lists_file) == 1:
+            #     print('train frames', len(subset_to_seq_frame['train']))
+            #     print('val frames', len(subset_to_seq_frame['val']))
+            #     print('test frames', len(subset_to_seq_frame['test']))
+            for set_ in subset_to_seq_frame:
+                for _, _, path in subset_to_seq_frame[set_]:
+                    if path in frame_path_to_subset:
+                        frame_path_to_subset[path].add(set_)
+                    else:
+                        frame_path_to_subset[path] = {set_}
+        # pyre-ignore[16]
+        for frame in self.frame_annots:
+            frame["subset"] = frame_path_to_subset.get(
+                frame["frame_annotation"].image.path, None
+            )
+            if frame["subset"] is None:
+                continue
+                warnings.warn(
+                    "Subset lists are given but don't include "
+                    + frame["frame_annotation"].image.path
+                )
+    def _sort_frames(self) -> None:
+        # Sort frames to have them grouped by sequence, ordered by timestamp
+        # pyre-ignore[16]
+        self.frame_annots = sorted(
+            self.frame_annots,
+            key=lambda f: (
+                f["frame_annotation"].sequence_name,
+                f["frame_annotation"].frame_timestamp or 0,
+            ),
+        )
+    def _filter_db(self) -> None:
+        if self.remove_empty_masks:
+            logger.info("Removing images with empty masks.")
+            # pyre-ignore[16]
+            old_len = len(self.frame_annots)
+            msg = "remove_empty_masks needs every MaskAnnotation.mass to be set."
+            def positive_mass(frame_annot: types.FrameAnnotation) -> bool:
+                mask = frame_annot.mask
+                if mask is None:
+                    return False
+                if mask.mass is None:
+                    raise ValueError(msg)
+                return mask.mass > 1
+            self.frame_annots = [
+                frame
+                for frame in self.frame_annots
+                if positive_mass(frame["frame_annotation"])
+            ]
+            logger.info("... filtered %d -> %d" % (old_len, len(self.frame_annots)))
+        # this has to be called after joining with categories!!
+        subsets = self.subsets
+        if subsets:
+            if not self.subset_lists_file:
+                raise ValueError(
+                    "Subset filter is on but subset_lists_file was not given"
+                )
+            logger.info(f"Limiting Co3D dataset to the '{subsets}' subsets.")
+            # truncate the list of subsets to the valid one
+            self.frame_annots = [
+                entry
+                for entry in self.frame_annots
+                if (entry["subset"] is not None and self.stage in entry["subset"])
+            ]
+            if len(self.frame_annots) == 0:
+                raise ValueError(f"There are no frames in the '{subsets}' subsets!")
+            self._invalidate_indexes(filter_seq_annots=True)
+        if len(self.limit_category_to) > 0:
+            logger.info(f"Limiting dataset to categories: {self.limit_category_to}")
+            # pyre-ignore[16]
+            self.seq_annots = {
+                name: entry
+                for name, entry in self.seq_annots.items()
+                if entry.category in self.limit_category_to
+            }
+        # sequence filters
+        for prefix in ("pick", "exclude"):
+            orig_len = len(self.seq_annots)
+            attr = f"{prefix}_sequence"
+            arr = getattr(self, attr)
+            if len(arr) > 0:
+                logger.info(f"{attr}: {str(arr)}")
+                self.seq_annots = {
+                    name: entry
+                    for name, entry in self.seq_annots.items()
+                    if (name in arr) == (prefix == "pick")
+                }
+                logger.info("... filtered %d -> %d" % (orig_len, len(self.seq_annots)))
+        if self.limit_sequences_to > 0:
+            self.seq_annots = dict(
+                islice(self.seq_annots.items(), self.limit_sequences_to)
+            )
+        # retain only frames from retained sequences
+        self.frame_annots = [
+            f
+            for f in self.frame_annots
+            if f["frame_annotation"].sequence_name in self.seq_annots
+        ]
+        self._invalidate_indexes()
+        if self.n_frames_per_sequence > 0:
+            logger.info(f"Taking max {self.n_frames_per_sequence} per sequence.")
+            keep_idx = []
+            # pyre-ignore[16]
+            for seq, seq_indices in self._seq_to_idx.items():
+                # infer the seed from the sequence name, this is reproducible
+                # and makes the selection differ for different sequences
+                seed = _seq_name_to_seed(seq) + self.seed
+                seq_idx_shuffled = random.Random(seed).sample(
+                    sorted(seq_indices), len(seq_indices)
+                )
+                keep_idx.extend(seq_idx_shuffled[: self.n_frames_per_sequence])
+            logger.info(
+                "... filtered %d -> %d" % (len(self.frame_annots), len(keep_idx))
+            )
+            self.frame_annots = [self.frame_annots[i] for i in keep_idx]
+            self._invalidate_indexes(filter_seq_annots=False)
+            # sequences are not decimated, so self.seq_annots is valid
+        if self.limit_to > 0 and self.limit_to < len(self.frame_annots):
+            logger.info(
+                "limit_to: filtered %d -> %d" % (len(self.frame_annots), self.limit_to)
+            )
+            self.frame_annots = self.frame_annots[: self.limit_to]
+            self._invalidate_indexes(filter_seq_annots=True)
+    def _invalidate_indexes(self, filter_seq_annots: bool = False) -> None:
+        # update _seq_to_idx and filter seq_meta according to frame_annots change
+        # if filter_seq_annots, also uldates seq_annots based on the changed _seq_to_idx
+        self._invalidate_seq_to_idx()
+        if filter_seq_annots:
+            # pyre-ignore[16]
+            self.seq_annots = {
+                k: v
+                for k, v in self.seq_annots.items()
+                # pyre-ignore[16]
+                if k in self._seq_to_idx
+            }
+    def _invalidate_seq_to_idx(self) -> None:
+        seq_to_idx = defaultdict(list)
+        # pyre-ignore[16]
+        for idx, entry in enumerate(self.frame_annots):
+            seq_to_idx[entry["frame_annotation"].sequence_name].append(idx)
+        # pyre-ignore[16]
+        self._seq_to_idx = seq_to_idx
+    def _resize_image(
+        self, image, mode="bilinear"
+    ) -> Tuple[torch.Tensor, float, torch.Tensor]:
+        image_height, image_width = self.image_height, self.image_width
+        if image_height is None or image_width is None:
+            # skip the resizing
+            imre_ = torch.from_numpy(image)
+            return imre_, 1.0, torch.ones_like(imre_[:1])
+        # takes numpy array, returns pytorch tensor
+        minscale = min(
+            image_height / image.shape[-2],
+            image_width / image.shape[-1],
+        )
+        imre = torch.nn.functional.interpolate(
+            torch.from_numpy(image)[None],
+            scale_factor=minscale,
+            mode=mode,
+            align_corners=False if mode == "bilinear" else None,
+            recompute_scale_factor=True,
+        )[0]
+        # pyre-fixme[19]: Expected 1 positional argument.
+        imre_ = torch.zeros(image.shape[0], self.image_height, self.image_width)
+        imre_[:, 0 : imre.shape[1], 0 : imre.shape[2]] = imre
+        # pyre-fixme[6]: For 2nd param expected `int` but got `Optional[int]`.
+        # pyre-fixme[6]: For 3rd param expected `int` but got `Optional[int]`.
+        mask = torch.zeros(1, self.image_height, self.image_width)
+        mask[:, 0 : imre.shape[1], 0 : imre.shape[2]] = 1.0
+        return imre_, minscale, mask
+    def _local_path(self, path: str) -> str:
+        if self.path_manager is None:
+            return path
+        return self.path_manager.get_local_path(path)
+    def get_frame_numbers_and_timestamps(
+        self, idxs: Sequence[int]
+    ) -> List[Tuple[int, float]]:
+        out: List[Tuple[int, float]] = []
+        for idx in idxs:
+            # pyre-ignore[16]
+            frame_annotation = self.frame_annots[idx]["frame_annotation"]
+            out.append(
+                (frame_annotation.frame_number, frame_annotation.frame_timestamp)
+            )
+        return out
+    def get_eval_batches(self) -> Optional[List[List[int]]]:
+        return self.eval_batches
+    def _get_frame_type(self, entry: FrameAnnotsEntry) -> Optional[str]:
+        return entry["frame_annotation"].meta["frame_type"]
+class CO3DDataset(LightningDataModule):
+    def __init__(
+        self,
+        root_dir,
+        batch_size=2,
+        shuffle=True,
+        num_workers=10,
+        prefetch_factor=2,
+        category="hydrant",
+        **kwargs,
+    ):
+        super().__init__()
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.prefetch_factor = prefetch_factor
+        self.shuffle = shuffle
+        self.train_dataset = CO3Dv2Wrapper(
+            root_dir=root_dir,
+            stage="train",
+            category=category,
+            **kwargs,
+        )
+        self.test_dataset = CO3Dv2Wrapper(
+            root_dir=root_dir,
+            stage="test",
+            subset="fewview_dev",
+            category=category,
+            **kwargs,
+        )
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            num_workers=self.num_workers,
+            prefetch_factor=self.prefetch_factor,
+            collate_fn=self.train_dataset.collate_fn,
+        )
+    def test_dataloader(self):
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            num_workers=self.num_workers,
+            prefetch_factor=self.prefetch_factor,
+            collate_fn=self.test_dataset.collate_fn,
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            num_workers=self.num_workers,
+            prefetch_factor=self.prefetch_factor,
+            collate_fn=video_collate_fn,
+        )

sgm/data/colmap.py ADDED Viewed

	@@ -0,0 +1,605 @@

+# Copyright (c) 2023, ETH Zurich and UNC Chapel Hill.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#
+#     * Neither the name of ETH Zurich and UNC Chapel Hill nor the names of
+#       its contributors may be used to endorse or promote products derived
+#       from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+import os
+import collections
+import numpy as np
+import struct
+import argparse
+CameraModel = collections.namedtuple(
+    "CameraModel", ["model_id", "model_name", "num_params"]
+)
+Camera = collections.namedtuple(
+    "Camera", ["id", "model", "width", "height", "params"]
+)
+BaseImage = collections.namedtuple(
+    "Image", ["id", "qvec", "tvec", "camera_id", "name", "xys", "point3D_ids"]
+)
+Point3D = collections.namedtuple(
+    "Point3D", ["id", "xyz", "rgb", "error", "image_ids", "point2D_idxs"]
+)
+class Image(BaseImage):
+    def qvec2rotmat(self):
+        return qvec2rotmat(self.qvec)
+CAMERA_MODELS = {
+    CameraModel(model_id=0, model_name="SIMPLE_PINHOLE", num_params=3),
+    CameraModel(model_id=1, model_name="PINHOLE", num_params=4),
+    CameraModel(model_id=2, model_name="SIMPLE_RADIAL", num_params=4),
+    CameraModel(model_id=3, model_name="RADIAL", num_params=5),
+    CameraModel(model_id=4, model_name="OPENCV", num_params=8),
+    CameraModel(model_id=5, model_name="OPENCV_FISHEYE", num_params=8),
+    CameraModel(model_id=6, model_name="FULL_OPENCV", num_params=12),
+    CameraModel(model_id=7, model_name="FOV", num_params=5),
+    CameraModel(model_id=8, model_name="SIMPLE_RADIAL_FISHEYE", num_params=4),
+    CameraModel(model_id=9, model_name="RADIAL_FISHEYE", num_params=5),
+    CameraModel(model_id=10, model_name="THIN_PRISM_FISHEYE", num_params=12),
+}
+CAMERA_MODEL_IDS = dict(
+    [(camera_model.model_id, camera_model) for camera_model in CAMERA_MODELS]
+)
+CAMERA_MODEL_NAMES = dict(
+    [(camera_model.model_name, camera_model) for camera_model in CAMERA_MODELS]
+)
+def read_next_bytes(fid, num_bytes, format_char_sequence, endian_character="<"):
+    """Read and unpack the next bytes from a binary file.
+    :param fid:
+    :param num_bytes: Sum of combination of {2, 4, 8}, e.g. 2, 6, 16, 30, etc.
+    :param format_char_sequence: List of {c, e, f, d, h, H, i, I, l, L, q, Q}.
+    :param endian_character: Any of {@, =, <, >, !}
+    :return: Tuple of read and unpacked values.
+    """
+    data = fid.read(num_bytes)
+    return struct.unpack(endian_character + format_char_sequence, data)
+def write_next_bytes(fid, data, format_char_sequence, endian_character="<"):
+    """pack and write to a binary file.
+    :param fid:
+    :param data: data to send, if multiple elements are sent at the same time,
+    they should be encapsuled either in a list or a tuple
+    :param format_char_sequence: List of {c, e, f, d, h, H, i, I, l, L, q, Q}.
+    should be the same length as the data list or tuple
+    :param endian_character: Any of {@, =, <, >, !}
+    """
+    if isinstance(data, (list, tuple)):
+        bytes = struct.pack(endian_character + format_char_sequence, *data)
+    else:
+        bytes = struct.pack(endian_character + format_char_sequence, data)
+    fid.write(bytes)
+def read_cameras_text(path):
+    """
+    see: src/colmap/scene/reconstruction.cc
+        void Reconstruction::WriteCamerasText(const std::string& path)
+        void Reconstruction::ReadCamerasText(const std::string& path)
+    """
+    cameras = {}
+    with open(path, "r") as fid:
+        while True:
+            line = fid.readline()
+            if not line:
+                break
+            line = line.strip()
+            if len(line) > 0 and line[0] != "#":
+                elems = line.split()
+                camera_id = int(elems[0])
+                model = elems[1]
+                width = int(elems[2])
+                height = int(elems[3])
+                params = np.array(tuple(map(float, elems[4:])))
+                cameras[camera_id] = Camera(
+                    id=camera_id,
+                    model=model,
+                    width=width,
+                    height=height,
+                    params=params,
+                )
+    return cameras
+def read_cameras_binary(path_to_model_file):
+    """
+    see: src/colmap/scene/reconstruction.cc
+        void Reconstruction::WriteCamerasBinary(const std::string& path)
+        void Reconstruction::ReadCamerasBinary(const std::string& path)
+    """
+    cameras = {}
+    with open(path_to_model_file, "rb") as fid:
+        num_cameras = read_next_bytes(fid, 8, "Q")[0]
+        for _ in range(num_cameras):
+            camera_properties = read_next_bytes(
+                fid, num_bytes=24, format_char_sequence="iiQQ"
+            )
+            camera_id = camera_properties[0]
+            model_id = camera_properties[1]
+            model_name = CAMERA_MODEL_IDS[camera_properties[1]].model_name
+            width = camera_properties[2]
+            height = camera_properties[3]
+            num_params = CAMERA_MODEL_IDS[model_id].num_params
+            params = read_next_bytes(
+                fid,
+                num_bytes=8 * num_params,
+                format_char_sequence="d" * num_params,
+            )
+            cameras[camera_id] = Camera(
+                id=camera_id,
+                model=model_name,
+                width=width,
+                height=height,
+                params=np.array(params),
+            )
+        assert len(cameras) == num_cameras
+    return cameras
+def write_cameras_text(cameras, path):
+    """
+    see: src/colmap/scene/reconstruction.cc
+        void Reconstruction::WriteCamerasText(const std::string& path)
+        void Reconstruction::ReadCamerasText(const std::string& path)
+    """
+    HEADER = (
+        "# Camera list with one line of data per camera:\n"
+        + "#   CAMERA_ID, MODEL, WIDTH, HEIGHT, PARAMS[]\n"
+        + "# Number of cameras: {}\n".format(len(cameras))
+    )
+    with open(path, "w") as fid:
+        fid.write(HEADER)
+        for _, cam in cameras.items():
+            to_write = [cam.id, cam.model, cam.width, cam.height, *cam.params]
+            line = " ".join([str(elem) for elem in to_write])
+            fid.write(line + "\n")
+def write_cameras_binary(cameras, path_to_model_file):
+    """
+    see: src/colmap/scene/reconstruction.cc
+        void Reconstruction::WriteCamerasBinary(const std::string& path)
+        void Reconstruction::ReadCamerasBinary(const std::string& path)
+    """
+    with open(path_to_model_file, "wb") as fid:
+        write_next_bytes(fid, len(cameras), "Q")
+        for _, cam in cameras.items():
+            model_id = CAMERA_MODEL_NAMES[cam.model].model_id
+            camera_properties = [cam.id, model_id, cam.width, cam.height]
+            write_next_bytes(fid, camera_properties, "iiQQ")
+            for p in cam.params:
+                write_next_bytes(fid, float(p), "d")
+    return cameras
+def read_images_text(path):
+    """
+    see: src/colmap/scene/reconstruction.cc
+        void Reconstruction::ReadImagesText(const std::string& path)
+        void Reconstruction::WriteImagesText(const std::string& path)
+    """
+    images = {}
+    with open(path, "r") as fid:
+        while True:
+            line = fid.readline()
+            if not line:
+                break
+            line = line.strip()
+            if len(line) > 0 and line[0] != "#":
+                elems = line.split()
+                image_id = int(elems[0])
+                qvec = np.array(tuple(map(float, elems[1:5])))
+                tvec = np.array(tuple(map(float, elems[5:8])))
+                camera_id = int(elems[8])
+                image_name = elems[9]
+                elems = fid.readline().split()
+                xys = np.column_stack(
+                    [
+                        tuple(map(float, elems[0::3])),
+                        tuple(map(float, elems[1::3])),
+                    ]
+                )
+                point3D_ids = np.array(tuple(map(int, elems[2::3])))
+                images[image_id] = Image(
+                    id=image_id,
+                    qvec=qvec,
+                    tvec=tvec,
+                    camera_id=camera_id,
+                    name=image_name,
+                    xys=xys,
+                    point3D_ids=point3D_ids,
+                )
+    return images
+def read_images_binary(path_to_model_file):
+    """
+    see: src/colmap/scene/reconstruction.cc
+        void Reconstruction::ReadImagesBinary(const std::string& path)
+        void Reconstruction::WriteImagesBinary(const std::string& path)
+    """
+    images = {}
+    with open(path_to_model_file, "rb") as fid:
+        num_reg_images = read_next_bytes(fid, 8, "Q")[0]
+        for _ in range(num_reg_images):
+            binary_image_properties = read_next_bytes(
+                fid, num_bytes=64, format_char_sequence="idddddddi"
+            )
+            image_id = binary_image_properties[0]
+            qvec = np.array(binary_image_properties[1:5])
+            tvec = np.array(binary_image_properties[5:8])
+            camera_id = binary_image_properties[8]
+            binary_image_name = b""
+            current_char = read_next_bytes(fid, 1, "c")[0]
+            while current_char != b"\x00":  # look for the ASCII 0 entry
+                binary_image_name += current_char
+                current_char = read_next_bytes(fid, 1, "c")[0]
+            image_name = binary_image_name.decode("utf-8")
+            num_points2D = read_next_bytes(
+                fid, num_bytes=8, format_char_sequence="Q"
+            )[0]
+            x_y_id_s = read_next_bytes(
+                fid,
+                num_bytes=24 * num_points2D,
+                format_char_sequence="ddq" * num_points2D,
+            )
+            xys = np.column_stack(
+                [
+                    tuple(map(float, x_y_id_s[0::3])),
+                    tuple(map(float, x_y_id_s[1::3])),
+                ]
+            )
+            point3D_ids = np.array(tuple(map(int, x_y_id_s[2::3])))
+            images[image_id] = Image(
+                id=image_id,
+                qvec=qvec,
+                tvec=tvec,
+                camera_id=camera_id,
+                name=image_name,
+                xys=xys,
+                point3D_ids=point3D_ids,
+            )
+    return images
+def write_images_text(images, path):
+    """
+    see: src/colmap/scene/reconstruction.cc
+        void Reconstruction::ReadImagesText(const std::string& path)
+        void Reconstruction::WriteImagesText(const std::string& path)
+    """
+    if len(images) == 0:
+        mean_observations = 0
+    else:
+        mean_observations = sum(
+            (len(img.point3D_ids) for _, img in images.items())
+        ) / len(images)
+    HEADER = (
+        "# Image list with two lines of data per image:\n"
+        + "#   IMAGE_ID, QW, QX, QY, QZ, TX, TY, TZ, CAMERA_ID, NAME\n"
+        + "#   POINTS2D[] as (X, Y, POINT3D_ID)\n"
+        + "# Number of images: {}, mean observations per image: {}\n".format(
+            len(images), mean_observations
+        )
+    )
+    with open(path, "w") as fid:
+        fid.write(HEADER)
+        for _, img in images.items():
+            image_header = [
+                img.id,
+                *img.qvec,
+                *img.tvec,
+                img.camera_id,
+                img.name,
+            ]
+            first_line = " ".join(map(str, image_header))
+            fid.write(first_line + "\n")
+            points_strings = []
+            for xy, point3D_id in zip(img.xys, img.point3D_ids):
+                points_strings.append(" ".join(map(str, [*xy, point3D_id])))
+            fid.write(" ".join(points_strings) + "\n")
+def write_images_binary(images, path_to_model_file):
+    """
+    see: src/colmap/scene/reconstruction.cc
+        void Reconstruction::ReadImagesBinary(const std::string& path)
+        void Reconstruction::WriteImagesBinary(const std::string& path)
+    """
+    with open(path_to_model_file, "wb") as fid:
+        write_next_bytes(fid, len(images), "Q")
+        for _, img in images.items():
+            write_next_bytes(fid, img.id, "i")
+            write_next_bytes(fid, img.qvec.tolist(), "dddd")
+            write_next_bytes(fid, img.tvec.tolist(), "ddd")
+            write_next_bytes(fid, img.camera_id, "i")
+            for char in img.name:
+                write_next_bytes(fid, char.encode("utf-8"), "c")
+            write_next_bytes(fid, b"\x00", "c")
+            write_next_bytes(fid, len(img.point3D_ids), "Q")
+            for xy, p3d_id in zip(img.xys, img.point3D_ids):
+                write_next_bytes(fid, [*xy, p3d_id], "ddq")
+def read_points3D_text(path):
+    """
+    see: src/colmap/scene/reconstruction.cc
+        void Reconstruction::ReadPoints3DText(const std::string& path)
+        void Reconstruction::WritePoints3DText(const std::string& path)
+    """
+    points3D = {}
+    with open(path, "r") as fid:
+        while True:
+            line = fid.readline()
+            if not line:
+                break
+            line = line.strip()
+            if len(line) > 0 and line[0] != "#":
+                elems = line.split()
+                point3D_id = int(elems[0])
+                xyz = np.array(tuple(map(float, elems[1:4])))
+                rgb = np.array(tuple(map(int, elems[4:7])))
+                error = float(elems[7])
+                image_ids = np.array(tuple(map(int, elems[8::2])))
+                point2D_idxs = np.array(tuple(map(int, elems[9::2])))
+                points3D[point3D_id] = Point3D(
+                    id=point3D_id,
+                    xyz=xyz,
+                    rgb=rgb,
+                    error=error,
+                    image_ids=image_ids,
+                    point2D_idxs=point2D_idxs,
+                )
+    return points3D
+def read_points3D_binary(path_to_model_file):
+    """
+    see: src/colmap/scene/reconstruction.cc
+        void Reconstruction::ReadPoints3DBinary(const std::string& path)
+        void Reconstruction::WritePoints3DBinary(const std::string& path)
+    """
+    points3D = {}
+    with open(path_to_model_file, "rb") as fid:
+        num_points = read_next_bytes(fid, 8, "Q")[0]
+        for _ in range(num_points):
+            binary_point_line_properties = read_next_bytes(
+                fid, num_bytes=43, format_char_sequence="QdddBBBd"
+            )
+            point3D_id = binary_point_line_properties[0]
+            xyz = np.array(binary_point_line_properties[1:4])
+            rgb = np.array(binary_point_line_properties[4:7])
+            error = np.array(binary_point_line_properties[7])
+            track_length = read_next_bytes(
+                fid, num_bytes=8, format_char_sequence="Q"
+            )[0]
+            track_elems = read_next_bytes(
+                fid,
+                num_bytes=8 * track_length,
+                format_char_sequence="ii" * track_length,
+            )
+            image_ids = np.array(tuple(map(int, track_elems[0::2])))
+            point2D_idxs = np.array(tuple(map(int, track_elems[1::2])))
+            points3D[point3D_id] = Point3D(
+                id=point3D_id,
+                xyz=xyz,
+                rgb=rgb,
+                error=error,
+                image_ids=image_ids,
+                point2D_idxs=point2D_idxs,
+            )
+    return points3D
+def write_points3D_text(points3D, path):
+    """
+    see: src/colmap/scene/reconstruction.cc
+        void Reconstruction::ReadPoints3DText(const std::string& path)
+        void Reconstruction::WritePoints3DText(const std::string& path)
+    """
+    if len(points3D) == 0:
+        mean_track_length = 0
+    else:
+        mean_track_length = sum(
+            (len(pt.image_ids) for _, pt in points3D.items())
+        ) / len(points3D)
+    HEADER = (
+        "# 3D point list with one line of data per point:\n"
+        + "#   POINT3D_ID, X, Y, Z, R, G, B, ERROR, TRACK[] as (IMAGE_ID, POINT2D_IDX)\n"
+        + "# Number of points: {}, mean track length: {}\n".format(
+            len(points3D), mean_track_length
+        )
+    )
+    with open(path, "w") as fid:
+        fid.write(HEADER)
+        for _, pt in points3D.items():
+            point_header = [pt.id, *pt.xyz, *pt.rgb, pt.error]
+            fid.write(" ".join(map(str, point_header)) + " ")
+            track_strings = []
+            for image_id, point2D in zip(pt.image_ids, pt.point2D_idxs):
+                track_strings.append(" ".join(map(str, [image_id, point2D])))
+            fid.write(" ".join(track_strings) + "\n")
+def write_points3D_binary(points3D, path_to_model_file):
+    """
+    see: src/colmap/scene/reconstruction.cc
+        void Reconstruction::ReadPoints3DBinary(const std::string& path)
+        void Reconstruction::WritePoints3DBinary(const std::string& path)
+    """
+    with open(path_to_model_file, "wb") as fid:
+        write_next_bytes(fid, len(points3D), "Q")
+        for _, pt in points3D.items():
+            write_next_bytes(fid, pt.id, "Q")
+            write_next_bytes(fid, pt.xyz.tolist(), "ddd")
+            write_next_bytes(fid, pt.rgb.tolist(), "BBB")
+            write_next_bytes(fid, pt.error, "d")
+            track_length = pt.image_ids.shape[0]
+            write_next_bytes(fid, track_length, "Q")
+            for image_id, point2D_id in zip(pt.image_ids, pt.point2D_idxs):
+                write_next_bytes(fid, [image_id, point2D_id], "ii")
+def detect_model_format(path, ext):
+    if (
+        os.path.isfile(os.path.join(path, "cameras" + ext))
+        and os.path.isfile(os.path.join(path, "images" + ext))
+        and os.path.isfile(os.path.join(path, "points3D" + ext))
+    ):
+        print("Detected model format: '" + ext + "'")
+        return True
+    return False
+def read_model(path, ext=""):
+    # try to detect the extension automatically
+    if ext == "":
+        if detect_model_format(path, ".bin"):
+            ext = ".bin"
+        elif detect_model_format(path, ".txt"):
+            ext = ".txt"
+        else:
+            print("Provide model format: '.bin' or '.txt'")
+            return
+    if ext == ".txt":
+        cameras = read_cameras_text(os.path.join(path, "cameras" + ext))
+        images = read_images_text(os.path.join(path, "images" + ext))
+        points3D = read_points3D_text(os.path.join(path, "points3D") + ext)
+    else:
+        cameras = read_cameras_binary(os.path.join(path, "cameras" + ext))
+        images = read_images_binary(os.path.join(path, "images" + ext))
+        points3D = read_points3D_binary(os.path.join(path, "points3D") + ext)
+    return cameras, images, points3D
+def write_model(cameras, images, points3D, path, ext=".bin"):
+    if ext == ".txt":
+        write_cameras_text(cameras, os.path.join(path, "cameras" + ext))
+        write_images_text(images, os.path.join(path, "images" + ext))
+        write_points3D_text(points3D, os.path.join(path, "points3D") + ext)
+    else:
+        write_cameras_binary(cameras, os.path.join(path, "cameras" + ext))
+        write_images_binary(images, os.path.join(path, "images" + ext))
+        write_points3D_binary(points3D, os.path.join(path, "points3D") + ext)
+    return cameras, images, points3D
+def qvec2rotmat(qvec):
+    return np.array(
+        [
+            [
+                1 - 2 * qvec[2] ** 2 - 2 * qvec[3] ** 2,
+                2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3],
+                2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2],
+            ],
+            [
+                2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3],
+                1 - 2 * qvec[1] ** 2 - 2 * qvec[3] ** 2,
+                2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1],
+            ],
+            [
+                2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2],
+                2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1],
+                1 - 2 * qvec[1] ** 2 - 2 * qvec[2] ** 2,
+            ],
+        ]
+    )
+def rotmat2qvec(R):
+    Rxx, Ryx, Rzx, Rxy, Ryy, Rzy, Rxz, Ryz, Rzz = R.flat
+    K = (
+        np.array(
+            [
+                [Rxx - Ryy - Rzz, 0, 0, 0],
+                [Ryx + Rxy, Ryy - Rxx - Rzz, 0, 0],
+                [Rzx + Rxz, Rzy + Ryz, Rzz - Rxx - Ryy, 0],
+                [Ryz - Rzy, Rzx - Rxz, Rxy - Ryx, Rxx + Ryy + Rzz],
+            ]
+        )
+        / 3.0
+    )
+    eigvals, eigvecs = np.linalg.eigh(K)
+    qvec = eigvecs[[3, 0, 1, 2], np.argmax(eigvals)]
+    if qvec[0] < 0:
+        qvec *= -1
+    return qvec
+def main():
+    parser = argparse.ArgumentParser(
+        description="Read and write COLMAP binary and text models"
+    )
+    parser.add_argument("--input_model", help="path to input model folder")
+    parser.add_argument(
+        "--input_format",
+        choices=[".bin", ".txt"],
+        help="input model format",
+        default="",
+    )
+    parser.add_argument("--output_model", help="path to output model folder")
+    parser.add_argument(
+        "--output_format",
+        choices=[".bin", ".txt"],
+        help="outut model format",
+        default=".txt",
+    )
+    args = parser.parse_args()
+    cameras, images, points3D = read_model(
+        path=args.input_model, ext=args.input_format
+    )
+    print("num_cameras:", len(cameras))
+    print("num_images:", len(images))
+    print("num_points3D:", len(points3D))
+    if args.output_model is not None:
+        write_model(
+            cameras,
+            images,
+            points3D,
+            path=args.output_model,
+            ext=args.output_format,
+        )
+if __name__ == "__main__":
+    main()

sgm/data/dataset.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from typing import Optional
+import torchdata.datapipes.iter
+import webdataset as wds
+from omegaconf import DictConfig
+from pytorch_lightning import LightningDataModule
+try:
+    from sdata import create_dataset, create_dummy_dataset, create_loader
+except ImportError as e:
+    print("#" * 100)
+    print("Datasets not yet available")
+    print("to enable, we need to add stable-datasets as a submodule")
+    print("please use ``git submodule update --init --recursive``")
+    print("and do ``pip install -e stable-datasets/`` from the root of this repo")
+    print("#" * 100)
+    exit(1)
+class StableDataModuleFromConfig(LightningDataModule):
+    def __init__(
+        self,
+        train: DictConfig,
+        validation: Optional[DictConfig] = None,
+        test: Optional[DictConfig] = None,
+        skip_val_loader: bool = False,
+        dummy: bool = False,
+    ):
+        super().__init__()
+        self.train_config = train
+        assert (
+            "datapipeline" in self.train_config and "loader" in self.train_config
+        ), "train config requires the fields `datapipeline` and `loader`"
+        self.val_config = validation
+        if not skip_val_loader:
+            if self.val_config is not None:
+                assert (
+                    "datapipeline" in self.val_config and "loader" in self.val_config
+                ), "validation config requires the fields `datapipeline` and `loader`"
+            else:
+                print(
+                    "Warning: No Validation datapipeline defined, using that one from training"
+                )
+                self.val_config = train
+        self.test_config = test
+        if self.test_config is not None:
+            assert (
+                "datapipeline" in self.test_config and "loader" in self.test_config
+            ), "test config requires the fields `datapipeline` and `loader`"
+        self.dummy = dummy
+        if self.dummy:
+            print("#" * 100)
+            print("USING DUMMY DATASET: HOPE YOU'RE DEBUGGING ;)")
+            print("#" * 100)
+    def setup(self, stage: str) -> None:
+        print("Preparing datasets")
+        if self.dummy:
+            data_fn = create_dummy_dataset
+        else:
+            data_fn = create_dataset
+        self.train_datapipeline = data_fn(**self.train_config.datapipeline)
+        if self.val_config:
+            self.val_datapipeline = data_fn(**self.val_config.datapipeline)
+        if self.test_config:
+            self.test_datapipeline = data_fn(**self.test_config.datapipeline)
+    def train_dataloader(self) -> torchdata.datapipes.iter.IterDataPipe:
+        loader = create_loader(self.train_datapipeline, **self.train_config.loader)
+        return loader
+    def val_dataloader(self) -> wds.DataPipeline:
+        return create_loader(self.val_datapipeline, **self.val_config.loader)
+    def test_dataloader(self) -> wds.DataPipeline:
+        return create_loader(self.test_datapipeline, **self.test_config.loader)

sgm/data/joint3d.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import torch
+from torch.utils.data import Dataset
+default_sub_data_config = {}
+class Joint3D(Dataset):
+    def __init__(self, sub_data_config: dict) -> None:
+        super().__init__()
+        self.sub_data_config = sub_data_config

sgm/data/json_index_dataset.py ADDED Viewed

	@@ -0,0 +1,1080 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import copy
+import functools
+import gzip
+import hashlib
+import json
+import logging
+import os
+import random
+import warnings
+from collections import defaultdict
+from itertools import islice
+from pathlib import Path
+from typing import (
+    Any,
+    ClassVar,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    Type,
+    TYPE_CHECKING,
+    Union,
+)
+import numpy as np
+import torch
+from PIL import Image
+from pytorch3d.implicitron.tools.config import registry, ReplaceableBase
+from pytorch3d.io import IO
+from pytorch3d.renderer.camera_utils import join_cameras_as_batch
+from pytorch3d.renderer.cameras import CamerasBase, PerspectiveCameras
+from pytorch3d.structures.pointclouds import Pointclouds
+from tqdm import tqdm
+from pytorch3d.implicitron.dataset import types
+from pytorch3d.implicitron.dataset.dataset_base import DatasetBase, FrameData
+from pytorch3d.implicitron.dataset.utils import is_known_frame_scalar
+logger = logging.getLogger(__name__)
+if TYPE_CHECKING:
+    from typing import TypedDict
+    class FrameAnnotsEntry(TypedDict):
+        subset: Optional[str]
+        frame_annotation: types.FrameAnnotation
+else:
+    FrameAnnotsEntry = dict
+@registry.register
+class JsonIndexDataset(DatasetBase, ReplaceableBase):
+    """
+    A dataset with annotations in json files like the Common Objects in 3D
+    (CO3D) dataset.
+    Args:
+        frame_annotations_file: A zipped json file containing metadata of the
+            frames in the dataset, serialized List[types.FrameAnnotation].
+        sequence_annotations_file: A zipped json file containing metadata of the
+            sequences in the dataset, serialized List[types.SequenceAnnotation].
+        subset_lists_file: A json file containing the lists of frames corresponding
+            corresponding to different subsets (e.g. train/val/test) of the dataset;
+            format: {subset: (sequence_name, frame_id, file_path)}.
+        subsets: Restrict frames/sequences only to the given list of subsets
+            as defined in subset_lists_file (see above).
+        limit_to: Limit the dataset to the first #limit_to frames (after other
+            filters have been applied).
+        limit_sequences_to: Limit the dataset to the first
+            #limit_sequences_to sequences (after other sequence filters have been
+            applied but before frame-based filters).
+        pick_sequence: A list of sequence names to restrict the dataset to.
+        exclude_sequence: A list of the names of the sequences to exclude.
+        limit_category_to: Restrict the dataset to the given list of categories.
+        dataset_root: The root folder of the dataset; all the paths in jsons are
+            specified relative to this root (but not json paths themselves).
+        load_images: Enable loading the frame RGB data.
+        load_depths: Enable loading the frame depth maps.
+        load_depth_masks: Enable loading the frame depth map masks denoting the
+            depth values used for evaluation (the points consistent across views).
+        load_masks: Enable loading frame foreground masks.
+        load_point_clouds: Enable loading sequence-level point clouds.
+        max_points: Cap on the number of loaded points in the point cloud;
+            if reached, they are randomly sampled without replacement.
+        mask_images: Whether to mask the images with the loaded foreground masks;
+            0 value is used for background.
+        mask_depths: Whether to mask the depth maps with the loaded foreground
+            masks; 0 value is used for background.
+        image_height: The height of the returned images, masks, and depth maps;
+            aspect ratio is preserved during cropping/resizing.
+        image_width: The width of the returned images, masks, and depth maps;
+            aspect ratio is preserved during cropping/resizing.
+        box_crop: Enable cropping of the image around the bounding box inferred
+            from the foreground region of the loaded segmentation mask; masks
+            and depth maps are cropped accordingly; cameras are corrected.
+        box_crop_mask_thr: The threshold used to separate pixels into foreground
+            and background based on the foreground_probability mask; if no value
+            is greater than this threshold, the loader lowers it and repeats.
+        box_crop_context: The amount of additional padding added to each
+            dimension of the cropping bounding box, relative to box size.
+        remove_empty_masks: Removes the frames with no active foreground pixels
+            in the segmentation mask after thresholding (see box_crop_mask_thr).
+        n_frames_per_sequence: If > 0, randomly samples #n_frames_per_sequence
+            frames in each sequences uniformly without replacement if it has
+            more frames than that; applied before other frame-level filters.
+        seed: The seed of the random generator sampling #n_frames_per_sequence
+            random frames per sequence.
+        sort_frames: Enable frame annotations sorting to group frames from the
+            same sequences together and order them by timestamps
+        eval_batches: A list of batches that form the evaluation set;
+            list of batch-sized lists of indices corresponding to __getitem__
+            of this class, thus it can be used directly as a batch sampler.
+        eval_batch_index:
+            ( Optional[List[List[Union[Tuple[str, int, str], Tuple[str, int]]]] )
+            A list of batches of frames described as (sequence_name, frame_idx)
+            that can form the evaluation set, `eval_batches` will be set from this.
+    """
+    frame_annotations_type: ClassVar[
+        Type[types.FrameAnnotation]
+    ] = types.FrameAnnotation
+    path_manager: Any = None
+    frame_annotations_file: str = ""
+    sequence_annotations_file: str = ""
+    subset_lists_file: str = ""
+    subsets: Optional[List[str]] = None
+    limit_to: int = 0
+    limit_sequences_to: int = 0
+    pick_sequence: Tuple[str, ...] = ()
+    exclude_sequence: Tuple[str, ...] = ()
+    limit_category_to: Tuple[int, ...] = ()
+    dataset_root: str = ""
+    load_images: bool = True
+    load_depths: bool = True
+    load_depth_masks: bool = True
+    load_masks: bool = True
+    load_point_clouds: bool = False
+    max_points: int = 0
+    mask_images: bool = False
+    mask_depths: bool = False
+    image_height: Optional[int] = 800
+    image_width: Optional[int] = 800
+    box_crop: bool = True
+    box_crop_mask_thr: float = 0.4
+    box_crop_context: float = 0.3
+    remove_empty_masks: bool = True
+    n_frames_per_sequence: int = -1
+    seed: int = 0
+    sort_frames: bool = False
+    eval_batches: Any = None
+    eval_batch_index: Any = None
+    # frame_annots: List[FrameAnnotsEntry] = field(init=False)
+    # seq_annots: Dict[str, types.SequenceAnnotation] = field(init=False)
+    def __post_init__(self) -> None:
+        # pyre-fixme[16]: `JsonIndexDataset` has no attribute `subset_to_image_path`.
+        self.subset_to_image_path = None
+        self._load_frames()
+        self._load_sequences()
+        if self.sort_frames:
+            self._sort_frames()
+        self._load_subset_lists()
+        self._filter_db()  # also computes sequence indices
+        self._extract_and_set_eval_batches()
+        logger.info(str(self))
+    def _extract_and_set_eval_batches(self):
+        """
+        Sets eval_batches based on input eval_batch_index.
+        """
+        if self.eval_batch_index is not None:
+            if self.eval_batches is not None:
+                raise ValueError(
+                    "Cannot define both eval_batch_index and eval_batches."
+                )
+            self.eval_batches = self.seq_frame_index_to_dataset_index(
+                self.eval_batch_index
+            )
+    def join(self, other_datasets: Iterable[DatasetBase]) -> None:
+        """
+        Join the dataset with other JsonIndexDataset objects.
+        Args:
+            other_datasets: A list of JsonIndexDataset objects to be joined
+                into the current dataset.
+        """
+        if not all(isinstance(d, JsonIndexDataset) for d in other_datasets):
+            raise ValueError("This function can only join a list of JsonIndexDataset")
+        # pyre-ignore[16]
+        self.frame_annots.extend([fa for d in other_datasets for fa in d.frame_annots])
+        # pyre-ignore[16]
+        self.seq_annots.update(
+            # https://gist.github.com/treyhunner/f35292e676efa0be1728
+            functools.reduce(
+                lambda a, b: {**a, **b},
+                [d.seq_annots for d in other_datasets],  # pyre-ignore[16]
+            )
+        )
+        all_eval_batches = [
+            self.eval_batches,
+            # pyre-ignore
+            *[d.eval_batches for d in other_datasets],
+        ]
+        if not (
+            all(ba is None for ba in all_eval_batches)
+            or all(ba is not None for ba in all_eval_batches)
+        ):
+            raise ValueError(
+                "When joining datasets, either all joined datasets have to have their"
+                " eval_batches defined, or all should have their eval batches undefined."
+            )
+        if self.eval_batches is not None:
+            self.eval_batches = sum(all_eval_batches, [])
+        self._invalidate_indexes(filter_seq_annots=True)
+    def is_filtered(self) -> bool:
+        """
+        Returns `True` in case the dataset has been filtered and thus some frame annotations
+        stored on the disk might be missing in the dataset object.
+        Returns:
+            is_filtered: `True` if the dataset has been filtered, else `False`.
+        """
+        return (
+            self.remove_empty_masks
+            or self.limit_to > 0
+            or self.limit_sequences_to > 0
+            or len(self.pick_sequence) > 0
+            or len(self.exclude_sequence) > 0
+            or len(self.limit_category_to) > 0
+            or self.n_frames_per_sequence > 0
+        )
+    def seq_frame_index_to_dataset_index(
+        self,
+        seq_frame_index: List[List[Union[Tuple[str, int, str], Tuple[str, int]]]],
+        allow_missing_indices: bool = False,
+        remove_missing_indices: bool = False,
+        suppress_missing_index_warning: bool = True,
+    ) -> List[List[Union[Optional[int], int]]]:
+        """
+        Obtain indices into the dataset object given a list of frame ids.
+        Args:
+            seq_frame_index: The list of frame ids specified as
+                `List[List[Tuple[sequence_name:str, frame_number:int]]]`. Optionally,
+                Image paths relative to the dataset_root can be stored specified as well:
+                `List[List[Tuple[sequence_name:str, frame_number:int, image_path:str]]]`
+            allow_missing_indices: If `False`, throws an IndexError upon reaching the first
+                entry from `seq_frame_index` which is missing in the dataset.
+                Otherwise, depending on `remove_missing_indices`, either returns `None`
+                in place of missing entries or removes the indices of missing entries.
+            remove_missing_indices: Active when `allow_missing_indices=True`.
+                If `False`, returns `None` in place of `seq_frame_index` entries that
+                are not present in the dataset.
+                If `True` removes missing indices from the returned indices.
+            suppress_missing_index_warning:
+                Active if `allow_missing_indices==True`. Suppressess a warning message
+                in case an entry from `seq_frame_index` is missing in the dataset
+                (expected in certain cases - e.g. when setting
+                `self.remove_empty_masks=True`).
+        Returns:
+            dataset_idx: Indices of dataset entries corresponding to`seq_frame_index`.
+        """
+        _dataset_seq_frame_n_index = {
+            seq: {
+                # pyre-ignore[16]
+                self.frame_annots[idx]["frame_annotation"].frame_number: idx
+                for idx in seq_idx
+            }
+            # pyre-ignore[16]
+            for seq, seq_idx in self._seq_to_idx.items()
+        }
+        def _get_dataset_idx(
+            seq_name: str, frame_no: int, path: Optional[str] = None
+        ) -> Optional[int]:
+            idx_seq = _dataset_seq_frame_n_index.get(seq_name, None)
+            idx = idx_seq.get(frame_no, None) if idx_seq is not None else None
+            if idx is None:
+                msg = (
+                    f"sequence_name={seq_name} / frame_number={frame_no}"
+                    " not in the dataset!"
+                )
+                if not allow_missing_indices:
+                    raise IndexError(msg)
+                if not suppress_missing_index_warning:
+                    warnings.warn(msg)
+                return idx
+            if path is not None:
+                # Check that the loaded frame path is consistent
+                # with the one stored in self.frame_annots.
+                assert os.path.normpath(
+                    # pyre-ignore[16]
+                    self.frame_annots[idx]["frame_annotation"].image.path
+                ) == os.path.normpath(
+                    path
+                ), f"Inconsistent frame indices {seq_name, frame_no, path}."
+            return idx
+        dataset_idx = [
+            [_get_dataset_idx(*b) for b in batch]  # pyre-ignore [6]
+            for batch in seq_frame_index
+        ]
+        if allow_missing_indices and remove_missing_indices:
+            # remove all None indices, and also batches with only None entries
+            valid_dataset_idx = [
+                [b for b in batch if b is not None] for batch in dataset_idx
+            ]
+            return [  # pyre-ignore[7]
+                batch for batch in valid_dataset_idx if len(batch) > 0
+            ]
+        return dataset_idx
+    def subset_from_frame_index(
+        self,
+        frame_index: List[Union[Tuple[str, int], Tuple[str, int, str]]],
+        allow_missing_indices: bool = True,
+    ) -> "JsonIndexDataset":
+        """
+        Generate a dataset subset given the list of frames specified in `frame_index`.
+        Args:
+            frame_index: The list of frame indentifiers (as stored in the metadata)
+                specified as `List[Tuple[sequence_name:str, frame_number:int]]`. Optionally,
+                Image paths relative to the dataset_root can be stored specified as well:
+                `List[Tuple[sequence_name:str, frame_number:int, image_path:str]]`,
+                in the latter case, if imaga_path do not match the stored paths, an error
+                is raised.
+            allow_missing_indices: If `False`, throws an IndexError upon reaching the first
+                entry from `frame_index` which is missing in the dataset.
+                Otherwise, generates a subset consisting of frames entries that actually
+                exist in the dataset.
+        """
+        # Get the indices into the frame annots.
+        dataset_indices = self.seq_frame_index_to_dataset_index(
+            [frame_index],
+            allow_missing_indices=self.is_filtered() and allow_missing_indices,
+        )[0]
+        valid_dataset_indices = [i for i in dataset_indices if i is not None]
+        # Deep copy the whole dataset except frame_annots, which are large so we
+        # deep copy only the requested subset of frame_annots.
+        memo = {id(self.frame_annots): None}  # pyre-ignore[16]
+        dataset_new = copy.deepcopy(self, memo)
+        dataset_new.frame_annots = copy.deepcopy(
+            [self.frame_annots[i] for i in valid_dataset_indices]
+        )
+        # This will kill all unneeded sequence annotations.
+        dataset_new._invalidate_indexes(filter_seq_annots=True)
+        # Finally annotate the frame annotations with the name of the subset
+        # stored in meta.
+        for frame_annot in dataset_new.frame_annots:
+            frame_annotation = frame_annot["frame_annotation"]
+            if frame_annotation.meta is not None:
+                frame_annot["subset"] = frame_annotation.meta.get("frame_type", None)
+        # A sanity check - this will crash in case some entries from frame_index are missing
+        # in dataset_new.
+        valid_frame_index = [
+            fi for fi, di in zip(frame_index, dataset_indices) if di is not None
+        ]
+        dataset_new.seq_frame_index_to_dataset_index(
+            [valid_frame_index], allow_missing_indices=False
+        )
+        return dataset_new
+    def __str__(self) -> str:
+        # pyre-ignore[16]
+        return f"JsonIndexDataset #frames={len(self.frame_annots)}"
+    def __len__(self) -> int:
+        # pyre-ignore[16]
+        return len(self.frame_annots)
+    def _get_frame_type(self, entry: FrameAnnotsEntry) -> Optional[str]:
+        return entry["subset"]
+    def get_all_train_cameras(self) -> CamerasBase:
+        """
+        Returns the cameras corresponding to all the known frames.
+        """
+        logger.info("Loading all train cameras.")
+        cameras = []
+        # pyre-ignore[16]
+        for frame_idx, frame_annot in enumerate(tqdm(self.frame_annots)):
+            frame_type = self._get_frame_type(frame_annot)
+            if frame_type is None:
+                raise ValueError("subsets not loaded")
+            if is_known_frame_scalar(frame_type):
+                cameras.append(self[frame_idx].camera)
+        return join_cameras_as_batch(cameras)
+    def __getitem__(self, index) -> FrameData:
+        # pyre-ignore[16]
+        if index >= len(self.frame_annots):
+            raise IndexError(f"index {index} out of range {len(self.frame_annots)}")
+        entry = self.frame_annots[index]["frame_annotation"]
+        # pyre-ignore[16]
+        point_cloud = self.seq_annots[entry.sequence_name].point_cloud
+        frame_data = FrameData(
+            frame_number=_safe_as_tensor(entry.frame_number, torch.long),
+            frame_timestamp=_safe_as_tensor(entry.frame_timestamp, torch.float),
+            sequence_name=entry.sequence_name,
+            sequence_category=self.seq_annots[entry.sequence_name].category,
+            camera_quality_score=_safe_as_tensor(
+                self.seq_annots[entry.sequence_name].viewpoint_quality_score,
+                torch.float,
+            ),
+            point_cloud_quality_score=_safe_as_tensor(
+                point_cloud.quality_score, torch.float
+            )
+            if point_cloud is not None
+            else None,
+        )
+        # The rest of the fields are optional
+        frame_data.frame_type = self._get_frame_type(self.frame_annots[index])
+        (
+            frame_data.fg_probability,
+            frame_data.mask_path,
+            frame_data.bbox_xywh,
+            clamp_bbox_xyxy,
+            frame_data.crop_bbox_xywh,
+        ) = self._load_crop_fg_probability(entry)
+        scale = 1.0
+        if self.load_images and entry.image is not None:
+            # original image size
+            frame_data.image_size_hw = _safe_as_tensor(entry.image.size, torch.long)
+            (
+                frame_data.image_rgb,
+                frame_data.image_path,
+                frame_data.mask_crop,
+                scale,
+            ) = self._load_crop_images(
+                entry, frame_data.fg_probability, clamp_bbox_xyxy
+            )
+        if self.load_depths and entry.depth is not None:
+            (
+                frame_data.depth_map,
+                frame_data.depth_path,
+                frame_data.depth_mask,
+            ) = self._load_mask_depth(entry, clamp_bbox_xyxy, frame_data.fg_probability)
+        if entry.viewpoint is not None:
+            frame_data.camera = self._get_pytorch3d_camera(
+                entry,
+                scale,
+                clamp_bbox_xyxy,
+            )
+        if self.load_point_clouds and point_cloud is not None:
+            pcl_path = self._fix_point_cloud_path(point_cloud.path)
+            frame_data.sequence_point_cloud = _load_pointcloud(
+                self._local_path(pcl_path), max_points=self.max_points
+            )
+            frame_data.sequence_point_cloud_path = pcl_path
+        return frame_data
+    def _fix_point_cloud_path(self, path: str) -> str:
+        """
+        Fix up a point cloud path from the dataset.
+        Some files in Co3Dv2 have an accidental absolute path stored.
+        """
+        unwanted_prefix = (
+            "/large_experiments/p3/replay/datasets/co3d/co3d45k_220512/export_v23/"
+        )
+        if path.startswith(unwanted_prefix):
+            path = path[len(unwanted_prefix) :]
+        return os.path.join(self.dataset_root, path)
+    def _load_crop_fg_probability(
+        self, entry: types.FrameAnnotation
+    ) -> Tuple[
+        Optional[torch.Tensor],
+        Optional[str],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
+        fg_probability = None
+        full_path = None
+        bbox_xywh = None
+        clamp_bbox_xyxy = None
+        crop_box_xywh = None
+        if (self.load_masks or self.box_crop) and entry.mask is not None:
+            full_path = os.path.join(self.dataset_root, entry.mask.path)
+            mask = _load_mask(self._local_path(full_path))
+            if mask.shape[-2:] != entry.image.size:
+                raise ValueError(
+                    f"bad mask size: {mask.shape[-2:]} vs {entry.image.size}!"
+                )
+            bbox_xywh = torch.tensor(_get_bbox_from_mask(mask, self.box_crop_mask_thr))
+            if self.box_crop:
+                clamp_bbox_xyxy = _clamp_box_to_image_bounds_and_round(
+                    _get_clamp_bbox(
+                        bbox_xywh,
+                        image_path=entry.image.path,
+                        box_crop_context=self.box_crop_context,
+                    ),
+                    image_size_hw=tuple(mask.shape[-2:]),
+                )
+                crop_box_xywh = _bbox_xyxy_to_xywh(clamp_bbox_xyxy)
+                mask = _crop_around_box(mask, clamp_bbox_xyxy, full_path)
+            fg_probability, _, _ = self._resize_image(mask, mode="nearest")
+        return fg_probability, full_path, bbox_xywh, clamp_bbox_xyxy, crop_box_xywh
+    def _load_crop_images(
+        self,
+        entry: types.FrameAnnotation,
+        fg_probability: Optional[torch.Tensor],
+        clamp_bbox_xyxy: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, str, torch.Tensor, float]:
+        assert self.dataset_root is not None and entry.image is not None
+        path = os.path.join(self.dataset_root, entry.image.path)
+        image_rgb = _load_image(self._local_path(path))
+        if image_rgb.shape[-2:] != entry.image.size:
+            raise ValueError(
+                f"bad image size: {image_rgb.shape[-2:]} vs {entry.image.size}!"
+            )
+        if self.box_crop:
+            assert clamp_bbox_xyxy is not None
+            image_rgb = _crop_around_box(image_rgb, clamp_bbox_xyxy, path)
+        image_rgb, scale, mask_crop = self._resize_image(image_rgb)
+        if self.mask_images:
+            assert fg_probability is not None
+            image_rgb *= fg_probability
+        return image_rgb, path, mask_crop, scale
+    def _load_mask_depth(
+        self,
+        entry: types.FrameAnnotation,
+        clamp_bbox_xyxy: Optional[torch.Tensor],
+        fg_probability: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, str, torch.Tensor]:
+        entry_depth = entry.depth
+        assert entry_depth is not None
+        path = os.path.join(self.dataset_root, entry_depth.path)
+        depth_map = _load_depth(self._local_path(path), entry_depth.scale_adjustment)
+        if self.box_crop:
+            assert clamp_bbox_xyxy is not None
+            depth_bbox_xyxy = _rescale_bbox(
+                clamp_bbox_xyxy, entry.image.size, depth_map.shape[-2:]
+            )
+            depth_map = _crop_around_box(depth_map, depth_bbox_xyxy, path)
+        depth_map, _, _ = self._resize_image(depth_map, mode="nearest")
+        if self.mask_depths:
+            assert fg_probability is not None
+            depth_map *= fg_probability
+        if self.load_depth_masks:
+            assert entry_depth.mask_path is not None
+            mask_path = os.path.join(self.dataset_root, entry_depth.mask_path)
+            depth_mask = _load_depth_mask(self._local_path(mask_path))
+            if self.box_crop:
+                assert clamp_bbox_xyxy is not None
+                depth_mask_bbox_xyxy = _rescale_bbox(
+                    clamp_bbox_xyxy, entry.image.size, depth_mask.shape[-2:]
+                )
+                depth_mask = _crop_around_box(
+                    depth_mask, depth_mask_bbox_xyxy, mask_path
+                )
+            depth_mask, _, _ = self._resize_image(depth_mask, mode="nearest")
+        else:
+            depth_mask = torch.ones_like(depth_map)
+        return depth_map, path, depth_mask
+    def _get_pytorch3d_camera(
+        self,
+        entry: types.FrameAnnotation,
+        scale: float,
+        clamp_bbox_xyxy: Optional[torch.Tensor],
+    ) -> PerspectiveCameras:
+        entry_viewpoint = entry.viewpoint
+        assert entry_viewpoint is not None
+        # principal point and focal length
+        principal_point = torch.tensor(
+            entry_viewpoint.principal_point, dtype=torch.float
+        )
+        focal_length = torch.tensor(entry_viewpoint.focal_length, dtype=torch.float)
+        half_image_size_wh_orig = (
+            torch.tensor(list(reversed(entry.image.size)), dtype=torch.float) / 2.0
+        )
+        # first, we convert from the dataset's NDC convention to pixels
+        format = entry_viewpoint.intrinsics_format
+        if format.lower() == "ndc_norm_image_bounds":
+            # this is e.g. currently used in CO3D for storing intrinsics
+            rescale = half_image_size_wh_orig
+        elif format.lower() == "ndc_isotropic":
+            rescale = half_image_size_wh_orig.min()
+        else:
+            raise ValueError(f"Unknown intrinsics format: {format}")
+        # principal point and focal length in pixels
+        principal_point_px = half_image_size_wh_orig - principal_point * rescale
+        focal_length_px = focal_length * rescale
+        if self.box_crop:
+            assert clamp_bbox_xyxy is not None
+            principal_point_px -= clamp_bbox_xyxy[:2]
+        # now, convert from pixels to PyTorch3D v0.5+ NDC convention
+        if self.image_height is None or self.image_width is None:
+            out_size = list(reversed(entry.image.size))
+        else:
+            out_size = [self.image_width, self.image_height]
+        half_image_size_output = torch.tensor(out_size, dtype=torch.float) / 2.0
+        half_min_image_size_output = half_image_size_output.min()
+        # rescaled principal point and focal length in ndc
+        principal_point = (
+            half_image_size_output - principal_point_px * scale
+        ) / half_min_image_size_output
+        focal_length = focal_length_px * scale / half_min_image_size_output
+        return PerspectiveCameras(
+            focal_length=focal_length[None],
+            principal_point=principal_point[None],
+            R=torch.tensor(entry_viewpoint.R, dtype=torch.float)[None],
+            T=torch.tensor(entry_viewpoint.T, dtype=torch.float)[None],
+        )
+    def _load_frames(self) -> None:
+        logger.info(f"Loading Co3D frames from {self.frame_annotations_file}.")
+        local_file = self._local_path(self.frame_annotations_file)
+        with gzip.open(local_file, "rt", encoding="utf8") as zipfile:
+            frame_annots_list = types.load_dataclass(
+                zipfile, List[self.frame_annotations_type]
+            )
+        if not frame_annots_list:
+            raise ValueError("Empty dataset!")
+        # pyre-ignore[16]
+        self.frame_annots = [
+            FrameAnnotsEntry(frame_annotation=a, subset=None) for a in frame_annots_list
+        ]
+    def _load_sequences(self) -> None:
+        logger.info(f"Loading Co3D sequences from {self.sequence_annotations_file}.")
+        local_file = self._local_path(self.sequence_annotations_file)
+        with gzip.open(local_file, "rt", encoding="utf8") as zipfile:
+            seq_annots = types.load_dataclass(zipfile, List[types.SequenceAnnotation])
+        if not seq_annots:
+            raise ValueError("Empty sequences file!")
+        # pyre-ignore[16]
+        self.seq_annots = {entry.sequence_name: entry for entry in seq_annots}
+    def _load_subset_lists(self) -> None:
+        logger.info(f"Loading Co3D subset lists from {self.subset_lists_file}.")
+        if not self.subset_lists_file:
+            return
+        with open(self._local_path(self.subset_lists_file), "r") as f:
+            subset_to_seq_frame = json.load(f)
+        frame_path_to_subset = {
+            path: subset
+            for subset, frames in subset_to_seq_frame.items()
+            for _, _, path in frames
+        }
+        # pyre-ignore[16]
+        for frame in self.frame_annots:
+            frame["subset"] = frame_path_to_subset.get(
+                frame["frame_annotation"].image.path, None
+            )
+            if frame["subset"] is None:
+                warnings.warn(
+                    "Subset lists are given but don't include "
+                    + frame["frame_annotation"].image.path
+                )
+    def _sort_frames(self) -> None:
+        # Sort frames to have them grouped by sequence, ordered by timestamp
+        # pyre-ignore[16]
+        self.frame_annots = sorted(
+            self.frame_annots,
+            key=lambda f: (
+                f["frame_annotation"].sequence_name,
+                f["frame_annotation"].frame_timestamp or 0,
+            ),
+        )
+    def _filter_db(self) -> None:
+        if self.remove_empty_masks:
+            logger.info("Removing images with empty masks.")
+            # pyre-ignore[16]
+            old_len = len(self.frame_annots)
+            msg = "remove_empty_masks needs every MaskAnnotation.mass to be set."
+            def positive_mass(frame_annot: types.FrameAnnotation) -> bool:
+                mask = frame_annot.mask
+                if mask is None:
+                    return False
+                if mask.mass is None:
+                    raise ValueError(msg)
+                return mask.mass > 1
+            self.frame_annots = [
+                frame
+                for frame in self.frame_annots
+                if positive_mass(frame["frame_annotation"])
+            ]
+            logger.info("... filtered %d -> %d" % (old_len, len(self.frame_annots)))
+        # this has to be called after joining with categories!!
+        subsets = self.subsets
+        if subsets:
+            if not self.subset_lists_file:
+                raise ValueError(
+                    "Subset filter is on but subset_lists_file was not given"
+                )
+            logger.info(f"Limiting Co3D dataset to the '{subsets}' subsets.")
+            # truncate the list of subsets to the valid one
+            self.frame_annots = [
+                entry for entry in self.frame_annots if entry["subset"] in subsets
+            ]
+            if len(self.frame_annots) == 0:
+                raise ValueError(f"There are no frames in the '{subsets}' subsets!")
+            self._invalidate_indexes(filter_seq_annots=True)
+        if len(self.limit_category_to) > 0:
+            logger.info(f"Limiting dataset to categories: {self.limit_category_to}")
+            # pyre-ignore[16]
+            self.seq_annots = {
+                name: entry
+                for name, entry in self.seq_annots.items()
+                if entry.category in self.limit_category_to
+            }
+        # sequence filters
+        for prefix in ("pick", "exclude"):
+            orig_len = len(self.seq_annots)
+            attr = f"{prefix}_sequence"
+            arr = getattr(self, attr)
+            if len(arr) > 0:
+                logger.info(f"{attr}: {str(arr)}")
+                self.seq_annots = {
+                    name: entry
+                    for name, entry in self.seq_annots.items()
+                    if (name in arr) == (prefix == "pick")
+                }
+                logger.info("... filtered %d -> %d" % (orig_len, len(self.seq_annots)))
+        if self.limit_sequences_to > 0:
+            self.seq_annots = dict(
+                islice(self.seq_annots.items(), self.limit_sequences_to)
+            )
+        # retain only frames from retained sequences
+        self.frame_annots = [
+            f
+            for f in self.frame_annots
+            if f["frame_annotation"].sequence_name in self.seq_annots
+        ]
+        self._invalidate_indexes()
+        if self.n_frames_per_sequence > 0:
+            logger.info(f"Taking max {self.n_frames_per_sequence} per sequence.")
+            keep_idx = []
+            # pyre-ignore[16]
+            for seq, seq_indices in self._seq_to_idx.items():
+                # infer the seed from the sequence name, this is reproducible
+                # and makes the selection differ for different sequences
+                seed = _seq_name_to_seed(seq) + self.seed
+                seq_idx_shuffled = random.Random(seed).sample(
+                    sorted(seq_indices), len(seq_indices)
+                )
+                keep_idx.extend(seq_idx_shuffled[: self.n_frames_per_sequence])
+            logger.info(
+                "... filtered %d -> %d" % (len(self.frame_annots), len(keep_idx))
+            )
+            self.frame_annots = [self.frame_annots[i] for i in keep_idx]
+            self._invalidate_indexes(filter_seq_annots=False)
+            # sequences are not decimated, so self.seq_annots is valid
+        if self.limit_to > 0 and self.limit_to < len(self.frame_annots):
+            logger.info(
+                "limit_to: filtered %d -> %d" % (len(self.frame_annots), self.limit_to)
+            )
+            self.frame_annots = self.frame_annots[: self.limit_to]
+            self._invalidate_indexes(filter_seq_annots=True)
+    def _invalidate_indexes(self, filter_seq_annots: bool = False) -> None:
+        # update _seq_to_idx and filter seq_meta according to frame_annots change
+        # if filter_seq_annots, also uldates seq_annots based on the changed _seq_to_idx
+        self._invalidate_seq_to_idx()
+        if filter_seq_annots:
+            # pyre-ignore[16]
+            self.seq_annots = {
+                k: v
+                for k, v in self.seq_annots.items()
+                # pyre-ignore[16]
+                if k in self._seq_to_idx
+            }
+    def _invalidate_seq_to_idx(self) -> None:
+        seq_to_idx = defaultdict(list)
+        # pyre-ignore[16]
+        for idx, entry in enumerate(self.frame_annots):
+            seq_to_idx[entry["frame_annotation"].sequence_name].append(idx)
+        # pyre-ignore[16]
+        self._seq_to_idx = seq_to_idx
+    def _resize_image(
+        self, image, mode="bilinear"
+    ) -> Tuple[torch.Tensor, float, torch.Tensor]:
+        image_height, image_width = self.image_height, self.image_width
+        if image_height is None or image_width is None:
+            # skip the resizing
+            imre_ = torch.from_numpy(image)
+            return imre_, 1.0, torch.ones_like(imre_[:1])
+        # takes numpy array, returns pytorch tensor
+        minscale = min(
+            image_height / image.shape[-2],
+            image_width / image.shape[-1],
+        )
+        imre = torch.nn.functional.interpolate(
+            torch.from_numpy(image)[None],
+            scale_factor=minscale,
+            mode=mode,
+            align_corners=False if mode == "bilinear" else None,
+            recompute_scale_factor=True,
+        )[0]
+        # pyre-fixme[19]: Expected 1 positional argument.
+        imre_ = torch.zeros(image.shape[0], self.image_height, self.image_width)
+        imre_[:, 0 : imre.shape[1], 0 : imre.shape[2]] = imre
+        # pyre-fixme[6]: For 2nd param expected `int` but got `Optional[int]`.
+        # pyre-fixme[6]: For 3rd param expected `int` but got `Optional[int]`.
+        mask = torch.zeros(1, self.image_height, self.image_width)
+        mask[:, 0 : imre.shape[1], 0 : imre.shape[2]] = 1.0
+        return imre_, minscale, mask
+    def _local_path(self, path: str) -> str:
+        if self.path_manager is None:
+            return path
+        return self.path_manager.get_local_path(path)
+    def get_frame_numbers_and_timestamps(
+        self, idxs: Sequence[int]
+    ) -> List[Tuple[int, float]]:
+        out: List[Tuple[int, float]] = []
+        for idx in idxs:
+            # pyre-ignore[16]
+            frame_annotation = self.frame_annots[idx]["frame_annotation"]
+            out.append(
+                (frame_annotation.frame_number, frame_annotation.frame_timestamp)
+            )
+        return out
+    def category_to_sequence_names(self) -> Dict[str, List[str]]:
+        c2seq = defaultdict(list)
+        # pyre-ignore
+        for sequence_name, sa in self.seq_annots.items():
+            c2seq[sa.category].append(sequence_name)
+        return dict(c2seq)
+    def get_eval_batches(self) -> Optional[List[List[int]]]:
+        return self.eval_batches
+def _seq_name_to_seed(seq_name) -> int:
+    return int(hashlib.sha1(seq_name.encode("utf-8")).hexdigest(), 16)
+def _load_image(path) -> np.ndarray:
+    with Image.open(path) as pil_im:
+        im = np.array(pil_im.convert("RGB"))
+    im = im.transpose((2, 0, 1))
+    im = im.astype(np.float32) / 255.0
+    return im
+def _load_16big_png_depth(depth_png) -> np.ndarray:
+    with Image.open(depth_png) as depth_pil:
+        # the image is stored with 16-bit depth but PIL reads it as I (32 bit).
+        # we cast it to uint16, then reinterpret as float16, then cast to float32
+        depth = (
+            np.frombuffer(np.array(depth_pil, dtype=np.uint16), dtype=np.float16)
+            .astype(np.float32)
+            .reshape((depth_pil.size[1], depth_pil.size[0]))
+        )
+    return depth
+def _load_1bit_png_mask(file: str) -> np.ndarray:
+    with Image.open(file) as pil_im:
+        mask = (np.array(pil_im.convert("L")) > 0.0).astype(np.float32)
+    return mask
+def _load_depth_mask(path: str) -> np.ndarray:
+    if not path.lower().endswith(".png"):
+        raise ValueError('unsupported depth mask file name "%s"' % path)
+    m = _load_1bit_png_mask(path)
+    return m[None]  # fake feature channel
+def _load_depth(path, scale_adjustment) -> np.ndarray:
+    if not path.lower().endswith(".png"):
+        raise ValueError('unsupported depth file name "%s"' % path)
+    d = _load_16big_png_depth(path) * scale_adjustment
+    d[~np.isfinite(d)] = 0.0
+    return d[None]  # fake feature channel
+def _load_mask(path) -> np.ndarray:
+    with Image.open(path) as pil_im:
+        mask = np.array(pil_im)
+    mask = mask.astype(np.float32) / 255.0
+    return mask[None]  # fake feature channel
+def _get_1d_bounds(arr) -> Tuple[int, int]:
+    nz = np.flatnonzero(arr)
+    return nz[0], nz[-1] + 1
+def _get_bbox_from_mask(
+    mask, thr, decrease_quant: float = 0.05
+) -> Tuple[int, int, int, int]:
+    # bbox in xywh
+    masks_for_box = np.zeros_like(mask)
+    while masks_for_box.sum() <= 1.0:
+        masks_for_box = (mask > thr).astype(np.float32)
+        thr -= decrease_quant
+    if thr <= 0.0:
+        warnings.warn(f"Empty masks_for_bbox (thr={thr}) => using full image.")
+    x0, x1 = _get_1d_bounds(masks_for_box.sum(axis=-2))
+    y0, y1 = _get_1d_bounds(masks_for_box.sum(axis=-1))
+    return x0, y0, x1 - x0, y1 - y0
+def _get_clamp_bbox(
+    bbox: torch.Tensor,
+    box_crop_context: float = 0.0,
+    image_path: str = "",
+) -> torch.Tensor:
+    # box_crop_context: rate of expansion for bbox
+    # returns possibly expanded bbox xyxy as float
+    bbox = bbox.clone()  # do not edit bbox in place
+    # increase box size
+    if box_crop_context > 0.0:
+        c = box_crop_context
+        bbox = bbox.float()
+        bbox[0] -= bbox[2] * c / 2
+        bbox[1] -= bbox[3] * c / 2
+        bbox[2] += bbox[2] * c
+        bbox[3] += bbox[3] * c
+    if (bbox[2:] <= 1.0).any():
+        raise ValueError(
+            f"squashed image {image_path}!! The bounding box contains no pixels."
+        )
+    bbox[2:] = torch.clamp(bbox[2:], 2)  # set min height, width to 2 along both axes
+    bbox_xyxy = _bbox_xywh_to_xyxy(bbox, clamp_size=2)
+    return bbox_xyxy
+def _crop_around_box(tensor, bbox, impath: str = ""):
+    # bbox is xyxy, where the upper bound is corrected with +1
+    bbox = _clamp_box_to_image_bounds_and_round(
+        bbox,
+        image_size_hw=tensor.shape[-2:],
+    )
+    tensor = tensor[..., bbox[1] : bbox[3], bbox[0] : bbox[2]]
+    assert all(c > 0 for c in tensor.shape), f"squashed image {impath}"
+    return tensor
+def _clamp_box_to_image_bounds_and_round(
+    bbox_xyxy: torch.Tensor,
+    image_size_hw: Tuple[int, int],
+) -> torch.LongTensor:
+    bbox_xyxy = bbox_xyxy.clone()
+    bbox_xyxy[[0, 2]] = torch.clamp(bbox_xyxy[[0, 2]], 0, image_size_hw[-1])
+    bbox_xyxy[[1, 3]] = torch.clamp(bbox_xyxy[[1, 3]], 0, image_size_hw[-2])
+    if not isinstance(bbox_xyxy, torch.LongTensor):
+        bbox_xyxy = bbox_xyxy.round().long()
+    return bbox_xyxy  # pyre-ignore [7]
+def _rescale_bbox(bbox: torch.Tensor, orig_res, new_res) -> torch.Tensor:
+    assert bbox is not None
+    assert np.prod(orig_res) > 1e-8
+    # average ratio of dimensions
+    rel_size = (new_res[0] / orig_res[0] + new_res[1] / orig_res[1]) / 2.0
+    return bbox * rel_size
+def _bbox_xyxy_to_xywh(xyxy: torch.Tensor) -> torch.Tensor:
+    wh = xyxy[2:] - xyxy[:2]
+    xywh = torch.cat([xyxy[:2], wh])
+    return xywh
+def _bbox_xywh_to_xyxy(
+    xywh: torch.Tensor, clamp_size: Optional[int] = None
+) -> torch.Tensor:
+    xyxy = xywh.clone()
+    if clamp_size is not None:
+        xyxy[2:] = torch.clamp(xyxy[2:], clamp_size)
+    xyxy[2:] += xyxy[:2]
+    return xyxy
+def _safe_as_tensor(data, dtype):
+    if data is None:
+        return None
+    return torch.tensor(data, dtype=dtype)
+# NOTE this cache is per-worker; they are implemented as processes.
+# each batch is loaded and collated by a single worker;
+# since sequences tend to co-occur within batches, this is useful.
+@functools.lru_cache(maxsize=256)
+def _load_pointcloud(pcl_path: Union[str, Path], max_points: int = 0) -> Pointclouds:
+    pcl = IO().load_pointcloud(pcl_path)
+    if max_points > 0:
+        pcl = pcl.subsample(max_points)
+    return pcl

sgm/data/latent_objaverse.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import numpy as np
+from pathlib import Path
+from PIL import Image
+import json
+import torch
+from torch.utils.data import Dataset, DataLoader, default_collate
+from torchvision.transforms import ToTensor, Normalize, Compose, Resize
+from pytorch_lightning import LightningDataModule
+from einops import rearrange
+class LatentObjaverseSpiral(Dataset):
+    def __init__(
+        self,
+        root_dir,
+        split="train",
+        transform=None,
+        random_front=False,
+        max_item=None,
+        cond_aug_mean=-3.0,
+        cond_aug_std=0.5,
+        condition_on_elevation=False,
+        **unused_kwargs,
+    ):
+        print("Using LVIS subset with precomputed Latents")
+        self.root_dir = Path(root_dir)
+        self.split = split
+        self.random_front = random_front
+        self.transform = transform
+        self.latent_dir = Path("/mnt/vepfs/3Ddataset/render_results/latents512")
+        self.ids = json.load(open("./assets/lvis_uids.json", "r"))
+        self.n_views = 18
+        valid_ids = []
+        for idx in self.ids:
+            if (self.root_dir / idx).exists():
+                valid_ids.append(idx)
+        self.ids = valid_ids
+        print("=" * 30)
+        print("Number of valid ids: ", len(self.ids))
+        print("=" * 30)
+        self.cond_aug_mean = cond_aug_mean
+        self.cond_aug_std = cond_aug_std
+        self.condition_on_elevation = condition_on_elevation
+        if max_item is not None:
+            self.ids = self.ids[:max_item]
+            ## debug
+            self.ids = self.ids * 10000

sgm/data/mnist.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import pytorch_lightning as pl
+import torchvision
+from torch.utils.data import DataLoader, Dataset
+from torchvision import transforms
+class MNISTDataDictWrapper(Dataset):
+    def __init__(self, dset):
+        super().__init__()
+        self.dset = dset
+    def __getitem__(self, i):
+        x, y = self.dset[i]
+        return {"jpg": x, "cls": y}
+    def __len__(self):
+        return len(self.dset)
+class MNISTLoader(pl.LightningDataModule):
+    def __init__(self, batch_size, num_workers=0, prefetch_factor=2, shuffle=True):
+        super().__init__()
+        transform = transforms.Compose(
+            [transforms.ToTensor(), transforms.Lambda(lambda x: x * 2.0 - 1.0)]
+        )
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.prefetch_factor = prefetch_factor if num_workers > 0 else 0
+        self.shuffle = shuffle
+        self.train_dataset = MNISTDataDictWrapper(
+            torchvision.datasets.MNIST(
+                root=".data/", train=True, download=True, transform=transform
+            )
+        )
+        self.test_dataset = MNISTDataDictWrapper(
+            torchvision.datasets.MNIST(
+                root=".data/", train=False, download=True, transform=transform
+            )
+        )
+    def prepare_data(self):
+        pass
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            num_workers=self.num_workers,
+            prefetch_factor=self.prefetch_factor,
+        )
+    def test_dataloader(self):
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            num_workers=self.num_workers,
+            prefetch_factor=self.prefetch_factor,
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            num_workers=self.num_workers,
+            prefetch_factor=self.prefetch_factor,
+        )
+if __name__ == "__main__":
+    dset = MNISTDataDictWrapper(
+        torchvision.datasets.MNIST(
+            root=".data/",
+            train=False,
+            download=True,
+            transform=transforms.Compose(
+                [transforms.ToTensor(), transforms.Lambda(lambda x: x * 2.0 - 1.0)]
+            ),
+        )
+    )
+    ex = dset[0]

sgm/data/mvimagenet.py ADDED Viewed

	@@ -0,0 +1,408 @@

+import numpy as np
+import torch
+from torch.utils.data import Dataset, DataLoader, default_collate
+from pathlib import Path
+from PIL import Image
+from scipy.spatial.transform import Rotation
+import rembg
+from rembg import remove, new_session
+from einops import rearrange
+from torchvision.transforms import ToTensor, Normalize, Compose, Resize
+from torchvision.transforms.functional import to_tensor
+from pytorch_lightning import LightningDataModule
+from sgm.data.colmap import read_cameras_binary, read_images_binary
+from sgm.data.objaverse import video_collate_fn, FLATTEN_FIELDS, flatten_for_video
+def qvec2rotmat(qvec):
+    return np.array(
+        [
+            [
+                1 - 2 * qvec[2] ** 2 - 2 * qvec[3] ** 2,
+                2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3],
+                2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2],
+            ],
+            [
+                2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3],
+                1 - 2 * qvec[1] ** 2 - 2 * qvec[3] ** 2,
+                2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1],
+            ],
+            [
+                2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2],
+                2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1],
+                1 - 2 * qvec[1] ** 2 - 2 * qvec[2] ** 2,
+            ],
+        ]
+    )
+def qt2c2w(q, t):
+    # NOTE: remember to convert to opengl coordinate system
+    # rot = Rotation.from_quat(q).as_matrix()
+    rot = qvec2rotmat(q)
+    c2w = np.eye(4)
+    c2w[:3, :3] = np.transpose(rot)
+    c2w[:3, 3] = -np.transpose(rot) @ t
+    c2w[..., 1:3] *= -1
+    return c2w
+def random_crop():
+    pass
+class MVImageNet(Dataset):
+    def __init__(
+        self,
+        root_dir,
+        split,
+        transform,
+        reso: int = 256,
+        mask_type: str = "random",
+        cond_aug_mean=-3.0,
+        cond_aug_std=0.5,
+        condition_on_elevation=False,
+        fps_id=0.0,
+        motion_bucket_id=300.0,
+        num_frames: int = 24,
+        use_mask: bool = True,
+        load_pixelnerf: bool = False,
+        scale_pose: bool = False,
+        max_n_cond: int = 1,
+        min_n_cond: int = 1,
+        cond_on_multi: bool = False,
+    ) -> None:
+        super().__init__()
+        self.root_dir = Path(root_dir)
+        self.split = split
+        avails = self.root_dir.glob("*/*")
+        self.ids = list(
+            map(
+                lambda x: str(x.relative_to(self.root_dir)),
+                filter(lambda x: x.is_dir(), avails),
+            )
+        )
+        self.transform = transform
+        self.reso = reso
+        self.num_frames = num_frames
+        self.cond_aug_mean = cond_aug_mean
+        self.cond_aug_std = cond_aug_std
+        self.condition_on_elevation = condition_on_elevation
+        self.fps_id = fps_id
+        self.motion_bucket_id = motion_bucket_id
+        self.mask_type = mask_type
+        self.use_mask = use_mask
+        self.load_pixelnerf = load_pixelnerf
+        self.scale_pose = scale_pose
+        self.max_n_cond = max_n_cond
+        self.min_n_cond = min_n_cond
+        self.cond_on_multi = cond_on_multi
+        if self.cond_on_multi:
+            assert self.min_n_cond == self.max_n_cond
+        self.session = new_session()
+    def __getitem__(self, index: int):
+        # mvimgnet starts with idx==1
+        idx_list = np.arange(0, self.num_frames)
+        this_image_dir = self.root_dir / self.ids[index] / "images"
+        this_camera_dir = self.root_dir / self.ids[index] / "sparse/0"
+        # while not this_camera_dir.exists():
+        #     index = (index + 1) % len(self.ids)
+        #     this_image_dir = self.root_dir / self.ids[index] / "images"
+        #     this_camera_dir = self.root_dir / self.ids[index] / "sparse/0"
+        if not this_camera_dir.exists():
+            index = 0
+            this_image_dir = self.root_dir / self.ids[index] / "images"
+            this_camera_dir = self.root_dir / self.ids[index] / "sparse/0"
+        this_images = read_images_binary(this_camera_dir / "images.bin")
+        # filenames = list(map(lambda x: f"{x:03d}", this_images.keys()))
+        filenames = list(this_images.keys())
+        if len(filenames) == 0:
+            index = 0
+            this_image_dir = self.root_dir / self.ids[index] / "images"
+            this_camera_dir = self.root_dir / self.ids[index] / "sparse/0"
+            this_images = read_images_binary(this_camera_dir / "images.bin")
+            # filenames = list(map(lambda x: f"{x:03d}", this_images.keys()))
+            filenames = list(this_images.keys())
+        filenames = list(
+            filter(lambda x: (this_image_dir / this_images[x].name).exists(), filenames)
+        )
+        filenames = sorted(filenames, key=lambda x: this_images[x].name)
+        # # debug
+        # names = []
+        # for v in filenames:
+        #     names.append(this_images[v].name)
+        # breakpoint()
+        while len(filenames) < self.num_frames:
+            num_surpass = self.num_frames - len(filenames)
+            filenames += list(reversed(filenames[-num_surpass:]))
+        if len(filenames) < self.num_frames:
+            print(f"\n\n{self.ids[index]}\n\n")
+        frames = []
+        cameras = []
+        downsampled_rgb = []
+        for view_idx in idx_list:
+            this_id = filenames[view_idx]
+            frame = Image.open(this_image_dir / this_images[this_id].name)
+            w, h = frame.size
+            if self.mask_type == "random":
+                image_size = min(h, w)
+                left = np.random.randint(0, w - image_size + 1)
+                right = left + image_size
+                top = np.random.randint(0, h - image_size + 1)
+                bottom = top + image_size
+                ## need to assign left, right, top, bottom, image_size
+            elif self.mask_type == "object":
+                pass
+            elif self.mask_type == "rembg":
+                image_size = min(h, w)
+                if (
+                    cached := this_image_dir
+                    / f"{this_images[this_id].name[:-4]}_rembg.png"
+                ).exists():
+                    try:
+                        mask = np.asarray(Image.open(cached, formats=["png"]))[..., 3]
+                    except:
+                        mask = remove(frame, session=self.session)
+                        mask.save(cached)
+                        mask = np.asarray(mask)[..., 3]
+                else:
+                    mask = remove(frame, session=self.session)
+                    mask.save(cached)
+                    mask = np.asarray(mask)[..., 3]
+                # in h,w order
+                y, x = np.array(mask.nonzero())
+                bbox_cx = x.mean()
+                bbox_cy = y.mean()
+                if bbox_cy - image_size / 2 < 0:
+                    top = 0
+                elif bbox_cy + image_size / 2 > h:
+                    top = h - image_size
+                else:
+                    top = int(bbox_cy - image_size / 2)
+                if bbox_cx - image_size / 2 < 0:
+                    left = 0
+                elif bbox_cx + image_size / 2 > w:
+                    left = w - image_size
+                else:
+                    left = int(bbox_cx - image_size / 2)
+                # top = max(int(bbox_cy - image_size / 2), 0)
+                # left = max(int(bbox_cx - image_size / 2), 0)
+                bottom = top + image_size
+                right = left + image_size
+            else:
+                raise ValueError(f"Unknown mask type: {self.mask_type}")
+            frame = frame.crop((left, top, right, bottom))
+            frame = frame.resize((self.reso, self.reso))
+            frames.append(self.transform(frame))
+            if self.load_pixelnerf:
+                # extrinsics
+                extrinsics = this_images[this_id]
+                c2w = qt2c2w(extrinsics.qvec, extrinsics.tvec)
+                # intrinsics
+                intrinsics = read_cameras_binary(this_camera_dir / "cameras.bin")
+                assert len(intrinsics) == 1
+                intrinsics = intrinsics[1]
+                f, cx, cy, _ = intrinsics.params
+                f *= 1 / image_size
+                cx -= left
+                cy -= top
+                cx *= 1 / image_size
+                cy *= 1 / image_size  # all are relative values
+                intrinsics = np.array([[f, 0, cx], [0, f, cy], [0, 0, 1]])
+                this_camera = np.zeros(25)
+                this_camera[:16] = c2w.reshape(-1)
+                this_camera[16:] = intrinsics.reshape(-1)
+                cameras.append(this_camera)
+                downsampled = frame.resize((self.reso // 8, self.reso // 8))
+                downsampled_rgb.append((self.transform(downsampled) + 1.0) * 0.5)
+        data = dict()
+        cond_aug = np.exp(
+            np.random.randn(1)[0] * self.cond_aug_std + self.cond_aug_mean
+        )
+        frames = torch.stack(frames)
+        cond = frames[0]
+        # setting all things in data
+        data["frames"] = frames
+        data["cond_frames_without_noise"] = cond
+        data["cond_aug"] = torch.as_tensor([cond_aug] * self.num_frames)
+        data["cond_frames"] = cond + cond_aug * torch.randn_like(cond)
+        data["fps_id"] = torch.as_tensor([self.fps_id] * self.num_frames)
+        data["motion_bucket_id"] = torch.as_tensor(
+            [self.motion_bucket_id] * self.num_frames
+        )
+        data["num_video_frames"] = self.num_frames
+        data["image_only_indicator"] = torch.as_tensor([0.0] * self.num_frames)
+        if self.load_pixelnerf:
+            # TODO: normalize camera poses
+            data["pixelnerf_input"] = dict()
+            data["pixelnerf_input"]["frames"] = frames
+            data["pixelnerf_input"]["rgb"] = torch.stack(downsampled_rgb)
+            cameras = torch.from_numpy(np.stack(cameras)).float()
+            if self.scale_pose:
+                c2ws = cameras[..., :16].reshape(-1, 4, 4)
+                center = c2ws[:, :3, 3].mean(0)
+                radius = (c2ws[:, :3, 3] - center).norm(dim=-1).max()
+                scale = 1.5 / radius
+                c2ws[..., :3, 3] = (c2ws[..., :3, 3] - center) * scale
+                cameras[..., :16] = c2ws.reshape(-1, 16)
+            # if self.max_n_cond > 1:
+            #     # TODO implement this
+            #     n_cond = np.random.randint(1, self.max_n_cond + 1)
+            #     # debug
+            #     source_index = [0]
+            #     if n_cond > 1:
+            #         source_index += np.random.choice(
+            #             np.arange(1, self.num_frames),
+            #             self.max_n_cond - 1,
+            #             replace=False,
+            #         ).tolist()
+            #         data["pixelnerf_input"]["source_index"] = torch.as_tensor(
+            #             source_index
+            #         )
+            #         data["pixelnerf_input"]["n_cond"] = n_cond
+            #         data["pixelnerf_input"]["source_images"] = frames[source_index]
+            #         data["pixelnerf_input"]["source_cameras"] = cameras[source_index]
+            data["pixelnerf_input"]["cameras"] = cameras
+        return data
+    def __len__(self):
+        return len(self.ids)
+    def collate_fn(self, batch):
+        # a hack to add source index and keep consistent within a batch
+        if self.max_n_cond > 1:
+            # TODO implement this
+            n_cond = np.random.randint(self.min_n_cond, self.max_n_cond + 1)
+            # debug
+            # source_index = [0]
+            if n_cond > 1:
+                for b in batch:
+                    source_index = [0] + np.random.choice(
+                        np.arange(1, self.num_frames),
+                        self.max_n_cond - 1,
+                        replace=False,
+                    ).tolist()
+                    b["pixelnerf_input"]["source_index"] = torch.as_tensor(source_index)
+                    b["pixelnerf_input"]["n_cond"] = n_cond
+                    b["pixelnerf_input"]["source_images"] = b["frames"][source_index]
+                    b["pixelnerf_input"]["source_cameras"] = b["pixelnerf_input"][
+                        "cameras"
+                    ][source_index]
+                    if self.cond_on_multi:
+                        b["cond_frames_without_noise"] = b["frames"][source_index]
+        ret = video_collate_fn(batch)
+        if self.cond_on_multi:
+            ret["cond_frames_without_noise"] = rearrange(ret["cond_frames_without_noise"], "b t ... -> (b t) ...")
+        return ret
+class MVImageNetFixedCond(MVImageNet):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+class MVImageNetDataset(LightningDataModule):
+    def __init__(
+        self,
+        root_dir,
+        batch_size=2,
+        shuffle=True,
+        num_workers=10,
+        prefetch_factor=2,
+        **kwargs,
+    ):
+        super().__init__()
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.prefetch_factor = prefetch_factor
+        self.shuffle = shuffle
+        self.transform = Compose(
+            [
+                ToTensor(),
+                Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+            ]
+        )
+        self.train_dataset = MVImageNet(
+            root_dir=root_dir,
+            split="train",
+            transform=self.transform,
+            **kwargs,
+        )
+        self.test_dataset = MVImageNet(
+            root_dir=root_dir,
+            split="test",
+            transform=self.transform,
+            **kwargs,
+        )
+    def train_dataloader(self):
+        def worker_init_fn(worker_id):
+            np.random.seed(np.random.get_state()[1][0])
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            num_workers=self.num_workers,
+            prefetch_factor=self.prefetch_factor,
+            collate_fn=self.train_dataset.collate_fn,
+        )
+    def test_dataloader(self):
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            num_workers=self.num_workers,
+            prefetch_factor=self.prefetch_factor,
+            collate_fn=self.test_dataset.collate_fn,
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            num_workers=self.num_workers,
+            prefetch_factor=self.prefetch_factor,
+            collate_fn=video_collate_fn,
+        )

sgm/data/objaverse.py ADDED Viewed

	@@ -0,0 +1,882 @@

+import numpy as np
+from pathlib import Path
+from PIL import Image
+import json
+import torch
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader, default_collate
+from torchvision.transforms import ToTensor, Normalize, Compose, Resize
+from torchvision.transforms.functional import to_tensor
+from pytorch_lightning import LightningDataModule
+from einops import rearrange
+def read_camera_matrix_single(json_file):
+    # for gobjaverse
+    with open(json_file, "r", encoding="utf8") as reader:
+        json_content = json.load(reader)
+    # negative sign for opencv to opengl
+    camera_matrix = torch.zeros(3, 4)
+    camera_matrix[:3, 0] = torch.tensor(json_content["x"])
+    camera_matrix[:3, 1] = -torch.tensor(json_content["y"])
+    camera_matrix[:3, 2] = -torch.tensor(json_content["z"])
+    camera_matrix[:3, 3] = torch.tensor(json_content["origin"])
+    """
+    camera_matrix = np.eye(4)
+    camera_matrix[:3, 0] = np.array(json_content['x'])
+    camera_matrix[:3, 1] = np.array(json_content['y'])
+    camera_matrix[:3, 2] = np.array(json_content['z'])
+    camera_matrix[:3, 3] = np.array(json_content['origin'])
+    # print(camera_matrix)
+    """
+    return camera_matrix
+def read_camera_instrinsics_single(json_file, h: int, w: int, scale: float = 1.0):
+    with open(json_file, "r", encoding="utf8") as reader:
+        json_content = json.load(reader)
+    h = int(h * scale)
+    w = int(w * scale)
+    y_fov = json_content["y_fov"]
+    x_fov = json_content["x_fov"]
+    fy = h / 2 / np.tan(y_fov / 2)
+    fx = w / 2 / np.tan(x_fov / 2)
+    cx = w // 2
+    cy = h // 2
+    intrinsics = torch.tensor(
+        [
+            [fx, fy],
+            [cx, cy],
+            [w, h],
+        ],
+        dtype=torch.float32,
+    )
+    return intrinsics
+def compose_extrinsic_RT(RT: torch.Tensor):
+    """
+    Compose the standard form extrinsic matrix from RT.
+    Batched I/O.
+    """
+    return torch.cat(
+        [
+            RT,
+            torch.tensor([[[0, 0, 0, 1]]], dtype=torch.float32).repeat(
+                RT.shape[0], 1, 1
+            ),
+        ],
+        dim=1,
+    )
+def get_normalized_camera_intrinsics(intrinsics: torch.Tensor):
+    """
+    intrinsics: (N, 3, 2), [[fx, fy], [cx, cy], [width, height]]
+    Return batched fx, fy, cx, cy
+    """
+    fx, fy = intrinsics[:, 0, 0], intrinsics[:, 0, 1]
+    cx, cy = intrinsics[:, 1, 0], intrinsics[:, 1, 1]
+    width, height = intrinsics[:, 2, 0], intrinsics[:, 2, 1]
+    fx, fy = fx / width, fy / height
+    cx, cy = cx / width, cy / height
+    return fx, fy, cx, cy
+def build_camera_standard(RT: torch.Tensor, intrinsics: torch.Tensor):
+    """
+    RT: (N, 3, 4)
+    intrinsics: (N, 3, 2), [[fx, fy], [cx, cy], [width, height]]
+    """
+    E = compose_extrinsic_RT(RT)
+    fx, fy, cx, cy = get_normalized_camera_intrinsics(intrinsics)
+    I = torch.stack(
+        [
+            torch.stack([fx, torch.zeros_like(fx), cx], dim=-1),
+            torch.stack([torch.zeros_like(fy), fy, cy], dim=-1),
+            torch.tensor([[0, 0, 1]], dtype=torch.float32).repeat(RT.shape[0], 1),
+        ],
+        dim=1,
+    )
+    return torch.cat(
+        [
+            E.reshape(-1, 16),
+            I.reshape(-1, 9),
+        ],
+        dim=-1,
+    )
+def calc_elevation(c2w):
+    ## works for single or batched c2w
+    ## assume world up is (0, 0, 1)
+    pos = c2w[..., :3, 3]
+    return np.arcsin(pos[..., 2] / np.linalg.norm(pos, axis=-1, keepdims=False))
+def read_camera_matrix_single(json_file):
+    with open(json_file, "r", encoding="utf8") as reader:
+        json_content = json.load(reader)
+    # negative sign for opencv to opengl
+    # camera_matrix = np.zeros([3, 4])
+    # camera_matrix[:3, 0] = np.array(json_content["x"])
+    # camera_matrix[:3, 1] = -np.array(json_content["y"])
+    # camera_matrix[:3, 2] = -np.array(json_content["z"])
+    # camera_matrix[:3, 3] = np.array(json_content["origin"])
+    camera_matrix = torch.zeros([3, 4])
+    camera_matrix[:3, 0] = torch.tensor(json_content["x"])
+    camera_matrix[:3, 1] = -torch.tensor(json_content["y"])
+    camera_matrix[:3, 2] = -torch.tensor(json_content["z"])
+    camera_matrix[:3, 3] = torch.tensor(json_content["origin"])
+    """
+    camera_matrix = np.eye(4)
+    camera_matrix[:3, 0] = np.array(json_content['x'])
+    camera_matrix[:3, 1] = np.array(json_content['y'])
+    camera_matrix[:3, 2] = np.array(json_content['z'])
+    camera_matrix[:3, 3] = np.array(json_content['origin'])
+    # print(camera_matrix)
+    """
+    return camera_matrix
+def blend_white_bg(image):
+    new_image = Image.new("RGB", image.size, (255, 255, 255))
+    new_image.paste(image, mask=image.split()[3])
+    return new_image
+def flatten_for_video(input):
+    return input.flatten()
+FLATTEN_FIELDS = ["fps_id", "motion_bucket_id", "cond_aug", "elevation"]
+def video_collate_fn(batch: list[dict], *args, **kwargs):
+    out = {}
+    for key in batch[0].keys():
+        if key in FLATTEN_FIELDS:
+            out[key] = default_collate([item[key] for item in batch])
+            out[key] = flatten_for_video(out[key])
+        elif key == "num_video_frames":
+            out[key] = batch[0][key]
+        elif key in ["frames", "latents", "rgb"]:
+            out[key] = default_collate([item[key] for item in batch])
+            out[key] = rearrange(out[key], "b t c h w -> (b t) c h w")
+        else:
+            out[key] = default_collate([item[key] for item in batch])
+    if "pixelnerf_input" in out:
+        out["pixelnerf_input"]["rgb"] = rearrange(
+            out["pixelnerf_input"]["rgb"], "b t c h w -> (b t) c h w"
+        )
+    return out
+class GObjaverse(Dataset):
+    def __init__(
+        self,
+        root_dir,
+        split="train",
+        transform=None,
+        random_front=False,
+        max_item=None,
+        cond_aug_mean=-3.0,
+        cond_aug_std=0.5,
+        condition_on_elevation=False,
+        fps_id=0.0,
+        motion_bucket_id=300.0,
+        use_latents=False,
+        load_caps=False,
+        front_view_selection="random",
+        load_pixelnerf=False,
+        debug_base_idx=None,
+        scale_pose: bool = False,
+        max_n_cond: int = 1,
+        **unused_kwargs,
+    ):
+        self.root_dir = Path(root_dir)
+        self.split = split
+        self.random_front = random_front
+        self.transform = transform
+        self.use_latents = use_latents
+        self.ids = json.load(open(self.root_dir / "valid_uids.json", "r"))
+        self.n_views = 24
+        self.load_caps = load_caps
+        if self.load_caps:
+            self.caps = json.load(open(self.root_dir / "text_captions_cap3d.json", "r"))
+        self.cond_aug_mean = cond_aug_mean
+        self.cond_aug_std = cond_aug_std
+        self.condition_on_elevation = condition_on_elevation
+        self.fps_id = fps_id
+        self.motion_bucket_id = motion_bucket_id
+        self.load_pixelnerf = load_pixelnerf
+        self.scale_pose = scale_pose
+        self.max_n_cond = max_n_cond
+        if self.use_latents:
+            self.latents_dir = self.root_dir / "latents256"
+            self.clip_dir = self.root_dir / "clip_emb256"
+        self.front_view_selection = front_view_selection
+        if self.front_view_selection == "random":
+            pass
+        elif self.front_view_selection == "fixed":
+            pass
+        elif self.front_view_selection.startswith("clip_score"):
+            self.clip_scores = torch.load(self.root_dir / "clip_score_per_view.pt")
+            self.ids = list(self.clip_scores.keys())
+        else:
+            raise ValueError(
+                f"Unknown front view selection method {self.front_view_selection}"
+            )
+        if max_item is not None:
+            self.ids = self.ids[:max_item]
+            ## debug
+            self.ids = self.ids * 10000
+        if debug_base_idx is not None:
+            print(f"debug mode with base idx: {debug_base_idx}")
+            self.debug_base_idx = debug_base_idx
+    def __getitem__(self, idx: int):
+        if hasattr(self, "debug_base_idx"):
+            idx = (idx + self.debug_base_idx) % len(self.ids)
+        data = {}
+        idx_list = np.arange(self.n_views)
+        # if self.random_front:
+        #     roll_idx = np.random.randint(self.n_views)
+        #     idx_list = np.roll(idx_list, roll_idx)
+        if self.front_view_selection == "random":
+            roll_idx = np.random.randint(self.n_views)
+            idx_list = np.roll(idx_list, roll_idx)
+        elif self.front_view_selection == "fixed":
+            pass
+        elif self.front_view_selection == "clip_score_softmax":
+            this_clip_score = (
+                F.softmax(self.clip_scores[self.ids[idx]], dim=-1).cpu().numpy()
+            )
+            roll_idx = np.random.choice(idx_list, p=this_clip_score)
+            idx_list = np.roll(idx_list, roll_idx)
+        elif self.front_view_selection == "clip_score_max":
+            this_clip_score = (
+                F.softmax(self.clip_scores[self.ids[idx]], dim=-1).cpu().numpy()
+            )
+            roll_idx = np.argmax(this_clip_score)
+            idx_list = np.roll(idx_list, roll_idx)
+        frames = []
+        if not self.use_latents:
+            try:
+                for view_idx in idx_list:
+                    frame = Image.open(
+                        self.root_dir
+                        / "gobjaverse"
+                        / self.ids[idx]
+                        / f"{view_idx:05d}/{view_idx:05d}.png"
+                    )
+                    frames.append(self.transform(frame))
+            except:
+                idx = 0
+                frames = []
+                for view_idx in idx_list:
+                    frame = Image.open(
+                        self.root_dir
+                        / "gobjaverse"
+                        / self.ids[idx]
+                        / f"{view_idx:05d}/{view_idx:05d}.png"
+                    )
+                    frames.append(self.transform(frame))
+                # a workaround for some bugs in gobjaverse
+                # use idx=0 and the repeat will be resolved when gathering results, valid number of items can be checked by the len of results
+            frames = torch.stack(frames, dim=0)
+            cond = frames[0]
+            cond_aug = np.exp(
+                np.random.randn(1)[0] * self.cond_aug_std + self.cond_aug_mean
+            )
+            data.update(
+                {
+                    "frames": frames,
+                    "cond_frames_without_noise": cond,
+                    "cond_aug": torch.as_tensor([cond_aug] * self.n_views),
+                    "cond_frames": cond + cond_aug * torch.randn_like(cond),
+                    "fps_id": torch.as_tensor([self.fps_id] * self.n_views),
+                    "motion_bucket_id": torch.as_tensor(
+                        [self.motion_bucket_id] * self.n_views
+                    ),
+                    "num_video_frames": 24,
+                    "image_only_indicator": torch.as_tensor([0.0] * self.n_views),
+                }
+            )
+        else:
+            latents = torch.load(self.latents_dir / f"{self.ids[idx]}.pt")[idx_list]
+            clip_emb = torch.load(self.clip_dir / f"{self.ids[idx]}.pt")[idx_list][0]
+            cond = latents[0]
+            cond_aug = np.exp(
+                np.random.randn(1)[0] * self.cond_aug_std + self.cond_aug_mean
+            )
+            data.update(
+                {
+                    "latents": latents,
+                    "cond_frames_without_noise": clip_emb,
+                    "cond_aug": torch.as_tensor([cond_aug] * self.n_views),
+                    "cond_frames": cond + cond_aug * torch.randn_like(cond),
+                    "fps_id": torch.as_tensor([self.fps_id] * self.n_views),
+                    "motion_bucket_id": torch.as_tensor(
+                        [self.motion_bucket_id] * self.n_views
+                    ),
+                    "num_video_frames": 24,
+                    "image_only_indicator": torch.as_tensor([0.0] * self.n_views),
+                }
+            )
+        if self.condition_on_elevation:
+            sample_c2w = read_camera_matrix_single(
+                self.root_dir / self.ids[idx] / f"00000/00000.json"
+            )
+            elevation = calc_elevation(sample_c2w)
+            data["elevation"] = torch.as_tensor([elevation] * self.n_views)
+        if self.load_pixelnerf:
+            assert "frames" in data, f"pixelnerf cannot work with latents only mode"
+            data["pixelnerf_input"] = {}
+            RTs = []
+            intrinsics = []
+            for view_idx in idx_list:
+                meta = (
+                    self.root_dir
+                    / "gobjaverse"
+                    / self.ids[idx]
+                    / f"{view_idx:05d}/{view_idx:05d}.json"
+                )
+                RTs.append(read_camera_matrix_single(meta)[:3])
+                intrinsics.append(read_camera_instrinsics_single(meta, 256, 256))
+            RTs = torch.stack(RTs, dim=0)
+            intrinsics = torch.stack(intrinsics, dim=0)
+            cameras = build_camera_standard(RTs, intrinsics)
+            data["pixelnerf_input"]["cameras"] = cameras
+            downsampled = []
+            for view_idx in idx_list:
+                frame = Image.open(
+                    self.root_dir
+                    / "gobjaverse"
+                    / self.ids[idx]
+                    / f"{view_idx:05d}/{view_idx:05d}.png"
+                ).resize((32, 32))
+                downsampled.append(to_tensor(blend_white_bg(frame)))
+            data["pixelnerf_input"]["rgb"] = torch.stack(downsampled, dim=0)
+            data["pixelnerf_input"]["frames"] = data["frames"]
+            if self.scale_pose:
+                c2ws = cameras[..., :16].reshape(-1, 4, 4)
+                center = c2ws[:, :3, 3].mean(0)
+                radius = (c2ws[:, :3, 3] - center).norm(dim=-1).max()
+                scale = 1.5 / radius
+                c2ws[..., :3, 3] = (c2ws[..., :3, 3] - center) * scale
+                cameras[..., :16] = c2ws.reshape(-1, 16)
+        if self.load_caps:
+            data["caption"] = self.caps[self.ids[idx]]
+            data["ids"] = self.ids[idx]
+        return data
+    def __len__(self):
+        return len(self.ids)
+    def collate_fn(self, batch):
+        if self.max_n_cond > 1:
+            n_cond = np.random.randint(1, self.max_n_cond + 1)
+            if n_cond > 1:
+                for b in batch:
+                    source_index = [0] + np.random.choice(
+                        np.arange(1, self.n_views),
+                        self.max_n_cond - 1,
+                        replace=False,
+                    ).tolist()
+                    b["pixelnerf_input"]["source_index"] = torch.as_tensor(source_index)
+                    b["pixelnerf_input"]["n_cond"] = n_cond
+                    b["pixelnerf_input"]["source_images"] = b["frames"][source_index]
+                    b["pixelnerf_input"]["source_cameras"] = b["pixelnerf_input"][
+                        "cameras"
+                    ][source_index]
+        return video_collate_fn(batch)
+class ObjaverseSpiral(Dataset):
+    def __init__(
+        self,
+        root_dir,
+        split="train",
+        transform=None,
+        random_front=False,
+        max_item=None,
+        cond_aug_mean=-3.0,
+        cond_aug_std=0.5,
+        condition_on_elevation=False,
+        **unused_kwargs,
+    ):
+        self.root_dir = Path(root_dir)
+        self.split = split
+        self.random_front = random_front
+        self.transform = transform
+        self.ids = json.load(open(self.root_dir / f"{split}_ids.json", "r"))
+        self.n_views = 24
+        valid_ids = []
+        for idx in self.ids:
+            if (self.root_dir / idx).exists():
+                valid_ids.append(idx)
+        self.ids = valid_ids
+        self.cond_aug_mean = cond_aug_mean
+        self.cond_aug_std = cond_aug_std
+        self.condition_on_elevation = condition_on_elevation
+        if max_item is not None:
+            self.ids = self.ids[:max_item]
+            ## debug
+            self.ids = self.ids * 10000
+    def __getitem__(self, idx: int):
+        frames = []
+        idx_list = np.arange(self.n_views)
+        if self.random_front:
+            roll_idx = np.random.randint(self.n_views)
+            idx_list = np.roll(idx_list, roll_idx)
+        for view_idx in idx_list:
+            frame = Image.open(
+                self.root_dir / self.ids[idx] / f"{view_idx:05d}/{view_idx:05d}.png"
+            )
+            frames.append(self.transform(frame))
+        # data = {"jpg": torch.stack(frames, dim=0)}  # [T, C, H, W]
+        frames = torch.stack(frames, dim=0)
+        cond = frames[0]
+        cond_aug = np.exp(
+            np.random.randn(1)[0] * self.cond_aug_std + self.cond_aug_mean
+        )
+        data = {
+            "frames": frames,
+            "cond_frames_without_noise": cond,
+            "cond_aug": torch.as_tensor([cond_aug] * self.n_views),
+            "cond_frames": cond + cond_aug * torch.randn_like(cond),
+            "fps_id": torch.as_tensor([1.0] * self.n_views),
+            "motion_bucket_id": torch.as_tensor([300.0] * self.n_views),
+            "num_video_frames": 24,
+            "image_only_indicator": torch.as_tensor([0.0] * self.n_views),
+        }
+        if self.condition_on_elevation:
+            sample_c2w = read_camera_matrix_single(
+                self.root_dir / self.ids[idx] / f"00000/00000.json"
+            )
+            elevation = calc_elevation(sample_c2w)
+            data["elevation"] = torch.as_tensor([elevation] * self.n_views)
+        return data
+    def __len__(self):
+        return len(self.ids)
+class ObjaverseLVISSpiral(Dataset):
+    def __init__(
+        self,
+        root_dir,
+        split="train",
+        transform=None,
+        random_front=False,
+        max_item=None,
+        cond_aug_mean=-3.0,
+        cond_aug_std=0.5,
+        condition_on_elevation=False,
+        use_precomputed_latents=False,
+        **unused_kwargs,
+    ):
+        print("Using LVIS subset")
+        self.root_dir = Path(root_dir)
+        self.latent_dir = Path("/mnt/vepfs/3Ddataset/render_results/latents512")
+        self.split = split
+        self.random_front = random_front
+        self.transform = transform
+        self.use_precomputed_latents = use_precomputed_latents
+        self.ids = json.load(open("./assets/lvis_uids.json", "r"))
+        self.n_views = 18
+        valid_ids = []
+        for idx in self.ids:
+            if (self.root_dir / idx).exists():
+                valid_ids.append(idx)
+        self.ids = valid_ids
+        print("=" * 30)
+        print("Number of valid ids: ", len(self.ids))
+        print("=" * 30)
+        self.cond_aug_mean = cond_aug_mean
+        self.cond_aug_std = cond_aug_std
+        self.condition_on_elevation = condition_on_elevation
+        if max_item is not None:
+            self.ids = self.ids[:max_item]
+            ## debug
+            self.ids = self.ids * 10000
+    def __getitem__(self, idx: int):
+        frames = []
+        idx_list = np.arange(self.n_views)
+        if self.random_front:
+            roll_idx = np.random.randint(self.n_views)
+            idx_list = np.roll(idx_list, roll_idx)
+        for view_idx in idx_list:
+            frame = Image.open(
+                self.root_dir
+                / self.ids[idx]
+                / "elevations_0"
+                / f"colors_{view_idx * 2}.png"
+            )
+            frames.append(self.transform(frame))
+        frames = torch.stack(frames, dim=0)
+        cond = frames[0]
+        cond_aug = np.exp(
+            np.random.randn(1)[0] * self.cond_aug_std + self.cond_aug_mean
+        )
+        data = {
+            "frames": frames,
+            "cond_frames_without_noise": cond,
+            "cond_aug": torch.as_tensor([cond_aug] * self.n_views),
+            "cond_frames": cond + cond_aug * torch.randn_like(cond),
+            "fps_id": torch.as_tensor([0.0] * self.n_views),
+            "motion_bucket_id": torch.as_tensor([300.0] * self.n_views),
+            "num_video_frames": self.n_views,
+            "image_only_indicator": torch.as_tensor([0.0] * self.n_views),
+        }
+        if self.use_precomputed_latents:
+            data["latents"] = torch.load(self.latent_dir / f"{self.ids[idx]}.pt")
+        if self.condition_on_elevation:
+            # sample_c2w = read_camera_matrix_single(
+            #     self.root_dir / self.ids[idx] / f"00000/00000.json"
+            # )
+            # elevation = calc_elevation(sample_c2w)
+            # data["elevation"] = torch.as_tensor([elevation] * self.n_views)
+            assert False, "currently assumes elevation 0"
+        return data
+    def __len__(self):
+        return len(self.ids)
+class ObjaverseALLSpiral(ObjaverseLVISSpiral):
+    def __init__(
+        self,
+        root_dir,
+        split="train",
+        transform=None,
+        random_front=False,
+        max_item=None,
+        cond_aug_mean=-3.0,
+        cond_aug_std=0.5,
+        condition_on_elevation=False,
+        use_precomputed_latents=False,
+        **unused_kwargs,
+    ):
+        print("Using ALL objects in Objaverse")
+        self.root_dir = Path(root_dir)
+        self.split = split
+        self.random_front = random_front
+        self.transform = transform
+        self.use_precomputed_latents = use_precomputed_latents
+        self.latent_dir = Path("/mnt/vepfs/3Ddataset/render_results/latents512")
+        self.ids = json.load(open("./assets/all_ids.json", "r"))
+        self.n_views = 18
+        valid_ids = []
+        for idx in self.ids:
+            if (self.root_dir / idx).exists() and (self.root_dir / idx).is_dir():
+                valid_ids.append(idx)
+        self.ids = valid_ids
+        print("=" * 30)
+        print("Number of valid ids: ", len(self.ids))
+        print("=" * 30)
+        self.cond_aug_mean = cond_aug_mean
+        self.cond_aug_std = cond_aug_std
+        self.condition_on_elevation = condition_on_elevation
+        if max_item is not None:
+            self.ids = self.ids[:max_item]
+            ## debug
+            self.ids = self.ids * 10000
+class ObjaverseWithPose(Dataset):
+    def __init__(
+        self,
+        root_dir,
+        split="train",
+        transform=None,
+        random_front=False,
+        max_item=None,
+        cond_aug_mean=-3.0,
+        cond_aug_std=0.5,
+        condition_on_elevation=False,
+        use_precomputed_latents=False,
+        **unused_kwargs,
+    ):
+        print("Using Objaverse with poses")
+        self.root_dir = Path(root_dir)
+        self.split = split
+        self.random_front = random_front
+        self.transform = transform
+        self.use_precomputed_latents = use_precomputed_latents
+        self.latent_dir = Path("/mnt/vepfs/3Ddataset/render_results/latents512")
+        self.ids = json.load(open("./assets/all_ids.json", "r"))
+        self.n_views = 18
+        valid_ids = []
+        for idx in self.ids:
+            if (self.root_dir / idx).exists() and (self.root_dir / idx).is_dir():
+                valid_ids.append(idx)
+        self.ids = valid_ids
+        print("=" * 30)
+        print("Number of valid ids: ", len(self.ids))
+        print("=" * 30)
+        self.cond_aug_mean = cond_aug_mean
+        self.cond_aug_std = cond_aug_std
+        self.condition_on_elevation = condition_on_elevation
+    def __getitem__(self, idx: int):
+        frames = []
+        idx_list = np.arange(self.n_views)
+        if self.random_front:
+            roll_idx = np.random.randint(self.n_views)
+            idx_list = np.roll(idx_list, roll_idx)
+        for view_idx in idx_list:
+            frame = Image.open(
+                self.root_dir
+                / self.ids[idx]
+                / "elevations_0"
+                / f"colors_{view_idx * 2}.png"
+            )
+            frames.append(self.transform(frame))
+        frames = torch.stack(frames, dim=0)
+        cond = frames[0]
+        cond_aug = np.exp(
+            np.random.randn(1)[0] * self.cond_aug_std + self.cond_aug_mean
+        )
+        data = {
+            "frames": frames,
+            "cond_frames_without_noise": cond,
+            "cond_aug": torch.as_tensor([cond_aug] * self.n_views),
+            "cond_frames": cond + cond_aug * torch.randn_like(cond),
+            "fps_id": torch.as_tensor([0.0] * self.n_views),
+            "motion_bucket_id": torch.as_tensor([300.0] * self.n_views),
+            "num_video_frames": self.n_views,
+            "image_only_indicator": torch.as_tensor([0.0] * self.n_views),
+        }
+        if self.use_precomputed_latents:
+            data["latents"] = torch.load(self.latent_dir / f"{self.ids[idx]}.pt")
+        if self.condition_on_elevation:
+            assert False, "currently assumes elevation 0"
+        return data
+class LatentObjaverse(Dataset):
+    def __init__(
+        self,
+        root_dir,
+        split="train",
+        random_front=False,
+        subset="lvis",
+        fps_id=1.0,
+        motion_bucket_id=300.0,
+        cond_aug_mean=-3.0,
+        cond_aug_std=0.5,
+        **unused_kwargs,
+    ):
+        self.root_dir = Path(root_dir)
+        self.split = split
+        self.random_front = random_front
+        self.ids = json.load(open(Path("./assets") / f"{subset}_ids.json", "r"))
+        self.clip_emb_dir = self.root_dir / ".." / "clip_emb512"
+        self.n_views = 18
+        self.fps_id = fps_id
+        self.motion_bucket_id = motion_bucket_id
+        self.cond_aug_mean = cond_aug_mean
+        self.cond_aug_std = cond_aug_std
+        if self.random_front:
+            print("Using a random view as front view")
+        valid_ids = []
+        for idx in self.ids:
+            if (self.root_dir / f"{idx}.pt").exists() and (
+                self.clip_emb_dir / f"{idx}.pt"
+            ).exists():
+                valid_ids.append(idx)
+        self.ids = valid_ids
+        print("=" * 30)
+        print("Number of valid ids: ", len(self.ids))
+        print("=" * 30)
+    def __getitem__(self, idx: int):
+        uid = self.ids[idx]
+        idx_list = torch.arange(self.n_views)
+        latents = torch.load(self.root_dir / f"{uid}.pt")
+        clip_emb = torch.load(self.clip_emb_dir / f"{uid}.pt")
+        if self.random_front:
+            idx_list = torch.roll(idx_list, np.random.randint(self.n_views))
+        latents = latents[idx_list]
+        clip_emb = clip_emb[idx_list][0]
+        cond_aug = np.exp(
+            np.random.randn(1)[0] * self.cond_aug_std + self.cond_aug_mean
+        )
+        cond = latents[0]
+        data = {
+            "latents": latents,
+            "cond_frames_without_noise": clip_emb,
+            "cond_frames": cond + cond_aug * torch.randn_like(cond),
+            "fps_id": torch.as_tensor([self.fps_id] * self.n_views),
+            "motion_bucket_id": torch.as_tensor([self.motion_bucket_id] * self.n_views),
+            "cond_aug": torch.as_tensor([cond_aug] * self.n_views),
+            "num_video_frames": self.n_views,
+            "image_only_indicator": torch.as_tensor([0.0] * self.n_views),
+        }
+        return data
+    def __len__(self):
+        return len(self.ids)
+class ObjaverseSpiralDataset(LightningDataModule):
+    def __init__(
+        self,
+        root_dir,
+        random_front=False,
+        batch_size=2,
+        num_workers=10,
+        prefetch_factor=2,
+        shuffle=True,
+        max_item=None,
+        dataset_cls="richdreamer",
+        reso: int = 256,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.prefetch_factor = prefetch_factor
+        self.shuffle = shuffle
+        self.max_item = max_item
+        self.transform = Compose(
+            [
+                blend_white_bg,
+                Resize((reso, reso)),
+                ToTensor(),
+                Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+            ]
+        )
+        data_cls = {
+            "richdreamer": ObjaverseSpiral,
+            "lvis": ObjaverseLVISSpiral,
+            "shengshu_all": ObjaverseALLSpiral,
+            "latent": LatentObjaverse,
+            "gobjaverse": GObjaverse,
+        }[dataset_cls]
+        self.train_dataset = data_cls(
+            root_dir=root_dir,
+            split="train",
+            random_front=random_front,
+            transform=self.transform,
+            max_item=self.max_item,
+            **kwargs,
+        )
+        self.test_dataset = data_cls(
+            root_dir=root_dir,
+            split="val",
+            random_front=random_front,
+            transform=self.transform,
+            max_item=self.max_item,
+            **kwargs,
+        )
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            num_workers=self.num_workers,
+            prefetch_factor=self.prefetch_factor,
+            collate_fn=video_collate_fn
+            if not hasattr(self.train_dataset, "collate_fn")
+            else self.train_dataset.collate_fn,
+        )
+    def test_dataloader(self):
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            num_workers=self.num_workers,
+            prefetch_factor=self.prefetch_factor,
+            collate_fn=video_collate_fn
+            if not hasattr(self.test_dataset, "collate_fn")
+            else self.train_dataset.collate_fn,
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            num_workers=self.num_workers,
+            prefetch_factor=self.prefetch_factor,
+            collate_fn=video_collate_fn
+            if not hasattr(self.test_dataset, "collate_fn")
+            else self.train_dataset.collate_fn,
+        )

sgm/inference/api.py ADDED Viewed

	@@ -0,0 +1,385 @@

+import pathlib
+from dataclasses import asdict, dataclass
+from enum import Enum
+from typing import Optional
+from omegaconf import OmegaConf
+from sgm.inference.helpers import (Img2ImgDiscretizationWrapper, do_img2img,
+                                   do_sample)
+from sgm.modules.diffusionmodules.sampling import (DPMPP2MSampler,
+                                                   DPMPP2SAncestralSampler,
+                                                   EulerAncestralSampler,
+                                                   EulerEDMSampler,
+                                                   HeunEDMSampler,
+                                                   LinearMultistepSampler)
+from sgm.util import load_model_from_config
+class ModelArchitecture(str, Enum):
+    SD_2_1 = "stable-diffusion-v2-1"
+    SD_2_1_768 = "stable-diffusion-v2-1-768"
+    SDXL_V0_9_BASE = "stable-diffusion-xl-v0-9-base"
+    SDXL_V0_9_REFINER = "stable-diffusion-xl-v0-9-refiner"
+    SDXL_V1_BASE = "stable-diffusion-xl-v1-base"
+    SDXL_V1_REFINER = "stable-diffusion-xl-v1-refiner"
+class Sampler(str, Enum):
+    EULER_EDM = "EulerEDMSampler"
+    HEUN_EDM = "HeunEDMSampler"
+    EULER_ANCESTRAL = "EulerAncestralSampler"
+    DPMPP2S_ANCESTRAL = "DPMPP2SAncestralSampler"
+    DPMPP2M = "DPMPP2MSampler"
+    LINEAR_MULTISTEP = "LinearMultistepSampler"
+class Discretization(str, Enum):
+    LEGACY_DDPM = "LegacyDDPMDiscretization"
+    EDM = "EDMDiscretization"
+class Guider(str, Enum):
+    VANILLA = "VanillaCFG"
+    IDENTITY = "IdentityGuider"
+class Thresholder(str, Enum):
+    NONE = "None"
+@dataclass
+class SamplingParams:
+    width: int = 1024
+    height: int = 1024
+    steps: int = 50
+    sampler: Sampler = Sampler.DPMPP2M
+    discretization: Discretization = Discretization.LEGACY_DDPM
+    guider: Guider = Guider.VANILLA
+    thresholder: Thresholder = Thresholder.NONE
+    scale: float = 6.0
+    aesthetic_score: float = 5.0
+    negative_aesthetic_score: float = 5.0
+    img2img_strength: float = 1.0
+    orig_width: int = 1024
+    orig_height: int = 1024
+    crop_coords_top: int = 0
+    crop_coords_left: int = 0
+    sigma_min: float = 0.0292
+    sigma_max: float = 14.6146
+    rho: float = 3.0
+    s_churn: float = 0.0
+    s_tmin: float = 0.0
+    s_tmax: float = 999.0
+    s_noise: float = 1.0
+    eta: float = 1.0
+    order: int = 4
+@dataclass
+class SamplingSpec:
+    width: int
+    height: int
+    channels: int
+    factor: int
+    is_legacy: bool
+    config: str
+    ckpt: str
+    is_guided: bool
+model_specs = {
+    ModelArchitecture.SD_2_1: SamplingSpec(
+        height=512,
+        width=512,
+        channels=4,
+        factor=8,
+        is_legacy=True,
+        config="sd_2_1.yaml",
+        ckpt="v2-1_512-ema-pruned.safetensors",
+        is_guided=True,
+    ),
+    ModelArchitecture.SD_2_1_768: SamplingSpec(
+        height=768,
+        width=768,
+        channels=4,
+        factor=8,
+        is_legacy=True,
+        config="sd_2_1_768.yaml",
+        ckpt="v2-1_768-ema-pruned.safetensors",
+        is_guided=True,
+    ),
+    ModelArchitecture.SDXL_V0_9_BASE: SamplingSpec(
+        height=1024,
+        width=1024,
+        channels=4,
+        factor=8,
+        is_legacy=False,
+        config="sd_xl_base.yaml",
+        ckpt="sd_xl_base_0.9.safetensors",
+        is_guided=True,
+    ),
+    ModelArchitecture.SDXL_V0_9_REFINER: SamplingSpec(
+        height=1024,
+        width=1024,
+        channels=4,
+        factor=8,
+        is_legacy=True,
+        config="sd_xl_refiner.yaml",
+        ckpt="sd_xl_refiner_0.9.safetensors",
+        is_guided=True,
+    ),
+    ModelArchitecture.SDXL_V1_BASE: SamplingSpec(
+        height=1024,
+        width=1024,
+        channels=4,
+        factor=8,
+        is_legacy=False,
+        config="sd_xl_base.yaml",
+        ckpt="sd_xl_base_1.0.safetensors",
+        is_guided=True,
+    ),
+    ModelArchitecture.SDXL_V1_REFINER: SamplingSpec(
+        height=1024,
+        width=1024,
+        channels=4,
+        factor=8,
+        is_legacy=True,
+        config="sd_xl_refiner.yaml",
+        ckpt="sd_xl_refiner_1.0.safetensors",
+        is_guided=True,
+    ),
+}
+class SamplingPipeline:
+    def __init__(
+        self,
+        model_id: ModelArchitecture,
+        model_path="checkpoints",
+        config_path="configs/inference",
+        device="cuda",
+        use_fp16=True,
+    ) -> None:
+        if model_id not in model_specs:
+            raise ValueError(f"Model {model_id} not supported")
+        self.model_id = model_id
+        self.specs = model_specs[self.model_id]
+        self.config = str(pathlib.Path(config_path, self.specs.config))
+        self.ckpt = str(pathlib.Path(model_path, self.specs.ckpt))
+        self.device = device
+        self.model = self._load_model(device=device, use_fp16=use_fp16)
+    def _load_model(self, device="cuda", use_fp16=True):
+        config = OmegaConf.load(self.config)
+        model = load_model_from_config(config, self.ckpt)
+        if model is None:
+            raise ValueError(f"Model {self.model_id} could not be loaded")
+        model.to(device)
+        if use_fp16:
+            model.conditioner.half()
+            model.model.half()
+        return model
+    def text_to_image(
+        self,
+        params: SamplingParams,
+        prompt: str,
+        negative_prompt: str = "",
+        samples: int = 1,
+        return_latents: bool = False,
+    ):
+        sampler = get_sampler_config(params)
+        value_dict = asdict(params)
+        value_dict["prompt"] = prompt
+        value_dict["negative_prompt"] = negative_prompt
+        value_dict["target_width"] = params.width
+        value_dict["target_height"] = params.height
+        return do_sample(
+            self.model,
+            sampler,
+            value_dict,
+            samples,
+            params.height,
+            params.width,
+            self.specs.channels,
+            self.specs.factor,
+            force_uc_zero_embeddings=["txt"] if not self.specs.is_legacy else [],
+            return_latents=return_latents,
+            filter=None,
+        )
+    def image_to_image(
+        self,
+        params: SamplingParams,
+        image,
+        prompt: str,
+        negative_prompt: str = "",
+        samples: int = 1,
+        return_latents: bool = False,
+    ):
+        sampler = get_sampler_config(params)
+        if params.img2img_strength < 1.0:
+            sampler.discretization = Img2ImgDiscretizationWrapper(
+                sampler.discretization,
+                strength=params.img2img_strength,
+            )
+        height, width = image.shape[2], image.shape[3]
+        value_dict = asdict(params)
+        value_dict["prompt"] = prompt
+        value_dict["negative_prompt"] = negative_prompt
+        value_dict["target_width"] = width
+        value_dict["target_height"] = height
+        return do_img2img(
+            image,
+            self.model,
+            sampler,
+            value_dict,
+            samples,
+            force_uc_zero_embeddings=["txt"] if not self.specs.is_legacy else [],
+            return_latents=return_latents,
+            filter=None,
+        )
+    def refiner(
+        self,
+        params: SamplingParams,
+        image,
+        prompt: str,
+        negative_prompt: Optional[str] = None,
+        samples: int = 1,
+        return_latents: bool = False,
+    ):
+        sampler = get_sampler_config(params)
+        value_dict = {
+            "orig_width": image.shape[3] * 8,
+            "orig_height": image.shape[2] * 8,
+            "target_width": image.shape[3] * 8,
+            "target_height": image.shape[2] * 8,
+            "prompt": prompt,
+            "negative_prompt": negative_prompt,
+            "crop_coords_top": 0,
+            "crop_coords_left": 0,
+            "aesthetic_score": 6.0,
+            "negative_aesthetic_score": 2.5,
+        }
+        return do_img2img(
+            image,
+            self.model,
+            sampler,
+            value_dict,
+            samples,
+            skip_encode=True,
+            return_latents=return_latents,
+            filter=None,
+        )
+def get_guider_config(params: SamplingParams):
+    if params.guider == Guider.IDENTITY:
+        guider_config = {
+            "target": "sgm.modules.diffusionmodules.guiders.IdentityGuider"
+        }
+    elif params.guider == Guider.VANILLA:
+        scale = params.scale
+        thresholder = params.thresholder
+        if thresholder == Thresholder.NONE:
+            dyn_thresh_config = {
+                "target": "sgm.modules.diffusionmodules.sampling_utils.NoDynamicThresholding"
+            }
+        else:
+            raise NotImplementedError
+        guider_config = {
+            "target": "sgm.modules.diffusionmodules.guiders.VanillaCFG",
+            "params": {"scale": scale, "dyn_thresh_config": dyn_thresh_config},
+        }
+    else:
+        raise NotImplementedError
+    return guider_config
+def get_discretization_config(params: SamplingParams):
+    if params.discretization == Discretization.LEGACY_DDPM:
+        discretization_config = {
+            "target": "sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization",
+        }
+    elif params.discretization == Discretization.EDM:
+        discretization_config = {
+            "target": "sgm.modules.diffusionmodules.discretizer.EDMDiscretization",
+            "params": {
+                "sigma_min": params.sigma_min,
+                "sigma_max": params.sigma_max,
+                "rho": params.rho,
+            },
+        }
+    else:
+        raise ValueError(f"unknown discretization {params.discretization}")
+    return discretization_config
+def get_sampler_config(params: SamplingParams):
+    discretization_config = get_discretization_config(params)
+    guider_config = get_guider_config(params)
+    sampler = None
+    if params.sampler == Sampler.EULER_EDM:
+        return EulerEDMSampler(
+            num_steps=params.steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            s_churn=params.s_churn,
+            s_tmin=params.s_tmin,
+            s_tmax=params.s_tmax,
+            s_noise=params.s_noise,
+            verbose=True,
+        )
+    if params.sampler == Sampler.HEUN_EDM:
+        return HeunEDMSampler(
+            num_steps=params.steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            s_churn=params.s_churn,
+            s_tmin=params.s_tmin,
+            s_tmax=params.s_tmax,
+            s_noise=params.s_noise,
+            verbose=True,
+        )
+    if params.sampler == Sampler.EULER_ANCESTRAL:
+        return EulerAncestralSampler(
+            num_steps=params.steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            eta=params.eta,
+            s_noise=params.s_noise,
+            verbose=True,
+        )
+    if params.sampler == Sampler.DPMPP2S_ANCESTRAL:
+        return DPMPP2SAncestralSampler(
+            num_steps=params.steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            eta=params.eta,
+            s_noise=params.s_noise,
+            verbose=True,
+        )
+    if params.sampler == Sampler.DPMPP2M:
+        return DPMPP2MSampler(
+            num_steps=params.steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            verbose=True,
+        )
+    if params.sampler == Sampler.LINEAR_MULTISTEP:
+        return LinearMultistepSampler(
+            num_steps=params.steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            order=params.order,
+            verbose=True,
+        )
+    raise ValueError(f"unknown sampler {params.sampler}!")

sgm/inference/helpers.py ADDED Viewed

	@@ -0,0 +1,305 @@

+import math
+import os
+from typing import List, Optional, Union
+import numpy as np
+import torch
+from einops import rearrange
+from imwatermark import WatermarkEncoder
+from omegaconf import ListConfig
+from PIL import Image
+from torch import autocast
+from sgm.util import append_dims
+class WatermarkEmbedder:
+    def __init__(self, watermark):
+        self.watermark = watermark
+        self.num_bits = len(WATERMARK_BITS)
+        self.encoder = WatermarkEncoder()
+        self.encoder.set_watermark("bits", self.watermark)
+    def __call__(self, image: torch.Tensor) -> torch.Tensor:
+        """
+        Adds a predefined watermark to the input image
+        Args:
+            image: ([N,] B, RGB, H, W) in range [0, 1]
+        Returns:
+            same as input but watermarked
+        """
+        squeeze = len(image.shape) == 4
+        if squeeze:
+            image = image[None, ...]
+        n = image.shape[0]
+        image_np = rearrange(
+            (255 * image).detach().cpu(), "n b c h w -> (n b) h w c"
+        ).numpy()[:, :, :, ::-1]
+        # torch (b, c, h, w) in [0, 1] -> numpy (b, h, w, c) [0, 255]
+        # watermarking libary expects input as cv2 BGR format
+        for k in range(image_np.shape[0]):
+            image_np[k] = self.encoder.encode(image_np[k], "dwtDct")
+        image = torch.from_numpy(
+            rearrange(image_np[:, :, :, ::-1], "(n b) h w c -> n b c h w", n=n)
+        ).to(image.device)
+        image = torch.clamp(image / 255, min=0.0, max=1.0)
+        if squeeze:
+            image = image[0]
+        return image
+# A fixed 48-bit message that was choosen at random
+# WATERMARK_MESSAGE = 0xB3EC907BB19E
+WATERMARK_MESSAGE = 0b101100111110110010010000011110111011000110011110
+# bin(x)[2:] gives bits of x as str, use int to convert them to 0/1
+WATERMARK_BITS = [int(bit) for bit in bin(WATERMARK_MESSAGE)[2:]]
+embed_watermark = WatermarkEmbedder(WATERMARK_BITS)
+def get_unique_embedder_keys_from_conditioner(conditioner):
+    return list({x.input_key for x in conditioner.embedders})
+def perform_save_locally(save_path, samples):
+    os.makedirs(os.path.join(save_path), exist_ok=True)
+    base_count = len(os.listdir(os.path.join(save_path)))
+    samples = embed_watermark(samples)
+    for sample in samples:
+        sample = 255.0 * rearrange(sample.cpu().numpy(), "c h w -> h w c")
+        Image.fromarray(sample.astype(np.uint8)).save(
+            os.path.join(save_path, f"{base_count:09}.png")
+        )
+        base_count += 1
+class Img2ImgDiscretizationWrapper:
+    """
+    wraps a discretizer, and prunes the sigmas
+    params:
+        strength: float between 0.0 and 1.0. 1.0 means full sampling (all sigmas are returned)
+    """
+    def __init__(self, discretization, strength: float = 1.0):
+        self.discretization = discretization
+        self.strength = strength
+        assert 0.0 <= self.strength <= 1.0
+    def __call__(self, *args, **kwargs):
+        # sigmas start large first, and decrease then
+        sigmas = self.discretization(*args, **kwargs)
+        print(f"sigmas after discretization, before pruning img2img: ", sigmas)
+        sigmas = torch.flip(sigmas, (0,))
+        sigmas = sigmas[: max(int(self.strength * len(sigmas)), 1)]
+        print("prune index:", max(int(self.strength * len(sigmas)), 1))
+        sigmas = torch.flip(sigmas, (0,))
+        print(f"sigmas after pruning: ", sigmas)
+        return sigmas
+def do_sample(
+    model,
+    sampler,
+    value_dict,
+    num_samples,
+    H,
+    W,
+    C,
+    F,
+    force_uc_zero_embeddings: Optional[List] = None,
+    batch2model_input: Optional[List] = None,
+    return_latents=False,
+    filter=None,
+    device="cuda",
+):
+    if force_uc_zero_embeddings is None:
+        force_uc_zero_embeddings = []
+    if batch2model_input is None:
+        batch2model_input = []
+    with torch.no_grad():
+        with autocast(device) as precision_scope:
+            with model.ema_scope():
+                num_samples = [num_samples]
+                batch, batch_uc = get_batch(
+                    get_unique_embedder_keys_from_conditioner(model.conditioner),
+                    value_dict,
+                    num_samples,
+                )
+                for key in batch:
+                    if isinstance(batch[key], torch.Tensor):
+                        print(key, batch[key].shape)
+                    elif isinstance(batch[key], list):
+                        print(key, [len(l) for l in batch[key]])
+                    else:
+                        print(key, batch[key])
+                c, uc = model.conditioner.get_unconditional_conditioning(
+                    batch,
+                    batch_uc=batch_uc,
+                    force_uc_zero_embeddings=force_uc_zero_embeddings,
+                )
+                for k in c:
+                    if not k == "crossattn":
+                        c[k], uc[k] = map(
+                            lambda y: y[k][: math.prod(num_samples)].to(device), (c, uc)
+                        )
+                additional_model_inputs = {}
+                for k in batch2model_input:
+                    additional_model_inputs[k] = batch[k]
+                shape = (math.prod(num_samples), C, H // F, W // F)
+                randn = torch.randn(shape).to(device)
+                def denoiser(input, sigma, c):
+                    return model.denoiser(
+                        model.model, input, sigma, c, **additional_model_inputs
+                    )
+                samples_z = sampler(denoiser, randn, cond=c, uc=uc)
+                samples_x = model.decode_first_stage(samples_z)
+                samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
+                if filter is not None:
+                    samples = filter(samples)
+                if return_latents:
+                    return samples, samples_z
+                return samples
+def get_batch(keys, value_dict, N: Union[List, ListConfig], device="cuda"):
+    # Hardcoded demo setups; might undergo some changes in the future
+    batch = {}
+    batch_uc = {}
+    for key in keys:
+        if key == "txt":
+            batch["txt"] = (
+                np.repeat([value_dict["prompt"]], repeats=math.prod(N))
+                .reshape(N)
+                .tolist()
+            )
+            batch_uc["txt"] = (
+                np.repeat([value_dict["negative_prompt"]], repeats=math.prod(N))
+                .reshape(N)
+                .tolist()
+            )
+        elif key == "original_size_as_tuple":
+            batch["original_size_as_tuple"] = (
+                torch.tensor([value_dict["orig_height"], value_dict["orig_width"]])
+                .to(device)
+                .repeat(*N, 1)
+            )
+        elif key == "crop_coords_top_left":
+            batch["crop_coords_top_left"] = (
+                torch.tensor(
+                    [value_dict["crop_coords_top"], value_dict["crop_coords_left"]]
+                )
+                .to(device)
+                .repeat(*N, 1)
+            )
+        elif key == "aesthetic_score":
+            batch["aesthetic_score"] = (
+                torch.tensor([value_dict["aesthetic_score"]]).to(device).repeat(*N, 1)
+            )
+            batch_uc["aesthetic_score"] = (
+                torch.tensor([value_dict["negative_aesthetic_score"]])
+                .to(device)
+                .repeat(*N, 1)
+            )
+        elif key == "target_size_as_tuple":
+            batch["target_size_as_tuple"] = (
+                torch.tensor([value_dict["target_height"], value_dict["target_width"]])
+                .to(device)
+                .repeat(*N, 1)
+            )
+        else:
+            batch[key] = value_dict[key]
+    for key in batch.keys():
+        if key not in batch_uc and isinstance(batch[key], torch.Tensor):
+            batch_uc[key] = torch.clone(batch[key])
+    return batch, batch_uc
+def get_input_image_tensor(image: Image.Image, device="cuda"):
+    w, h = image.size
+    print(f"loaded input image of size ({w}, {h})")
+    width, height = map(
+        lambda x: x - x % 64, (w, h)
+    )  # resize to integer multiple of 64
+    image = image.resize((width, height))
+    image_array = np.array(image.convert("RGB"))
+    image_array = image_array[None].transpose(0, 3, 1, 2)
+    image_tensor = torch.from_numpy(image_array).to(dtype=torch.float32) / 127.5 - 1.0
+    return image_tensor.to(device)
+def do_img2img(
+    img,
+    model,
+    sampler,
+    value_dict,
+    num_samples,
+    force_uc_zero_embeddings=[],
+    additional_kwargs={},
+    offset_noise_level: float = 0.0,
+    return_latents=False,
+    skip_encode=False,
+    filter=None,
+    device="cuda",
+):
+    with torch.no_grad():
+        with autocast(device) as precision_scope:
+            with model.ema_scope():
+                batch, batch_uc = get_batch(
+                    get_unique_embedder_keys_from_conditioner(model.conditioner),
+                    value_dict,
+                    [num_samples],
+                )
+                c, uc = model.conditioner.get_unconditional_conditioning(
+                    batch,
+                    batch_uc=batch_uc,
+                    force_uc_zero_embeddings=force_uc_zero_embeddings,
+                )
+                for k in c:
+                    c[k], uc[k] = map(lambda y: y[k][:num_samples].to(device), (c, uc))
+                for k in additional_kwargs:
+                    c[k] = uc[k] = additional_kwargs[k]
+                if skip_encode:
+                    z = img
+                else:
+                    z = model.encode_first_stage(img)
+                noise = torch.randn_like(z)
+                sigmas = sampler.discretization(sampler.num_steps)
+                sigma = sigmas[0].to(z.device)
+                if offset_noise_level > 0.0:
+                    noise = noise + offset_noise_level * append_dims(
+                        torch.randn(z.shape[0], device=z.device), z.ndim
+                    )
+                noised_z = z + noise * append_dims(sigma, z.ndim)
+                noised_z = noised_z / torch.sqrt(
+                    1.0 + sigmas[0] ** 2.0
+                )  # Note: hardcoded to DDPM-like scaling. need to generalize later.
+                def denoiser(x, sigma, c):
+                    return model.denoiser(model.model, x, sigma, c)
+                samples_z = sampler(denoiser, noised_z, cond=c, uc=uc)
+                samples_x = model.decode_first_stage(samples_z)
+                samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
+                if filter is not None:
+                    samples = filter(samples)
+                if return_latents:
+                    return samples, samples_z
+                return samples

sgm/lr_scheduler.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import numpy as np
+class LambdaWarmUpCosineScheduler:
+    """
+    note: use with a base_lr of 1.0
+    """
+    def __init__(
+        self,
+        warm_up_steps,
+        lr_min,
+        lr_max,
+        lr_start,
+        max_decay_steps,
+        verbosity_interval=0,
+    ):
+        self.lr_warm_up_steps = warm_up_steps
+        self.lr_start = lr_start
+        self.lr_min = lr_min
+        self.lr_max = lr_max
+        self.lr_max_decay_steps = max_decay_steps
+        self.last_lr = 0.0
+        self.verbosity_interval = verbosity_interval
+    def schedule(self, n, **kwargs):
+        if self.verbosity_interval > 0:
+            if n % self.verbosity_interval == 0:
+                print(f"current step: {n}, recent lr-multiplier: {self.last_lr}")
+        if n < self.lr_warm_up_steps:
+            lr = (
+                self.lr_max - self.lr_start
+            ) / self.lr_warm_up_steps * n + self.lr_start
+            self.last_lr = lr
+            return lr
+        else:
+            t = (n - self.lr_warm_up_steps) / (
+                self.lr_max_decay_steps - self.lr_warm_up_steps
+            )
+            t = min(t, 1.0)
+            lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * (
+                1 + np.cos(t * np.pi)
+            )
+            self.last_lr = lr
+            return lr
+    def __call__(self, n, **kwargs):
+        return self.schedule(n, **kwargs)
+class LambdaWarmUpCosineScheduler2:
+    """
+    supports repeated iterations, configurable via lists
+    note: use with a base_lr of 1.0.
+    """
+    def __init__(
+        self, warm_up_steps, f_min, f_max, f_start, cycle_lengths, verbosity_interval=0
+    ):
+        assert (
+            len(warm_up_steps)
+            == len(f_min)
+            == len(f_max)
+            == len(f_start)
+            == len(cycle_lengths)
+        )
+        self.lr_warm_up_steps = warm_up_steps
+        self.f_start = f_start
+        self.f_min = f_min
+        self.f_max = f_max
+        self.cycle_lengths = cycle_lengths
+        self.cum_cycles = np.cumsum([0] + list(self.cycle_lengths))
+        self.last_f = 0.0
+        self.verbosity_interval = verbosity_interval
+    def find_in_interval(self, n):
+        interval = 0
+        for cl in self.cum_cycles[1:]:
+            if n <= cl:
+                return interval
+            interval += 1
+    def schedule(self, n, **kwargs):
+        cycle = self.find_in_interval(n)
+        n = n - self.cum_cycles[cycle]
+        if self.verbosity_interval > 0:
+            if n % self.verbosity_interval == 0:
+                print(
+                    f"current step: {n}, recent lr-multiplier: {self.last_f}, "
+                    f"current cycle {cycle}"
+                )
+        if n < self.lr_warm_up_steps[cycle]:
+            f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[
+                cycle
+            ] * n + self.f_start[cycle]
+            self.last_f = f
+            return f
+        else:
+            t = (n - self.lr_warm_up_steps[cycle]) / (
+                self.cycle_lengths[cycle] - self.lr_warm_up_steps[cycle]
+            )
+            t = min(t, 1.0)
+            f = self.f_min[cycle] + 0.5 * (self.f_max[cycle] - self.f_min[cycle]) * (
+                1 + np.cos(t * np.pi)
+            )
+            self.last_f = f
+            return f
+    def __call__(self, n, **kwargs):
+        return self.schedule(n, **kwargs)
+class LambdaLinearScheduler(LambdaWarmUpCosineScheduler2):
+    def schedule(self, n, **kwargs):
+        cycle = self.find_in_interval(n)
+        n = n - self.cum_cycles[cycle]
+        if self.verbosity_interval > 0:
+            if n % self.verbosity_interval == 0:
+                print(
+                    f"current step: {n}, recent lr-multiplier: {self.last_f}, "
+                    f"current cycle {cycle}"
+                )
+        if n < self.lr_warm_up_steps[cycle]:
+            f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[
+                cycle
+            ] * n + self.f_start[cycle]
+            self.last_f = f
+            return f
+        else:
+            f = self.f_min[cycle] + (self.f_max[cycle] - self.f_min[cycle]) * (
+                self.cycle_lengths[cycle] - n
+            ) / (self.cycle_lengths[cycle])
+            self.last_f = f
+            return f

sgm/models/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .autoencoder import AutoencodingEngine
2	+ from .diffusion import DiffusionEngine

sgm/models/autoencoder.py ADDED Viewed

	@@ -0,0 +1,615 @@

+import logging
+import math
+import re
+from abc import abstractmethod
+from contextlib import contextmanager
+from typing import Any, Dict, List, Optional, Tuple, Union
+import pytorch_lightning as pl
+import torch
+import torch.nn as nn
+from einops import rearrange
+from packaging import version
+from ..modules.autoencoding.regularizers import AbstractRegularizer
+from ..modules.ema import LitEma
+from ..util import (default, get_nested_attribute, get_obj_from_str,
+                    instantiate_from_config)
+logpy = logging.getLogger(__name__)
+class AbstractAutoencoder(pl.LightningModule):
+    """
+    This is the base class for all autoencoders, including image autoencoders, image autoencoders with discriminators,
+    unCLIP models, etc. Hence, it is fairly general, and specific features
+    (e.g. discriminator training, encoding, decoding) must be implemented in subclasses.
+    """
+    def __init__(
+        self,
+        ema_decay: Union[None, float] = None,
+        monitor: Union[None, str] = None,
+        input_key: str = "jpg",
+    ):
+        super().__init__()
+        self.input_key = input_key
+        self.use_ema = ema_decay is not None
+        if monitor is not None:
+            self.monitor = monitor
+        if self.use_ema:
+            self.model_ema = LitEma(self, decay=ema_decay)
+            logpy.info(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
+        if version.parse(torch.__version__) >= version.parse("2.0.0"):
+            self.automatic_optimization = False
+    def apply_ckpt(self, ckpt: Union[None, str, dict]):
+        if ckpt is None:
+            return
+        if isinstance(ckpt, str):
+            ckpt = {
+                "target": "sgm.modules.checkpoint.CheckpointEngine",
+                "params": {"ckpt_path": ckpt},
+            }
+        engine = instantiate_from_config(ckpt)
+        engine(self)
+    @abstractmethod
+    def get_input(self, batch) -> Any:
+        raise NotImplementedError()
+    def on_train_batch_end(self, *args, **kwargs):
+        # for EMA computation
+        if self.use_ema:
+            self.model_ema(self)
+    @contextmanager
+    def ema_scope(self, context=None):
+        if self.use_ema:
+            self.model_ema.store(self.parameters())
+            self.model_ema.copy_to(self)
+            if context is not None:
+                logpy.info(f"{context}: Switched to EMA weights")
+        try:
+            yield None
+        finally:
+            if self.use_ema:
+                self.model_ema.restore(self.parameters())
+                if context is not None:
+                    logpy.info(f"{context}: Restored training weights")
+    @abstractmethod
+    def encode(self, *args, **kwargs) -> torch.Tensor:
+        raise NotImplementedError("encode()-method of abstract base class called")
+    @abstractmethod
+    def decode(self, *args, **kwargs) -> torch.Tensor:
+        raise NotImplementedError("decode()-method of abstract base class called")
+    def instantiate_optimizer_from_config(self, params, lr, cfg):
+        logpy.info(f"loading >>> {cfg['target']} <<< optimizer from config")
+        return get_obj_from_str(cfg["target"])(
+            params, lr=lr, **cfg.get("params", dict())
+        )
+    def configure_optimizers(self) -> Any:
+        raise NotImplementedError()
+class AutoencodingEngine(AbstractAutoencoder):
+    """
+    Base class for all image autoencoders that we train, like VQGAN or AutoencoderKL
+    (we also restore them explicitly as special cases for legacy reasons).
+    Regularizations such as KL or VQ are moved to the regularizer class.
+    """
+    def __init__(
+        self,
+        *args,
+        encoder_config: Dict,
+        decoder_config: Dict,
+        loss_config: Dict,
+        regularizer_config: Dict,
+        optimizer_config: Union[Dict, None] = None,
+        lr_g_factor: float = 1.0,
+        trainable_ae_params: Optional[List[List[str]]] = None,
+        ae_optimizer_args: Optional[List[dict]] = None,
+        trainable_disc_params: Optional[List[List[str]]] = None,
+        disc_optimizer_args: Optional[List[dict]] = None,
+        disc_start_iter: int = 0,
+        diff_boost_factor: float = 3.0,
+        ckpt_engine: Union[None, str, dict] = None,
+        ckpt_path: Optional[str] = None,
+        additional_decode_keys: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.automatic_optimization = False  # pytorch lightning
+        self.encoder: torch.nn.Module = instantiate_from_config(encoder_config)
+        self.decoder: torch.nn.Module = instantiate_from_config(decoder_config)
+        self.loss: torch.nn.Module = instantiate_from_config(loss_config)
+        self.regularization: AbstractRegularizer = instantiate_from_config(
+            regularizer_config
+        )
+        self.optimizer_config = default(
+            optimizer_config, {"target": "torch.optim.Adam"}
+        )
+        self.diff_boost_factor = diff_boost_factor
+        self.disc_start_iter = disc_start_iter
+        self.lr_g_factor = lr_g_factor
+        self.trainable_ae_params = trainable_ae_params
+        if self.trainable_ae_params is not None:
+            self.ae_optimizer_args = default(
+                ae_optimizer_args,
+                [{} for _ in range(len(self.trainable_ae_params))],
+            )
+            assert len(self.ae_optimizer_args) == len(self.trainable_ae_params)
+        else:
+            self.ae_optimizer_args = [{}]  # makes type consitent
+        self.trainable_disc_params = trainable_disc_params
+        if self.trainable_disc_params is not None:
+            self.disc_optimizer_args = default(
+                disc_optimizer_args,
+                [{} for _ in range(len(self.trainable_disc_params))],
+            )
+            assert len(self.disc_optimizer_args) == len(self.trainable_disc_params)
+        else:
+            self.disc_optimizer_args = [{}]  # makes type consitent
+        if ckpt_path is not None:
+            assert ckpt_engine is None, "Can't set ckpt_engine and ckpt_path"
+            logpy.warn("Checkpoint path is deprecated, use `checkpoint_egnine` instead")
+        self.apply_ckpt(default(ckpt_path, ckpt_engine))
+        self.additional_decode_keys = set(default(additional_decode_keys, []))
+    def get_input(self, batch: Dict) -> torch.Tensor:
+        # assuming unified data format, dataloader returns a dict.
+        # image tensors should be scaled to -1 ... 1 and in channels-first
+        # format (e.g., bchw instead if bhwc)
+        return batch[self.input_key]
+    def get_autoencoder_params(self) -> list:
+        params = []
+        if hasattr(self.loss, "get_trainable_autoencoder_parameters"):
+            params += list(self.loss.get_trainable_autoencoder_parameters())
+        if hasattr(self.regularization, "get_trainable_parameters"):
+            params += list(self.regularization.get_trainable_parameters())
+        params = params + list(self.encoder.parameters())
+        params = params + list(self.decoder.parameters())
+        return params
+    def get_discriminator_params(self) -> list:
+        if hasattr(self.loss, "get_trainable_parameters"):
+            params = list(self.loss.get_trainable_parameters())  # e.g., discriminator
+        else:
+            params = []
+        return params
+    def get_last_layer(self):
+        return self.decoder.get_last_layer()
+    def encode(
+        self,
+        x: torch.Tensor,
+        return_reg_log: bool = False,
+        unregularized: bool = False,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, dict]]:
+        z = self.encoder(x)
+        if unregularized:
+            return z, dict()
+        z, reg_log = self.regularization(z)
+        if return_reg_log:
+            return z, reg_log
+        return z
+    def decode(self, z: torch.Tensor, **kwargs) -> torch.Tensor:
+        x = self.decoder(z, **kwargs)
+        return x
+    def forward(
+        self, x: torch.Tensor, **additional_decode_kwargs
+    ) -> Tuple[torch.Tensor, torch.Tensor, dict]:
+        z, reg_log = self.encode(x, return_reg_log=True)
+        dec = self.decode(z, **additional_decode_kwargs)
+        return z, dec, reg_log
+    def inner_training_step(
+        self, batch: dict, batch_idx: int, optimizer_idx: int = 0
+    ) -> torch.Tensor:
+        x = self.get_input(batch)
+        additional_decode_kwargs = {
+            key: batch[key] for key in self.additional_decode_keys.intersection(batch)
+        }
+        z, xrec, regularization_log = self(x, **additional_decode_kwargs)
+        if hasattr(self.loss, "forward_keys"):
+            extra_info = {
+                "z": z,
+                "optimizer_idx": optimizer_idx,
+                "global_step": self.global_step,
+                "last_layer": self.get_last_layer(),
+                "split": "train",
+                "regularization_log": regularization_log,
+                "autoencoder": self,
+            }
+            extra_info = {k: extra_info[k] for k in self.loss.forward_keys}
+        else:
+            extra_info = dict()
+        if optimizer_idx == 0:
+            # autoencode
+            out_loss = self.loss(x, xrec, **extra_info)
+            if isinstance(out_loss, tuple):
+                aeloss, log_dict_ae = out_loss
+            else:
+                # simple loss function
+                aeloss = out_loss
+                log_dict_ae = {"train/loss/rec": aeloss.detach()}
+            self.log_dict(
+                log_dict_ae,
+                prog_bar=False,
+                logger=True,
+                on_step=True,
+                on_epoch=True,
+                sync_dist=False,
+            )
+            self.log(
+                "loss",
+                aeloss.mean().detach(),
+                prog_bar=True,
+                logger=False,
+                on_epoch=False,
+                on_step=True,
+            )
+            return aeloss
+        elif optimizer_idx == 1:
+            # discriminator
+            discloss, log_dict_disc = self.loss(x, xrec, **extra_info)
+            # -> discriminator always needs to return a tuple
+            self.log_dict(
+                log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=True
+            )
+            return discloss
+        else:
+            raise NotImplementedError(f"Unknown optimizer {optimizer_idx}")
+    def training_step(self, batch: dict, batch_idx: int):
+        opts = self.optimizers()
+        if not isinstance(opts, list):
+            # Non-adversarial case
+            opts = [opts]
+        optimizer_idx = batch_idx % len(opts)
+        if self.global_step < self.disc_start_iter:
+            optimizer_idx = 0
+        opt = opts[optimizer_idx]
+        opt.zero_grad()
+        with opt.toggle_model():
+            loss = self.inner_training_step(
+                batch, batch_idx, optimizer_idx=optimizer_idx
+            )
+            self.manual_backward(loss)
+        opt.step()
+    def validation_step(self, batch: dict, batch_idx: int) -> Dict:
+        log_dict = self._validation_step(batch, batch_idx)
+        with self.ema_scope():
+            log_dict_ema = self._validation_step(batch, batch_idx, postfix="_ema")
+            log_dict.update(log_dict_ema)
+        return log_dict
+    def _validation_step(self, batch: dict, batch_idx: int, postfix: str = "") -> Dict:
+        x = self.get_input(batch)
+        z, xrec, regularization_log = self(x)
+        if hasattr(self.loss, "forward_keys"):
+            extra_info = {
+                "z": z,
+                "optimizer_idx": 0,
+                "global_step": self.global_step,
+                "last_layer": self.get_last_layer(),
+                "split": "val" + postfix,
+                "regularization_log": regularization_log,
+                "autoencoder": self,
+            }
+            extra_info = {k: extra_info[k] for k in self.loss.forward_keys}
+        else:
+            extra_info = dict()
+        out_loss = self.loss(x, xrec, **extra_info)
+        if isinstance(out_loss, tuple):
+            aeloss, log_dict_ae = out_loss
+        else:
+            # simple loss function
+            aeloss = out_loss
+            log_dict_ae = {f"val{postfix}/loss/rec": aeloss.detach()}
+        full_log_dict = log_dict_ae
+        if "optimizer_idx" in extra_info:
+            extra_info["optimizer_idx"] = 1
+            discloss, log_dict_disc = self.loss(x, xrec, **extra_info)
+            full_log_dict.update(log_dict_disc)
+        self.log(
+            f"val{postfix}/loss/rec",
+            log_dict_ae[f"val{postfix}/loss/rec"],
+            sync_dist=True,
+        )
+        self.log_dict(full_log_dict, sync_dist=True)
+        return full_log_dict
+    def get_param_groups(
+        self, parameter_names: List[List[str]], optimizer_args: List[dict]
+    ) -> Tuple[List[Dict[str, Any]], int]:
+        groups = []
+        num_params = 0
+        for names, args in zip(parameter_names, optimizer_args):
+            params = []
+            for pattern_ in names:
+                pattern_params = []
+                pattern = re.compile(pattern_)
+                for p_name, param in self.named_parameters():
+                    if re.match(pattern, p_name):
+                        pattern_params.append(param)
+                        num_params += param.numel()
+                if len(pattern_params) == 0:
+                    logpy.warn(f"Did not find parameters for pattern {pattern_}")
+                params.extend(pattern_params)
+            groups.append({"params": params, **args})
+        return groups, num_params
+    def configure_optimizers(self) -> List[torch.optim.Optimizer]:
+        if self.trainable_ae_params is None:
+            ae_params = self.get_autoencoder_params()
+        else:
+            ae_params, num_ae_params = self.get_param_groups(
+                self.trainable_ae_params, self.ae_optimizer_args
+            )
+            logpy.info(f"Number of trainable autoencoder parameters: {num_ae_params:,}")
+        if self.trainable_disc_params is None:
+            disc_params = self.get_discriminator_params()
+        else:
+            disc_params, num_disc_params = self.get_param_groups(
+                self.trainable_disc_params, self.disc_optimizer_args
+            )
+            logpy.info(
+                f"Number of trainable discriminator parameters: {num_disc_params:,}"
+            )
+        opt_ae = self.instantiate_optimizer_from_config(
+            ae_params,
+            default(self.lr_g_factor, 1.0) * self.learning_rate,
+            self.optimizer_config,
+        )
+        opts = [opt_ae]
+        if len(disc_params) > 0:
+            opt_disc = self.instantiate_optimizer_from_config(
+                disc_params, self.learning_rate, self.optimizer_config
+            )
+            opts.append(opt_disc)
+        return opts
+    @torch.no_grad()
+    def log_images(
+        self, batch: dict, additional_log_kwargs: Optional[Dict] = None, **kwargs
+    ) -> dict:
+        log = dict()
+        additional_decode_kwargs = {}
+        x = self.get_input(batch)
+        additional_decode_kwargs.update(
+            {key: batch[key] for key in self.additional_decode_keys.intersection(batch)}
+        )
+        _, xrec, _ = self(x, **additional_decode_kwargs)
+        log["inputs"] = x
+        log["reconstructions"] = xrec
+        diff = 0.5 * torch.abs(torch.clamp(xrec, -1.0, 1.0) - x)
+        diff.clamp_(0, 1.0)
+        log["diff"] = 2.0 * diff - 1.0
+        # diff_boost shows location of small errors, by boosting their
+        # brightness.
+        log["diff_boost"] = (
+            2.0 * torch.clamp(self.diff_boost_factor * diff, 0.0, 1.0) - 1
+        )
+        if hasattr(self.loss, "log_images"):
+            log.update(self.loss.log_images(x, xrec))
+        with self.ema_scope():
+            _, xrec_ema, _ = self(x, **additional_decode_kwargs)
+            log["reconstructions_ema"] = xrec_ema
+            diff_ema = 0.5 * torch.abs(torch.clamp(xrec_ema, -1.0, 1.0) - x)
+            diff_ema.clamp_(0, 1.0)
+            log["diff_ema"] = 2.0 * diff_ema - 1.0
+            log["diff_boost_ema"] = (
+                2.0 * torch.clamp(self.diff_boost_factor * diff_ema, 0.0, 1.0) - 1
+            )
+        if additional_log_kwargs:
+            additional_decode_kwargs.update(additional_log_kwargs)
+            _, xrec_add, _ = self(x, **additional_decode_kwargs)
+            log_str = "reconstructions-" + "-".join(
+                [f"{key}={additional_log_kwargs[key]}" for key in additional_log_kwargs]
+            )
+            log[log_str] = xrec_add
+        return log
+class AutoencodingEngineLegacy(AutoencodingEngine):
+    def __init__(self, embed_dim: int, **kwargs):
+        self.max_batch_size = kwargs.pop("max_batch_size", None)
+        ddconfig = kwargs.pop("ddconfig")
+        ckpt_path = kwargs.pop("ckpt_path", None)
+        ckpt_engine = kwargs.pop("ckpt_engine", None)
+        super().__init__(
+            encoder_config={
+                "target": "sgm.modules.diffusionmodules.model.Encoder",
+                "params": ddconfig,
+            },
+            decoder_config={
+                "target": "sgm.modules.diffusionmodules.model.Decoder",
+                "params": ddconfig,
+            },
+            **kwargs,
+        )
+        self.quant_conv = torch.nn.Conv2d(
+            (1 + ddconfig["double_z"]) * ddconfig["z_channels"],
+            (1 + ddconfig["double_z"]) * embed_dim,
+            1,
+        )
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
+        self.embed_dim = embed_dim
+        self.apply_ckpt(default(ckpt_path, ckpt_engine))
+    def get_autoencoder_params(self) -> list:
+        params = super().get_autoencoder_params()
+        return params
+    def encode(
+        self, x: torch.Tensor, return_reg_log: bool = False
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, dict]]:
+        if self.max_batch_size is None:
+            z = self.encoder(x)
+            z = self.quant_conv(z)
+        else:
+            N = x.shape[0]
+            bs = self.max_batch_size
+            n_batches = int(math.ceil(N / bs))
+            z = list()
+            for i_batch in range(n_batches):
+                z_batch = self.encoder(x[i_batch * bs : (i_batch + 1) * bs])
+                z_batch = self.quant_conv(z_batch)
+                z.append(z_batch)
+            z = torch.cat(z, 0)
+        z, reg_log = self.regularization(z)
+        if return_reg_log:
+            return z, reg_log
+        return z
+    def decode(self, z: torch.Tensor, **decoder_kwargs) -> torch.Tensor:
+        if self.max_batch_size is None:
+            dec = self.post_quant_conv(z)
+            dec = self.decoder(dec, **decoder_kwargs)
+        else:
+            N = z.shape[0]
+            bs = self.max_batch_size
+            n_batches = int(math.ceil(N / bs))
+            dec = list()
+            for i_batch in range(n_batches):
+                dec_batch = self.post_quant_conv(z[i_batch * bs : (i_batch + 1) * bs])
+                dec_batch = self.decoder(dec_batch, **decoder_kwargs)
+                dec.append(dec_batch)
+            dec = torch.cat(dec, 0)
+        return dec
+class AutoencoderKL(AutoencodingEngineLegacy):
+    def __init__(self, **kwargs):
+        if "lossconfig" in kwargs:
+            kwargs["loss_config"] = kwargs.pop("lossconfig")
+        super().__init__(
+            regularizer_config={
+                "target": (
+                    "sgm.modules.autoencoding.regularizers"
+                    ".DiagonalGaussianRegularizer"
+                )
+            },
+            **kwargs,
+        )
+class AutoencoderLegacyVQ(AutoencodingEngineLegacy):
+    def __init__(
+        self,
+        embed_dim: int,
+        n_embed: int,
+        sane_index_shape: bool = False,
+        **kwargs,
+    ):
+        if "lossconfig" in kwargs:
+            logpy.warn(f"Parameter `lossconfig` is deprecated, use `loss_config`.")
+            kwargs["loss_config"] = kwargs.pop("lossconfig")
+        super().__init__(
+            regularizer_config={
+                "target": (
+                    "sgm.modules.autoencoding.regularizers.quantize" ".VectorQuantizer"
+                ),
+                "params": {
+                    "n_e": n_embed,
+                    "e_dim": embed_dim,
+                    "sane_index_shape": sane_index_shape,
+                },
+            },
+            **kwargs,
+        )
+class IdentityFirstStage(AbstractAutoencoder):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def get_input(self, x: Any) -> Any:
+        return x
+    def encode(self, x: Any, *args, **kwargs) -> Any:
+        return x
+    def decode(self, x: Any, *args, **kwargs) -> Any:
+        return x
+class AEIntegerWrapper(nn.Module):
+    def __init__(
+        self,
+        model: nn.Module,
+        shape: Union[None, Tuple[int, int], List[int]] = (16, 16),
+        regularization_key: str = "regularization",
+        encoder_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__()
+        self.model = model
+        assert hasattr(model, "encode") and hasattr(
+            model, "decode"
+        ), "Need AE interface"
+        self.regularization = get_nested_attribute(model, regularization_key)
+        self.shape = shape
+        self.encoder_kwargs = default(encoder_kwargs, {"return_reg_log": True})
+    def encode(self, x) -> torch.Tensor:
+        assert (
+            not self.training
+        ), f"{self.__class__.__name__} only supports inference currently"
+        _, log = self.model.encode(x, **self.encoder_kwargs)
+        assert isinstance(log, dict)
+        inds = log["min_encoding_indices"]
+        return rearrange(inds, "b ... -> b (...)")
+    def decode(
+        self, inds: torch.Tensor, shape: Union[None, tuple, list] = None
+    ) -> torch.Tensor:
+        # expect inds shape (b, s) with s = h*w
+        shape = default(shape, self.shape)  # Optional[(h, w)]
+        if shape is not None:
+            assert len(shape) == 2, f"Unhandeled shape {shape}"
+            inds = rearrange(inds, "b (h w) -> b h w", h=shape[0], w=shape[1])
+        h = self.regularization.get_codebook_entry(inds)  # (b, h, w, c)
+        h = rearrange(h, "b h w c -> b c h w")
+        return self.model.decode(h)
+class AutoencoderKLModeOnly(AutoencodingEngineLegacy):
+    def __init__(self, **kwargs):
+        if "lossconfig" in kwargs:
+            kwargs["loss_config"] = kwargs.pop("lossconfig")
+        super().__init__(
+            regularizer_config={
+                "target": (
+                    "sgm.modules.autoencoding.regularizers"
+                    ".DiagonalGaussianRegularizer"
+                ),
+                "params": {"sample": False},
+            },
+            **kwargs,
+        )

sgm/models/diffusion.py ADDED Viewed

	@@ -0,0 +1,358 @@

+import math
+from contextlib import contextmanager
+from typing import Any, Dict, List, Optional, Tuple, Union
+import pytorch_lightning as pl
+import torch
+from omegaconf import ListConfig, OmegaConf
+from safetensors.torch import load_file as load_safetensors
+from torch.optim.lr_scheduler import LambdaLR
+from einops import rearrange
+from ..modules import UNCONDITIONAL_CONFIG
+from ..modules.autoencoding.temporal_ae import VideoDecoder
+from ..modules.diffusionmodules.wrappers import OPENAIUNETWRAPPER
+from ..modules.ema import LitEma
+from ..util import (
+    default,
+    disabled_train,
+    get_obj_from_str,
+    instantiate_from_config,
+    log_txt_as_img,
+)
+class DiffusionEngine(pl.LightningModule):
+    def __init__(
+        self,
+        network_config,
+        denoiser_config,
+        first_stage_config,
+        conditioner_config: Union[None, Dict, ListConfig, OmegaConf] = None,
+        sampler_config: Union[None, Dict, ListConfig, OmegaConf] = None,
+        optimizer_config: Union[None, Dict, ListConfig, OmegaConf] = None,
+        scheduler_config: Union[None, Dict, ListConfig, OmegaConf] = None,
+        loss_fn_config: Union[None, Dict, ListConfig, OmegaConf] = None,
+        network_wrapper: Union[None, str] = None,
+        ckpt_path: Union[None, str] = None,
+        use_ema: bool = False,
+        ema_decay_rate: float = 0.9999,
+        scale_factor: float = 1.0,
+        disable_first_stage_autocast=False,
+        input_key: str = "jpg",
+        log_keys: Union[List, None] = None,
+        no_cond_log: bool = False,
+        compile_model: bool = False,
+        en_and_decode_n_samples_a_time: Optional[int] = None,
+    ):
+        super().__init__()
+        self.log_keys = log_keys
+        self.input_key = input_key
+        self.optimizer_config = default(
+            optimizer_config, {"target": "torch.optim.AdamW"}
+        )
+        model = instantiate_from_config(network_config)
+        self.model = get_obj_from_str(default(network_wrapper, OPENAIUNETWRAPPER))(
+            model, compile_model=compile_model
+        )
+        self.denoiser = instantiate_from_config(denoiser_config)
+        self.sampler = (
+            instantiate_from_config(sampler_config)
+            if sampler_config is not None
+            else None
+        )
+        self.conditioner = instantiate_from_config(
+            default(conditioner_config, UNCONDITIONAL_CONFIG)
+        )
+        self.scheduler_config = scheduler_config
+        self._init_first_stage(first_stage_config)
+        self.loss_fn = (
+            instantiate_from_config(loss_fn_config)
+            if loss_fn_config is not None
+            else None
+        )
+        self.use_ema = use_ema
+        if self.use_ema:
+            self.model_ema = LitEma(self.model, decay=ema_decay_rate)
+            print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
+        self.scale_factor = scale_factor
+        self.disable_first_stage_autocast = disable_first_stage_autocast
+        self.no_cond_log = no_cond_log
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path)
+        self.en_and_decode_n_samples_a_time = en_and_decode_n_samples_a_time
+    def init_from_ckpt(
+        self,
+        path: str,
+    ) -> None:
+        if path.endswith("ckpt"):
+            sd = torch.load(path, map_location="cpu")["state_dict"]
+        elif path.endswith("safetensors"):
+            sd = load_safetensors(path)
+        else:
+            raise NotImplementedError
+        missing, unexpected = self.load_state_dict(sd, strict=False)
+        print(
+            f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys"
+        )
+        if len(missing) > 0:
+            print(f"Missing Keys: {missing}")
+        if len(unexpected) > 0:
+            print(f"Unexpected Keys: {unexpected}")
+    def _init_first_stage(self, config):
+        model = instantiate_from_config(config).eval()
+        model.train = disabled_train
+        for param in model.parameters():
+            param.requires_grad = False
+        self.first_stage_model = model
+    def get_input(self, batch):
+        # assuming unified data format, dataloader returns a dict.
+        # image tensors should be scaled to -1 ... 1 and in bchw format
+        return batch[self.input_key]
+    @torch.no_grad()
+    def decode_first_stage(self, z):
+        z = 1.0 / self.scale_factor * z
+        n_samples = default(self.en_and_decode_n_samples_a_time, z.shape[0])
+        n_rounds = math.ceil(z.shape[0] / n_samples)
+        all_out = []
+        with torch.autocast("cuda", enabled=not self.disable_first_stage_autocast):
+            for n in range(n_rounds):
+                if isinstance(self.first_stage_model.decoder, VideoDecoder):
+                    kwargs = {"timesteps": len(z[n * n_samples : (n + 1) * n_samples])}
+                else:
+                    kwargs = {}
+                out = self.first_stage_model.decode(
+                    z[n * n_samples : (n + 1) * n_samples], **kwargs
+                )
+                all_out.append(out)
+        out = torch.cat(all_out, dim=0)
+        return out
+    @torch.no_grad()
+    def encode_first_stage(self, x):
+        bs = x.shape[0]
+        is_video_input = False
+        if x.dim() == 5:
+            is_video_input = True
+            # for video diffusion
+            x = rearrange(x, "b t c h w -> (b t) c h w")
+        n_samples = default(self.en_and_decode_n_samples_a_time, x.shape[0])
+        n_rounds = math.ceil(x.shape[0] / n_samples)
+        all_out = []
+        with torch.autocast("cuda", enabled=not self.disable_first_stage_autocast):
+            for n in range(n_rounds):
+                out = self.first_stage_model.encode(
+                    x[n * n_samples : (n + 1) * n_samples]
+                )
+                all_out.append(out)
+        z = torch.cat(all_out, dim=0)
+        z = self.scale_factor * z
+        if is_video_input:
+            z = rearrange(z, "(b t) c h w -> b t c h w", b=bs)
+        return z
+    def forward(self, x, batch):
+        loss = self.loss_fn(self.model, self.denoiser, self.conditioner, x, batch)
+        loss_mean = loss.mean()
+        loss_dict = {"loss": loss_mean}
+        return loss_mean, loss_dict
+    def shared_step(self, batch: Dict) -> Any:
+        x = self.get_input(batch)
+        breakpoint()
+        x = self.encode_first_stage(x)
+        batch["global_step"] = self.global_step
+        loss, loss_dict = self(x, batch)
+        return loss, loss_dict
+    def training_step(self, batch, batch_idx):
+        loss, loss_dict = self.shared_step(batch)
+        self.log_dict(
+            loss_dict, prog_bar=True, logger=True, on_step=True, on_epoch=False
+        )
+        self.log(
+            "global_step",
+            self.global_step,
+            prog_bar=True,
+            logger=True,
+            on_step=True,
+            on_epoch=False,
+        )
+        if self.scheduler_config is not None:
+            lr = self.optimizers().param_groups[0]["lr"]
+            self.log(
+                "lr_abs", lr, prog_bar=True, logger=True, on_step=True, on_epoch=False
+            )
+        return loss
+    def on_train_start(self, *args, **kwargs):
+        if self.sampler is None or self.loss_fn is None:
+            raise ValueError("Sampler and loss function need to be set for training.")
+    def on_train_batch_end(self, *args, **kwargs):
+        if self.use_ema:
+            self.model_ema(self.model)
+    @contextmanager
+    def ema_scope(self, context=None):
+        if self.use_ema:
+            self.model_ema.store(self.model.parameters())
+            self.model_ema.copy_to(self.model)
+            if context is not None:
+                print(f"{context}: Switched to EMA weights")
+        try:
+            yield None
+        finally:
+            if self.use_ema:
+                self.model_ema.restore(self.model.parameters())
+                if context is not None:
+                    print(f"{context}: Restored training weights")
+    def instantiate_optimizer_from_config(self, params, lr, cfg):
+        return get_obj_from_str(cfg["target"])(
+            params, lr=lr, **cfg.get("params", dict())
+        )
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        params = list(self.model.parameters())
+        for embedder in self.conditioner.embedders:
+            if embedder.is_trainable:
+                params = params + list(embedder.parameters())
+        opt = self.instantiate_optimizer_from_config(params, lr, self.optimizer_config)
+        if self.scheduler_config is not None:
+            scheduler = instantiate_from_config(self.scheduler_config)
+            print("Setting up LambdaLR scheduler...")
+            scheduler = [
+                {
+                    "scheduler": LambdaLR(opt, lr_lambda=scheduler.schedule),
+                    "interval": "step",
+                    "frequency": 1,
+                }
+            ]
+            return [opt], scheduler
+        return opt
+    @torch.no_grad()
+    def sample(
+        self,
+        cond: Dict,
+        uc: Union[Dict, None] = None,
+        batch_size: int = 16,
+        shape: Union[None, Tuple, List] = None,
+        **kwargs,
+    ):
+        randn = torch.randn(batch_size, *shape).to(self.device)
+        denoiser = lambda input, sigma, c: self.denoiser(
+            self.model, input, sigma, c, **kwargs
+        )
+        samples = self.sampler(denoiser, randn, cond, uc=uc)
+        return samples
+    @torch.no_grad()
+    def log_conditionings(self, batch: Dict, n: int) -> Dict:
+        """
+        Defines heuristics to log different conditionings.
+        These can be lists of strings (text-to-image), tensors, ints, ...
+        """
+        image_h, image_w = batch[self.input_key].shape[2:]
+        log = dict()
+        for embedder in self.conditioner.embedders:
+            if (
+                (self.log_keys is None) or (embedder.input_key in self.log_keys)
+            ) and not self.no_cond_log:
+                x = batch[embedder.input_key][:n]
+                if isinstance(x, torch.Tensor):
+                    if x.dim() == 1:
+                        # class-conditional, convert integer to string
+                        x = [str(x[i].item()) for i in range(x.shape[0])]
+                        xc = log_txt_as_img((image_h, image_w), x, size=image_h // 4)
+                    elif x.dim() == 2:
+                        # size and crop cond and the like
+                        x = [
+                            "x".join([str(xx) for xx in x[i].tolist()])
+                            for i in range(x.shape[0])
+                        ]
+                        xc = log_txt_as_img((image_h, image_w), x, size=image_h // 20)
+                    else:
+                        raise NotImplementedError()
+                elif isinstance(x, (List, ListConfig)):
+                    if isinstance(x[0], str):
+                        # strings
+                        xc = log_txt_as_img((image_h, image_w), x, size=image_h // 20)
+                    else:
+                        raise NotImplementedError()
+                else:
+                    raise NotImplementedError()
+                log[embedder.input_key] = xc
+        return log
+    @torch.no_grad()
+    def log_images(
+        self,
+        batch: Dict,
+        N: int = 8,
+        sample: bool = True,
+        ucg_keys: List[str] = None,
+        **kwargs,
+    ) -> Dict:
+        conditioner_input_keys = [e.input_key for e in self.conditioner.embedders]
+        if ucg_keys:
+            assert all(map(lambda x: x in conditioner_input_keys, ucg_keys)), (
+                "Each defined ucg key for sampling must be in the provided conditioner input keys,"
+                f"but we have {ucg_keys} vs. {conditioner_input_keys}"
+            )
+        else:
+            ucg_keys = conditioner_input_keys
+        log = dict()
+        x = self.get_input(batch)
+        c, uc = self.conditioner.get_unconditional_conditioning(
+            batch,
+            force_uc_zero_embeddings=ucg_keys
+            if len(self.conditioner.embedders) > 0
+            else [],
+        )
+        sampling_kwargs = {}
+        N = min(x.shape[0], N)
+        x = x.to(self.device)[:N]
+        log["inputs"] = x
+        z = self.encode_first_stage(x)
+        log["reconstructions"] = self.decode_first_stage(z)
+        log.update(self.log_conditionings(batch, N))
+        for k in c:
+            if isinstance(c[k], torch.Tensor):
+                c[k], uc[k] = map(lambda y: y[k][:N].to(self.device), (c, uc))
+        if sample:
+            with self.ema_scope("Plotting"):
+                samples = self.sample(
+                    c, shape=z.shape[1:], uc=uc, batch_size=N, **sampling_kwargs
+                )
+            samples = self.decode_first_stage(samples)
+            log["samples"] = samples
+        return log

sgm/models/video3d_diffusion.py ADDED Viewed

	@@ -0,0 +1,524 @@

+import re
+import math
+from contextlib import contextmanager
+from typing import Any, Dict, List, Optional, Tuple, Union
+import pytorch_lightning as pl
+from pytorch_lightning.loggers import WandbLogger
+import torch
+from omegaconf import ListConfig, OmegaConf
+from safetensors.torch import load_file as load_safetensors
+from torch.optim.lr_scheduler import LambdaLR
+from torchvision.utils import make_grid
+from einops import rearrange, repeat
+from ..modules import UNCONDITIONAL_CONFIG
+from ..modules.autoencoding.temporal_ae import VideoDecoder
+from ..modules.diffusionmodules.wrappers import OPENAIUNETWRAPPER
+from ..modules.ema import LitEma
+from ..modules.encoders.modules import VideoPredictionEmbedderWithEncoder
+from ..util import (
+    default,
+    disabled_train,
+    get_obj_from_str,
+    instantiate_from_config,
+    log_txt_as_img,
+    video_frames_as_grid,
+)
+def flatten_for_video(input):
+    return input.flatten()
+class Video3DDiffusionEngine(pl.LightningModule):
+    def __init__(
+        self,
+        network_config,
+        denoiser_config,
+        first_stage_config,
+        conditioner_config: Union[None, Dict, ListConfig, OmegaConf] = None,
+        sampler_config: Union[None, Dict, ListConfig, OmegaConf] = None,
+        optimizer_config: Union[None, Dict, ListConfig, OmegaConf] = None,
+        scheduler_config: Union[None, Dict, ListConfig, OmegaConf] = None,
+        loss_fn_config: Union[None, Dict, ListConfig, OmegaConf] = None,
+        network_wrapper: Union[None, str] = None,
+        ckpt_path: Union[None, str] = None,
+        use_ema: bool = False,
+        ema_decay_rate: float = 0.9999,
+        scale_factor: float = 1.0,
+        disable_first_stage_autocast=False,
+        input_key: str = "frames",  # for video inputs
+        log_keys: Union[List, None] = None,
+        no_cond_log: bool = False,
+        compile_model: bool = False,
+        en_and_decode_n_samples_a_time: Optional[int] = None,
+    ):
+        super().__init__()
+        self.log_keys = log_keys
+        self.input_key = input_key
+        self.optimizer_config = default(
+            optimizer_config, {"target": "torch.optim.AdamW"}
+        )
+        model = instantiate_from_config(network_config)
+        self.model = get_obj_from_str(default(network_wrapper, OPENAIUNETWRAPPER))(
+            model, compile_model=compile_model
+        )
+        self.denoiser = instantiate_from_config(denoiser_config)
+        self.sampler = (
+            instantiate_from_config(sampler_config)
+            if sampler_config is not None
+            else None
+        )
+        self.conditioner = instantiate_from_config(
+            default(conditioner_config, UNCONDITIONAL_CONFIG)
+        )
+        self.scheduler_config = scheduler_config
+        self._init_first_stage(first_stage_config)
+        self.loss_fn = (
+            instantiate_from_config(loss_fn_config)
+            if loss_fn_config is not None
+            else None
+        )
+        self.use_ema = use_ema
+        if self.use_ema:
+            self.model_ema = LitEma(self.model, decay=ema_decay_rate)
+            print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
+        self.scale_factor = scale_factor
+        self.disable_first_stage_autocast = disable_first_stage_autocast
+        self.no_cond_log = no_cond_log
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path)
+        self.en_and_decode_n_samples_a_time = en_and_decode_n_samples_a_time
+    def _load_last_embedder(self, original_state_dict):
+        original_module_name = "conditioner.embedders.3"
+        state_dict = dict()
+        for k, v in original_state_dict.items():
+            m = re.match(rf"^{original_module_name}\.(.*)$", k)
+            if m is None:
+                continue
+            state_dict[m.group(1)] = v
+        idx = -1
+        for i in range(len(self.conditioner.embedders)):
+            if isinstance(
+                self.conditioner.embedders[i], VideoPredictionEmbedderWithEncoder
+            ):
+                idx = i
+        print(f"Embedder [{idx}] is the frame encoder, make sure this is expected")
+        self.conditioner.embedders[idx].load_state_dict(state_dict)
+    def init_from_ckpt(
+        self,
+        path: str,
+    ) -> None:
+        if path.endswith("ckpt"):
+            sd = torch.load(path, map_location="cpu")["state_dict"]
+        elif path.endswith("safetensors"):
+            sd = load_safetensors(path)
+        else:
+            raise NotImplementedError
+        self_sd = self.state_dict()
+        input_keys = [
+            "model.diffusion_model.input_blocks.0.0.weight",
+            "model_ema.diffusion_modelinput_blocks00weight",
+        ]
+        for input_key in input_keys:
+            if input_key not in sd or input_key not in self_sd:
+                continue
+            input_weight = self_sd[input_key]
+            if input_weight.shape != sd[input_key].shape:
+                print("Manual init: {}".format(input_key))
+                input_weight.zero_()
+                input_weight[:, :8, :, :].copy_(sd[input_key])
+        deleted_keys = []
+        for k, v in self.state_dict().items():
+            # resolve shape dismatch
+            if k in sd:
+                if v.shape != sd[k].shape:
+                    del sd[k]
+                    deleted_keys.append(k)
+        if len(deleted_keys) > 0:
+            print(f"Deleted Keys: {deleted_keys}")
+        missing, unexpected = self.load_state_dict(sd, strict=False)
+        print(
+            f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys"
+        )
+        if len(missing) > 0:
+            print(f"Missing Keys: {missing}")
+        if len(unexpected) > 0:
+            print(f"Unexpected Keys: {unexpected}")
+        if len(deleted_keys) > 0:
+            print(f"Deleted Keys: {deleted_keys}")
+        if len(missing) > 0 or len(unexpected) > 0:
+            # means we are loading from a checkpoint that has the old embedder (motion bucket id and fps id)
+            print("Modified embedder to support 3d spiral video inputs")
+            try:
+                self._load_last_embedder(sd)
+            except:
+                print("Failed to load last embedder, make sure this is expected")
+    def _init_first_stage(self, config):
+        model = instantiate_from_config(config).eval()
+        model.train = disabled_train
+        for param in model.parameters():
+            param.requires_grad = False
+        self.first_stage_model = model
+    def get_input(self, batch):
+        # assuming unified data format, dataloader returns a dict.
+        # image tensors should be scaled to -1 ... 1 and in bchw format
+        return batch[self.input_key]
+    @torch.no_grad()
+    def decode_first_stage(self, z):
+        z = 1.0 / self.scale_factor * z
+        is_video_input = False
+        bs = z.shape[0]
+        if z.dim() == 5:
+            is_video_input = True
+            # for video diffusion
+            z = rearrange(z, "b t c h w -> (b t) c h w")
+        n_samples = default(self.en_and_decode_n_samples_a_time, z.shape[0])
+        n_rounds = math.ceil(z.shape[0] / n_samples)
+        all_out = []
+        with torch.autocast("cuda", enabled=not self.disable_first_stage_autocast):
+            for n in range(n_rounds):
+                if isinstance(self.first_stage_model.decoder, VideoDecoder):
+                    kwargs = {"timesteps": len(z[n * n_samples : (n + 1) * n_samples])}
+                else:
+                    kwargs = {}
+                out = self.first_stage_model.decode(
+                    z[n * n_samples : (n + 1) * n_samples], **kwargs
+                )
+                all_out.append(out)
+        out = torch.cat(all_out, dim=0)
+        if is_video_input:
+            out = rearrange(out, "(b t) c h w -> b t c h w", b=bs)
+        return out
+    @torch.no_grad()
+    def encode_first_stage(self, x):
+        if self.input_key == "latents":
+            return x
+        bs = x.shape[0]
+        is_video_input = False
+        if x.dim() == 5:
+            is_video_input = True
+            # for video diffusion
+            x = rearrange(x, "b t c h w -> (b t) c h w")
+        n_samples = default(self.en_and_decode_n_samples_a_time, x.shape[0])
+        n_rounds = math.ceil(x.shape[0] / n_samples)
+        all_out = []
+        with torch.autocast("cuda", enabled=not self.disable_first_stage_autocast):
+            for n in range(n_rounds):
+                out = self.first_stage_model.encode(
+                    x[n * n_samples : (n + 1) * n_samples]
+                )
+                all_out.append(out)
+        z = torch.cat(all_out, dim=0)
+        z = self.scale_factor * z
+        # if is_video_input:
+        #     z = rearrange(z, "(b t) c h w -> b t c h w", b=bs)
+        return z
+    def forward(self, x, batch):
+        loss, model_output = self.loss_fn(
+            self.model,
+            self.denoiser,
+            self.conditioner,
+            x,
+            batch,
+            return_model_output=True,
+        )
+        loss_mean = loss.mean()
+        loss_dict = {"loss": loss_mean, "model_output": model_output}
+        return loss_mean, loss_dict
+    def shared_step(self, batch: Dict) -> Any:
+        # TODO: move this shit to collate_fn in dataloader
+        # if "fps_id" in batch:
+        #     batch["fps_id"] = flatten_for_video(batch["fps_id"])
+        # if "motion_bucket_id" in batch:
+        #     batch["motion_bucket_id"] = flatten_for_video(batch["motion_bucket_id"])
+        # if "cond_aug" in batch:
+        #     batch["cond_aug"] = flatten_for_video(batch["cond_aug"])
+        x = self.get_input(batch)
+        x = self.encode_first_stage(x)
+        # ## debug
+        # x_recon = self.decode_first_stage(x)
+        # video_frames_as_grid((batch["frames"][0] + 1.0) / 2.0, "./tmp/origin.jpg")
+        # video_frames_as_grid((x_recon[0] + 1.0) / 2.0, "./tmp/recon.jpg")
+        # ## debug
+        batch["global_step"] = self.global_step
+        loss, loss_dict = self(x, batch)
+        return loss, loss_dict
+    def training_step(self, batch, batch_idx):
+        loss, loss_dict = self.shared_step(batch)
+        with torch.no_grad():
+            if "model_output" in loss_dict:
+                if batch_idx % 100 == 0:
+                    if isinstance(self.logger, WandbLogger):
+                        model_output = loss_dict["model_output"].detach()[
+                            : batch["num_video_frames"]
+                        ]
+                        recons = (
+                            (self.decode_first_stage(model_output) + 1.0) / 2.0
+                        ).clamp(0.0, 1.0)
+                        recon_grid = make_grid(recons, nrow=4)
+                        self.logger.log_image(
+                            key=f"train/model_output_recon",
+                            images=[recon_grid],
+                            step=self.global_step,
+                        )
+            del loss_dict["model_output"]
+        if torch.isnan(loss).any():
+            print("Nan detected")
+            loss = None
+        self.log_dict(
+            loss_dict, prog_bar=True, logger=True, on_step=True, on_epoch=False
+        )
+        self.log(
+            "global_step",
+            self.global_step,
+            prog_bar=True,
+            logger=True,
+            on_step=True,
+            on_epoch=False,
+        )
+        if self.scheduler_config is not None:
+            lr = self.optimizers().param_groups[0]["lr"]
+            self.log(
+                "lr_abs", lr, prog_bar=True, logger=True, on_step=True, on_epoch=False
+            )
+        return loss
+    def on_train_start(self, *args, **kwargs):
+        if self.sampler is None or self.loss_fn is None:
+            raise ValueError("Sampler and loss function need to be set for training.")
+    def on_train_batch_end(self, *args, **kwargs):
+        if self.use_ema:
+            self.model_ema(self.model)
+    @contextmanager
+    def ema_scope(self, context=None):
+        if self.use_ema:
+            self.model_ema.store(self.model.parameters())
+            self.model_ema.copy_to(self.model)
+            if context is not None:
+                print(f"{context}: Switched to EMA weights")
+        try:
+            yield None
+        finally:
+            if self.use_ema:
+                self.model_ema.restore(self.model.parameters())
+                if context is not None:
+                    print(f"{context}: Restored training weights")
+    def instantiate_optimizer_from_config(self, params, lr, cfg):
+        return get_obj_from_str(cfg["target"])(
+            params, lr=lr, **cfg.get("params", dict())
+        )
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        params = list(self.model.parameters())
+        for embedder in self.conditioner.embedders:
+            if embedder.is_trainable:
+                params = params + list(embedder.parameters())
+        opt = self.instantiate_optimizer_from_config(params, lr, self.optimizer_config)
+        if self.scheduler_config is not None:
+            scheduler = instantiate_from_config(self.scheduler_config)
+            print("Setting up LambdaLR scheduler...")
+            scheduler = [
+                {
+                    "scheduler": LambdaLR(opt, lr_lambda=scheduler.schedule),
+                    "interval": "step",
+                    "frequency": 1,
+                }
+            ]
+            return [opt], scheduler
+        return opt
+    @torch.no_grad()
+    def sample(
+        self,
+        cond: Dict,
+        uc: Union[Dict, None] = None,
+        batch_size: int = 16,
+        shape: Union[None, Tuple, List] = None,
+        **kwargs,
+    ):
+        randn = torch.randn(batch_size, *shape).to(self.device)
+        denoiser = lambda input, sigma, c: self.denoiser(
+            self.model, input, sigma, c, **kwargs
+        )
+        samples = self.sampler(denoiser, randn, cond, uc=uc)
+        return samples
+    @torch.no_grad()
+    def log_conditionings(self, batch: Dict, n: int) -> Dict:
+        """
+        Defines heuristics to log different conditionings.
+        These can be lists of strings (text-to-image), tensors, ints, ...
+        """
+        image_h, image_w = batch[self.input_key].shape[-2:]
+        log = dict()
+        for embedder in self.conditioner.embedders:
+            if (
+                (self.log_keys is None) or (embedder.input_key in self.log_keys)
+            ) and not self.no_cond_log:
+                x = batch[embedder.input_key][:n]
+                if isinstance(x, torch.Tensor):
+                    if x.dim() == 1:
+                        # class-conditional, convert integer to string
+                        x = [str(x[i].item()) for i in range(x.shape[0])]
+                        xc = log_txt_as_img((image_h, image_w), x, size=image_h // 4)
+                    elif x.dim() == 2:
+                        # size and crop cond and the like
+                        x = [
+                            "x".join([str(xx) for xx in x[i].tolist()])
+                            for i in range(x.shape[0])
+                        ]
+                        xc = log_txt_as_img((image_h, image_w), x, size=image_h // 20)
+                    elif x.dim() == 4:
+                        # image
+                        xc = x
+                    else:
+                        raise NotImplementedError()
+                elif isinstance(x, (List, ListConfig)):
+                    if isinstance(x[0], str):
+                        # strings
+                        xc = log_txt_as_img((image_h, image_w), x, size=image_h // 20)
+                    else:
+                        raise NotImplementedError()
+                else:
+                    raise NotImplementedError()
+                log[embedder.input_key] = xc
+        return log
+    # for video diffusions will be logging frames of a video
+    @torch.no_grad()
+    def log_images(
+        self,
+        batch: Dict,
+        N: int = 1,
+        sample: bool = True,
+        ucg_keys: List[str] = None,
+        **kwargs,
+    ) -> Dict:
+        # # debug
+        # return {}
+        # # debug
+        assert "num_video_frames" in batch, "num_video_frames must be in batch"
+        num_video_frames = batch["num_video_frames"]
+        conditioner_input_keys = [e.input_key for e in self.conditioner.embedders]
+        conditioner_input_keys = []
+        for e in self.conditioner.embedders:
+            if e.input_key is not None:
+                conditioner_input_keys.append(e.input_key)
+            else:
+                conditioner_input_keys.extend(e.input_keys)
+        if ucg_keys:
+            assert all(map(lambda x: x in conditioner_input_keys, ucg_keys)), (
+                "Each defined ucg key for sampling must be in the provided conditioner input keys,"
+                f"but we have {ucg_keys} vs. {conditioner_input_keys}"
+            )
+        else:
+            ucg_keys = conditioner_input_keys
+        log = dict()
+        x = self.get_input(batch)
+        c, uc = self.conditioner.get_unconditional_conditioning(
+            batch,
+            force_uc_zero_embeddings=ucg_keys
+            if len(self.conditioner.embedders) > 0
+            else [],
+        )
+        sampling_kwargs = {"num_video_frames": num_video_frames}
+        n = min(x.shape[0] // num_video_frames, N)
+        sampling_kwargs["image_only_indicator"] = torch.cat(
+            [batch["image_only_indicator"][:n]] * 2
+        )
+        N = min(x.shape[0] // num_video_frames, N) * num_video_frames
+        x = x.to(self.device)[:N]
+        # log["inputs"] = rearrange(x, "(b t) c h w -> b c h (t w)", t=num_video_frames)
+        log["inputs"] = x
+        z = self.encode_first_stage(x)
+        recon = self.decode_first_stage(z)
+        # log["reconstructions"] = rearrange(
+        #     recon, "(b t) c h w -> b c h (t w)", t=num_video_frames
+        # )
+        log["reconstructions"] = recon
+        log.update(self.log_conditionings(batch, N))
+        log["pixelnerf_rgb"] = c["rgb"]
+        for k in ["crossattn", "concat", "vector"]:
+            if k in c:
+                c[k] = c[k][:N]
+                uc[k] = uc[k][:N]
+        # for k in c:
+        #     if isinstance(c[k], torch.Tensor):
+        #         if k == "vector":
+        #             end = N
+        #         else:
+        #             end = n
+        #         c[k], uc[k] = map(lambda y: y[k][:end].to(self.device), (c, uc))
+        # # for k in c:
+        # #     print(c[k].shape)
+        # breakpoint()
+        # for k in ["crossattn", "concat"]:
+        #     c[k] = repeat(c[k], "b ... -> b t ...", t=num_video_frames)
+        #     c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=num_video_frames)
+        #     uc[k] = repeat(uc[k], "b ... -> b t ...", t=num_video_frames)
+        #     uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=num_video_frames)
+        # for k in c:
+        #     print(c[k].shape)
+        if sample:
+            with self.ema_scope("Plotting"):
+                samples = self.sample(
+                    c, shape=z.shape[1:], uc=uc, batch_size=N, **sampling_kwargs
+                )
+            samples = self.decode_first_stage(samples)
+            log["samples"] = samples
+        return log

sgm/models/video_diffusion.py ADDED Viewed

	@@ -0,0 +1,503 @@

+import re
+import math
+from contextlib import contextmanager
+from typing import Any, Dict, List, Optional, Tuple, Union
+import pytorch_lightning as pl
+from pytorch_lightning.loggers import WandbLogger
+import torch
+from omegaconf import ListConfig, OmegaConf
+from safetensors.torch import load_file as load_safetensors
+from torch.optim.lr_scheduler import LambdaLR
+from torchvision.utils import make_grid
+from einops import rearrange, repeat
+from ..modules import UNCONDITIONAL_CONFIG
+from ..modules.autoencoding.temporal_ae import VideoDecoder
+from ..modules.diffusionmodules.wrappers import OPENAIUNETWRAPPER
+from ..modules.ema import LitEma
+from ..modules.encoders.modules import VideoPredictionEmbedderWithEncoder
+from ..util import (
+    default,
+    disabled_train,
+    get_obj_from_str,
+    instantiate_from_config,
+    log_txt_as_img,
+    video_frames_as_grid,
+)
+def flatten_for_video(input):
+    return input.flatten()
+class DiffusionEngine(pl.LightningModule):
+    def __init__(
+        self,
+        network_config,
+        denoiser_config,
+        first_stage_config,
+        conditioner_config: Union[None, Dict, ListConfig, OmegaConf] = None,
+        sampler_config: Union[None, Dict, ListConfig, OmegaConf] = None,
+        optimizer_config: Union[None, Dict, ListConfig, OmegaConf] = None,
+        scheduler_config: Union[None, Dict, ListConfig, OmegaConf] = None,
+        loss_fn_config: Union[None, Dict, ListConfig, OmegaConf] = None,
+        network_wrapper: Union[None, str] = None,
+        ckpt_path: Union[None, str] = None,
+        use_ema: bool = False,
+        ema_decay_rate: float = 0.9999,
+        scale_factor: float = 1.0,
+        disable_first_stage_autocast=False,
+        input_key: str = "frames",  # for video inputs
+        log_keys: Union[List, None] = None,
+        no_cond_log: bool = False,
+        compile_model: bool = False,
+        en_and_decode_n_samples_a_time: Optional[int] = None,
+        load_last_embedder: bool = False,
+        from_scratch: bool = False,
+    ):
+        super().__init__()
+        self.log_keys = log_keys
+        self.input_key = input_key
+        self.optimizer_config = default(
+            optimizer_config, {"target": "torch.optim.AdamW"}
+        )
+        model = instantiate_from_config(network_config)
+        self.model = get_obj_from_str(default(network_wrapper, OPENAIUNETWRAPPER))(
+            model, compile_model=compile_model
+        )
+        self.denoiser = instantiate_from_config(denoiser_config)
+        self.sampler = (
+            instantiate_from_config(sampler_config)
+            if sampler_config is not None
+            else None
+        )
+        self.conditioner = instantiate_from_config(
+            default(conditioner_config, UNCONDITIONAL_CONFIG)
+        )
+        self.scheduler_config = scheduler_config
+        self._init_first_stage(first_stage_config)
+        self.loss_fn = (
+            instantiate_from_config(loss_fn_config)
+            if loss_fn_config is not None
+            else None
+        )
+        self.use_ema = use_ema
+        if self.use_ema:
+            self.model_ema = LitEma(self.model, decay=ema_decay_rate)
+            print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
+        self.scale_factor = scale_factor
+        self.disable_first_stage_autocast = disable_first_stage_autocast
+        self.no_cond_log = no_cond_log
+        self.load_last_embedder = load_last_embedder
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, from_scratch)
+        self.en_and_decode_n_samples_a_time = en_and_decode_n_samples_a_time
+    def _load_last_embedder(self, original_state_dict):
+        original_module_name = "conditioner.embedders.3"
+        state_dict = dict()
+        for k, v in original_state_dict.items():
+            m = re.match(rf"^{original_module_name}\.(.*)$", k)
+            if m is None:
+                continue
+            state_dict[m.group(1)] = v
+        idx = -1
+        for i in range(len(self.conditioner.embedders)):
+            if isinstance(
+                self.conditioner.embedders[i], VideoPredictionEmbedderWithEncoder
+            ):
+                idx = i
+        print(f"Embedder [{idx}] is the frame encoder, make sure this is expected")
+        self.conditioner.embedders[idx].load_state_dict(state_dict)
+    def init_from_ckpt(
+        self,
+        path: str,
+        from_scratch: bool = False,
+    ) -> None:
+        if path.endswith("ckpt"):
+            sd = torch.load(path, map_location="cpu")["state_dict"]
+        elif path.endswith("safetensors"):
+            sd = load_safetensors(path)
+        else:
+            raise NotImplementedError
+        deleted_keys = []
+        for k, v in self.state_dict().items():
+            # resolve shape dismatch
+            if k in sd:
+                if v.shape != sd[k].shape:
+                    del sd[k]
+                    deleted_keys.append(k)
+        if from_scratch:
+            new_sd = {}
+            for k in sd:
+                if "first_stage_model" in k:
+                    new_sd[k] = sd[k]
+            sd = new_sd
+            print(sd.keys())
+        if len(deleted_keys) > 0:
+            print(f"Deleted Keys: {deleted_keys}")
+        missing, unexpected = self.load_state_dict(sd, strict=False)
+        print(
+            f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys"
+        )
+        if len(missing) > 0:
+            print(f"Missing Keys: {missing}")
+        if len(unexpected) > 0:
+            print(f"Unexpected Keys: {unexpected}")
+        if len(deleted_keys) > 0:
+            print(f"Deleted Keys: {deleted_keys}")
+        if (len(missing) > 0 or len(unexpected) > 0) and self.load_last_embedder:
+            # means we are loading from a checkpoint that has the old embedder (motion bucket id and fps id)
+            print("Modified embedder to support 3d spiral video inputs")
+            self._load_last_embedder(sd)
+    def _init_first_stage(self, config):
+        model = instantiate_from_config(config).eval()
+        model.train = disabled_train
+        for param in model.parameters():
+            param.requires_grad = False
+        self.first_stage_model = model
+    def get_input(self, batch):
+        # assuming unified data format, dataloader returns a dict.
+        # image tensors should be scaled to -1 ... 1 and in bchw format
+        return batch[self.input_key]
+    @torch.no_grad()
+    def decode_first_stage(self, z):
+        z = 1.0 / self.scale_factor * z
+        is_video_input = False
+        bs = z.shape[0]
+        if z.dim() == 5:
+            is_video_input = True
+            # for video diffusion
+            z = rearrange(z, "b t c h w -> (b t) c h w")
+        n_samples = default(self.en_and_decode_n_samples_a_time, z.shape[0])
+        n_rounds = math.ceil(z.shape[0] / n_samples)
+        all_out = []
+        with torch.autocast("cuda", enabled=not self.disable_first_stage_autocast):
+            for n in range(n_rounds):
+                if isinstance(self.first_stage_model.decoder, VideoDecoder):
+                    kwargs = {"timesteps": len(z[n * n_samples : (n + 1) * n_samples])}
+                else:
+                    kwargs = {}
+                out = self.first_stage_model.decode(
+                    z[n * n_samples : (n + 1) * n_samples], **kwargs
+                )
+                all_out.append(out)
+        out = torch.cat(all_out, dim=0)
+        if is_video_input:
+            out = rearrange(out, "(b t) c h w -> b t c h w", b=bs)
+        return out
+    @torch.no_grad()
+    def encode_first_stage(self, x):
+        if self.input_key == "latents":
+            return x * self.scale_factor
+        bs = x.shape[0]
+        is_video_input = False
+        if x.dim() == 5:
+            is_video_input = True
+            # for video diffusion
+            x = rearrange(x, "b t c h w -> (b t) c h w")
+        n_samples = default(self.en_and_decode_n_samples_a_time, x.shape[0])
+        n_rounds = math.ceil(x.shape[0] / n_samples)
+        all_out = []
+        with torch.autocast("cuda", enabled=not self.disable_first_stage_autocast):
+            for n in range(n_rounds):
+                out = self.first_stage_model.encode(
+                    x[n * n_samples : (n + 1) * n_samples]
+                )
+                all_out.append(out)
+        z = torch.cat(all_out, dim=0)
+        z = self.scale_factor * z
+        # if is_video_input:
+        #     z = rearrange(z, "(b t) c h w -> b t c h w", b=bs)
+        return z
+    def forward(self, x, batch):
+        loss, model_output = self.loss_fn(
+            self.model,
+            self.denoiser,
+            self.conditioner,
+            x,
+            batch,
+            return_model_output=True,
+        )
+        loss_mean = loss.mean()
+        loss_dict = {"loss": loss_mean, "model_output": model_output}
+        return loss_mean, loss_dict
+    def shared_step(self, batch: Dict) -> Any:
+        # TODO: move this shit to collate_fn in dataloader
+        # if "fps_id" in batch:
+        #     batch["fps_id"] = flatten_for_video(batch["fps_id"])
+        # if "motion_bucket_id" in batch:
+        #     batch["motion_bucket_id"] = flatten_for_video(batch["motion_bucket_id"])
+        # if "cond_aug" in batch:
+        #     batch["cond_aug"] = flatten_for_video(batch["cond_aug"])
+        x = self.get_input(batch)
+        x = self.encode_first_stage(x)
+        # ## debug
+        # x_recon = self.decode_first_stage(x)
+        # video_frames_as_grid((batch["frames"][0] + 1.0) / 2.0, "./tmp/origin.jpg")
+        # video_frames_as_grid((x_recon[0] + 1.0) / 2.0, "./tmp/recon.jpg")
+        # ## debug
+        batch["global_step"] = self.global_step
+        # breakpoint()
+        loss, loss_dict = self(x, batch)
+        return loss, loss_dict
+    def training_step(self, batch, batch_idx):
+        loss, loss_dict = self.shared_step(batch)
+        with torch.no_grad():
+            if "model_output" in loss_dict:
+                if batch_idx % 100 == 0:
+                    if isinstance(self.logger, WandbLogger):
+                        model_output = loss_dict["model_output"].detach()[
+                            : batch["num_video_frames"]
+                        ]
+                        recons = (
+                            (self.decode_first_stage(model_output) + 1.0) / 2.0
+                        ).clamp(0.0, 1.0)
+                        recon_grid = make_grid(recons, nrow=4)
+                        self.logger.log_image(
+                            key=f"train/model_output_recon",
+                            images=[recon_grid],
+                            step=self.global_step,
+                        )
+            del loss_dict["model_output"]
+        self.log_dict(
+            loss_dict, prog_bar=True, logger=True, on_step=True, on_epoch=False
+        )
+        self.log(
+            "global_step",
+            self.global_step,
+            prog_bar=True,
+            logger=True,
+            on_step=True,
+            on_epoch=False,
+        )
+        if self.scheduler_config is not None:
+            lr = self.optimizers().param_groups[0]["lr"]
+            self.log(
+                "lr_abs", lr, prog_bar=True, logger=True, on_step=True, on_epoch=False
+            )
+        return loss
+    def on_train_start(self, *args, **kwargs):
+        if self.sampler is None or self.loss_fn is None:
+            raise ValueError("Sampler and loss function need to be set for training.")
+    def on_train_batch_end(self, *args, **kwargs):
+        if self.use_ema:
+            self.model_ema(self.model)
+    @contextmanager
+    def ema_scope(self, context=None):
+        if self.use_ema:
+            self.model_ema.store(self.model.parameters())
+            self.model_ema.copy_to(self.model)
+            if context is not None:
+                print(f"{context}: Switched to EMA weights")
+        try:
+            yield None
+        finally:
+            if self.use_ema:
+                self.model_ema.restore(self.model.parameters())
+                if context is not None:
+                    print(f"{context}: Restored training weights")
+    def instantiate_optimizer_from_config(self, params, lr, cfg):
+        return get_obj_from_str(cfg["target"])(
+            params, lr=lr, **cfg.get("params", dict())
+        )
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        params = list(self.model.parameters())
+        for embedder in self.conditioner.embedders:
+            if embedder.is_trainable:
+                params = params + list(embedder.parameters())
+        opt = self.instantiate_optimizer_from_config(params, lr, self.optimizer_config)
+        if self.scheduler_config is not None:
+            scheduler = instantiate_from_config(self.scheduler_config)
+            print("Setting up LambdaLR scheduler...")
+            scheduler = [
+                {
+                    "scheduler": LambdaLR(opt, lr_lambda=scheduler.schedule),
+                    "interval": "step",
+                    "frequency": 1,
+                }
+            ]
+            return [opt], scheduler
+        return opt
+    @torch.no_grad()
+    def sample(
+        self,
+        cond: Dict,
+        uc: Union[Dict, None] = None,
+        batch_size: int = 16,
+        shape: Union[None, Tuple, List] = None,
+        **kwargs,
+    ):
+        randn = torch.randn(batch_size, *shape).to(self.device)
+        denoiser = lambda input, sigma, c: self.denoiser(
+            self.model, input, sigma, c, **kwargs
+        )
+        samples = self.sampler(denoiser, randn, cond, uc=uc)
+        return samples
+    @torch.no_grad()
+    def log_conditionings(self, batch: Dict, n: int) -> Dict:
+        """
+        Defines heuristics to log different conditionings.
+        These can be lists of strings (text-to-image), tensors, ints, ...
+        """
+        image_h, image_w = batch[self.input_key].shape[-2:]
+        log = dict()
+        for embedder in self.conditioner.embedders:
+            if (
+                (self.log_keys is None) or (embedder.input_key in self.log_keys)
+            ) and not self.no_cond_log:
+                x = batch[embedder.input_key][:n]
+                if isinstance(x, torch.Tensor):
+                    if x.dim() == 1:
+                        # class-conditional, convert integer to string
+                        x = [str(x[i].item()) for i in range(x.shape[0])]
+                        xc = log_txt_as_img((image_h, image_w), x, size=image_h // 4)
+                    elif x.dim() == 2:
+                        # size and crop cond and the like
+                        x = [
+                            "x".join([str(xx) for xx in x[i].tolist()])
+                            for i in range(x.shape[0])
+                        ]
+                        xc = log_txt_as_img((image_h, image_w), x, size=image_h // 20)
+                    elif x.dim() == 4:
+                        # image
+                        xc = x
+                    else:
+                        pass
+                        # breakpoint()
+                        # raise NotImplementedError()
+                elif isinstance(x, (List, ListConfig)):
+                    if isinstance(x[0], str):
+                        # strings
+                        xc = log_txt_as_img((image_h, image_w), x, size=image_h // 20)
+                    else:
+                        raise NotImplementedError()
+                else:
+                    raise NotImplementedError()
+                log[embedder.input_key] = xc
+        return log
+    # for video diffusions will be logging frames of a video
+    @torch.no_grad()
+    def log_images(
+        self,
+        batch: Dict,
+        N: int = 1,
+        sample: bool = True,
+        ucg_keys: List[str] = None,
+        **kwargs,
+    ) -> Dict:
+        # # debug
+        # return {}
+        # # debug
+        assert "num_video_frames" in batch, "num_video_frames must be in batch"
+        num_video_frames = batch["num_video_frames"]
+        conditioner_input_keys = [e.input_key for e in self.conditioner.embedders]
+        if ucg_keys:
+            assert all(map(lambda x: x in conditioner_input_keys, ucg_keys)), (
+                "Each defined ucg key for sampling must be in the provided conditioner input keys,"
+                f"but we have {ucg_keys} vs. {conditioner_input_keys}"
+            )
+        else:
+            ucg_keys = conditioner_input_keys
+        log = dict()
+        x = self.get_input(batch)
+        c, uc = self.conditioner.get_unconditional_conditioning(
+            batch,
+            force_uc_zero_embeddings=ucg_keys
+            if len(self.conditioner.embedders) > 0
+            else [],
+        )
+        sampling_kwargs = {"num_video_frames": num_video_frames}
+        n = min(x.shape[0] // num_video_frames, N)
+        sampling_kwargs["image_only_indicator"] = torch.cat(
+            [batch["image_only_indicator"][:n]] * 2
+        )
+        N = min(x.shape[0] // num_video_frames, N) * num_video_frames
+        x = x.to(self.device)[:N]
+        # log["inputs"] = rearrange(x, "(b t) c h w -> b c h (t w)", t=num_video_frames)
+        if self.input_key != "latents":
+            log["inputs"] = x
+        z = self.encode_first_stage(x)
+        recon = self.decode_first_stage(z)
+        # log["reconstructions"] = rearrange(
+        #     recon, "(b t) c h w -> b c h (t w)", t=num_video_frames
+        # )
+        log["reconstructions"] = recon
+        log.update(self.log_conditionings(batch, N))
+        for k in c:
+            if isinstance(c[k], torch.Tensor):
+                if k == "vector":
+                    end = N
+                else:
+                    end = n
+                c[k], uc[k] = map(lambda y: y[k][:end].to(self.device), (c, uc))
+        # for k in c:
+        #     print(c[k].shape)
+        for k in ["crossattn", "concat"]:
+            c[k] = repeat(c[k], "b ... -> b t ...", t=num_video_frames)
+            c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=num_video_frames)
+            uc[k] = repeat(uc[k], "b ... -> b t ...", t=num_video_frames)
+            uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=num_video_frames)
+        # for k in c:
+        #     print(c[k].shape)
+        if sample:
+            with self.ema_scope("Plotting"):
+                samples = self.sample(
+                    c, shape=z.shape[1:], uc=uc, batch_size=N, **sampling_kwargs
+                )
+            samples = self.decode_first_stage(samples)
+            log["samples"] = samples
+        return log

sgm/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .encoders.modules import GeneralConditioner, ExtraConditioner
+UNCONDITIONAL_CONFIG = {
+    "target": "sgm.modules.GeneralConditioner",
+    "params": {"emb_models": []},
+}

sgm/modules/attention.py ADDED Viewed

	@@ -0,0 +1,764 @@

+import logging
+import math
+from inspect import isfunction
+from typing import Any, Optional
+from functools import partial
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from packaging import version
+from torch import nn
+# from torch.utils.checkpoint import checkpoint
+checkpoint = partial(torch.utils.checkpoint.checkpoint, use_reentrant=False)
+logpy = logging.getLogger(__name__)
+if version.parse(torch.__version__) >= version.parse("2.0.0"):
+    SDP_IS_AVAILABLE = True
+    from torch.backends.cuda import SDPBackend, sdp_kernel
+    BACKEND_MAP = {
+        SDPBackend.MATH: {
+            "enable_math": True,
+            "enable_flash": False,
+            "enable_mem_efficient": False,
+        },
+        SDPBackend.FLASH_ATTENTION: {
+            "enable_math": False,
+            "enable_flash": True,
+            "enable_mem_efficient": False,
+        },
+        SDPBackend.EFFICIENT_ATTENTION: {
+            "enable_math": False,
+            "enable_flash": False,
+            "enable_mem_efficient": True,
+        },
+        None: {"enable_math": True, "enable_flash": True, "enable_mem_efficient": True},
+    }
+else:
+    from contextlib import nullcontext
+    SDP_IS_AVAILABLE = False
+    sdp_kernel = nullcontext
+    BACKEND_MAP = {}
+    logpy.warn(
+        f"No SDP backend available, likely because you are running in pytorch "
+        f"versions < 2.0. In fact, you are using PyTorch {torch.__version__}. "
+        f"You might want to consider upgrading."
+    )
+try:
+    import xformers
+    import xformers.ops
+    XFORMERS_IS_AVAILABLE = True
+except:
+    XFORMERS_IS_AVAILABLE = False
+    logpy.warn("no module 'xformers'. Processing without...")
+# from .diffusionmodules.util import mixed_checkpoint as checkpoint
+def exists(val):
+    return val is not None
+def uniq(arr):
+    return {el: True for el in arr}.keys()
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+def max_neg_value(t):
+    return -torch.finfo(t.dtype).max
+def init_(tensor):
+    dim = tensor.shape[-1]
+    std = 1 / math.sqrt(dim)
+    tensor.uniform_(-std, std)
+    return tensor
+# feedforward
+class GEGLU(nn.Module):
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+    def forward(self, x):
+        x, gate = self.proj(x).chunk(2, dim=-1)
+        return x * F.gelu(gate)
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = default(dim_out, dim)
+        project_in = (
+            nn.Sequential(nn.Linear(dim, inner_dim), nn.GELU())
+            if not glu
+            else GEGLU(dim, inner_dim)
+        )
+        self.net = nn.Sequential(
+            project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out)
+        )
+    def forward(self, x):
+        return self.net(x)
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+def Normalize(in_channels):
+    return torch.nn.GroupNorm(
+        num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
+    )
+class LinearAttention(nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32):
+        super().__init__()
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)
+        self.to_out = nn.Conv2d(hidden_dim, dim, 1)
+    def forward(self, x):
+        b, c, h, w = x.shape
+        qkv = self.to_qkv(x)
+        q, k, v = rearrange(
+            qkv, "b (qkv heads c) h w -> qkv b heads c (h w)", heads=self.heads, qkv=3
+        )
+        k = k.softmax(dim=-1)
+        context = torch.einsum("bhdn,bhen->bhde", k, v)
+        out = torch.einsum("bhde,bhdn->bhen", context, q)
+        out = rearrange(
+            out, "b heads c (h w) -> b (heads c) h w", heads=self.heads, h=h, w=w
+        )
+        return self.to_out(out)
+class SelfAttention(nn.Module):
+    ATTENTION_MODES = ("xformers", "torch", "math")
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        qk_scale: Optional[float] = None,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        attn_mode: str = "xformers",
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        assert attn_mode in self.ATTENTION_MODES
+        self.attn_mode = attn_mode
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, L, C = x.shape
+        qkv = self.qkv(x)
+        if self.attn_mode == "torch":
+            qkv = rearrange(
+                qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
+            ).float()
+            q, k, v = qkv[0], qkv[1], qkv[2]  # B H L D
+            x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+            x = rearrange(x, "B H L D -> B L (H D)")
+        elif self.attn_mode == "xformers":
+            qkv = rearrange(qkv, "B L (K H D) -> K B L H D", K=3, H=self.num_heads)
+            q, k, v = qkv[0], qkv[1], qkv[2]  # B L H D
+            x = xformers.ops.memory_efficient_attention(q, k, v)
+            x = rearrange(x, "B L H D -> B L (H D)", H=self.num_heads)
+        elif self.attn_mode == "math":
+            qkv = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+            q, k, v = qkv[0], qkv[1], qkv[2]  # B H L D
+            attn = (q @ k.transpose(-2, -1)) * self.scale
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = (attn @ v).transpose(1, 2).reshape(B, L, C)
+        else:
+            raise NotImplemented
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class SpatialSelfAttention(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.k = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.v = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.proj_out = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b, c, h, w = q.shape
+        q = rearrange(q, "b c h w -> b (h w) c")
+        k = rearrange(k, "b c h w -> b c (h w)")
+        w_ = torch.einsum("bij,bjk->bik", q, k)
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = rearrange(v, "b c h w -> b c (h w)")
+        w_ = rearrange(w_, "b i j -> b j i")
+        h_ = torch.einsum("bij,bjk->bik", v, w_)
+        h_ = rearrange(h_, "b c (h w) -> b c h w", h=h)
+        h_ = self.proj_out(h_)
+        return x + h_
+class CrossAttention(nn.Module):
+    def __init__(
+        self,
+        query_dim,
+        context_dim=None,
+        heads=8,
+        dim_head=64,
+        dropout=0.0,
+        backend=None,
+    ):
+        super().__init__()
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
+        )
+        self.backend = backend
+    def forward(
+        self,
+        x,
+        context=None,
+        mask=None,
+        additional_tokens=None,
+        n_times_crossframe_attn_in_self=0,
+    ):
+        h = self.heads
+        if additional_tokens is not None:
+            # get the number of masked tokens at the beginning of the output sequence
+            n_tokens_to_mask = additional_tokens.shape[1]
+            # add additional token
+            x = torch.cat([additional_tokens, x], dim=1)
+        q = self.to_q(x)
+        context = default(context, x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+        if n_times_crossframe_attn_in_self:
+            # reprogramming cross-frame attention as in https://arxiv.org/abs/2303.13439
+            assert x.shape[0] % n_times_crossframe_attn_in_self == 0
+            n_cp = x.shape[0] // n_times_crossframe_attn_in_self
+            k = repeat(
+                k[::n_times_crossframe_attn_in_self], "b ... -> (b n) ...", n=n_cp
+            )
+            v = repeat(
+                v[::n_times_crossframe_attn_in_self], "b ... -> (b n) ...", n=n_cp
+            )
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v))
+        ## old
+        """
+        sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
+        del q, k
+        if exists(mask):
+            mask = rearrange(mask, 'b ... -> b (...)')
+            max_neg_value = -torch.finfo(sim.dtype).max
+            mask = repeat(mask, 'b j -> (b h) () j', h=h)
+            sim.masked_fill_(~mask, max_neg_value)
+        # attention, what we cannot get enough of
+        sim = sim.softmax(dim=-1)
+        out = einsum('b i j, b j d -> b i d', sim, v)
+        """
+        ## new
+        with sdp_kernel(**BACKEND_MAP[self.backend]):
+            # print("dispatching into backend", self.backend, "q/k/v shape: ", q.shape, k.shape, v.shape)
+            out = F.scaled_dot_product_attention(
+                q, k, v, attn_mask=mask
+            )  # scale is dim_head ** -0.5 per default
+        del q, k, v
+        out = rearrange(out, "b h n d -> b n (h d)", h=h)
+        if additional_tokens is not None:
+            # remove additional token
+            out = out[:, n_tokens_to_mask:]
+        return self.to_out(out)
+class MemoryEfficientCrossAttention(nn.Module):
+    # https://github.com/MatthieuTPHR/diffusers/blob/d80b531ff8060ec1ea982b65a1b8df70f73aa67c/src/diffusers/models/attention.py#L223
+    def __init__(
+        self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0, **kwargs
+    ):
+        super().__init__()
+        logpy.debug(
+            f"Setting up {self.__class__.__name__}. Query dim is {query_dim}, "
+            f"context_dim is {context_dim} and using {heads} heads with a "
+            f"dimension of {dim_head}."
+        )
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+        self.heads = heads
+        self.dim_head = dim_head
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
+        )
+        self.attention_op: Optional[Any] = None
+    def forward(
+        self,
+        x,
+        context=None,
+        mask=None,
+        additional_tokens=None,
+        n_times_crossframe_attn_in_self=0,
+    ):
+        if additional_tokens is not None:
+            # get the number of masked tokens at the beginning of the output sequence
+            n_tokens_to_mask = additional_tokens.shape[1]
+            # add additional token
+            x = torch.cat([additional_tokens, x], dim=1)
+        q = self.to_q(x)
+        context = default(context, x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+        if n_times_crossframe_attn_in_self:
+            # reprogramming cross-frame attention as in https://arxiv.org/abs/2303.13439
+            assert x.shape[0] % n_times_crossframe_attn_in_self == 0
+            # n_cp = x.shape[0]//n_times_crossframe_attn_in_self
+            k = repeat(
+                k[::n_times_crossframe_attn_in_self],
+                "b ... -> (b n) ...",
+                n=n_times_crossframe_attn_in_self,
+            )
+            v = repeat(
+                v[::n_times_crossframe_attn_in_self],
+                "b ... -> (b n) ...",
+                n=n_times_crossframe_attn_in_self,
+            )
+        b, _, _ = q.shape
+        q, k, v = map(
+            lambda t: t.unsqueeze(3)
+            .reshape(b, t.shape[1], self.heads, self.dim_head)
+            .permute(0, 2, 1, 3)
+            .reshape(b * self.heads, t.shape[1], self.dim_head)
+            .contiguous(),
+            (q, k, v),
+        )
+        # actually compute the attention, what we cannot get enough of
+        if version.parse(xformers.__version__) >= version.parse("0.0.21"):
+            # NOTE: workaround for
+            # https://github.com/facebookresearch/xformers/issues/845
+            max_bs = 32768
+            N = q.shape[0]
+            n_batches = math.ceil(N / max_bs)
+            out = list()
+            for i_batch in range(n_batches):
+                batch = slice(i_batch * max_bs, (i_batch + 1) * max_bs)
+                out.append(
+                    xformers.ops.memory_efficient_attention(
+                        q[batch],
+                        k[batch],
+                        v[batch],
+                        attn_bias=None,
+                        op=self.attention_op,
+                    )
+                )
+            out = torch.cat(out, 0)
+        else:
+            out = xformers.ops.memory_efficient_attention(
+                q, k, v, attn_bias=None, op=self.attention_op
+            )
+        # TODO: Use this directly in the attention operation, as a bias
+        if exists(mask):
+            raise NotImplementedError
+        out = (
+            out.unsqueeze(0)
+            .reshape(b, self.heads, out.shape[1], self.dim_head)
+            .permute(0, 2, 1, 3)
+            .reshape(b, out.shape[1], self.heads * self.dim_head)
+        )
+        if additional_tokens is not None:
+            # remove additional token
+            out = out[:, n_tokens_to_mask:]
+        return self.to_out(out)
+class BasicTransformerBlock(nn.Module):
+    ATTENTION_MODES = {
+        "softmax": CrossAttention,  # vanilla attention
+        "softmax-xformers": MemoryEfficientCrossAttention,  # ampere
+    }
+    def __init__(
+        self,
+        dim,
+        n_heads,
+        d_head,
+        dropout=0.0,
+        context_dim=None,
+        gated_ff=True,
+        checkpoint=True,
+        disable_self_attn=False,
+        attn_mode="softmax",
+        sdp_backend=None,
+    ):
+        super().__init__()
+        assert attn_mode in self.ATTENTION_MODES
+        if attn_mode != "softmax" and not XFORMERS_IS_AVAILABLE:
+            logpy.warn(
+                f"Attention mode '{attn_mode}' is not available. Falling "
+                f"back to native attention. This is not a problem in "
+                f"Pytorch >= 2.0. FYI, you are running with PyTorch "
+                f"version {torch.__version__}."
+            )
+            attn_mode = "softmax"
+        elif attn_mode == "softmax" and not SDP_IS_AVAILABLE:
+            logpy.warn(
+                "We do not support vanilla attention anymore, as it is too "
+                "expensive. Sorry."
+            )
+            if not XFORMERS_IS_AVAILABLE:
+                assert (
+                    False
+                ), "Please install xformers via e.g. 'pip install xformers==0.0.16'"
+            else:
+                logpy.info("Falling back to xformers efficient attention.")
+                attn_mode = "softmax-xformers"
+        attn_cls = self.ATTENTION_MODES[attn_mode]
+        if version.parse(torch.__version__) >= version.parse("2.0.0"):
+            assert sdp_backend is None or isinstance(sdp_backend, SDPBackend)
+        else:
+            assert sdp_backend is None
+        self.disable_self_attn = disable_self_attn
+        self.attn1 = attn_cls(
+            query_dim=dim,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+            context_dim=context_dim if self.disable_self_attn else None,
+            backend=sdp_backend,
+        )  # is a self-attention if not self.disable_self_attn
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        self.attn2 = attn_cls(
+            query_dim=dim,
+            context_dim=context_dim,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+            backend=sdp_backend,
+        )  # is self-attn if context is none
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.norm3 = nn.LayerNorm(dim)
+        self.checkpoint = checkpoint
+        if self.checkpoint:
+            logpy.debug(f"{self.__class__.__name__} is using checkpointing")
+    def forward(
+        self, x, context=None, additional_tokens=None, n_times_crossframe_attn_in_self=0
+    ):
+        kwargs = {"x": x}
+        if context is not None:
+            kwargs.update({"context": context})
+        if additional_tokens is not None:
+            kwargs.update({"additional_tokens": additional_tokens})
+        if n_times_crossframe_attn_in_self:
+            kwargs.update(
+                {"n_times_crossframe_attn_in_self": n_times_crossframe_attn_in_self}
+            )
+        # return mixed_checkpoint(self._forward, kwargs, self.parameters(), self.checkpoint)
+        if self.checkpoint:
+            # inputs = {"x": x, "context": context}
+            return checkpoint(self._forward, x, context)
+            # return checkpoint(self._forward, inputs, self.parameters(), self.checkpoint)
+        else:
+            return self._forward(**kwargs)
+    def _forward(
+        self, x, context=None, additional_tokens=None, n_times_crossframe_attn_in_self=0
+    ):
+        x = (
+            self.attn1(
+                self.norm1(x),
+                context=context if self.disable_self_attn else None,
+                additional_tokens=additional_tokens,
+                n_times_crossframe_attn_in_self=n_times_crossframe_attn_in_self
+                if not self.disable_self_attn
+                else 0,
+            )
+            + x
+        )
+        x = (
+            self.attn2(
+                self.norm2(x), context=context, additional_tokens=additional_tokens
+            )
+            + x
+        )
+        x = self.ff(self.norm3(x)) + x
+        return x
+class BasicTransformerSingleLayerBlock(nn.Module):
+    ATTENTION_MODES = {
+        "softmax": CrossAttention,  # vanilla attention
+        "softmax-xformers": MemoryEfficientCrossAttention  # on the A100s not quite as fast as the above version
+        # (todo might depend on head_dim, check, falls back to semi-optimized kernels for dim!=[16,32,64,128])
+    }
+    def __init__(
+        self,
+        dim,
+        n_heads,
+        d_head,
+        dropout=0.0,
+        context_dim=None,
+        gated_ff=True,
+        checkpoint=True,
+        attn_mode="softmax",
+    ):
+        super().__init__()
+        assert attn_mode in self.ATTENTION_MODES
+        attn_cls = self.ATTENTION_MODES[attn_mode]
+        self.attn1 = attn_cls(
+            query_dim=dim,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+            context_dim=context_dim,
+        )
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.checkpoint = checkpoint
+    def forward(self, x, context=None):
+        # inputs = {"x": x, "context": context}
+        # return checkpoint(self._forward, inputs, self.parameters(), self.checkpoint)
+        return checkpoint(self._forward, x, context)
+    def _forward(self, x, context=None):
+        x = self.attn1(self.norm1(x), context=context) + x
+        x = self.ff(self.norm2(x)) + x
+        return x
+class SpatialTransformer(nn.Module):
+    """
+    Transformer block for image-like data.
+    First, project the input (aka embedding)
+    and reshape to b, t, d.
+    Then apply standard transformer action.
+    Finally, reshape to image
+    NEW: use_linear for more efficiency instead of the 1x1 convs
+    """
+    def __init__(
+        self,
+        in_channels,
+        n_heads,
+        d_head,
+        depth=1,
+        dropout=0.0,
+        context_dim=None,
+        disable_self_attn=False,
+        use_linear=False,
+        attn_type="softmax",
+        use_checkpoint=True,
+        # sdp_backend=SDPBackend.FLASH_ATTENTION
+        sdp_backend=None,
+    ):
+        super().__init__()
+        logpy.debug(
+            f"constructing {self.__class__.__name__} of depth {depth} w/ "
+            f"{in_channels} channels and {n_heads} heads."
+        )
+        if exists(context_dim) and not isinstance(context_dim, list):
+            context_dim = [context_dim]
+        if exists(context_dim) and isinstance(context_dim, list):
+            if depth != len(context_dim):
+                logpy.warn(
+                    f"{self.__class__.__name__}: Found context dims "
+                    f"{context_dim} of depth {len(context_dim)}, which does not "
+                    f"match the specified 'depth' of {depth}. Setting context_dim "
+                    f"to {depth * [context_dim[0]]} now."
+                )
+                # depth does not match context dims.
+                assert all(
+                    map(lambda x: x == context_dim[0], context_dim)
+                ), "need homogenous context_dim to match depth automatically"
+                context_dim = depth * [context_dim[0]]
+        elif context_dim is None:
+            context_dim = [None] * depth
+        self.in_channels = in_channels
+        inner_dim = n_heads * d_head
+        self.norm = Normalize(in_channels)
+        if not use_linear:
+            self.proj_in = nn.Conv2d(
+                in_channels, inner_dim, kernel_size=1, stride=1, padding=0
+            )
+        else:
+            self.proj_in = nn.Linear(in_channels, inner_dim)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    n_heads,
+                    d_head,
+                    dropout=dropout,
+                    context_dim=context_dim[d],
+                    disable_self_attn=disable_self_attn,
+                    attn_mode=attn_type,
+                    checkpoint=use_checkpoint,
+                    sdp_backend=sdp_backend,
+                )
+                for d in range(depth)
+            ]
+        )
+        if not use_linear:
+            self.proj_out = zero_module(
+                nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+            )
+        else:
+            # self.proj_out = zero_module(nn.Linear(in_channels, inner_dim))
+            self.proj_out = zero_module(nn.Linear(inner_dim, in_channels))
+        self.use_linear = use_linear
+    def forward(self, x, context=None):
+        # note: if no context is given, cross-attention defaults to self-attention
+        if not isinstance(context, list):
+            context = [context]
+        b, c, h, w = x.shape
+        x_in = x
+        x = self.norm(x)
+        if not self.use_linear:
+            x = self.proj_in(x)
+        x = rearrange(x, "b c h w -> b (h w) c").contiguous()
+        if self.use_linear:
+            x = self.proj_in(x)
+        for i, block in enumerate(self.transformer_blocks):
+            if i > 0 and len(context) == 1:
+                i = 0  # use same context for each block
+            x = block(x, context=context[i])
+        if self.use_linear:
+            x = self.proj_out(x)
+        x = rearrange(x, "b (h w) c -> b c h w", h=h, w=w).contiguous()
+        if not self.use_linear:
+            x = self.proj_out(x)
+        return x + x_in
+class SimpleTransformer(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        depth: int,
+        heads: int,
+        dim_head: int,
+        context_dim: Optional[int] = None,
+        dropout: float = 0.0,
+        checkpoint: bool = True,
+    ):
+        super().__init__()
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                BasicTransformerBlock(
+                    dim,
+                    heads,
+                    dim_head,
+                    dropout=dropout,
+                    context_dim=context_dim,
+                    attn_mode="softmax-xformers",
+                    checkpoint=checkpoint,
+                )
+            )
+    def forward(
+        self,
+        x: torch.Tensor,
+        context: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        for layer in self.layers:
+            x = layer(x, context)
+        return x

sgm/modules/autoencoding/__init__.py ADDED Viewed

File without changes

sgm/modules/autoencoding/losses/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+__all__ = [
+    "GeneralLPIPSWithDiscriminator",
+    "LatentLPIPS",
+]
+from .discriminator_loss import GeneralLPIPSWithDiscriminator
+from .lpips import LatentLPIPS

sgm/modules/autoencoding/losses/discriminator_loss.py ADDED Viewed

	@@ -0,0 +1,306 @@

+from typing import Dict, Iterator, List, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torchvision
+from einops import rearrange
+from matplotlib import colormaps
+from matplotlib import pyplot as plt
+from ....util import default, instantiate_from_config
+from ..lpips.loss.lpips import LPIPS
+from ..lpips.model.model import weights_init
+from ..lpips.vqperceptual import hinge_d_loss, vanilla_d_loss
+class GeneralLPIPSWithDiscriminator(nn.Module):
+    def __init__(
+        self,
+        disc_start: int,
+        logvar_init: float = 0.0,
+        disc_num_layers: int = 3,
+        disc_in_channels: int = 3,
+        disc_factor: float = 1.0,
+        disc_weight: float = 1.0,
+        perceptual_weight: float = 1.0,
+        disc_loss: str = "hinge",
+        scale_input_to_tgt_size: bool = False,
+        dims: int = 2,
+        learn_logvar: bool = False,
+        regularization_weights: Union[None, Dict[str, float]] = None,
+        additional_log_keys: Optional[List[str]] = None,
+        discriminator_config: Optional[Dict] = None,
+    ):
+        super().__init__()
+        self.dims = dims
+        if self.dims > 2:
+            print(
+                f"running with dims={dims}. This means that for perceptual loss "
+                f"calculation, the LPIPS loss will be applied to each frame "
+                f"independently."
+            )
+        self.scale_input_to_tgt_size = scale_input_to_tgt_size
+        assert disc_loss in ["hinge", "vanilla"]
+        self.perceptual_loss = LPIPS().eval()
+        self.perceptual_weight = perceptual_weight
+        # output log variance
+        self.logvar = nn.Parameter(
+            torch.full((), logvar_init), requires_grad=learn_logvar
+        )
+        self.learn_logvar = learn_logvar
+        discriminator_config = default(
+            discriminator_config,
+            {
+                "target": "sgm.modules.autoencoding.lpips.model.model.NLayerDiscriminator",
+                "params": {
+                    "input_nc": disc_in_channels,
+                    "n_layers": disc_num_layers,
+                    "use_actnorm": False,
+                },
+            },
+        )
+        self.discriminator = instantiate_from_config(discriminator_config).apply(
+            weights_init
+        )
+        self.discriminator_iter_start = disc_start
+        self.disc_loss = hinge_d_loss if disc_loss == "hinge" else vanilla_d_loss
+        self.disc_factor = disc_factor
+        self.discriminator_weight = disc_weight
+        self.regularization_weights = default(regularization_weights, {})
+        self.forward_keys = [
+            "optimizer_idx",
+            "global_step",
+            "last_layer",
+            "split",
+            "regularization_log",
+        ]
+        self.additional_log_keys = set(default(additional_log_keys, []))
+        self.additional_log_keys.update(set(self.regularization_weights.keys()))
+    def get_trainable_parameters(self) -> Iterator[nn.Parameter]:
+        return self.discriminator.parameters()
+    def get_trainable_autoencoder_parameters(self) -> Iterator[nn.Parameter]:
+        if self.learn_logvar:
+            yield self.logvar
+        yield from ()
+    @torch.no_grad()
+    def log_images(
+        self, inputs: torch.Tensor, reconstructions: torch.Tensor
+    ) -> Dict[str, torch.Tensor]:
+        # calc logits of real/fake
+        logits_real = self.discriminator(inputs.contiguous().detach())
+        if len(logits_real.shape) < 4:
+            # Non patch-discriminator
+            return dict()
+        logits_fake = self.discriminator(reconstructions.contiguous().detach())
+        # -> (b, 1, h, w)
+        # parameters for colormapping
+        high = max(logits_fake.abs().max(), logits_real.abs().max()).item()
+        cmap = colormaps["PiYG"]  # diverging colormap
+        def to_colormap(logits: torch.Tensor) -> torch.Tensor:
+            """(b, 1, ...) -> (b, 3, ...)"""
+            logits = (logits + high) / (2 * high)
+            logits_np = cmap(logits.cpu().numpy())[..., :3]  # truncate alpha channel
+            # -> (b, 1, ..., 3)
+            logits = torch.from_numpy(logits_np).to(logits.device)
+            return rearrange(logits, "b 1 ... c -> b c ...")
+        logits_real = torch.nn.functional.interpolate(
+            logits_real,
+            size=inputs.shape[-2:],
+            mode="nearest",
+            antialias=False,
+        )
+        logits_fake = torch.nn.functional.interpolate(
+            logits_fake,
+            size=reconstructions.shape[-2:],
+            mode="nearest",
+            antialias=False,
+        )
+        # alpha value of logits for overlay
+        alpha_real = torch.abs(logits_real) / high
+        alpha_fake = torch.abs(logits_fake) / high
+        # -> (b, 1, h, w) in range [0, 0.5]
+        # alpha value of lines don't really matter, since the values are the same
+        # for both images and logits anyway
+        grid_alpha_real = torchvision.utils.make_grid(alpha_real, nrow=4)
+        grid_alpha_fake = torchvision.utils.make_grid(alpha_fake, nrow=4)
+        grid_alpha = 0.8 * torch.cat((grid_alpha_real, grid_alpha_fake), dim=1)
+        # -> (1, h, w)
+        # blend logits and images together
+        # prepare logits for plotting
+        logits_real = to_colormap(logits_real)
+        logits_fake = to_colormap(logits_fake)
+        # resize logits
+        # -> (b, 3, h, w)
+        # make some grids
+        # add all logits to one plot
+        logits_real = torchvision.utils.make_grid(logits_real, nrow=4)
+        logits_fake = torchvision.utils.make_grid(logits_fake, nrow=4)
+        # I just love how torchvision calls the number of columns `nrow`
+        grid_logits = torch.cat((logits_real, logits_fake), dim=1)
+        # -> (3, h, w)
+        grid_images_real = torchvision.utils.make_grid(0.5 * inputs + 0.5, nrow=4)
+        grid_images_fake = torchvision.utils.make_grid(
+            0.5 * reconstructions + 0.5, nrow=4
+        )
+        grid_images = torch.cat((grid_images_real, grid_images_fake), dim=1)
+        # -> (3, h, w) in range [0, 1]
+        grid_blend = grid_alpha * grid_logits + (1 - grid_alpha) * grid_images
+        # Create labeled colorbar
+        dpi = 100
+        height = 128 / dpi
+        width = grid_logits.shape[2] / dpi
+        fig, ax = plt.subplots(figsize=(width, height), dpi=dpi)
+        img = ax.imshow(np.array([[-high, high]]), cmap=cmap)
+        plt.colorbar(
+            img,
+            cax=ax,
+            orientation="horizontal",
+            fraction=0.9,
+            aspect=width / height,
+            pad=0.0,
+        )
+        img.set_visible(False)
+        fig.tight_layout()
+        fig.canvas.draw()
+        # manually convert figure to numpy
+        cbar_np = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
+        cbar_np = cbar_np.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+        cbar = torch.from_numpy(cbar_np.copy()).to(grid_logits.dtype) / 255.0
+        cbar = rearrange(cbar, "h w c -> c h w").to(grid_logits.device)
+        # Add colorbar to plot
+        annotated_grid = torch.cat((grid_logits, cbar), dim=1)
+        blended_grid = torch.cat((grid_blend, cbar), dim=1)
+        return {
+            "vis_logits": 2 * annotated_grid[None, ...] - 1,
+            "vis_logits_blended": 2 * blended_grid[None, ...] - 1,
+        }
+    def calculate_adaptive_weight(
+        self, nll_loss: torch.Tensor, g_loss: torch.Tensor, last_layer: torch.Tensor
+    ) -> torch.Tensor:
+        nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
+        g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
+        d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
+        d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
+        d_weight = d_weight * self.discriminator_weight
+        return d_weight
+    def forward(
+        self,
+        inputs: torch.Tensor,
+        reconstructions: torch.Tensor,
+        *,  # added because I changed the order here
+        regularization_log: Dict[str, torch.Tensor],
+        optimizer_idx: int,
+        global_step: int,
+        last_layer: torch.Tensor,
+        split: str = "train",
+        weights: Union[None, float, torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, dict]:
+        if self.scale_input_to_tgt_size:
+            inputs = torch.nn.functional.interpolate(
+                inputs, reconstructions.shape[2:], mode="bicubic", antialias=True
+            )
+        if self.dims > 2:
+            inputs, reconstructions = map(
+                lambda x: rearrange(x, "b c t h w -> (b t) c h w"),
+                (inputs, reconstructions),
+            )
+        rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous())
+        if self.perceptual_weight > 0:
+            p_loss = self.perceptual_loss(
+                inputs.contiguous(), reconstructions.contiguous()
+            )
+            rec_loss = rec_loss + self.perceptual_weight * p_loss
+        nll_loss, weighted_nll_loss = self.get_nll_loss(rec_loss, weights)
+        # now the GAN part
+        if optimizer_idx == 0:
+            # generator update
+            if global_step >= self.discriminator_iter_start or not self.training:
+                logits_fake = self.discriminator(reconstructions.contiguous())
+                g_loss = -torch.mean(logits_fake)
+                if self.training:
+                    d_weight = self.calculate_adaptive_weight(
+                        nll_loss, g_loss, last_layer=last_layer
+                    )
+                else:
+                    d_weight = torch.tensor(1.0)
+            else:
+                d_weight = torch.tensor(0.0)
+                g_loss = torch.tensor(0.0, requires_grad=True)
+            loss = weighted_nll_loss + d_weight * self.disc_factor * g_loss
+            log = dict()
+            for k in regularization_log:
+                if k in self.regularization_weights:
+                    loss = loss + self.regularization_weights[k] * regularization_log[k]
+                if k in self.additional_log_keys:
+                    log[f"{split}/{k}"] = regularization_log[k].detach().float().mean()
+            log.update(
+                {
+                    f"{split}/loss/total": loss.clone().detach().mean(),
+                    f"{split}/loss/nll": nll_loss.detach().mean(),
+                    f"{split}/loss/rec": rec_loss.detach().mean(),
+                    f"{split}/loss/g": g_loss.detach().mean(),
+                    f"{split}/scalars/logvar": self.logvar.detach(),
+                    f"{split}/scalars/d_weight": d_weight.detach(),
+                }
+            )
+            return loss, log
+        elif optimizer_idx == 1:
+            # second pass for discriminator update
+            logits_real = self.discriminator(inputs.contiguous().detach())
+            logits_fake = self.discriminator(reconstructions.contiguous().detach())
+            if global_step >= self.discriminator_iter_start or not self.training:
+                d_loss = self.disc_factor * self.disc_loss(logits_real, logits_fake)
+            else:
+                d_loss = torch.tensor(0.0, requires_grad=True)
+            log = {
+                f"{split}/loss/disc": d_loss.clone().detach().mean(),
+                f"{split}/logits/real": logits_real.detach().mean(),
+                f"{split}/logits/fake": logits_fake.detach().mean(),
+            }
+            return d_loss, log
+        else:
+            raise NotImplementedError(f"Unknown optimizer_idx {optimizer_idx}")
+    def get_nll_loss(
+        self,
+        rec_loss: torch.Tensor,
+        weights: Optional[Union[float, torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        nll_loss = rec_loss / torch.exp(self.logvar) + self.logvar
+        weighted_nll_loss = nll_loss
+        if weights is not None:
+            weighted_nll_loss = weights * nll_loss
+        weighted_nll_loss = torch.sum(weighted_nll_loss) / weighted_nll_loss.shape[0]
+        nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
+        return nll_loss, weighted_nll_loss

sgm/modules/autoencoding/losses/lpips.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import torch
+import torch.nn as nn
+from ....util import default, instantiate_from_config
+from ..lpips.loss.lpips import LPIPS
+class LatentLPIPS(nn.Module):
+    def __init__(
+        self,
+        decoder_config,
+        perceptual_weight=1.0,
+        latent_weight=1.0,
+        scale_input_to_tgt_size=False,
+        scale_tgt_to_input_size=False,
+        perceptual_weight_on_inputs=0.0,
+    ):
+        super().__init__()
+        self.scale_input_to_tgt_size = scale_input_to_tgt_size
+        self.scale_tgt_to_input_size = scale_tgt_to_input_size
+        self.init_decoder(decoder_config)
+        self.perceptual_loss = LPIPS().eval()
+        self.perceptual_weight = perceptual_weight
+        self.latent_weight = latent_weight
+        self.perceptual_weight_on_inputs = perceptual_weight_on_inputs
+    def init_decoder(self, config):
+        self.decoder = instantiate_from_config(config)
+        if hasattr(self.decoder, "encoder"):
+            del self.decoder.encoder
+    def forward(self, latent_inputs, latent_predictions, image_inputs, split="train"):
+        log = dict()
+        loss = (latent_inputs - latent_predictions) ** 2
+        log[f"{split}/latent_l2_loss"] = loss.mean().detach()
+        image_reconstructions = None
+        if self.perceptual_weight > 0.0:
+            image_reconstructions = self.decoder.decode(latent_predictions)
+            image_targets = self.decoder.decode(latent_inputs)
+            perceptual_loss = self.perceptual_loss(
+                image_targets.contiguous(), image_reconstructions.contiguous()
+            )
+            loss = (
+                self.latent_weight * loss.mean()
+                + self.perceptual_weight * perceptual_loss.mean()
+            )
+            log[f"{split}/perceptual_loss"] = perceptual_loss.mean().detach()
+        if self.perceptual_weight_on_inputs > 0.0:
+            image_reconstructions = default(
+                image_reconstructions, self.decoder.decode(latent_predictions)
+            )
+            if self.scale_input_to_tgt_size:
+                image_inputs = torch.nn.functional.interpolate(
+                    image_inputs,
+                    image_reconstructions.shape[2:],
+                    mode="bicubic",
+                    antialias=True,
+                )
+            elif self.scale_tgt_to_input_size:
+                image_reconstructions = torch.nn.functional.interpolate(
+                    image_reconstructions,
+                    image_inputs.shape[2:],
+                    mode="bicubic",
+                    antialias=True,
+                )
+            perceptual_loss2 = self.perceptual_loss(
+                image_inputs.contiguous(), image_reconstructions.contiguous()
+            )
+            loss = loss + self.perceptual_weight_on_inputs * perceptual_loss2.mean()
+            log[f"{split}/perceptual_loss_on_inputs"] = perceptual_loss2.mean().detach()
+        return loss, log

sgm/modules/autoencoding/lpips/__init__.py ADDED Viewed

File without changes

sgm/modules/autoencoding/lpips/loss/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ vgg.pth

sgm/modules/autoencoding/lpips/loss/LICENSE ADDED Viewed

	@@ -0,0 +1,23 @@

+Copyright (c) 2018, Richard Zhang, Phillip Isola, Alexei A. Efros, Eli Shechtman, Oliver Wang
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

sgm/modules/autoencoding/lpips/loss/__init__.py ADDED Viewed

File without changes

sgm/modules/autoencoding/lpips/loss/lpips.py ADDED Viewed

	@@ -0,0 +1,147 @@

+"""Stripped version of https://github.com/richzhang/PerceptualSimilarity/tree/master/models"""
+from collections import namedtuple
+import torch
+import torch.nn as nn
+from torchvision import models
+from ..util import get_ckpt_path
+class LPIPS(nn.Module):
+    # Learned perceptual metric
+    def __init__(self, use_dropout=True):
+        super().__init__()
+        self.scaling_layer = ScalingLayer()
+        self.chns = [64, 128, 256, 512, 512]  # vg16 features
+        self.net = vgg16(pretrained=True, requires_grad=False)
+        self.lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout)
+        self.lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout)
+        self.lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout)
+        self.lin3 = NetLinLayer(self.chns[3], use_dropout=use_dropout)
+        self.lin4 = NetLinLayer(self.chns[4], use_dropout=use_dropout)
+        self.load_from_pretrained()
+        for param in self.parameters():
+            param.requires_grad = False
+    def load_from_pretrained(self, name="vgg_lpips"):
+        ckpt = get_ckpt_path(name, "sgm/modules/autoencoding/lpips/loss")
+        self.load_state_dict(
+            torch.load(ckpt, map_location=torch.device("cpu")), strict=False
+        )
+        print("loaded pretrained LPIPS loss from {}".format(ckpt))
+    @classmethod
+    def from_pretrained(cls, name="vgg_lpips"):
+        if name != "vgg_lpips":
+            raise NotImplementedError
+        model = cls()
+        ckpt = get_ckpt_path(name)
+        model.load_state_dict(
+            torch.load(ckpt, map_location=torch.device("cpu")), strict=False
+        )
+        return model
+    def forward(self, input, target):
+        in0_input, in1_input = (self.scaling_layer(input), self.scaling_layer(target))
+        outs0, outs1 = self.net(in0_input), self.net(in1_input)
+        feats0, feats1, diffs = {}, {}, {}
+        lins = [self.lin0, self.lin1, self.lin2, self.lin3, self.lin4]
+        for kk in range(len(self.chns)):
+            feats0[kk], feats1[kk] = normalize_tensor(outs0[kk]), normalize_tensor(
+                outs1[kk]
+            )
+            diffs[kk] = (feats0[kk] - feats1[kk]) ** 2
+        res = [
+            spatial_average(lins[kk].model(diffs[kk]), keepdim=True)
+            for kk in range(len(self.chns))
+        ]
+        val = res[0]
+        for l in range(1, len(self.chns)):
+            val += res[l]
+        return val
+class ScalingLayer(nn.Module):
+    def __init__(self):
+        super(ScalingLayer, self).__init__()
+        self.register_buffer(
+            "shift", torch.Tensor([-0.030, -0.088, -0.188])[None, :, None, None]
+        )
+        self.register_buffer(
+            "scale", torch.Tensor([0.458, 0.448, 0.450])[None, :, None, None]
+        )
+    def forward(self, inp):
+        return (inp - self.shift) / self.scale
+class NetLinLayer(nn.Module):
+    """A single linear layer which does a 1x1 conv"""
+    def __init__(self, chn_in, chn_out=1, use_dropout=False):
+        super(NetLinLayer, self).__init__()
+        layers = (
+            [
+                nn.Dropout(),
+            ]
+            if (use_dropout)
+            else []
+        )
+        layers += [
+            nn.Conv2d(chn_in, chn_out, 1, stride=1, padding=0, bias=False),
+        ]
+        self.model = nn.Sequential(*layers)
+class vgg16(torch.nn.Module):
+    def __init__(self, requires_grad=False, pretrained=True):
+        super(vgg16, self).__init__()
+        vgg_pretrained_features = models.vgg16(pretrained=pretrained).features
+        self.slice1 = torch.nn.Sequential()
+        self.slice2 = torch.nn.Sequential()
+        self.slice3 = torch.nn.Sequential()
+        self.slice4 = torch.nn.Sequential()
+        self.slice5 = torch.nn.Sequential()
+        self.N_slices = 5
+        for x in range(4):
+            self.slice1.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(4, 9):
+            self.slice2.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(9, 16):
+            self.slice3.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(16, 23):
+            self.slice4.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(23, 30):
+            self.slice5.add_module(str(x), vgg_pretrained_features[x])
+        if not requires_grad:
+            for param in self.parameters():
+                param.requires_grad = False
+    def forward(self, X):
+        h = self.slice1(X)
+        h_relu1_2 = h
+        h = self.slice2(h)
+        h_relu2_2 = h
+        h = self.slice3(h)
+        h_relu3_3 = h
+        h = self.slice4(h)
+        h_relu4_3 = h
+        h = self.slice5(h)
+        h_relu5_3 = h
+        vgg_outputs = namedtuple(
+            "VggOutputs", ["relu1_2", "relu2_2", "relu3_3", "relu4_3", "relu5_3"]
+        )
+        out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3)
+        return out
+def normalize_tensor(x, eps=1e-10):
+    norm_factor = torch.sqrt(torch.sum(x**2, dim=1, keepdim=True))
+    return x / (norm_factor + eps)
+def spatial_average(x, keepdim=True):
+    return x.mean([2, 3], keepdim=keepdim)

sgm/modules/autoencoding/lpips/model/LICENSE ADDED Viewed

	@@ -0,0 +1,58 @@

+Copyright (c) 2017, Jun-Yan Zhu and Taesung Park
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+--------------------------- LICENSE FOR pix2pix --------------------------------
+BSD License
+For pix2pix software
+Copyright (c) 2016, Phillip Isola and Jun-Yan Zhu
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+----------------------------- LICENSE FOR DCGAN --------------------------------
+BSD License
+For dcgan.torch software
+Copyright (c) 2015, Facebook, Inc. All rights reserved.
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+Neither the name Facebook nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

sgm/modules/autoencoding/lpips/model/__init__.py ADDED Viewed

File without changes

sgm/modules/autoencoding/lpips/model/model.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import functools
+import torch.nn as nn
+from ..util import ActNorm
+def weights_init(m):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        nn.init.normal_(m.weight.data, 0.0, 0.02)
+    elif classname.find("BatchNorm") != -1:
+        nn.init.normal_(m.weight.data, 1.0, 0.02)
+        nn.init.constant_(m.bias.data, 0)
+class NLayerDiscriminator(nn.Module):
+    """Defines a PatchGAN discriminator as in Pix2Pix
+    --> see https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/master/models/networks.py
+    """
+    def __init__(self, input_nc=3, ndf=64, n_layers=3, use_actnorm=False):
+        """Construct a PatchGAN discriminator
+        Parameters:
+            input_nc (int)  -- the number of channels in input images
+            ndf (int)       -- the number of filters in the last conv layer
+            n_layers (int)  -- the number of conv layers in the discriminator
+            norm_layer      -- normalization layer
+        """
+        super(NLayerDiscriminator, self).__init__()
+        if not use_actnorm:
+            norm_layer = nn.BatchNorm2d
+        else:
+            norm_layer = ActNorm
+        if (
+            type(norm_layer) == functools.partial
+        ):  # no need to use bias as BatchNorm2d has affine parameters
+            use_bias = norm_layer.func != nn.BatchNorm2d
+        else:
+            use_bias = norm_layer != nn.BatchNorm2d
+        kw = 4
+        padw = 1
+        sequence = [
+            nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw),
+            nn.LeakyReLU(0.2, True),
+        ]
+        nf_mult = 1
+        nf_mult_prev = 1
+        for n in range(1, n_layers):  # gradually increase the number of filters
+            nf_mult_prev = nf_mult
+            nf_mult = min(2**n, 8)
+            sequence += [
+                nn.Conv2d(
+                    ndf * nf_mult_prev,
+                    ndf * nf_mult,
+                    kernel_size=kw,
+                    stride=2,
+                    padding=padw,
+                    bias=use_bias,
+                ),
+                norm_layer(ndf * nf_mult),
+                nn.LeakyReLU(0.2, True),
+            ]
+        nf_mult_prev = nf_mult
+        nf_mult = min(2**n_layers, 8)
+        sequence += [
+            nn.Conv2d(
+                ndf * nf_mult_prev,
+                ndf * nf_mult,
+                kernel_size=kw,
+                stride=1,
+                padding=padw,
+                bias=use_bias,
+            ),
+            norm_layer(ndf * nf_mult),
+            nn.LeakyReLU(0.2, True),
+        ]
+        sequence += [
+            nn.Conv2d(ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw)
+        ]  # output 1 channel prediction map
+        self.main = nn.Sequential(*sequence)
+    def forward(self, input):
+        """Standard forward."""
+        return self.main(input)

sgm/modules/autoencoding/lpips/util.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import hashlib
+import os
+import requests
+import torch
+import torch.nn as nn
+from tqdm import tqdm
+URL_MAP = {"vgg_lpips": "https://heibox.uni-heidelberg.de/f/607503859c864bc1b30b/?dl=1"}
+CKPT_MAP = {"vgg_lpips": "vgg.pth"}
+MD5_MAP = {"vgg_lpips": "d507d7349b931f0638a25a48a722f98a"}
+def download(url, local_path, chunk_size=1024):
+    os.makedirs(os.path.split(local_path)[0], exist_ok=True)
+    with requests.get(url, stream=True) as r:
+        total_size = int(r.headers.get("content-length", 0))
+        with tqdm(total=total_size, unit="B", unit_scale=True) as pbar:
+            with open(local_path, "wb") as f:
+                for data in r.iter_content(chunk_size=chunk_size):
+                    if data:
+                        f.write(data)
+                        pbar.update(chunk_size)
+def md5_hash(path):
+    with open(path, "rb") as f:
+        content = f.read()
+    return hashlib.md5(content).hexdigest()
+def get_ckpt_path(name, root, check=False):
+    assert name in URL_MAP
+    path = os.path.join(root, CKPT_MAP[name])
+    if not os.path.exists(path) or (check and not md5_hash(path) == MD5_MAP[name]):
+        print("Downloading {} model from {} to {}".format(name, URL_MAP[name], path))
+        download(URL_MAP[name], path)
+        md5 = md5_hash(path)
+        assert md5 == MD5_MAP[name], md5
+    return path
+class ActNorm(nn.Module):
+    def __init__(
+        self, num_features, logdet=False, affine=True, allow_reverse_init=False
+    ):
+        assert affine
+        super().__init__()
+        self.logdet = logdet
+        self.loc = nn.Parameter(torch.zeros(1, num_features, 1, 1))
+        self.scale = nn.Parameter(torch.ones(1, num_features, 1, 1))
+        self.allow_reverse_init = allow_reverse_init
+        self.register_buffer("initialized", torch.tensor(0, dtype=torch.uint8))
+    def initialize(self, input):
+        with torch.no_grad():
+            flatten = input.permute(1, 0, 2, 3).contiguous().view(input.shape[1], -1)
+            mean = (
+                flatten.mean(1)
+                .unsqueeze(1)
+                .unsqueeze(2)
+                .unsqueeze(3)
+                .permute(1, 0, 2, 3)
+            )
+            std = (
+                flatten.std(1)
+                .unsqueeze(1)
+                .unsqueeze(2)
+                .unsqueeze(3)
+                .permute(1, 0, 2, 3)
+            )
+            self.loc.data.copy_(-mean)
+            self.scale.data.copy_(1 / (std + 1e-6))
+    def forward(self, input, reverse=False):
+        if reverse:
+            return self.reverse(input)
+        if len(input.shape) == 2:
+            input = input[:, :, None, None]
+            squeeze = True
+        else:
+            squeeze = False
+        _, _, height, width = input.shape
+        if self.training and self.initialized.item() == 0:
+            self.initialize(input)
+            self.initialized.fill_(1)
+        h = self.scale * (input + self.loc)
+        if squeeze:
+            h = h.squeeze(-1).squeeze(-1)
+        if self.logdet:
+            log_abs = torch.log(torch.abs(self.scale))
+            logdet = height * width * torch.sum(log_abs)
+            logdet = logdet * torch.ones(input.shape[0]).to(input)
+            return h, logdet
+        return h
+    def reverse(self, output):
+        if self.training and self.initialized.item() == 0:
+            if not self.allow_reverse_init:
+                raise RuntimeError(
+                    "Initializing ActNorm in reverse direction is "
+                    "disabled by default. Use allow_reverse_init=True to enable."
+                )
+            else:
+                self.initialize(output)
+                self.initialized.fill_(1)
+        if len(output.shape) == 2:
+            output = output[:, :, None, None]
+            squeeze = True
+        else:
+            squeeze = False
+        h = output / self.scale - self.loc
+        if squeeze:
+            h = h.squeeze(-1).squeeze(-1)
+        return h

sgm/modules/autoencoding/lpips/vqperceptual.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import torch
+import torch.nn.functional as F
+def hinge_d_loss(logits_real, logits_fake):
+    loss_real = torch.mean(F.relu(1.0 - logits_real))
+    loss_fake = torch.mean(F.relu(1.0 + logits_fake))
+    d_loss = 0.5 * (loss_real + loss_fake)
+    return d_loss
+def vanilla_d_loss(logits_real, logits_fake):
+    d_loss = 0.5 * (
+        torch.mean(torch.nn.functional.softplus(-logits_real))
+        + torch.mean(torch.nn.functional.softplus(logits_fake))
+    )
+    return d_loss

sgm/modules/autoencoding/regularizers/__init__.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from abc import abstractmethod
+from typing import Any, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ....modules.distributions.distributions import \
+    DiagonalGaussianDistribution
+from .base import AbstractRegularizer
+class DiagonalGaussianRegularizer(AbstractRegularizer):
+    def __init__(self, sample: bool = True):
+        super().__init__()
+        self.sample = sample
+    def get_trainable_parameters(self) -> Any:
+        yield from ()
+    def forward(self, z: torch.Tensor) -> Tuple[torch.Tensor, dict]:
+        log = dict()
+        posterior = DiagonalGaussianDistribution(z)
+        if self.sample:
+            z = posterior.sample()
+        else:
+            z = posterior.mode()
+        kl_loss = posterior.kl()
+        kl_loss = torch.sum(kl_loss) / kl_loss.shape[0]
+        log["kl_loss"] = kl_loss
+        return z, log

sgm/modules/autoencoding/regularizers/base.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from abc import abstractmethod
+from typing import Any, Tuple
+import torch
+import torch.nn.functional as F
+from torch import nn
+class AbstractRegularizer(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, z: torch.Tensor) -> Tuple[torch.Tensor, dict]:
+        raise NotImplementedError()
+    @abstractmethod
+    def get_trainable_parameters(self) -> Any:
+        raise NotImplementedError()
+class IdentityRegularizer(AbstractRegularizer):
+    def forward(self, z: torch.Tensor) -> Tuple[torch.Tensor, dict]:
+        return z, dict()
+    def get_trainable_parameters(self) -> Any:
+        yield from ()
+def measure_perplexity(
+    predicted_indices: torch.Tensor, num_centroids: int
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # src: https://github.com/karpathy/deep-vector-quantization/blob/main/model.py
+    # eval cluster perplexity. when perplexity == num_embeddings then all clusters are used exactly equally
+    encodings = (
+        F.one_hot(predicted_indices, num_centroids).float().reshape(-1, num_centroids)
+    )
+    avg_probs = encodings.mean(0)
+    perplexity = (-(avg_probs * torch.log(avg_probs + 1e-10)).sum()).exp()
+    cluster_use = torch.sum(avg_probs > 0)
+    return perplexity, cluster_use