Spaces:

TencentARC
/

GeometryCrafter

Running on Zero

App Files Files Community

slothfulxtx commited on 28 days ago

Commit

ca2145e

1 Parent(s): 774e213

init

Browse files

Files changed (14) hide show

.gitignore +178 -0
.gitmodules +3 -0
app.py +436 -0
geometrycrafter/__init__.py +4 -0
geometrycrafter/determ_ppl.py +453 -0
geometrycrafter/diff_ppl.py +526 -0
geometrycrafter/pmap_vae.py +330 -0
geometrycrafter/unet.py +281 -0
requirements.txt +16 -0
third_party/__init__.py +22 -0
third_party/moge +1 -0
utils/__init__.py +0 -0
utils/disp_utils.py +43 -0
utils/glb_utils.py +19 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,178 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Vscode settings
+.vscode/
+# Temporal files
+/tmp
+tmp*
+# Workspace
+/workspace
+# running scripts
+/*.sh
+# pretrained
+/pretrained_models

.gitmodules ADDED Viewed

	@@ -0,0 +1,3 @@

+[submodule "third_party/moge"]
+	path = third_party/moge
+	url = https://github.com/microsoft/MoGe.git

app.py ADDED Viewed

	@@ -0,0 +1,436 @@

+import gc
+import os
+import uuid
+from pathlib import Path
+import numpy as np
+import spaces
+import gradio as gr
+import torch
+from decord import cpu, VideoReader
+from diffusers.training_utils import set_seed
+import torch.nn.functional as F
+import imageio
+from kornia.filters import canny
+from kornia.morphology import dilation
+from third_party import MoGe
+from geometrycrafter import (
+    GeometryCrafterDiffPipeline,
+    GeometryCrafterDetermPipeline,
+    PMapAutoencoderKLTemporalDecoder,
+    UNetSpatioTemporalConditionModelVid2vid
+)
+from utils.glb_utils import pmap_to_glb
+from utils.disp_utils import pmap_to_disp
+examples = [
+    # process_length: int,
+    # max_res: int,
+    # num_inference_steps: int,
+    # guidance_scale: float,
+    # window_size: int,
+    # decode_chunk_size: int,
+    # overlap: int,
+    ["examples/video1.mp4", 60, 640, 5, 1.0, 110, 8, 25],
+    ["examples/video2.mp4", 60, 640, 5, 1.0, 110, 8, 25],
+    ["examples/video3.mp4", 60, 640, 5, 1.0, 110, 8, 25],
+    ["examples/video4.mp4", 60, 640, 5, 1.0, 110, 8, 25],
+]
+model_type = 'diff'
+cache_dir = 'workspace/cache'
+unet = UNetSpatioTemporalConditionModelVid2vid.from_pretrained(
+    'TencentARC/GeometryCrafter',
+    subfolder='unet_diff' if model_type == 'diff' else 'unet_determ',
+    low_cpu_mem_usage=True,
+    torch_dtype=torch.float16,
+    cache_dir=cache_dir
+).requires_grad_(False).to("cuda", dtype=torch.float16)
+point_map_vae = PMapAutoencoderKLTemporalDecoder.from_pretrained(
+    'TencentARC/GeometryCrafter',
+    subfolder='point_map_vae',
+    low_cpu_mem_usage=True,
+    torch_dtype=torch.float32,
+    cache_dir=cache_dir
+).requires_grad_(False).to("cuda", dtype=torch.float32)
+prior_model = MoGe(
+    cache_dir=cache_dir,
+).requires_grad_(False).to('cuda', dtype=torch.float32)
+if model_type == 'diff':
+    pipe = GeometryCrafterDiffPipeline.from_pretrained(
+        'stabilityai/stable-video-diffusion-img2vid-xt',
+        unet=unet,
+        torch_dtype=torch.float16,
+        variant="fp16",
+        cache_dir=cache_dir
+    ).to("cuda")
+else:
+    pipe = GeometryCrafterDetermPipeline.from_pretrained(
+        'stabilityai/stable-video-diffusion-img2vid-xt',
+        unet=unet,
+        torch_dtype=torch.float16,
+        variant="fp16",
+        cache_dir=cache_dir
+    ).to("cuda")
+try:
+    pipe.enable_xformers_memory_efficient_attention()
+except Exception as e:
+    print(e)
+    print("Xformers is not enabled")
+# bugs at https://github.com/continue-revolution/sd-webui-animatediff/issues/101
+# pipe.enable_xformers_memory_efficient_attention()
+pipe.enable_attention_slicing()
+mesh_seqs = []
+frame_seqs = []
+cur_mesh_idx = None
+def read_video_frames(video_path, process_length, max_res):
+    print("==> processing video: ", video_path)
+    vid = VideoReader(video_path, ctx=cpu(0))
+    fps = vid.get_avg_fps()
+    print("==> original video shape: ", (len(vid), *vid.get_batch([0]).shape[1:]))
+    original_height, original_width = vid.get_batch([0]).shape[1:3]
+    if max(original_height, original_width) > max_res:
+        scale = max_res / max(original_height, original_width)
+        original_height, original_width = round(original_height * scale), round(original_width * scale)
+    else:
+        scale = 1.0
+    height = round(original_height * scale / 64) * 64
+    width = round(original_width * scale / 64) * 64
+    vid = VideoReader(video_path, ctx=cpu(0), width=original_width, height=original_height)
+    frames_idx = list(range(0, min(len(vid), process_length) if process_length != -1 else len(vid)))
+    print(
+        f"==> final processing shape: {len(frames_idx), *vid.get_batch([0]).shape[1:]}"
+    )
+    frames = vid.get_batch(frames_idx).asnumpy().astype("float32") / 255.0
+    return frames, height, width, fps
+def compute_edge_mask(depth: torch.Tensor, edge_dilation_radius: int):
+    magnitude, edges = canny(depth[None, None, :, :], low_threshold=0.4, high_threshold=0.5)
+    magnitude = magnitude[0, 0]
+    edges = edges[0, 0]
+    mask = (edges > 0).float()
+    mask = dilation(mask[None, None, :, :], torch.ones((edge_dilation_radius,edge_dilation_radius), device=mask.device))
+    return mask[0, 0] > 0.5
+@spaces.GPU(duration=120)
+@torch.inference_mode()
+def infer_geometry(
+    video: str,
+    process_length: int,
+    max_res: int,
+    num_inference_steps: int,
+    guidance_scale: float,
+    window_size: int,
+    decode_chunk_size: int,
+    overlap: int,
+    downsample_ratio: float = 1.0, # downsample pcd for visualization
+    num_sample_frames: int =8, # downsample frames for visualization
+    remove_edge: bool = True, # remove edge for visualization
+    save_folder: str = os.path.join('workspace', 'GeometryCrafterApp'),
+):
+    try:
+        global cur_mesh_idx, mesh_seqs, frame_seqs
+        run_id = str(uuid.uuid4())
+        set_seed(42)
+        pipe.enable_xformers_memory_efficient_attention()
+        frames, height, width, fps = read_video_frames(video, process_length, max_res)
+        aspect_ratio = width / height
+        assert 0.5 <= aspect_ratio and aspect_ratio <= 2.0
+        frames_tensor = torch.tensor(frames.astype("float32"), device='cuda').float().permute(0, 3, 1, 2)
+        window_size = min(window_size, len(frames))
+        if window_size == len(frames):
+            overlap = 0
+        point_maps, valid_masks = pipe(
+            frames_tensor,
+            point_map_vae,
+            prior_model,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            window_size=window_size,
+            decode_chunk_size=decode_chunk_size,
+            overlap=overlap,
+            force_projection=True,
+            force_fixed_focal=True,
+        )
+        frames_tensor = frames_tensor.cpu()
+        point_maps = point_maps.cpu()
+        valid_masks = valid_masks.cpu()
+        gc.collect()
+        torch.cuda.empty_cache()
+        output_npz_path = Path(save_folder, run_id, f'point_maps.npz')
+        output_npz_path.parent.mkdir(exist_ok=True)
+        np.savez_compressed(
+            output_npz_path,
+            point_map=point_maps.cpu().numpy().astype(np.float16),
+            valid_mask=valid_masks.cpu().numpy().astype(np.bool_)
+        )
+        output_disp_path = Path(save_folder, run_id, f'disp.mp4')
+        output_disp_path.parent.mkdir(exist_ok=True)
+        colored_disp = pmap_to_disp(point_maps, valid_masks)
+        imageio.mimsave(
+            output_disp_path, (colored_disp*255).cpu().numpy().astype(np.uint8), fps=fps, macro_block_size=1)
+        # downsample for visualization
+        if downsample_ratio > 1.0:
+            H, W = point_maps.shape[1:3]
+            H, W = round(H / downsample_ratio), round(W / downsample_ratio)
+            point_maps = F.interpolate(point_maps.permute(0,3,1,2), (H, W)).permute(0,2,3,1)
+            frames = F.interpolate(frames_tensor, (H, W)).permute(0,2,3,1)
+            valid_masks = F.interpolate(valid_masks.float()[:, None], (H, W))[:, 0] > 0.5
+        else:
+            H, W = point_maps.shape[1:3]
+            frames = frames_tensor.permute(0,2,3,1)
+        if remove_edge:
+            for i in range(len(valid_masks)):
+                edge_mask = compute_edge_mask(point_maps[i, :, :, 2], 3)
+                valid_masks[i] = valid_masks[i] & (~edge_mask)
+        indices = np.linspace(0, len(point_maps)-1, num_sample_frames)
+        indices = np.round(indices).astype(np.int32)
+        mesh_seqs.clear()
+        cur_mesh_idx = None
+        for index in indices:
+            valid_mask = valid_masks[index].cpu().numpy()
+            point_map = point_maps[index].cpu().numpy()
+            frame = frames[index].cpu().numpy()
+            output_glb_path = Path(save_folder, run_id, f'{index:04}.glb')
+            output_glb_path.parent.mkdir(exist_ok=True)
+            glbscene = pmap_to_glb(point_map, valid_mask, frame)
+            glbscene.export(file_obj=output_glb_path)
+            mesh_seqs.append(output_glb_path)
+            frame_seqs.append(index)
+        cur_mesh_idx = 0
+        gc.collect()
+        torch.cuda.empty_cache()
+        return [
+            gr.Model3D(value=mesh_seqs[cur_mesh_idx], label=f"Frame: {frame_seqs[cur_mesh_idx]}"),
+            gr.Video(value=output_disp_path, label="Disparity", interactive=False),
+            gr.DownloadButton("Download Npz File", value=output_npz_path, visible=True)
+        ]
+    except Exception as e:
+        mesh_seqs.clear()
+        frame_seqs.clear()
+        cur_mesh_idx = None
+        gc.collect()
+        torch.cuda.empty_cache()
+        raise gr.Error(str(e))
+        # return [
+        #     gr.Model3D(
+        #         label="Point Map",
+        #         clear_color=[1.0, 1.0, 1.0, 1.0],
+        #         interactive=False
+        #     ),
+        #     gr.Video(label="Disparity", interactive=False),
+        #     gr.DownloadButton("Download Npz File", visible=False)
+        # ]
+def goto_prev_frame():
+    global cur_mesh_idx, mesh_seqs, frame_seqs
+    if cur_mesh_idx is not None and len(mesh_seqs) > 0:
+        if cur_mesh_idx > 0:
+            cur_mesh_idx -= 1
+        return gr.Model3D(value=mesh_seqs[cur_mesh_idx], label=f"Frame: {frame_seqs[cur_mesh_idx]}")
+def goto_next_frame():
+    global cur_mesh_idx, mesh_seqs, frame_seqs
+    if cur_mesh_idx is not None and len(mesh_seqs) > 0:
+        if cur_mesh_idx < len(mesh_seqs)-1:
+            cur_mesh_idx += 1
+        return gr.Model3D(value=mesh_seqs[cur_mesh_idx], label=f"Frame: {frame_seqs[cur_mesh_idx]}")
+def download_file():
+    return gr.DownloadButton(visible=False)
+def build_demo():
+    with gr.Blocks(analytics_enabled=False) as gradio_demo:
+        gr.Markdown(
+            """
+            <div align='center'>
+                <h1> GeometryCrafter: Consistent Geometry Estimation for Open-world Videos with Diffusion Priors </h1> \
+                <h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>\
+                    <a href='https://scholar.google.com/citations?user=zHp0rMIAAAAJ'>Tian-Xing Xu</a>, \
+                    <a href='https://scholar.google.com/citations?user=qgdesEcAAAAJ'>Xiangjun Gao</a>, \
+                    <a href='https://wbhu.github.io'>Wenbo Hu</a>, \
+                    <a href='https://xiaoyu258.github.io/'>Xiaoyu Li</a>, \
+                    <a href='https://scholar.google.com/citations?user=AWtV-EQAAAAJ'>Song-Hai Zhang</a>,\
+                    <a href='https://scholar.google.com/citations?user=4oXBp9UAAAAJ'>Ying Shan</a>\
+                </h2> \
+                <span style='font-size:18px'>If you find GeometryCrafter useful, please help ⭐ the \
+                    <a style='font-size:18px' href='https://github.com/TencentARC/GeometryCrafter/'>[Github Repo]</a>\
+                    , which is important to Open-Source projects. Thanks!\
+                    <a style='font-size:18px' href='https://arxiv.org'> [ArXivTODO] </a>\
+                    <a style='font-size:18px' href='https://geometrycrafter.github.io'> [Project Page] </a>
+                </span>
+            </div>
+            """
+        )
+        with gr.Row(equal_height=True):
+            with gr.Column(scale=1):
+                input_video = gr.Video(
+                    label="Input Video",
+                    sources=['upload']
+                )
+                with gr.Row(equal_height=False):
+                    with gr.Accordion("Advanced Settings", open=False):
+                        process_length = gr.Slider(
+                            label="process length",
+                            minimum=-1,
+                            maximum=280,
+                            value=110,
+                            step=1,
+                        )
+                        max_res = gr.Slider(
+                            label="max resolution",
+                            minimum=512,
+                            maximum=2048,
+                            value=1024,
+                            step=64,
+                        )
+                        num_denoising_steps = gr.Slider(
+                            label="num denoising steps",
+                            minimum=1,
+                            maximum=25,
+                            value=5,
+                            step=1,
+                        )
+                        guidance_scale = gr.Slider(
+                            label="cfg scale",
+                            minimum=1.0,
+                            maximum=1.2,
+                            value=1.0,
+                            step=0.1,
+                        )
+                        window_size = gr.Slider(
+                            label="shift window size",
+                            minimum=10,
+                            maximum=110,
+                            value=110,
+                            step=10,
+                        )
+                        decode_chunk_size = gr.Slider(
+                            label="decode chunk size",
+                            minimum=1,
+                            maximum=16,
+                            value=6,
+                            step=1,
+                        )
+                        overlap = gr.Slider(
+                            label="overlap",
+                            minimum=1,
+                            maximum=50,
+                            value=25,
+                            step=1,
+                        )
+                    generate_btn = gr.Button("Generate")
+            with gr.Column(scale=1):
+                output_point_maps = gr.Model3D(
+                    label="Point Map",
+                    clear_color=[1.0, 1.0, 1.0, 1.0],
+                    # display_mode="solid"
+                    interactive=False
+                )
+                with gr.Row():
+                    prev_btn = gr.Button("Prev")
+                    next_btn = gr.Button("Next")
+            with gr.Column(scale=1):
+                output_disp_video = gr.Video(
+                    label="Disparity",
+                    interactive=False
+                )
+                download_btn = gr.DownloadButton("Download Npz File", visible=False)
+        gr.Examples(
+            examples=examples,
+            fn=infer_geometry,
+            inputs=[
+                input_video,
+                process_length,
+                max_res,
+                num_denoising_steps,
+                guidance_scale,
+                window_size,
+                decode_chunk_size,
+                overlap,
+            ],
+            outputs=[output_point_maps, output_disp_video, download_btn],
+            # cache_examples="lazy",
+        )
+        gr.Markdown(
+            """
+            <span style='font-size:18px'>Note:
+                For time quota consideration, we set the default parameters to be more efficient here,
+                with a trade-off of shorter video length and slightly lower quality.
+                You may adjust the parameters according to our
+                <a style='font-size:18px' href='https://github.com/TencentARC/GeometryCrafter/'>[Github Repo]</a>
+                for better results if you have enough time quota. We only provide a simplified visualization
+                script in this page due to the lack of support for point cloud sequences. You can download
+                the npz file and open it with Viser backend in our repo for better visualization.
+            </span>
+            """
+        )
+        generate_btn.click(
+            fn=infer_geometry,
+            inputs=[
+                input_video,
+                process_length,
+                max_res,
+                num_denoising_steps,
+                guidance_scale,
+                window_size,
+                decode_chunk_size,
+                overlap,
+            ],
+            outputs=[output_point_maps, output_disp_video, download_btn],
+        )
+        prev_btn.click(
+            fn=goto_prev_frame,
+            outputs=output_point_maps,
+        )
+        next_btn.click(
+            fn=goto_next_frame,
+            outputs=output_point_maps,
+        )
+        download_btn.click(
+            fn=download_file,
+            outputs=download_btn
+        )
+    return gradio_demo
+if __name__ == "__main__":
+    demo = build_demo()
+    demo.queue()
+    demo.launch(server_name="0.0.0.0", server_port=12345, debug=True, share=False)
+    # demo.launch(share=True)

geometrycrafter/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .pmap_vae import PMapAutoencoderKLTemporalDecoder
+from .unet import UNetSpatioTemporalConditionModelVid2vid
+from .diff_ppl import GeometryCrafterDiffPipeline
+from .determ_ppl import GeometryCrafterDetermPipeline

geometrycrafter/determ_ppl.py ADDED Viewed

	@@ -0,0 +1,453 @@

+from typing import Optional, Union
+import gc
+import numpy as np
+import torch
+import torch.nn.functional as F
+from diffusers.pipelines.stable_video_diffusion.pipeline_stable_video_diffusion import (
+    _resize_with_antialiasing,
+    StableVideoDiffusionPipeline,
+)
+from diffusers.utils import logging
+from kornia.utils import create_meshgrid
+from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@torch.no_grad()
+def normalize_point_map(point_map, valid_mask):
+    # T,H,W,3 T,H,W
+    norm_factor = (point_map[..., 2] * valid_mask.float()).mean() / (valid_mask.float().mean() + 1e-8)
+    norm_factor = norm_factor.clip(min=1e-3)
+    return point_map / norm_factor
+def point_map_xy2intrinsic_map(point_map_xy):
+    # *,h,w,2
+    height, width = point_map_xy.shape[-3], point_map_xy.shape[-2]
+    assert height % 2 == 0
+    assert width % 2 == 0
+    mesh_grid = create_meshgrid(
+        height=height,
+        width=width,
+        normalized_coordinates=True,
+        device=point_map_xy.device,
+        dtype=point_map_xy.dtype
+    )[0] # h,w,2
+    assert mesh_grid.abs().min() > 1e-4
+    # *,h,w,2
+    mesh_grid = mesh_grid.expand_as(point_map_xy)
+    nc = point_map_xy.mean(dim=-2).mean(dim=-2) # *, 2
+    nc_map = nc[..., None, None, :].expand_as(point_map_xy)
+    nf = ((point_map_xy - nc_map) / mesh_grid).mean(dim=-2).mean(dim=-2)
+    nf_map = nf[..., None, None, :].expand_as(point_map_xy)
+    # print((mesh_grid * nf_map + nc_map - point_map_xy).abs().max())
+    return torch.cat([nc_map, nf_map], dim=-1)
+def robust_min_max(tensor, quantile=0.99):
+    T, H, W = tensor.shape
+    min_vals = []
+    max_vals = []
+    for i in range(T):
+        min_vals.append(torch.quantile(tensor[i], q=1-quantile, interpolation='nearest').item())
+        max_vals.append(torch.quantile(tensor[i], q=quantile, interpolation='nearest').item())
+    return min(min_vals), max(max_vals)
+class GeometryCrafterDetermPipeline(StableVideoDiffusionPipeline):
+    @torch.inference_mode()
+    def encode_video(
+        self,
+        video: torch.Tensor,
+        chunk_size: int = 14,
+    ) -> torch.Tensor:
+        """
+        :param video: [b, c, h, w] in range [-1, 1], the b may contain multiple videos or frames
+        :param chunk_size: the chunk size to encode video
+        :return: image_embeddings in shape of [b, 1024]
+        """
+        video_224 = _resize_with_antialiasing(video.float(), (224, 224))
+        video_224 = (video_224 + 1.0) / 2.0  # [-1, 1] -> [0, 1]
+        embeddings = []
+        for i in range(0, video_224.shape[0], chunk_size):
+            emb = self.feature_extractor(
+                images=video_224[i : i + chunk_size],
+                do_normalize=True,
+                do_center_crop=False,
+                do_resize=False,
+                do_rescale=False,
+                return_tensors="pt",
+            ).pixel_values.to(video.device, dtype=video.dtype)
+            embeddings.append(self.image_encoder(emb).image_embeds)  # [b, 1024]
+        embeddings = torch.cat(embeddings, dim=0)  # [t, 1024]
+        return embeddings
+    @torch.inference_mode()
+    def encode_vae_video(
+        self,
+        video: torch.Tensor,
+        chunk_size: int = 14,
+    ):
+        """
+        :param video: [b, c, h, w] in range [-1, 1], the b may contain multiple videos or frames
+        :param chunk_size: the chunk size to encode video
+        :return: vae latents in shape of [b, c, h, w]
+        """
+        video_latents = []
+        for i in range(0, video.shape[0], chunk_size):
+            video_latents.append(
+                self.vae.encode(video[i : i + chunk_size]).latent_dist.mode()
+            )
+        video_latents = torch.cat(video_latents, dim=0)
+        return video_latents
+    @torch.inference_mode()
+    def produce_priors(self, prior_model, frame, chunk_size=8):
+        T, _, H, W = frame.shape
+        frame = (frame + 1) / 2
+        pred_point_maps = []
+        pred_masks = []
+        for i in range(0, len(frame), chunk_size):
+            pred_p, pred_m = prior_model.forward_image(frame[i:i+chunk_size])
+            pred_point_maps.append(pred_p)
+            pred_masks.append(pred_m)
+        pred_point_maps = torch.cat(pred_point_maps, dim=0)
+        pred_masks = torch.cat(pred_masks, dim=0)
+        pred_masks = pred_masks.float() * 2 - 1
+        # T,H,W,3 T,H,W
+        pred_point_maps = normalize_point_map(pred_point_maps, pred_masks > 0)
+        pred_disps = 1.0 / pred_point_maps[..., 2].clamp_min(1e-3)
+        pred_disps = pred_disps * (pred_masks > 0)
+        min_disparity, max_disparity = robust_min_max(pred_disps)
+        pred_disps = ((pred_disps - min_disparity) / (max_disparity - min_disparity+1e-4)).clamp(0, 1)
+        pred_disps = pred_disps * 2 - 1
+        pred_point_maps[..., :2] = pred_point_maps[..., :2] / (pred_point_maps[..., 2:3] + 1e-7)
+        pred_point_maps[..., 2] = torch.log(pred_point_maps[..., 2] + 1e-7) * (pred_masks > 0) # [x/z, y/z, log(z)]
+        pred_intr_maps = point_map_xy2intrinsic_map(pred_point_maps[..., :2]).permute(0,3,1,2) # T,H,W,2
+        pred_point_maps = pred_point_maps.permute(0,3,1,2)
+        return pred_disps, pred_masks, pred_point_maps, pred_intr_maps
+    @torch.inference_mode()
+    def encode_point_map(self, point_map_vae, disparity, valid_mask, point_map, intrinsic_map, chunk_size=8):
+        T, _, H, W = point_map.shape
+        latents = []
+        psedo_image = disparity[:, None].repeat(1,3,1,1)
+        intrinsic_map = torch.norm(intrinsic_map[:, 2:4], p=2, dim=1, keepdim=False)
+        for i in range(0, T, chunk_size):
+            latent_dist = self.vae.encode(psedo_image[i : i + chunk_size].to(self.vae.dtype)).latent_dist
+            latent_dist = point_map_vae.encode(
+                torch.cat([
+                    intrinsic_map[i:i+chunk_size, None],
+                    point_map[i:i+chunk_size, 2:3],
+                    disparity[i:i+chunk_size, None],
+                    valid_mask[i:i+chunk_size, None]], dim=1),
+                latent_dist
+            )
+            if isinstance(latent_dist, DiagonalGaussianDistribution):
+                latent = latent_dist.mode()
+            else:
+                latent = latent_dist
+            assert isinstance(latent, torch.Tensor)
+            latents.append(latent)
+        latents = torch.cat(latents, dim=0)
+        latents = latents * self.vae.config.scaling_factor
+        return latents
+    @torch.no_grad()
+    def decode_point_map(self, point_map_vae, latents, chunk_size=8, force_projection=True, force_fixed_focal=True, use_extract_interp=False, need_resize=False, height=None, width=None):
+        T = latents.shape[0]
+        rec_intrinsic_maps = []
+        rec_depth_maps = []
+        rec_valid_masks = []
+        for i in range(0, T, chunk_size):
+            lat = latents[i:i+chunk_size]
+            rec_imap, rec_dmap, rec_vmask = point_map_vae.decode(
+                lat,
+                num_frames=lat.shape[0],
+            )
+            rec_intrinsic_maps.append(rec_imap)
+            rec_depth_maps.append(rec_dmap)
+            rec_valid_masks.append(rec_vmask)
+        rec_intrinsic_maps = torch.cat(rec_intrinsic_maps, dim=0)
+        rec_depth_maps = torch.cat(rec_depth_maps, dim=0)
+        rec_valid_masks = torch.cat(rec_valid_masks, dim=0)
+        if need_resize:
+            rec_depth_maps = F.interpolate(rec_depth_maps, (height, width), mode='nearest-exact') if use_extract_interp else F.interpolate(rec_depth_maps, (height, width), mode='bilinear', align_corners=False)
+            rec_valid_masks = F.interpolate(rec_valid_masks, (height, width), mode='nearest-exact') if use_extract_interp else F.interpolate(rec_valid_masks, (height, width), mode='bilinear', align_corners=False)
+            rec_intrinsic_maps = F.interpolate(rec_intrinsic_maps, (height, width), mode='bilinear', align_corners=False)
+        H, W = rec_intrinsic_maps.shape[-2], rec_intrinsic_maps.shape[-1]
+        mesh_grid = create_meshgrid(
+            H, W,
+            normalized_coordinates=True
+        ).to(rec_intrinsic_maps.device, rec_intrinsic_maps.dtype, non_blocking=True)
+        # 1,h,w,2
+        rec_intrinsic_maps = torch.cat([rec_intrinsic_maps * W / np.sqrt(W**2+H**2), rec_intrinsic_maps * H / np.sqrt(W**2+H**2)], dim=1) # t,2,h,w
+        mesh_grid = mesh_grid.permute(0,3,1,2)
+        rec_valid_masks = rec_valid_masks.squeeze(1) > 0
+        if force_projection:
+            if force_fixed_focal:
+                nfx = (rec_intrinsic_maps[:, 0, :, :] * rec_valid_masks.float()).mean() / (rec_valid_masks.float().mean() + 1e-4)
+                nfy = (rec_intrinsic_maps[:, 1, :, :] * rec_valid_masks.float()).mean() / (rec_valid_masks.float().mean() + 1e-4)
+                rec_intrinsic_maps = torch.tensor([nfx, nfy], device=rec_intrinsic_maps.device)[None, :, None, None].repeat(T, 1, 1, 1)
+            else:
+                nfx = (rec_intrinsic_maps[:, 0, :, :] * rec_valid_masks.float()).mean(dim=[-1, -2]) / (rec_valid_masks.float().mean(dim=[-1, -2]) + 1e-4)
+                nfy = (rec_intrinsic_maps[:, 1, :, :] * rec_valid_masks.float()).mean(dim=[-1, -2]) / (rec_valid_masks.float().mean(dim=[-1, -2]) + 1e-4)
+                rec_intrinsic_maps = torch.stack([nfx, nfy], dim=-1)[:, :, None, None]
+            # t,2,1,1
+        rec_point_maps = torch.cat([rec_intrinsic_maps * mesh_grid, rec_depth_maps], dim=1).permute(0,2,3,1)
+        xy, z = rec_point_maps.split([2, 1], dim=-1)
+        z = torch.clamp_max(z, 10) # for numerical stability
+        z = torch.exp(z)
+        rec_point_maps = torch.cat([xy * z, z], dim=-1)
+        return rec_point_maps, rec_valid_masks
+    @torch.no_grad()
+    def __call__(
+        self,
+        video: Union[np.ndarray, torch.Tensor],
+        point_map_vae,
+        prior_model,
+        height: int = 576,
+        width: int = 1024,
+        window_size: Optional[int] = 14,
+        noise_aug_strength: float = 0.02,
+        decode_chunk_size: Optional[int] = None,
+        overlap: int = 4,
+        force_projection: bool = True,
+        force_fixed_focal: bool = True,
+        use_extract_interp: bool = False,
+        track_time: bool = False,
+        **kwargs
+    ):
+        # video: in shape [t, h, w, c] if np.ndarray or [t, c, h, w] if torch.Tensor, in range [0, 1]
+        # 0. Define height and width for preprocessing
+        if isinstance(video, np.ndarray):
+            video = torch.from_numpy(video.transpose(0, 3, 1, 2))
+        else:
+            assert isinstance(video, torch.Tensor)
+        height = height or video.shape[-2]
+        width = width or video.shape[-1]
+        original_height = video.shape[-2]
+        original_width = video.shape[-1]
+        num_frames = video.shape[0]
+        decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else 8
+        if num_frames <= window_size:
+            window_size = num_frames
+            overlap = 0
+        stride = window_size - overlap
+        # 1. Check inputs. Raise error if not correct
+        assert height % 64 == 0 and width % 64 == 0
+        if original_height != height or original_width != width:
+            need_resize = True
+        else:
+            need_resize = False
+        # 2. Define call parameters
+        batch_size = 1
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        self._guidance_scale = 1.0
+        if track_time:
+            start_event = torch.cuda.Event(enable_timing=True)
+            prior_event = torch.cuda.Event(enable_timing=True)
+            encode_event = torch.cuda.Event(enable_timing=True)
+            denoise_event = torch.cuda.Event(enable_timing=True)
+            decode_event = torch.cuda.Event(enable_timing=True)
+            start_event.record()
+        # 3. Compute prior latents under original resolutions
+        pred_disparity, pred_valid_mask, pred_point_map, pred_intrinsic_map = self.produce_priors(
+            prior_model,
+            video.to(device=device, dtype=torch.float32),
+            chunk_size=decode_chunk_size
+        ) # T,H,W T,H,W T,3,H,W T,2,H,W
+        if need_resize:
+            pred_disparity = F.interpolate(pred_disparity.unsqueeze(1), (height, width), mode='bilinear', align_corners=False).squeeze(1)
+            pred_valid_mask = F.interpolate(pred_valid_mask.unsqueeze(1), (height, width), mode='bilinear', align_corners=False).squeeze(1)
+            pred_point_map = F.interpolate(pred_point_map, (height, width), mode='bilinear', align_corners=False)
+            pred_intrinsic_map = F.interpolate(pred_intrinsic_map, (height, width), mode='bilinear', align_corners=False)
+        if track_time:
+            prior_event.record()
+            torch.cuda.synchronize()
+            elapsed_time_ms = start_event.elapsed_time(prior_event)
+            print(f"Elapsed time for computing per-frame prior: {elapsed_time_ms} ms")
+        else:
+            gc.collect()
+            torch.cuda.empty_cache()
+        # 3. Encode input video
+        if need_resize:
+            video = F.interpolate(video, (height, width), mode="bicubic", align_corners=False, antialias=True).clamp(0, 1)
+        video = video.to(device=device, dtype=self.dtype)
+        video = video * 2.0 - 1.0  # [0,1] -> [-1,1], in [t, c, h, w]
+        video_embeddings = self.encode_video(video, chunk_size=decode_chunk_size).unsqueeze(0)
+        prior_latents = self.encode_point_map(
+            point_map_vae,
+            pred_disparity,
+            pred_valid_mask,
+            pred_point_map,
+            pred_intrinsic_map,
+            chunk_size=decode_chunk_size
+        ).unsqueeze(0).to(video_embeddings.dtype) # 1,T,C,H,W
+        # 4. Encode input image using VAE
+        # pdb.set_trace()
+        needs_upcasting = (
+            self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+        )
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float32)
+        video_latents = self.encode_vae_video(
+            video.to(self.vae.dtype),
+            chunk_size=decode_chunk_size,
+        ).unsqueeze(0).to(video_embeddings.dtype)  # [1, t, c, h, w]
+        if track_time:
+            encode_event.record()
+            torch.cuda.synchronize()
+            elapsed_time_ms = prior_event.elapsed_time(encode_event)
+            print(f"Elapsed time for encode prior and frames: {elapsed_time_ms} ms")
+        else:
+            gc.collect()
+            torch.cuda.empty_cache()
+        # cast back to fp16 if needed
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float16)
+        # 5. Get Added Time IDs
+        added_time_ids = self._get_add_time_ids(
+            7,
+            127,
+            noise_aug_strength,
+            video_embeddings.dtype,
+            batch_size,
+            1,
+            False,
+        )  # [1 or 2, 3]
+        added_time_ids = added_time_ids.to(device)
+        # 6. Prepare timesteps
+        timestep = 1.6378
+        self._num_timesteps = 1
+        # 7. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents_init = prior_latents  # [1, t, c, h, w]
+        latents_all = None
+        idx_start = 0
+        if overlap > 0:
+            weights = torch.linspace(0, 1, overlap, device=device)
+            weights = weights.view(1, overlap, 1, 1, 1)
+        else:
+            weights = None
+        while idx_start < num_frames - overlap:
+            idx_end = min(idx_start + window_size, num_frames)
+            # 9. Denoising loop
+            # latents_init = latents_init.flip(1)
+            latents = latents_init[:, idx_start:idx_end]
+            video_latents_current = video_latents[:, idx_start:idx_end]
+            video_embeddings_current = video_embeddings[:, idx_start:idx_end]
+            latent_model_input = torch.cat(
+                [latents, video_latents_current], dim=2
+            )
+            model_pred = self.unet(
+                latent_model_input,
+                timestep,
+                encoder_hidden_states=video_embeddings_current,
+                added_time_ids=added_time_ids,
+                return_dict=False,
+            )[0]
+            c_out = -1
+            latents = model_pred * c_out
+            if latents_all is None:
+                latents_all = latents.clone()
+            else:
+                if overlap > 0:
+                    latents_all[:, -overlap:] = latents[
+                        :, :overlap
+                    ] * weights + latents_all[:, -overlap:] * (1 - weights)
+                latents_all = torch.cat([latents_all, latents[:, overlap:]], dim=1)
+            idx_start += stride
+        latents_all = 1 / self.vae.config.scaling_factor * latents_all.squeeze(0).to(torch.float32)
+        if track_time:
+            denoise_event.record()
+            torch.cuda.synchronize()
+            elapsed_time_ms = encode_event.elapsed_time(denoise_event)
+            print(f"Elapsed time for denoise latent: {elapsed_time_ms} ms")
+        else:
+            gc.collect()
+            torch.cuda.empty_cache()
+        point_map, valid_mask = self.decode_point_map(
+            point_map_vae,
+            latents_all,
+            chunk_size=decode_chunk_size,
+            force_projection=force_projection,
+            force_fixed_focal=force_fixed_focal,
+            use_extract_interp=use_extract_interp,
+            need_resize=need_resize,
+            height=original_height,
+            width=original_width)
+        if track_time:
+            decode_event.record()
+            torch.cuda.synchronize()
+            elapsed_time_ms = denoise_event.elapsed_time(decode_event)
+            print(f"Elapsed time for decode latent: {elapsed_time_ms} ms")
+        else:
+            gc.collect()
+            torch.cuda.empty_cache()
+        self.maybe_free_model_hooks()
+        # t,h,w,3   t,h,w
+        return point_map, valid_mask

geometrycrafter/diff_ppl.py ADDED Viewed

	@@ -0,0 +1,526 @@

+from typing import Callable, Dict, List, Optional, Union
+import gc
+import numpy as np
+import torch
+import torch.nn.functional as F
+from diffusers.pipelines.stable_video_diffusion.pipeline_stable_video_diffusion import (
+    _resize_with_antialiasing,
+    StableVideoDiffusionPipeline,
+    retrieve_timesteps,
+)
+from diffusers.utils import logging
+from kornia.utils import create_meshgrid
+from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@torch.no_grad()
+def normalize_point_map(point_map, valid_mask):
+    # T,H,W,3 T,H,W
+    norm_factor = (point_map[..., 2] * valid_mask.float()).mean() / (valid_mask.float().mean() + 1e-8)
+    norm_factor = norm_factor.clip(min=1e-3)
+    return point_map / norm_factor
+def point_map_xy2intrinsic_map(point_map_xy):
+    # *,h,w,2
+    height, width = point_map_xy.shape[-3], point_map_xy.shape[-2]
+    assert height % 2 == 0
+    assert width % 2 == 0
+    mesh_grid = create_meshgrid(
+        height=height,
+        width=width,
+        normalized_coordinates=True,
+        device=point_map_xy.device,
+        dtype=point_map_xy.dtype
+    )[0] # h,w,2
+    assert mesh_grid.abs().min() > 1e-4
+    # *,h,w,2
+    mesh_grid = mesh_grid.expand_as(point_map_xy)
+    nc = point_map_xy.mean(dim=-2).mean(dim=-2) # *, 2
+    nc_map = nc[..., None, None, :].expand_as(point_map_xy)
+    nf = ((point_map_xy - nc_map) / mesh_grid).mean(dim=-2).mean(dim=-2)
+    nf_map = nf[..., None, None, :].expand_as(point_map_xy)
+    # print((mesh_grid * nf_map + nc_map - point_map_xy).abs().max())
+    return torch.cat([nc_map, nf_map], dim=-1)
+def robust_min_max(tensor, quantile=0.99):
+    T, H, W = tensor.shape
+    min_vals = []
+    max_vals = []
+    for i in range(T):
+        min_vals.append(torch.quantile(tensor[i], q=1-quantile, interpolation='nearest').item())
+        max_vals.append(torch.quantile(tensor[i], q=quantile, interpolation='nearest').item())
+    return min(min_vals), max(max_vals)
+class GeometryCrafterDiffPipeline(StableVideoDiffusionPipeline):
+    @torch.inference_mode()
+    def encode_video(
+        self,
+        video: torch.Tensor,
+        chunk_size: int = 14,
+    ) -> torch.Tensor:
+        """
+        :param video: [b, c, h, w] in range [-1, 1], the b may contain multiple videos or frames
+        :param chunk_size: the chunk size to encode video
+        :return: image_embeddings in shape of [b, 1024]
+        """
+        video_224 = _resize_with_antialiasing(video.float(), (224, 224))
+        video_224 = (video_224 + 1.0) / 2.0  # [-1, 1] -> [0, 1]
+        embeddings = []
+        for i in range(0, video_224.shape[0], chunk_size):
+            emb = self.feature_extractor(
+                images=video_224[i : i + chunk_size],
+                do_normalize=True,
+                do_center_crop=False,
+                do_resize=False,
+                do_rescale=False,
+                return_tensors="pt",
+            ).pixel_values.to(video.device, dtype=video.dtype)
+            embeddings.append(self.image_encoder(emb).image_embeds)  # [b, 1024]
+        embeddings = torch.cat(embeddings, dim=0)  # [t, 1024]
+        return embeddings
+    @torch.inference_mode()
+    def encode_vae_video(
+        self,
+        video: torch.Tensor,
+        chunk_size: int = 14,
+    ):
+        """
+        :param video: [b, c, h, w] in range [-1, 1], the b may contain multiple videos or frames
+        :param chunk_size: the chunk size to encode video
+        :return: vae latents in shape of [b, c, h, w]
+        """
+        video_latents = []
+        for i in range(0, video.shape[0], chunk_size):
+            video_latents.append(
+                self.vae.encode(video[i : i + chunk_size]).latent_dist.mode()
+            )
+        video_latents = torch.cat(video_latents, dim=0)
+        return video_latents
+    @torch.inference_mode()
+    def produce_priors(self, prior_model, frame, chunk_size=8):
+        T, _, H, W = frame.shape
+        frame = (frame + 1) / 2
+        pred_point_maps = []
+        pred_masks = []
+        for i in range(0, len(frame), chunk_size):
+            pred_p, pred_m = prior_model.forward_image(frame[i:i+chunk_size])
+            pred_point_maps.append(pred_p)
+            pred_masks.append(pred_m)
+        pred_point_maps = torch.cat(pred_point_maps, dim=0)
+        pred_masks = torch.cat(pred_masks, dim=0)
+        pred_masks = pred_masks.float() * 2 - 1
+        # T,H,W,3 T,H,W
+        pred_point_maps = normalize_point_map(pred_point_maps, pred_masks > 0)
+        pred_disps = 1.0 / pred_point_maps[..., 2].clamp_min(1e-3)
+        pred_disps = pred_disps * (pred_masks > 0)
+        min_disparity, max_disparity = robust_min_max(pred_disps)
+        pred_disps = ((pred_disps - min_disparity) / (max_disparity - min_disparity+1e-4)).clamp(0, 1)
+        pred_disps = pred_disps * 2 - 1
+        pred_point_maps[..., :2] = pred_point_maps[..., :2] / (pred_point_maps[..., 2:3] + 1e-7)
+        pred_point_maps[..., 2] = torch.log(pred_point_maps[..., 2] + 1e-7) * (pred_masks > 0) # [x/z, y/z, log(z)]
+        pred_intr_maps = point_map_xy2intrinsic_map(pred_point_maps[..., :2]).permute(0,3,1,2) # T,H,W,2
+        pred_point_maps = pred_point_maps.permute(0,3,1,2)
+        return pred_disps, pred_masks, pred_point_maps, pred_intr_maps
+    @torch.inference_mode()
+    def encode_point_map(self, point_map_vae, disparity, valid_mask, point_map, intrinsic_map, chunk_size=8):
+        T, _, H, W = point_map.shape
+        latents = []
+        psedo_image = disparity[:, None].repeat(1,3,1,1)
+        intrinsic_map = torch.norm(intrinsic_map[:, 2:4], p=2, dim=1, keepdim=False)
+        for i in range(0, T, chunk_size):
+            latent_dist = self.vae.encode(psedo_image[i : i + chunk_size].to(self.vae.dtype)).latent_dist
+            latent_dist = point_map_vae.encode(
+                torch.cat([
+                    intrinsic_map[i:i+chunk_size, None],
+                    point_map[i:i+chunk_size, 2:3],
+                    disparity[i:i+chunk_size, None],
+                    valid_mask[i:i+chunk_size, None]], dim=1),
+                latent_dist
+            )
+            if isinstance(latent_dist, DiagonalGaussianDistribution):
+                latent = latent_dist.mode()
+            else:
+                latent = latent_dist
+            assert isinstance(latent, torch.Tensor)
+            latents.append(latent)
+        latents = torch.cat(latents, dim=0)
+        latents = latents * self.vae.config.scaling_factor
+        return latents
+    @torch.no_grad()
+    def decode_point_map(self, point_map_vae, latents, chunk_size=8, force_projection=True, force_fixed_focal=True, use_extract_interp=False, need_resize=False, height=None, width=None):
+        T = latents.shape[0]
+        rec_intrinsic_maps = []
+        rec_depth_maps = []
+        rec_valid_masks = []
+        for i in range(0, T, chunk_size):
+            lat = latents[i:i+chunk_size]
+            rec_imap, rec_dmap, rec_vmask = point_map_vae.decode(
+                lat,
+                num_frames=lat.shape[0],
+            )
+            rec_intrinsic_maps.append(rec_imap)
+            rec_depth_maps.append(rec_dmap)
+            rec_valid_masks.append(rec_vmask)
+        rec_intrinsic_maps = torch.cat(rec_intrinsic_maps, dim=0)
+        rec_depth_maps = torch.cat(rec_depth_maps, dim=0)
+        rec_valid_masks = torch.cat(rec_valid_masks, dim=0)
+        if need_resize:
+            rec_depth_maps = F.interpolate(rec_depth_maps, (height, width), mode='nearest-exact') if use_extract_interp else F.interpolate(rec_depth_maps, (height, width), mode='bilinear', align_corners=False)
+            rec_valid_masks = F.interpolate(rec_valid_masks, (height, width), mode='nearest-exact') if use_extract_interp else F.interpolate(rec_valid_masks, (height, width), mode='bilinear', align_corners=False)
+            rec_intrinsic_maps = F.interpolate(rec_intrinsic_maps, (height, width), mode='bilinear', align_corners=False)
+        H, W = rec_intrinsic_maps.shape[-2], rec_intrinsic_maps.shape[-1]
+        mesh_grid = create_meshgrid(
+            H, W,
+            normalized_coordinates=True
+        ).to(rec_intrinsic_maps.device, rec_intrinsic_maps.dtype, non_blocking=True)
+        # 1,h,w,2
+        rec_intrinsic_maps = torch.cat([rec_intrinsic_maps * W / np.sqrt(W**2+H**2), rec_intrinsic_maps * H / np.sqrt(W**2+H**2)], dim=1) # t,2,h,w
+        mesh_grid = mesh_grid.permute(0,3,1,2)
+        rec_valid_masks = rec_valid_masks.squeeze(1) > 0
+        if force_projection:
+            if force_fixed_focal:
+                nfx = (rec_intrinsic_maps[:, 0, :, :] * rec_valid_masks.float()).mean() / (rec_valid_masks.float().mean() + 1e-4)
+                nfy = (rec_intrinsic_maps[:, 1, :, :] * rec_valid_masks.float()).mean() / (rec_valid_masks.float().mean() + 1e-4)
+                rec_intrinsic_maps = torch.tensor([nfx, nfy], device=rec_intrinsic_maps.device)[None, :, None, None].repeat(T, 1, 1, 1)
+            else:
+                nfx = (rec_intrinsic_maps[:, 0, :, :] * rec_valid_masks.float()).mean(dim=[-1, -2]) / (rec_valid_masks.float().mean(dim=[-1, -2]) + 1e-4)
+                nfy = (rec_intrinsic_maps[:, 1, :, :] * rec_valid_masks.float()).mean(dim=[-1, -2]) / (rec_valid_masks.float().mean(dim=[-1, -2]) + 1e-4)
+                rec_intrinsic_maps = torch.stack([nfx, nfy], dim=-1)[:, :, None, None]
+                # t,2,1,1
+        rec_point_maps = torch.cat([rec_intrinsic_maps * mesh_grid, rec_depth_maps], dim=1).permute(0,2,3,1)
+        xy, z = rec_point_maps.split([2, 1], dim=-1)
+        z = torch.clamp_max(z, 10) # for numerical stability
+        z = torch.exp(z)
+        rec_point_maps = torch.cat([xy * z, z], dim=-1)
+        return rec_point_maps, rec_valid_masks
+    @torch.no_grad()
+    def __call__(
+        self,
+        video: Union[np.ndarray, torch.Tensor],
+        point_map_vae,
+        prior_model,
+        height: int = 320,
+        width: int = 640,
+        num_inference_steps: int = 5,
+        guidance_scale: float = 1.0,
+        window_size: Optional[int] = 14,
+        noise_aug_strength: float = 0.02,
+        decode_chunk_size: Optional[int] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        overlap: int = 4,
+        force_projection: bool = True,
+        force_fixed_focal: bool = True,
+        use_extract_interp: bool = False,
+        track_time: bool = False,
+    ):
+        # video: in shape [t, h, w, c] if np.ndarray or [t, c, h, w] if torch.Tensor, in range [0, 1]
+        # 0. Default height and width to unet
+        if isinstance(video, np.ndarray):
+            video = torch.from_numpy(video.transpose(0, 3, 1, 2))
+        else:
+            assert isinstance(video, torch.Tensor)
+        height = height or video.shape[-2]
+        width = width or video.shape[-1]
+        original_height = video.shape[-2]
+        original_width = video.shape[-1]
+        num_frames = video.shape[0]
+        decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else 8
+        if num_frames <= window_size:
+            window_size = num_frames
+            overlap = 0
+        stride = window_size - overlap
+        # 1. Check inputs. Raise error if not correct
+        assert height % 64 == 0 and width % 64 == 0
+        if original_height != height or original_width != width:
+            need_resize = True
+        else:
+            need_resize = False
+        # 2. Define call parameters
+        batch_size = 1
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        self._guidance_scale = guidance_scale
+        if track_time:
+            start_event = torch.cuda.Event(enable_timing=True)
+            prior_event = torch.cuda.Event(enable_timing=True)
+            encode_event = torch.cuda.Event(enable_timing=True)
+            denoise_event = torch.cuda.Event(enable_timing=True)
+            decode_event = torch.cuda.Event(enable_timing=True)
+            start_event.record()
+        # 3. Encode input video
+        pred_disparity, pred_valid_mask, pred_point_map, pred_intrinsic_map = self.produce_priors(
+            prior_model,
+            video.to(device=device, dtype=torch.float32),
+            chunk_size=decode_chunk_size
+        ) # T,H,W T,H,W T,3,H,W T,2,H,W
+        if need_resize:
+            pred_disparity = F.interpolate(pred_disparity.unsqueeze(1), (height, width), mode='bilinear', align_corners=False).squeeze(1)
+            pred_valid_mask = F.interpolate(pred_valid_mask.unsqueeze(1), (height, width), mode='bilinear', align_corners=False).squeeze(1)
+            pred_point_map = F.interpolate(pred_point_map, (height, width), mode='bilinear', align_corners=False)
+            pred_intrinsic_map = F.interpolate(pred_intrinsic_map, (height, width), mode='bilinear', align_corners=False)
+        if track_time:
+            prior_event.record()
+            torch.cuda.synchronize()
+            elapsed_time_ms = start_event.elapsed_time(prior_event)
+            print(f"Elapsed time for computing per-frame prior: {elapsed_time_ms} ms")
+        else:
+            gc.collect()
+            torch.cuda.empty_cache()
+        # 3. Encode input video
+        if need_resize:
+            video = F.interpolate(video, (height, width), mode="bicubic", align_corners=False, antialias=True).clamp(0, 1)
+        video = video.to(device=device, dtype=self.dtype)
+        video = video * 2.0 - 1.0  # [0,1] -> [-1,1], in [t, c, h, w]
+        video_embeddings = self.encode_video(video, chunk_size=decode_chunk_size).unsqueeze(0)
+        prior_latents = self.encode_point_map(
+            point_map_vae,
+            pred_disparity,
+            pred_valid_mask,
+            pred_point_map,
+            pred_intrinsic_map,
+            chunk_size=decode_chunk_size
+        ).unsqueeze(0).to(video_embeddings.dtype) # 1,T,C,H,W
+        # 4. Encode input image using VAE
+        # pdb.set_trace()
+        needs_upcasting = (
+            self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+        )
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float32)
+        video_latents = self.encode_vae_video(
+            video.to(self.vae.dtype),
+            chunk_size=decode_chunk_size,
+        ).unsqueeze(0).to(video_embeddings.dtype)  # [1, t, c, h, w]
+        torch.cuda.empty_cache()
+        if track_time:
+            encode_event.record()
+            torch.cuda.synchronize()
+            elapsed_time_ms = prior_event.elapsed_time(encode_event)
+            print(f"Elapsed time for encode prior and frames: {elapsed_time_ms} ms")
+        else:
+            gc.collect()
+            torch.cuda.empty_cache()
+        # cast back to fp16 if needed
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float16)
+        # 5. Get Added Time IDs
+        added_time_ids = self._get_add_time_ids(
+            7,
+            127,
+            noise_aug_strength,
+            video_embeddings.dtype,
+            batch_size,
+            1,
+            False,
+        )  # [1 or 2, 3]
+        added_time_ids = added_time_ids.to(device)
+        # 6. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, None, None
+        )
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        # 7. Prepare latent variables
+        # num_channels_latents = self.unet.config.in_channels - prior_latents.shape[1]
+        num_channels_latents = 8
+        latents_init = self.prepare_latents(
+            batch_size,
+            window_size,
+            num_channels_latents,
+            height,
+            width,
+            video_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )  # [1, t, c, h, w]
+        latents_all = None
+        idx_start = 0
+        if overlap > 0:
+            weights = torch.linspace(0, 1, overlap, device=device)
+            weights = weights.view(1, overlap, 1, 1, 1)
+        else:
+            weights = None
+        while idx_start < num_frames - overlap:
+            idx_end = min(idx_start + window_size, num_frames)
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            # 9. Denoising loop
+            # latents_init = latents_init.flip(1)
+            latents = latents_init[:, : idx_end - idx_start].clone()
+            latents_init = torch.cat(
+                [latents_init[:, -overlap:], latents_init[:, :stride]], dim=1
+            )
+            video_latents_current = video_latents[:, idx_start:idx_end]
+            prior_latents_current = prior_latents[:, idx_start:idx_end]
+            video_embeddings_current = video_embeddings[:, idx_start:idx_end]
+            with self.progress_bar(total=num_inference_steps) as progress_bar:
+                for i, t in enumerate(timesteps):
+                    if latents_all is not None and i == 0:
+                        latents[:, :overlap] = (
+                            latents_all[:, -overlap:]
+                            + latents[:, :overlap]
+                            / self.scheduler.init_noise_sigma
+                            * self.scheduler.sigmas[i]
+                        )
+                    latent_model_input = latents
+                    latent_model_input = self.scheduler.scale_model_input(
+                        latent_model_input, t
+                    )  # [1 or 2, t, c, h, w]
+                    latent_model_input = torch.cat(
+                        [latent_model_input, video_latents_current, prior_latents_current], dim=2
+                    )
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=video_embeddings_current,
+                        added_time_ids=added_time_ids,
+                        return_dict=False,
+                    )[0]
+                    # pdb.set_trace()
+                    # perform guidance
+                    if self.do_classifier_free_guidance:
+                        latent_model_input = latents
+                        latent_model_input = self.scheduler.scale_model_input(
+                            latent_model_input, t
+                        )
+                        latent_model_input = torch.cat(
+                            [latent_model_input, torch.zeros_like(latent_model_input), torch.zeros_like(latent_model_input)],
+                            dim=2,
+                        )
+                        noise_pred_uncond = self.unet(
+                            latent_model_input,
+                            t,
+                            encoder_hidden_states=torch.zeros_like(
+                                video_embeddings_current
+                            ),
+                            added_time_ids=added_time_ids,
+                            return_dict=False,
+                        )[0]
+                        noise_pred = noise_pred_uncond + self.guidance_scale * (
+                            noise_pred - noise_pred_uncond
+                        )
+                    latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+                    if callback_on_step_end is not None:
+                        callback_kwargs = {}
+                        for k in callback_on_step_end_tensor_inputs:
+                            callback_kwargs[k] = locals()[k]
+                        callback_outputs = callback_on_step_end(
+                            self, i, t, callback_kwargs
+                        )
+                        latents = callback_outputs.pop("latents", latents)
+                    if i == len(timesteps) - 1 or (
+                        (i + 1) > num_warmup_steps
+                        and (i + 1) % self.scheduler.order == 0
+                    ):
+                        progress_bar.update()
+            if latents_all is None:
+                latents_all = latents.clone()
+            else:
+                if overlap > 0:
+                    latents_all[:, -overlap:] = latents[
+                        :, :overlap
+                    ] * weights + latents_all[:, -overlap:] * (1 - weights)
+                latents_all = torch.cat([latents_all, latents[:, overlap:]], dim=1)
+            idx_start += stride
+        latents_all = 1 / self.vae.config.scaling_factor * latents_all.squeeze(0).to(torch.float32)
+        if track_time:
+            denoise_event.record()
+            torch.cuda.synchronize()
+            elapsed_time_ms = encode_event.elapsed_time(denoise_event)
+            print(f"Elapsed time for denoise latent: {elapsed_time_ms} ms")
+        else:
+            gc.collect()
+            torch.cuda.empty_cache()
+        point_map, valid_mask = self.decode_point_map(
+            point_map_vae,
+            latents_all,
+            chunk_size=decode_chunk_size,
+            force_projection=force_projection,
+            force_fixed_focal=force_fixed_focal,
+            use_extract_interp=use_extract_interp,
+            need_resize=need_resize,
+            height=original_height,
+            width=original_width)
+        if track_time:
+            decode_event.record()
+            torch.cuda.synchronize()
+            elapsed_time_ms = denoise_event.elapsed_time(decode_event)
+            print(f"Elapsed time for decode latent: {elapsed_time_ms} ms")
+        else:
+            gc.collect()
+            torch.cuda.empty_cache()
+        self.maybe_free_model_hooks()
+        # t,h,w,3   t,h,w
+        return point_map, valid_mask

geometrycrafter/pmap_vae.py ADDED Viewed

	@@ -0,0 +1,330 @@

+from typing import Dict, Tuple, Union
+import torch
+import torch.nn as nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils.accelerate_utils import apply_forward_hook
+from diffusers.models.attention_processor import CROSS_ATTENTION_PROCESSORS, AttentionProcessor, AttnProcessor
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution, Encoder
+from diffusers.utils import is_torch_version
+from diffusers.models.unets.unet_3d_blocks import UpBlockTemporalDecoder, MidBlockTemporalDecoder
+from diffusers.models.resnet import SpatioTemporalResBlock
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+class PMapTemporalDecoder(nn.Module):
+    def __init__(
+        self,
+        in_channels: int = 4,
+        out_channels: Tuple[int] = (1, 1, 1),
+        block_out_channels: Tuple[int] = (128, 256, 512, 512),
+        layers_per_block: int = 2,
+    ):
+        super().__init__()
+        self.conv_in = nn.Conv2d(
+            in_channels,
+            block_out_channels[-1],
+            kernel_size=3,
+            stride=1,
+            padding=1
+        )
+        self.mid_block = MidBlockTemporalDecoder(
+            num_layers=layers_per_block,
+            in_channels=block_out_channels[-1],
+            out_channels=block_out_channels[-1],
+            attention_head_dim=block_out_channels[-1],
+        )
+        # up
+        self.up_blocks = nn.ModuleList([])
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i in range(len(block_out_channels)):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            up_block = UpBlockTemporalDecoder(
+                num_layers=layers_per_block + 1,
+                in_channels=prev_output_channel,
+                out_channels=output_channel,
+                add_upsample=not is_final_block,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        self.out_blocks = nn.ModuleList([])
+        self.time_conv_outs = nn.ModuleList([])
+        for out_channel in out_channels:
+            self.out_blocks.append(
+                nn.ModuleList([
+                    nn.GroupNorm(num_channels=block_out_channels[0], num_groups=32, eps=1e-6),
+                    nn.ReLU(inplace=True),
+                    nn.Conv2d(
+                        block_out_channels[0],
+                        block_out_channels[0] // 2,
+                        kernel_size=3,
+                        padding=1
+                    ),
+                    SpatioTemporalResBlock(
+                        in_channels=block_out_channels[0] // 2,
+                        out_channels=block_out_channels[0] // 2,
+                        temb_channels=None,
+                        eps=1e-6,
+                        temporal_eps=1e-5,
+                        merge_factor=0.0,
+                        merge_strategy="learned",
+                        switch_spatial_to_temporal_mix=True
+                    ),
+                    nn.ReLU(inplace=True),
+                    nn.Conv2d(
+                        block_out_channels[0] // 2,
+                        out_channel,
+                        kernel_size=1,
+                    )
+                ])
+            )
+            conv_out_kernel_size = (3, 1, 1)
+            padding = [int(k // 2) for k in conv_out_kernel_size]
+            self.time_conv_outs.append(nn.Conv3d(
+                in_channels=out_channel,
+                out_channels=out_channel,
+                kernel_size=conv_out_kernel_size,
+                padding=padding,
+            ))
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        sample: torch.Tensor,
+        image_only_indicator: torch.Tensor,
+        num_frames: int = 1,
+    ):
+        sample = self.conv_in(sample)
+        upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
+        if self.training and self.gradient_checkpointing:
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+                return custom_forward
+            if is_torch_version(">=", "1.11.0"):
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block),
+                    sample,
+                    image_only_indicator,
+                    use_reentrant=False,
+                )
+                sample = sample.to(upscale_dtype)
+                # up
+                for up_block in self.up_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(up_block),
+                        sample,
+                        image_only_indicator,
+                        use_reentrant=False,
+                    )
+            else:
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block),
+                    sample,
+                    image_only_indicator,
+                )
+                sample = sample.to(upscale_dtype)
+                # up
+                for up_block in self.up_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(up_block),
+                        sample,
+                        image_only_indicator,
+                    )
+        else:
+            # middle
+            sample = self.mid_block(sample, image_only_indicator=image_only_indicator)
+            sample = sample.to(upscale_dtype)
+            # up
+            for up_block in self.up_blocks:
+                sample = up_block(sample, image_only_indicator=image_only_indicator)
+        # post-process
+        output = []
+        for out_block, time_conv_out in zip(self.out_blocks, self.time_conv_outs):
+            x = sample
+            for layer in out_block:
+                if isinstance(layer, SpatioTemporalResBlock):
+                    x = layer(x, None, image_only_indicator)
+                else:
+                    x = layer(x)
+            batch_frames, channels, height, width = x.shape
+            batch_size = batch_frames // num_frames
+            x = x[None, :].reshape(batch_size, num_frames, channels, height, width).permute(0, 2, 1, 3, 4)
+            x = time_conv_out(x)
+            x = x.permute(0, 2, 1, 3, 4).reshape(batch_frames, channels, height, width)
+            output.append(x)
+        return output
+class PMapAutoencoderKLTemporalDecoder(ModelMixin, ConfigMixin):
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 4,
+        latent_channels: int = 4,
+        enc_down_block_types: Tuple[str] = (
+            "DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D"
+        ),
+        enc_block_out_channels: Tuple[int] = (128, 256, 512, 512),
+        enc_layers_per_block: int = 2,
+        dec_block_out_channels: Tuple[int] = (128, 256, 512, 512),
+        dec_layers_per_block: int = 2,
+        out_channels: Tuple[int] = (1, 1, 1),
+        mid_block_add_attention: bool = True,
+        offset_scale_factor: float = 0.1,
+        **kwargs
+    ):
+        super().__init__()
+        self.encoder = Encoder(
+            in_channels=in_channels,
+            out_channels=latent_channels,
+            down_block_types=enc_down_block_types,
+            block_out_channels=enc_block_out_channels,
+            layers_per_block=enc_layers_per_block,
+            double_z=False,
+            mid_block_add_attention=mid_block_add_attention
+        )
+        zero_module(self.encoder.conv_out)
+        self.offset_scale_factor = offset_scale_factor
+        self.decoder = PMapTemporalDecoder(
+            in_channels=latent_channels,
+            block_out_channels=dec_block_out_channels,
+            layers_per_block=dec_layers_per_block,
+            out_channels=out_channels
+        )
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (Encoder, PMapTemporalDecoder)):
+            module.gradient_checkpointing = value
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+        self.set_attn_processor(processor)
+    @apply_forward_hook
+    def encode(
+        self,
+        x: torch.Tensor,
+        latent_dist: DiagonalGaussianDistribution
+    ) -> DiagonalGaussianDistribution:
+        h = self.encoder(x)
+        offset = h * self.offset_scale_factor
+        param = latent_dist.parameters.to(h.dtype)
+        mean, logvar = torch.chunk(param, 2, dim=1)
+        posterior = DiagonalGaussianDistribution(torch.cat([mean + offset, logvar], dim=1))
+        return posterior
+    @apply_forward_hook
+    def decode(
+        self,
+        z: torch.Tensor,
+        num_frames: int
+    ) -> torch.Tensor:
+        batch_size = z.shape[0] // num_frames
+        image_only_indicator = torch.zeros(batch_size, num_frames, dtype=z.dtype, device=z.device)
+        decoded = self.decoder(z, num_frames=num_frames, image_only_indicator=image_only_indicator)
+        return decoded

geometrycrafter/unet.py ADDED Viewed

	@@ -0,0 +1,281 @@

+from typing import Union, Tuple
+import torch
+from diffusers import UNetSpatioTemporalConditionModel
+from diffusers.models.unets.unet_spatio_temporal_condition import UNetSpatioTemporalConditionOutput
+from diffusers.utils import is_torch_version
+class UNetSpatioTemporalConditionModelVid2vid(
+    UNetSpatioTemporalConditionModel
+):
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        sample: torch.Tensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        added_time_ids: torch.Tensor,
+        return_dict: bool = True,
+    ) -> Union[UNetSpatioTemporalConditionOutput, Tuple]:
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        batch_size, num_frames = sample.shape[:2]
+        timesteps = timesteps.expand(batch_size)
+        t_emb = self.time_proj(timesteps)
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.conv_in.weight.dtype)
+        emb = self.time_embedding(t_emb)  # [batch_size * num_frames, channels]
+        time_embeds = self.add_time_proj(added_time_ids.flatten())
+        time_embeds = time_embeds.reshape((batch_size, -1))
+        time_embeds = time_embeds.to(emb.dtype)
+        aug_emb = self.add_embedding(time_embeds)
+        emb = emb + aug_emb
+        # Flatten the batch and frames dimensions
+        # sample: [batch, frames, channels, height, width] -> [batch * frames, channels, height, width]
+        sample = sample.flatten(0, 1)
+        # Repeat the embeddings num_video_frames times
+        # emb: [batch, channels] -> [batch * frames, channels]
+        emb = emb.repeat_interleave(num_frames, dim=0)
+        # encoder_hidden_states: [batch, frames, channels] -> [batch * frames, 1, channels]
+        encoder_hidden_states = encoder_hidden_states.flatten(0, 1).unsqueeze(1)
+        # 2. pre-process
+        sample = sample.to(dtype=self.conv_in.weight.dtype)
+        assert sample.dtype == self.conv_in.weight.dtype, (
+            f"sample.dtype: {sample.dtype}, "
+            f"self.conv_in.weight.dtype: {self.conv_in.weight.dtype}"
+        )
+        sample = self.conv_in(sample)
+        image_only_indicator = torch.zeros(
+            batch_size, num_frames, dtype=sample.dtype, device=sample.device
+        )
+        down_block_res_samples = (sample,)
+        if self.training and self.gradient_checkpointing:
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+                return custom_forward
+            if is_torch_version(">=", "1.11.0"):
+                for downsample_block in self.down_blocks:
+                    if (
+                        hasattr(downsample_block, "has_cross_attention")
+                        and downsample_block.has_cross_attention
+                    ):
+                        sample, res_samples = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(downsample_block),
+                            sample,
+                            emb,
+                            encoder_hidden_states,
+                            image_only_indicator,
+                            use_reentrant=False,
+                        )
+                    else:
+                        sample, res_samples = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(downsample_block),
+                            sample,
+                            emb,
+                            image_only_indicator,
+                            use_reentrant=False,
+                        )
+                    down_block_res_samples += res_samples
+                # 4. mid
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block),
+                    sample,
+                    emb,
+                    encoder_hidden_states,
+                    image_only_indicator,
+                    use_reentrant=False,
+                )
+                # 5. up
+                for i, upsample_block in enumerate(self.up_blocks):
+                    res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+                    down_block_res_samples = down_block_res_samples[
+                        : -len(upsample_block.resnets)
+                    ]
+                    if (
+                        hasattr(upsample_block, "has_cross_attention")
+                        and upsample_block.has_cross_attention
+                    ):
+                        sample = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(upsample_block),
+                            sample,
+                            res_samples,
+                            emb,
+                            encoder_hidden_states,
+                            image_only_indicator,
+                            use_reentrant=False,
+                        )
+                    else:
+                        sample = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(upsample_block),
+                            sample,
+                            res_samples,
+                            emb,
+                            image_only_indicator,
+                            use_reentrant=False,
+                        )
+            else:
+                for downsample_block in self.down_blocks:
+                    if (
+                        hasattr(downsample_block, "has_cross_attention")
+                        and downsample_block.has_cross_attention
+                    ):
+                        sample, res_samples = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(downsample_block),
+                            sample,
+                            emb,
+                            encoder_hidden_states,
+                            image_only_indicator,
+                        )
+                    else:
+                        sample, res_samples = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(downsample_block),
+                            sample,
+                            emb,
+                            image_only_indicator,
+                        )
+                    down_block_res_samples += res_samples
+                # 4. mid
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block),
+                    sample,
+                    emb,
+                    encoder_hidden_states,
+                    image_only_indicator,
+                )
+                # 5. up
+                for i, upsample_block in enumerate(self.up_blocks):
+                    res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+                    down_block_res_samples = down_block_res_samples[
+                        : -len(upsample_block.resnets)
+                    ]
+                    if (
+                        hasattr(upsample_block, "has_cross_attention")
+                        and upsample_block.has_cross_attention
+                    ):
+                        sample = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(upsample_block),
+                            sample,
+                            res_samples,
+                            emb,
+                            encoder_hidden_states,
+                            image_only_indicator,
+                        )
+                    else:
+                        sample = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(upsample_block),
+                            sample,
+                            res_samples,
+                            emb,
+                            image_only_indicator,
+                        )
+        else:
+            for downsample_block in self.down_blocks:
+                if (
+                    hasattr(downsample_block, "has_cross_attention")
+                    and downsample_block.has_cross_attention
+                ):
+                    sample, res_samples = downsample_block(
+                        hidden_states=sample,
+                        temb=emb,
+                        encoder_hidden_states=encoder_hidden_states,
+                        image_only_indicator=image_only_indicator,
+                    )
+                else:
+                    sample, res_samples = downsample_block(
+                        hidden_states=sample,
+                        temb=emb,
+                        image_only_indicator=image_only_indicator,
+                    )
+                down_block_res_samples += res_samples
+            # 4. mid
+            sample = self.mid_block(
+                hidden_states=sample,
+                temb=emb,
+                encoder_hidden_states=encoder_hidden_states,
+                image_only_indicator=image_only_indicator,
+            )
+            # 5. up
+            for i, upsample_block in enumerate(self.up_blocks):
+                res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+                down_block_res_samples = down_block_res_samples[
+                    : -len(upsample_block.resnets)
+                ]
+                if (
+                    hasattr(upsample_block, "has_cross_attention")
+                    and upsample_block.has_cross_attention
+                ):
+                    sample = upsample_block(
+                        hidden_states=sample,
+                        res_hidden_states_tuple=res_samples,
+                        temb=emb,
+                        encoder_hidden_states=encoder_hidden_states,
+                        image_only_indicator=image_only_indicator,
+                    )
+                else:
+                    sample = upsample_block(
+                        hidden_states=sample,
+                        res_hidden_states_tuple=res_samples,
+                        temb=emb,
+                        image_only_indicator=image_only_indicator,
+                    )
+        # 6. post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        # 7. Reshape back to original shape
+        sample = sample.reshape(batch_size, num_frames, *sample.shape[1:])
+        if not return_dict:
+            return (sample,)
+        return UNetSpatioTemporalConditionOutput(sample=sample)

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+torch==2.3.1
+diffusers==0.31.0
+numpy==2.0.1
+matplotlib==3.9.2
+transformers==4.48.0
+accelerate==1.1.1
+xformers==0.0.27
+mediapy==1.2.2
+fire==0.7.0
+decord==0.6.0
+OpenEXR==3.3.2
+kornia==0.7.4
+opencv-python==4.10.0.84
+h5py==3.12.1
+moderngl==5.12.0
+piqp==0.4.2

third_party/__init__.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import torch
+import torch.nn as nn
+import sys
+sys.path.append('third_party/moge')
+from .moge.moge.model.moge_model import MoGeModel
+class MoGe(nn.Module):
+    def __init__(self, cache_dir):
+        super().__init__()
+        self.model = MoGeModel.from_pretrained(
+            'Ruicheng/moge-vitl', cache_dir=cache_dir).eval()
+    @torch.no_grad()
+    def forward_image(self, image: torch.Tensor, **kwargs):
+        # image: b, 3, h, w 0,1
+        output = self.model.infer(image, resolution_level=9, apply_mask=False, **kwargs)
+        points = output['points'] # b,h,w,3
+        masks = output['mask'] # b,h,w
+        return points, masks

third_party/moge ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit dd158c05461f2353287a182afb2adf0fda46436f

utils/__init__.py ADDED Viewed

File without changes

utils/disp_utils.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import torch
+from matplotlib import cm
+def robust_min_max(tensor, quantile=0.99):
+    T, H, W = tensor.shape
+    min_vals = []
+    max_vals = []
+    for i in range(T):
+        min_vals.append(torch.quantile(tensor[i], q=1-quantile, interpolation='nearest').item())
+        max_vals.append(torch.quantile(tensor[i], q=quantile, interpolation='nearest').item())
+    return min(min_vals), max(max_vals)
+class ColorMapper:
+    def __init__(self, colormap: str = "inferno"):
+        self.colormap = torch.tensor(cm.get_cmap(colormap).colors)
+    def apply(self, image: torch.Tensor, v_min=None, v_max=None):
+        # assert len(image.shape) == 2
+        if v_min is None:
+            v_min = image.min()
+        if v_max is None:
+            v_max = image.max()
+        image = (image - v_min) / (v_max - v_min)
+        image = (image * 255).long()
+        colormap = self.colormap.to(image.device)
+        image = colormap[image]
+        return image
+def color_video_disp(disp):
+    visualizer = ColorMapper()
+    disp_img = visualizer.apply(disp, v_min=0, v_max=1)
+    return disp_img
+def pmap_to_disp(point_maps, valid_masks):
+    disp_map = 1.0 / (point_maps[..., 2] + 1e-4)
+    min_disparity, max_disparity = robust_min_max(disp_map)
+    disp_map = torch.clamp((disp_map - min_disparity) / (max_disparity - min_disparity+1e-4), 0, 1)
+    disp_map = color_video_disp(disp_map)
+    disp_map[~valid_masks] = 0
+    return disp_map
+    # imageio.mimsave(os.path.join(args.save_dir, os.path.basename(args.data[:-4])+'_disp.mp4'), disp, fps=24, quality=9, macro_block_size=1)

utils/glb_utils.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import trimesh
+import numpy as np
+def pmap_to_glb(point_map, valid_mask, frame) -> trimesh.Scene:
+    pts_3d = point_map[valid_mask] * np.array([-1, -1, 1])
+    pts_rgb = frame[valid_mask]
+    # Initialize a 3D scene
+    scene_3d = trimesh.Scene()
+    # Add point cloud data to the scene
+    point_cloud_data = trimesh.PointCloud(
+        vertices=pts_3d, colors=pts_rgb
+    )
+    scene_3d.add_geometry(point_cloud_data)
+    return scene_3d