Spaces:

EXCAI
/

Diffusion-As-Shader

Running on Zero

App Files Files Community

Beijia11 commited on Mar 5

Commit

3aba902

1 Parent(s): 686bb9b

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +200 -0
.gitmodules +3 -0
app.py +577 -0
config/__init__.py +0 -0
config/base_cfg.py +410 -0
config/ssm_cfg.py +347 -0
config/yacs.py +506 -0
demo.py +206 -0
models/cogvideox_tracking.py +1020 -0
models/pipelines.py +1040 -0
models/spatracker/__init__.py +5 -0
models/spatracker/models/__init__.py +5 -0
models/spatracker/models/build_spatracker.py +51 -0
models/spatracker/models/core/__init__.py +5 -0
models/spatracker/models/core/embeddings.py +250 -0
models/spatracker/models/core/model_utils.py +477 -0
models/spatracker/models/core/spatracker/__init__.py +5 -0
models/spatracker/models/core/spatracker/blocks.py +999 -0
models/spatracker/models/core/spatracker/dpt/__init__.py +0 -0
models/spatracker/models/core/spatracker/dpt/base_model.py +16 -0
models/spatracker/models/core/spatracker/dpt/blocks.py +394 -0
models/spatracker/models/core/spatracker/dpt/midas_net.py +77 -0
models/spatracker/models/core/spatracker/dpt/models.py +231 -0
models/spatracker/models/core/spatracker/dpt/transforms.py +231 -0
models/spatracker/models/core/spatracker/dpt/vit.py +596 -0
models/spatracker/models/core/spatracker/feature_net.py +915 -0
models/spatracker/models/core/spatracker/loftr/__init__.py +1 -0
models/spatracker/models/core/spatracker/loftr/linear_attention.py +81 -0
models/spatracker/models/core/spatracker/loftr/transformer.py +142 -0
models/spatracker/models/core/spatracker/losses.py +90 -0
models/spatracker/models/core/spatracker/softsplat.py +539 -0
models/spatracker/models/core/spatracker/spatracker.py +732 -0
models/spatracker/models/core/spatracker/unet.py +258 -0
models/spatracker/models/core/spatracker/vit/__init__.py +0 -0
models/spatracker/models/core/spatracker/vit/common.py +43 -0
models/spatracker/models/core/spatracker/vit/encoder.py +397 -0
models/spatracker/predictor.py +284 -0
models/spatracker/utils/__init__.py +5 -0
models/spatracker/utils/basic.py +397 -0
models/spatracker/utils/geom.py +547 -0
models/spatracker/utils/improc.py +1447 -0
models/spatracker/utils/misc.py +166 -0
models/spatracker/utils/samp.py +152 -0
models/spatracker/utils/visualizer.py +409 -0
models/spatracker/utils/vox.py +500 -0
requirements.txt +32 -0
submodules/MoGe/.gitignore +425 -0
submodules/MoGe/CHANGELOG.md +15 -0
submodules/MoGe/CODE_OF_CONDUCT.md +9 -0
submodules/MoGe/LICENSE +224 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,200 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# JetBrains
+.idea
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# manually added
+wandb/
+dump*
+!requirements.txt
+env/
+datasets/
+validation/
+ckpts/
+.vscode/
+output.mp4
+outputs/
+camctrl_output
+*.code-workspace
+**/*/.DS_Store
+**/*/__pycache__/*
+.DS_Store
+__pycache__
+vis_results
+checkpoints
+**/*/.pth
+**/*/.pt
+**/*/.mp4
+**/*/.npy
+/assets/**
+./vis_results/** */
+models/monoD/zoeDepth/ckpts/*
+slurm-*.out
+.vscode
+data/
+tmp/

.gitmodules ADDED Viewed

	@@ -0,0 +1,3 @@

+[submodule "submodules/MoGe"]
+	path = submodules/MoGe
+	url = https://github.com/microsoft/MoGe.git

app.py ADDED Viewed

	@@ -0,0 +1,577 @@

+import os
+import sys
+import gradio as gr
+import torch
+import subprocess
+import argparse
+import glob
+project_root = os.path.dirname(os.path.abspath(__file__))
+os.environ["GRADIO_TEMP_DIR"] = os.path.join(project_root, "tmp", "gradio")
+sys.path.append(project_root)
+HERE_PATH = os.path.normpath(os.path.dirname(__file__))
+sys.path.insert(0, HERE_PATH)
+from huggingface_hub import hf_hub_download
+hf_hub_download(repo_id="EXCAI/Diffusion-As-Shader", filename='spatracker/spaT_final.pth', local_dir=f'{HERE_PATH}/checkpoints/')
+# Parse command line arguments
+parser = argparse.ArgumentParser(description="Diffusion as Shader Web UI")
+parser.add_argument("--port", type=int, default=7860, help="Port to run the web UI on")
+parser.add_argument("--share", action="store_true", help="Share the web UI")
+parser.add_argument("--gpu", type=int, default=0, help="GPU device ID")
+parser.add_argument("--model_path", type=str, default="EXCAI/Diffusion-As-Shader", help="Path to model checkpoint")
+parser.add_argument("--output_dir", type=str, default="tmp", help="Output directory")
+args = parser.parse_args()
+# Use the original GPU ID throughout the entire code for consistency
+GPU_ID = args.gpu
+# Set environment variables - this used to remap the GPU, but we're removing this for consistency
+# Instead, we'll pass the original GPU ID to all commands
+# os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu)  # Commented out to ensure consistent GPU ID usage
+# Check if CUDA is available
+CUDA_AVAILABLE = torch.cuda.is_available()
+if CUDA_AVAILABLE:
+    GPU_COUNT = torch.cuda.device_count()
+    GPU_NAMES = [f"{i}: {torch.cuda.get_device_name(i)}" for i in range(GPU_COUNT)]
+else:
+    GPU_COUNT = 0
+    GPU_NAMES = ["CPU (CUDA not available)"]
+    GPU_ID = "CPU"
+DEFAULT_MODEL_PATH = args.model_path
+OUTPUT_DIR = args.output_dir
+# Create necessary directories
+os.makedirs("outputs", exist_ok=True)
+# Create project tmp directory instead of using system temp
+os.makedirs(os.path.join(project_root, "tmp"), exist_ok=True)
+os.makedirs(os.path.join(project_root, "tmp", "gradio"), exist_ok=True)
+def save_uploaded_file(file):
+    if file is None:
+        return None
+    # Use project tmp directory instead of system temp
+    temp_dir = os.path.join(project_root, "tmp")
+    if hasattr(file, 'name'):
+        filename = file.name
+    else:
+        # Generate a unique filename if name attribute is missing
+        import uuid
+        ext = ".tmp"
+        if hasattr(file, 'content_type'):
+            if "image" in file.content_type:
+                ext = ".png"
+            elif "video" in file.content_type:
+                ext = ".mp4"
+        filename = f"{uuid.uuid4()}{ext}"
+    temp_path = os.path.join(temp_dir, filename)
+    try:
+        # Check if file is a FileStorage object or already a path
+        if hasattr(file, 'save'):
+            file.save(temp_path)
+        elif isinstance(file, str):
+            # It's already a path
+            return file
+        else:
+            # Try to read and save the file
+            with open(temp_path, 'wb') as f:
+                f.write(file.read() if hasattr(file, 'read') else file)
+    except Exception as e:
+        print(f"Error saving file: {e}")
+        return None
+    return temp_path
+def create_run_command(args):
+    """Create command based on input parameters"""
+    cmd = ["python", "demo.py"]
+    if "prompt" not in args or args["prompt"] is None or args["prompt"] == "":
+        args["prompt"] = ""
+    if "checkpoint_path" not in args or args["checkpoint_path"] is None or args["checkpoint_path"] == "":
+        args["checkpoint_path"] = DEFAULT_MODEL_PATH
+    # 添加调试输出
+    print(f"DEBUG: Command args: {args}")
+    for key, value in args.items():
+        if value is not None:
+            # Handle boolean values correctly - for repaint, we need to pass true/false
+            if isinstance(value, bool):
+                cmd.append(f"--{key}")
+                cmd.append(str(value).lower())  # Convert True/False to true/false
+            else:
+                cmd.append(f"--{key}")
+                cmd.append(str(value))
+    return cmd
+def run_process(cmd):
+    """Run command and return output"""
+    print(f"Running command: {' '.join(cmd)}")
+    process = subprocess.Popen(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        universal_newlines=True
+    )
+    output = []
+    for line in iter(process.stdout.readline, ""):
+        print(line, end="")
+        output.append(line)
+        if not line:
+            break
+    process.stdout.close()
+    return_code = process.wait()
+    if return_code:
+        stderr = process.stderr.read()
+        print(f"Error: {stderr}")
+        raise subprocess.CalledProcessError(return_code, cmd, output="\n".join(output), stderr=stderr)
+    return "\n".join(output)
+# Process functions for each tab
+def process_motion_transfer(source, prompt, mt_repaint_option, mt_repaint_image):
+    """Process video motion transfer task"""
+    try:
+        # Save uploaded files
+        input_video_path = save_uploaded_file(source)
+        if input_video_path is None:
+            return None
+        print(f"DEBUG: Repaint option: {mt_repaint_option}")
+        print(f"DEBUG: Repaint image: {mt_repaint_image}")
+        args = {
+            "input_path": input_video_path,
+            "prompt": f"\"{prompt}\"",
+            "checkpoint_path": DEFAULT_MODEL_PATH,
+            "output_dir": OUTPUT_DIR,
+            "gpu": GPU_ID
+        }
+        # Priority: Custom Image > Yes > No
+        if mt_repaint_image is not None:
+            # Custom image takes precedence if provided
+            repaint_path = save_uploaded_file(mt_repaint_image)
+            print(f"DEBUG: Repaint path: {repaint_path}")
+            args["repaint"] = repaint_path
+        elif mt_repaint_option == "Yes":
+            # Otherwise use Yes/No selection
+            args["repaint"] = "true"
+        # Create and run command
+        cmd = create_run_command(args)
+        output = run_process(cmd)
+        # Find generated video files
+        output_files = glob.glob(os.path.join(OUTPUT_DIR, "*.mp4"))
+        if output_files:
+            # Sort by modification time, return the latest file
+            latest_file = max(output_files, key=os.path.getmtime)
+            return latest_file
+        else:
+            return None
+    except Exception as e:
+        import traceback
+        print(f"Processing failed: {str(e)}\n{traceback.format_exc()}")
+        return None
+def process_camera_control(source, prompt, camera_motion, tracking_method):
+    """Process camera control task"""
+    try:
+        # Save uploaded files
+        input_media_path = save_uploaded_file(source)
+        if input_media_path is None:
+            return None
+        print(f"DEBUG: Camera motion: '{camera_motion}'")
+        print(f"DEBUG: Tracking method: '{tracking_method}'")
+        args = {
+            "input_path": input_media_path,
+            "prompt": prompt,
+            "checkpoint_path": DEFAULT_MODEL_PATH,
+            "output_dir": OUTPUT_DIR,
+            "gpu": GPU_ID,
+            "tracking_method": tracking_method
+        }
+        if camera_motion and camera_motion.strip():
+            args["camera_motion"] = camera_motion
+        # Create and run command
+        cmd = create_run_command(args)
+        output = run_process(cmd)
+        # Find generated video files
+        output_files = glob.glob(os.path.join(OUTPUT_DIR, "*.mp4"))
+        if output_files:
+            # Sort by modification time, return the latest file
+            latest_file = max(output_files, key=os.path.getmtime)
+            return latest_file
+        else:
+            return None
+    except Exception as e:
+        import traceback
+        print(f"Processing failed: {str(e)}\n{traceback.format_exc()}")
+        return None
+def process_object_manipulation(source, prompt, object_motion, object_mask, tracking_method):
+    """Process object manipulation task"""
+    try:
+        # Save uploaded files
+        input_image_path = save_uploaded_file(source)
+        if input_image_path is None:
+            return None
+        object_mask_path = save_uploaded_file(object_mask)
+        args = {
+            "input_path": input_image_path,
+            "prompt": prompt,
+            "checkpoint_path": DEFAULT_MODEL_PATH,
+            "output_dir": OUTPUT_DIR,
+            "gpu": GPU_ID,
+            "object_motion": object_motion,
+            "object_mask": object_mask_path,
+            "tracking_method": tracking_method
+        }
+        # Create and run command
+        cmd = create_run_command(args)
+        output = run_process(cmd)
+        # Find generated video files
+        output_files = glob.glob(os.path.join(OUTPUT_DIR, "*.mp4"))
+        if output_files:
+            # Sort by modification time, return the latest file
+            latest_file = max(output_files, key=os.path.getmtime)
+            return latest_file
+        else:
+            return None
+    except Exception as e:
+        import traceback
+        print(f"Processing failed: {str(e)}\n{traceback.format_exc()}")
+        return None
+def process_mesh_animation(source, prompt, tracking_video, ma_repaint_option, ma_repaint_image):
+    """Process mesh animation task"""
+    try:
+        # Save uploaded files
+        input_video_path = save_uploaded_file(source)
+        if input_video_path is None:
+            return None
+        tracking_video_path = save_uploaded_file(tracking_video)
+        if tracking_video_path is None:
+            return None
+        args = {
+            "input_path": input_video_path,
+            "prompt": prompt,
+            "checkpoint_path": DEFAULT_MODEL_PATH,
+            "output_dir": OUTPUT_DIR,
+            "gpu": GPU_ID,
+            "tracking_path": tracking_video_path
+        }
+        # Priority: Custom Image > Yes > No
+        if ma_repaint_image is not None:
+            # Custom image takes precedence if provided
+            repaint_path = save_uploaded_file(ma_repaint_image)
+            args["repaint"] = repaint_path
+        elif ma_repaint_option == "Yes":
+            # Otherwise use Yes/No selection
+            args["repaint"] = "true"
+        # Create and run command
+        cmd = create_run_command(args)
+        output = run_process(cmd)
+        # Find generated video files
+        output_files = glob.glob(os.path.join(OUTPUT_DIR, "*.mp4"))
+        if output_files:
+            # Sort by modification time, return the latest file
+            latest_file = max(output_files, key=os.path.getmtime)
+            return latest_file
+        else:
+            return None
+    except Exception as e:
+        import traceback
+        print(f"Processing failed: {str(e)}\n{traceback.format_exc()}")
+        return None
+# Create Gradio interface with updated layout
+with gr.Blocks(title="Diffusion as Shader") as demo:
+    gr.Markdown("# Diffusion as Shader Web UI")
+    gr.Markdown("### [Project Page](https://igl-hkust.github.io/das/) | [GitHub](https://github.com/IGL-HKUST/DiffusionAsShader)")
+    with gr.Row():
+        left_column = gr.Column(scale=1)
+        right_column = gr.Column(scale=1)
+    with right_column:
+        output_video = gr.Video(label="Generated Video")
+    with left_column:
+        source = gr.File(label="Source", file_types=["image", "video"])
+        common_prompt = gr.Textbox(label="Prompt", lines=2)
+        gr.Markdown(f"**Using GPU: {GPU_ID}**")
+        with gr.Tabs() as task_tabs:
+            # Motion Transfer tab
+            with gr.TabItem("Motion Transfer"):
+                gr.Markdown("## Motion Transfer")
+                # Simplified controls - Radio buttons for Yes/No and separate file upload
+                with gr.Row():
+                    mt_repaint_option = gr.Radio(
+                        label="Repaint First Frame",
+                        choices=["No", "Yes"],
+                        value="No"
+                    )
+                gr.Markdown("### Note: If you want to use your own image as repainted first frame, please upload the image in below.")
+                # Custom image uploader (always visible)
+                mt_repaint_image = gr.File(
+                    label="Custom Repaint Image",
+                    file_types=["image"]
+                )
+                # Add run button for Motion Transfer tab
+                mt_run_btn = gr.Button("Run Motion Transfer", variant="primary", size="lg")
+                # Connect to process function
+                mt_run_btn.click(
+                    fn=process_motion_transfer,
+                    inputs=[
+                        source, common_prompt,
+                        mt_repaint_option, mt_repaint_image
+                    ],
+                    outputs=[output_video]
+                )
+            # Camera Control tab
+            with gr.TabItem("Camera Control"):
+                gr.Markdown("## Camera Control")
+                cc_camera_motion = gr.Textbox(
+                    label="Current Camera Motion Sequence",
+                    placeholder="Your camera motion sequence will appear here...",
+                    interactive=False
+                )
+                # Use tabs for different motion types
+                with gr.Tabs() as cc_motion_tabs:
+                    # Translation tab
+                    with gr.TabItem("Translation (trans)"):
+                        with gr.Row():
+                            cc_trans_x = gr.Slider(minimum=-1.0, maximum=1.0, value=0.0, step=0.05, label="X-axis Movement")
+                            cc_trans_y = gr.Slider(minimum=-1.0, maximum=1.0, value=0.0, step=0.05, label="Y-axis Movement")
+                            cc_trans_z = gr.Slider(minimum=-1.0, maximum=1.0, value=0.0, step=0.05, label="Z-axis Movement (depth)")
+                        with gr.Row():
+                            cc_trans_start = gr.Number(minimum=0, maximum=48, value=0, step=1, label="Start Frame", precision=0)
+                            cc_trans_end = gr.Number(minimum=0, maximum=48, value=48, step=1, label="End Frame", precision=0)
+                        cc_trans_note = gr.Markdown("""
+                        **Translation Notes:**
+                        - Positive X: Move right, Negative X: Move left
+                        - Positive Y: Move down, Negative Y: Move up
+                        - Positive Z: Zoom in, Negative Z: Zoom out
+                        """)
+                        # Add translation button in the Translation tab
+                        cc_add_trans = gr.Button("Add Camera Translation", variant="secondary")
+                        # Function to add translation motion
+                        def add_translation_motion(current_motion, trans_x, trans_y, trans_z, trans_start, trans_end):
+                            # Format: trans dx dy dz [start_frame end_frame]
+                            frame_range = f" {int(trans_start)} {int(trans_end)}" if trans_start != 0 or trans_end != 48 else ""
+                            new_motion = f"trans {trans_x:.2f} {trans_y:.2f} {trans_z:.2f}{frame_range}"
+                            # Append to existing motion string with semicolon separator if needed
+                            if current_motion and current_motion.strip():
+                                updated_motion = f"{current_motion}; {new_motion}"
+                            else:
+                                updated_motion = new_motion
+                            return updated_motion
+                        # Connect translation button
+                        cc_add_trans.click(
+                            fn=add_translation_motion,
+                            inputs=[
+                                cc_camera_motion,
+                                cc_trans_x, cc_trans_y, cc_trans_z, cc_trans_start, cc_trans_end
+                            ],
+                            outputs=[cc_camera_motion]
+                        )
+                    # Rotation tab
+                    with gr.TabItem("Rotation (rot)"):
+                        with gr.Row():
+                            cc_rot_axis = gr.Dropdown(choices=["x", "y", "z"], value="y", label="Rotation Axis")
+                            cc_rot_angle = gr.Slider(minimum=-30, maximum=30, value=5, step=1, label="Rotation Angle (degrees)")
+                        with gr.Row():
+                            cc_rot_start = gr.Number(minimum=0, maximum=48, value=0, step=1, label="Start Frame", precision=0)
+                            cc_rot_end = gr.Number(minimum=0, maximum=48, value=48, step=1, label="End Frame", precision=0)
+                        cc_rot_note = gr.Markdown("""
+                        **Rotation Notes:**
+                        - X-axis rotation: Tilt camera up/down
+                        - Y-axis rotation: Pan camera left/right
+                        - Z-axis rotation: Roll camera
+                        """)
+                        # Add rotation button in the Rotation tab
+                        cc_add_rot = gr.Button("Add Camera Rotation", variant="secondary")
+                        # Function to add rotation motion
+                        def add_rotation_motion(current_motion, rot_axis, rot_angle, rot_start, rot_end):
+                            # Format: rot axis angle [start_frame end_frame]
+                            frame_range = f" {int(rot_start)} {int(rot_end)}" if rot_start != 0 or rot_end != 48 else ""
+                            new_motion = f"rot {rot_axis} {rot_angle}{frame_range}"
+                            # Append to existing motion string with semicolon separator if needed
+                            if current_motion and current_motion.strip():
+                                updated_motion = f"{current_motion}; {new_motion}"
+                            else:
+                                updated_motion = new_motion
+                            return updated_motion
+                        # Connect rotation button
+                        cc_add_rot.click(
+                            fn=add_rotation_motion,
+                            inputs=[
+                                cc_camera_motion,
+                                cc_rot_axis, cc_rot_angle, cc_rot_start, cc_rot_end
+                            ],
+                            outputs=[cc_camera_motion]
+                        )
+                # Add a clear button to reset the motion sequence
+                cc_clear_motion = gr.Button("Clear All Motions", variant="stop")
+                def clear_camera_motion():
+                    return ""
+                cc_clear_motion.click(
+                    fn=clear_camera_motion,
+                    inputs=[],
+                    outputs=[cc_camera_motion]
+                )
+                cc_tracking_method = gr.Radio(
+                    label="Tracking Method",
+                    choices=["spatracker", "moge"],
+                    value="moge"
+                )
+                # Add run button for Camera Control tab
+                cc_run_btn = gr.Button("Run Camera Control", variant="primary", size="lg")
+                # Connect to process function
+                cc_run_btn.click(
+                    fn=process_camera_control,
+                    inputs=[
+                        source, common_prompt,
+                        cc_camera_motion, cc_tracking_method
+                    ],
+                    outputs=[output_video]
+                )
+            # Object Manipulation tab
+            with gr.TabItem("Object Manipulation"):
+                gr.Markdown("## Object Manipulation")
+                om_object_mask = gr.File(
+                    label="Object Mask Image",
+                    file_types=["image"]
+                )
+                gr.Markdown("Upload a binary mask image, white areas indicate the object to manipulate")
+                om_object_motion = gr.Dropdown(
+                    label="Object Motion Type",
+                    choices=["up", "down", "left", "right", "front", "back", "rot"],
+                    value="up"
+                )
+                om_tracking_method = gr.Radio(
+                    label="Tracking Method",
+                    choices=["spatracker", "moge"],
+                    value="moge"
+                )
+                # Add run button for Object Manipulation tab
+                om_run_btn = gr.Button("Run Object Manipulation", variant="primary", size="lg")
+                # Connect to process function
+                om_run_btn.click(
+                    fn=process_object_manipulation,
+                    inputs=[
+                        source, common_prompt,
+                        om_object_motion, om_object_mask, om_tracking_method
+                    ],
+                    outputs=[output_video]
+                )
+            # Animating meshes to video tab
+            with gr.TabItem("Animating meshes to video"):
+                gr.Markdown("## Mesh Animation to Video")
+                gr.Markdown("""
+                    Note: Currently only supports tracking videos generated with Blender (version > 4.0).
+                    Please run the script `scripts/blender.py` in your Blender project to generate tracking videos.
+                """)
+                ma_tracking_video = gr.File(
+                    label="Tracking Video",
+                    file_types=["video"]
+                )
+                gr.Markdown("Tracking video needs to be generated from Blender")
+                # Simplified controls - Radio buttons for Yes/No and separate file upload
+                with gr.Row():
+                    ma_repaint_option = gr.Radio(
+                        label="Repaint First Frame",
+                        choices=["No", "Yes"],
+                        value="No"
+                    )
+                gr.Markdown("### Note: If you want to use your own image as repainted first frame, please upload the image in below.")
+                # Custom image uploader (always visible)
+                ma_repaint_image = gr.File(
+                    label="Custom Repaint Image",
+                    file_types=["image"]
+                )
+                # Add run button for Mesh Animation tab
+                ma_run_btn = gr.Button("Run Mesh Animation", variant="primary", size="lg")
+                # Connect to process function
+                ma_run_btn.click(
+                    fn=process_mesh_animation,
+                    inputs=[
+                        source, common_prompt,
+                        ma_tracking_video, ma_repaint_option, ma_repaint_image
+                    ],
+                    outputs=[output_video]
+                )
+# Launch interface
+if __name__ == "__main__":
+    print(f"Using GPU: {GPU_ID}")
+    print(f"Web UI will start on port {args.port}")
+    if args.share:
+        print("Creating public link for remote access")
+    # Launch interface
+    demo.launch(share=args.share, server_port=args.port)

config/__init__.py ADDED Viewed

File without changes

config/base_cfg.py ADDED Viewed

	@@ -0,0 +1,410 @@

+#python3.10
+"""Hierachical configuration for different pipelines, using `yacs`
+(refered to https://github.com/rbgirshick/yacs)
+This projects contain the configuration for three aspects:
+    the regular config for experiment setting
+    NOTE: Each experiment will be assigned a seperate working space, and the
+    intermediate results will be saved in the working space. The experimentes
+    folder structure is as follows:
+    {
+        /${ROOT_WORK_DIR}/
+        └── ${PIPELINES_NAME}/
+            ├── ${EXP_NAME}/
+                ├── ${CHECKPOINT_DIR}/
+                ├── ${RESULT_DIR}/
+                ├── meta.json/
+                └── ${LOG_DIR}
+    }
+"""
+import os, sys
+from .yacs import CfgNode as CN
+import argparse
+import numpy as np
+# the parser for boolean
+def bool_parser(arg):
+    """Parses an argument to boolean."""
+    if isinstance(arg, bool):
+        return arg
+    if arg is None:
+        return False
+    if arg.lower() in ['1', 'true', 't', 'yes', 'y']:
+        return True
+    if arg.lower() in ['0', 'false', 'f', 'no', 'n']:
+        return False
+    raise ValueError(f'`{arg}` cannot be converted to boolean!')
+# -----------------------------------------------------------------------------
+# base cfg
+# -----------------------------------------------------------------------------
+cfg = CN()
+# configuration for basic experiments
+cfg.save_dir = "./checkpoints"
+cfg.restore_ckpt = ""
+cfg.model_name = "cotracker"
+cfg.exp_name = ""
+# NOTE: configuration for datasets and augmentation
+cfg.dataset_root = ""
+cfg.eval_datasets = [""]
+cfg.dont_use_augs = False
+cfg.crop_size = [384, 512]
+cfg.traj_per_sample = 384
+cfg.sample_vis_1st_frame = False
+cfg.depth_near = 0.01 # meter
+cfg.depth_far = 65.0 # meter
+cfg.sequence_len = 24
+# NOTE: configuration for network arch
+cfg.sliding_window_len = 8
+cfg.remove_space_attn = False
+cfg.updateformer_hidden_size = 384
+cfg.updateformer_num_heads = 8
+cfg.updateformer_space_depth = 6
+cfg.updateformer_time_depth = 6
+cfg.model_stride = 4
+cfg.train_iters = 4
+cfg.if_ARAP = False
+cfg.Embed3D = False
+cfg.Loss_W_feat = 5e-1
+cfg.Loss_W_cls = 1e-4
+cfg.depth_color = False
+cfg.flash_attn = False
+cfg.corr_dp = True
+cfg.support_grid = 0
+cfg.backbone = "CNN"
+cfg.enc_only = False
+cfg.init_match = False
+cfg.Nblock = 4
+# NOTE: configuration for training and saving
+cfg.nodes_num = 1
+cfg.batch_size = 1
+cfg.num_workers = 6
+cfg.mixed_precision = False
+cfg.lr = 0.0005
+cfg.wdecay = 0.00001
+cfg.num_steps = 200000
+cfg.evaluate_every_n_epoch = 1
+cfg.save_every_n_epoch = 1
+cfg.validate_at_start = False
+cfg.save_freq = 100
+cfg.eval_max_seq_len = 1000
+cfg.debug = False
+cfg.fine_tune = False
+cfg.aug_wind_sample = False
+cfg.use_video_flip = False
+cfg.fix_backbone = False
+cfg.tune_backbone = False
+cfg.tune_arap = False
+cfg.tune_per_scene = False
+cfg.use_hier_encoder = False
+cfg.scales = [4, 2]
+# NOTE: configuration for monocular depth estimator
+cfg.mde_name = "zoedepth_nk"
+# -----------------------------------------------------------------------------
+# configurations for the command line
+parser = argparse.ArgumentParser()
+# config for the basic experiment
+parser.add_argument("--save_dir", default="./checkpoints", type=str ,help="path to save checkpoints")
+parser.add_argument("--restore_ckpt", default="", help="path to restore a checkpoint")
+parser.add_argument("--model_name", default="cotracker", help="model name")
+parser.add_argument("--exp_name", type=str, default="base",
+                    help="the name for experiment",
+                    )
+# config for dataset and augmentation
+parser.add_argument(
+    "--dataset_root", type=str, help="path lo all the datasets (train and eval)"
+)
+parser.add_argument(
+    "--eval_datasets", nargs="+", default=["things", "badja"],
+    help="what datasets to use for evaluation",
+)
+parser.add_argument(
+    "--dont_use_augs", action="store_true", default=False,
+    help="don't apply augmentations during training",
+)
+parser.add_argument(
+    "--crop_size", type=int, nargs="+", default=[384, 512],
+    help="crop videos to this resolution during training",
+)
+parser.add_argument(
+    "--traj_per_sample", type=int, default=768,
+    help="the number of trajectories to sample for training",
+)
+parser.add_argument(
+    "--depth_near", type=float, default=0.01, help="near plane depth"
+)
+parser.add_argument(
+    "--depth_far", type=float, default=65.0, help="far plane depth"
+)
+parser.add_argument(
+    "--sample_vis_1st_frame",
+    action="store_true",
+    default=False,
+    help="only sample trajectories with points visible on the first frame",
+)
+parser.add_argument(
+    "--sequence_len", type=int, default=24, help="train sequence length"
+)
+# configuration for network arch
+parser.add_argument(
+    "--sliding_window_len",
+    type=int,
+    default=8,
+    help="length of the CoTracker sliding window",
+)
+parser.add_argument(
+    "--remove_space_attn",
+    action="store_true",
+    default=False,
+    help="remove space attention from CoTracker",
+)
+parser.add_argument(
+    "--updateformer_hidden_size",
+    type=int,
+    default=384,
+    help="hidden dimension of the CoTracker transformer model",
+)
+parser.add_argument(
+    "--updateformer_num_heads",
+    type=int,
+    default=8,
+    help="number of heads of the CoTracker transformer model",
+)
+parser.add_argument(
+    "--updateformer_space_depth",
+    type=int,
+    default=6,
+    help="number of group attention layers in the CoTracker transformer model",
+)
+parser.add_argument(
+    "--updateformer_time_depth",
+    type=int,
+    default=6,
+    help="number of time attention layers in the CoTracker transformer model",
+)
+parser.add_argument(
+    "--model_stride",
+    type=int,
+    default=4,
+    help="stride of the CoTracker feature network",
+)
+parser.add_argument(
+    "--train_iters",
+    type=int,
+    default=4,
+    help="number of updates to the disparity field in each forward pass.",
+)
+parser.add_argument(
+    "--if_ARAP",
+    action="store_true",
+    default=False,
+    help="if using ARAP loss in the optimization",
+)
+parser.add_argument(
+    "--Embed3D",
+    action="store_true",
+    default=False,
+    help="if using the 3D embedding for image",
+)
+parser.add_argument(
+    "--Loss_W_feat",
+    type=float,
+    default=5e-1,
+    help="weight for the feature loss",
+)
+parser.add_argument(
+    "--Loss_W_cls",
+    type=float,
+    default=1e-4,
+    help="weight for the classification loss",
+)
+parser.add_argument(
+    "--depth_color",
+    action="store_true",
+    default=False,
+    help="if using the color for depth",
+)
+parser.add_argument(
+    "--flash_attn",
+    action="store_true",
+    default=False,
+    help="if using the flash attention",
+)
+parser.add_argument(
+    "--corr_dp",
+    action="store_true",
+    default=False,
+    help="if using the correlation of depth",
+)
+parser.add_argument(
+    "--support_grid",
+    type=int,
+    default=0,
+    help="if using the support grid",
+)
+parser.add_argument(
+    "--backbone",
+    type=str,
+    default="CNN",
+    help="backbone for the CoTracker feature network",
+)
+parser.add_argument(
+    "--enc_only",
+    action="store_true",
+    default=False,
+    help="if using the encoder only",
+)
+parser.add_argument(
+    "--init_match",
+    action="store_true",
+    default=False,
+    help="if using the initial matching",
+)
+parser.add_argument(
+    "--Nblock",
+    type=int,
+    default=4,
+    help="number of blocks in the CoTracker feature network",
+)
+# configuration for training and saving
+parser.add_argument(
+    "--nodes_num", type=int, default=1, help="number of nodes used for training."
+)
+parser.add_argument(
+    "--batch_size", type=int, default=1, help="batch size used during training."
+)
+parser.add_argument(
+    "--num_workers", type=int, default=6, help="number of dataloader workers"
+)
+parser.add_argument(
+    "--mixed_precision",
+    action="store_true", default=False,
+    help="use mixed precision"
+)
+parser.add_argument("--lr", type=float, default=0.0005, help="max learning rate.")
+parser.add_argument(
+    "--wdecay", type=float, default=0.00001, help="Weight decay in optimizer."
+)
+parser.add_argument(
+    "--num_steps", type=int, default=200000, help="length of training schedule."
+)
+parser.add_argument(
+    "--evaluate_every_n_epoch",
+    type=int,
+    default=1,
+    help="evaluate during training after every n epochs, after every epoch by default",
+)
+parser.add_argument(
+    "--save_every_n_epoch",
+    type=int,
+    default=1,
+    help="save checkpoints during training after every n epochs, after every epoch by default",
+)
+parser.add_argument(
+    "--validate_at_start",
+    action="store_true",
+    default=False,
+    help="whether to run evaluation before training starts",
+)
+parser.add_argument(
+    "--save_freq",
+    type=int,
+    default=100,
+    help="frequency of trajectory visualization during training",
+)
+parser.add_argument(
+    "--eval_max_seq_len",
+    type=int,
+    default=1000,
+    help="maximum length of evaluation videos",
+)
+parser.add_argument(
+    "--debug",
+    action="store_true",
+    default=False,
+    help="if using the visibility mask",
+)
+parser.add_argument(
+    "--fine_tune",
+    action="store_true",
+    default=False,
+    help="if fine tune the model",
+)
+parser.add_argument(
+    "--aug_wind_sample",
+    action="store_true",
+    default=False,
+    help="if using the window sampling",
+)
+parser.add_argument(
+    "--use_video_flip",
+    action="store_true",
+    default=False,
+    help="if using the video flip",
+)
+parser.add_argument(
+    "--fix_backbone",
+    action="store_true",
+    default=False,
+    help="if fix the backbone",
+)
+parser.add_argument(
+    "--tune_backbone",
+    action="store_true",
+    default=False,
+    help="if tune the backbone",
+)
+parser.add_argument(
+    "--tune_arap",
+    action="store_true",
+    default=False,
+    help="if fix the backbone",
+)
+parser.add_argument(
+    "--tune_per_scene",
+    action="store_true",
+    default=False,
+    help="if tune one scene",
+)
+parser.add_argument(
+    "--use_hier_encoder",
+    action="store_true",
+    default=False,
+    help="if using the hierarchical encoder",
+)
+parser.add_argument(
+    "--scales",
+    type=int,
+    nargs="+",
+    default=[4, 2],
+    help="scales for the CoTracker feature network",
+)
+# config for monocular depth estimator
+parser.add_argument(
+    "--mde_name", type=str, default="zoedepth_nk", help="name of the MDE model"
+)
+args = parser.parse_args()
+args_dict = vars(args)
+# -----------------------------------------------------------------------------
+# merge the `args` to the `cfg`
+cfg.merge_from_dict(args_dict)
+cfg.ckpt_path=os.path.join(args.save_dir, args.model_name ,args.exp_name)

config/ssm_cfg.py ADDED Viewed

	@@ -0,0 +1,347 @@

+#python3.10
+"""Hierachical configuration for different pipelines, using `yacs`
+(refered to https://github.com/rbgirshick/yacs)
+This projects contain the configuration for three aspects:
+    the regular config for experiment setting
+    NOTE: Each experiment will be assigned a seperate working space, and the
+    intermediate results will be saved in the working space. The experimentes
+    folder structure is as follows:
+    {
+        /${ROOT_WORK_DIR}/
+        └── ${PIPELINES_NAME}/
+            ├── ${EXP_NAME}/
+                ├── ${CHECKPOINT_DIR}/
+                ├── ${RESULT_DIR}/
+                ├── meta.json/
+                └── ${LOG_DIR}
+    }
+"""
+import os, sys
+from .yacs import CfgNode as CN
+import argparse
+import numpy as np
+# the parser for boolean
+def bool_parser(arg):
+    """Parses an argument to boolean."""
+    if isinstance(arg, bool):
+        return arg
+    if arg is None:
+        return False
+    if arg.lower() in ['1', 'true', 't', 'yes', 'y']:
+        return True
+    if arg.lower() in ['0', 'false', 'f', 'no', 'n']:
+        return False
+    raise ValueError(f'`{arg}` cannot be converted to boolean!')
+# -----------------------------------------------------------------------------
+# base cfg
+# -----------------------------------------------------------------------------
+cfg = CN()
+# configuration for basic experiments
+cfg.save_dir = "./checkpoints"
+cfg.restore_ckpt = ""
+cfg.model_name = "cotracker"
+cfg.exp_name = ""
+# NOTE: configuration for datasets and augmentation
+cfg.dataset_root = ""
+cfg.eval_datasets = [""]
+cfg.dont_use_augs = False
+cfg.crop_size = [384, 512]
+cfg.traj_per_sample = 384
+cfg.sample_vis_1st_frame = False
+cfg.depth_near = 0.01 # meter
+cfg.depth_far = 65.0 # meter
+cfg.sequence_len = 24
+# NOTE: configuration for network arch
+cfg.hidden_size = 384
+cfg.mamba_depth = 8
+cfg.model_stride = 4
+cfg.train_iters = 4
+cfg.updateformer_num_heads = 8
+cfg.updateformer_hidden_size = 384
+cfg.if_ARAP = False
+cfg.Embed3D = False
+cfg.Loss_W_feat = 5e-1
+cfg.Loss_W_cls = 1e-4
+cfg.depth_color = False
+cfg.flash_attn = False
+cfg.corr_dp = True
+cfg.support_grid = 0
+cfg.backbone = "CNN"
+cfg.enc_only = False
+# NOTE: configuration for training and saving
+cfg.nodes_num = 1
+cfg.batch_size = 1
+cfg.num_workers = 6
+cfg.mixed_precision = False
+cfg.lr = 0.0005
+cfg.wdecay = 0.00001
+cfg.num_steps = 200000
+cfg.evaluate_every_n_epoch = 1
+cfg.save_every_n_epoch = 1
+cfg.validate_at_start = False
+cfg.save_freq = 100
+cfg.eval_max_seq_len = 1000
+cfg.debug = False
+cfg.fine_tune = False
+cfg.aug_wind_sample = False
+cfg.use_video_flip = False
+cfg.fix_backbone = False
+cfg.tune_backbone = False
+# NOTE: configuration for monocular depth estimator
+cfg.mde_name = "zoedepth_nk"
+# -----------------------------------------------------------------------------
+# configurations for the command line
+parser = argparse.ArgumentParser()
+# config for the basic experiment
+parser.add_argument("--save_dir", default="./checkpoints", type=str ,help="path to save checkpoints")
+parser.add_argument("--restore_ckpt", default="", help="path to restore a checkpoint")
+parser.add_argument("--model_name", default="cotracker", help="model name")
+parser.add_argument("--exp_name", type=str, default="base",
+                    help="the name for experiment",
+                    )
+# config for dataset and augmentation
+parser.add_argument(
+    "--dataset_root", type=str, help="path lo all the datasets (train and eval)"
+)
+parser.add_argument(
+    "--eval_datasets", nargs="+", default=["things", "badja"],
+    help="what datasets to use for evaluation",
+)
+parser.add_argument(
+    "--dont_use_augs", action="store_true", default=False,
+    help="don't apply augmentations during training",
+)
+parser.add_argument(
+    "--crop_size", type=int, nargs="+", default=[384, 512],
+    help="crop videos to this resolution during training",
+)
+parser.add_argument(
+    "--traj_per_sample", type=int, default=768,
+    help="the number of trajectories to sample for training",
+)
+parser.add_argument(
+    "--depth_near", type=float, default=0.01, help="near plane depth"
+)
+parser.add_argument(
+    "--depth_far", type=float, default=65.0, help="far plane depth"
+)
+parser.add_argument(
+    "--sample_vis_1st_frame",
+    action="store_true",
+    default=False,
+    help="only sample trajectories with points visible on the first frame",
+)
+parser.add_argument(
+    "--sequence_len", type=int, default=24, help="train sequence length"
+)
+# configuration for network arch
+parser.add_argument(
+    "--hidden_size",
+    type=int,
+    default=384,
+    help="hidden dimension of the CoTracker transformer model",
+)
+parser.add_argument(
+    "--mamba_depth",
+    type=int,
+    default=6,
+    help="number of group attention layers in the CoTracker transformer model",
+)
+parser.add_argument(
+    "--updateformer_num_heads",
+    type=int,
+    default=8,
+    help="number of heads of the CoTracker transformer model",
+)
+parser.add_argument(
+    "--updateformer_hidden_size",
+    type=int,
+    default=384,
+    help="hidden dimension of the CoTracker transformer model",
+)
+parser.add_argument(
+    "--model_stride",
+    type=int,
+    default=4,
+    help="stride of the CoTracker feature network",
+)
+parser.add_argument(
+    "--train_iters",
+    type=int,
+    default=4,
+    help="number of updates to the disparity field in each forward pass.",
+)
+parser.add_argument(
+    "--if_ARAP",
+    action="store_true",
+    default=False,
+    help="if using ARAP loss in the optimization",
+)
+parser.add_argument(
+    "--Embed3D",
+    action="store_true",
+    default=False,
+    help="if using the 3D embedding for image",
+)
+parser.add_argument(
+    "--Loss_W_feat",
+    type=float,
+    default=5e-1,
+    help="weight for the feature loss",
+)
+parser.add_argument(
+    "--Loss_W_cls",
+    type=float,
+    default=1e-4,
+    help="weight for the classification loss",
+)
+parser.add_argument(
+    "--depth_color",
+    action="store_true",
+    default=False,
+    help="if using the color for depth",
+)
+parser.add_argument(
+    "--flash_attn",
+    action="store_true",
+    default=False,
+    help="if using the flash attention",
+)
+parser.add_argument(
+    "--corr_dp",
+    action="store_true",
+    default=False,
+    help="if using the correlation of depth",
+)
+parser.add_argument(
+    "--support_grid",
+    type=int,
+    default=0,
+    help="if using the support grid",
+)
+parser.add_argument(
+    "--backbone",
+    type=str,
+    default="CNN",
+    help="backbone for the CoTracker feature network",
+)
+parser.add_argument(
+    "--enc_only",
+    action="store_true",
+    default=False,
+    help="if using the encoder only",
+)
+# configuration for training and saving
+parser.add_argument(
+    "--nodes_num", type=int, default=1, help="number of nodes used for training."
+)
+parser.add_argument(
+    "--batch_size", type=int, default=1, help="batch size used during training."
+)
+parser.add_argument(
+    "--num_workers", type=int, default=6, help="number of dataloader workers"
+)
+parser.add_argument(
+    "--mixed_precision",
+    action="store_true", default=False,
+    help="use mixed precision"
+)
+parser.add_argument("--lr", type=float, default=0.0005, help="max learning rate.")
+parser.add_argument(
+    "--wdecay", type=float, default=0.00001, help="Weight decay in optimizer."
+)
+parser.add_argument(
+    "--num_steps", type=int, default=200000, help="length of training schedule."
+)
+parser.add_argument(
+    "--evaluate_every_n_epoch",
+    type=int,
+    default=1,
+    help="evaluate during training after every n epochs, after every epoch by default",
+)
+parser.add_argument(
+    "--save_every_n_epoch",
+    type=int,
+    default=1,
+    help="save checkpoints during training after every n epochs, after every epoch by default",
+)
+parser.add_argument(
+    "--validate_at_start",
+    action="store_true",
+    default=False,
+    help="whether to run evaluation before training starts",
+)
+parser.add_argument(
+    "--save_freq",
+    type=int,
+    default=100,
+    help="frequency of trajectory visualization during training",
+)
+parser.add_argument(
+    "--eval_max_seq_len",
+    type=int,
+    default=1000,
+    help="maximum length of evaluation videos",
+)
+parser.add_argument(
+    "--debug",
+    action="store_true",
+    default=False,
+    help="if using the visibility mask",
+)
+parser.add_argument(
+    "--fine_tune",
+    action="store_true",
+    default=False,
+    help="if fine tune the model",
+)
+parser.add_argument(
+    "--aug_wind_sample",
+    action="store_true",
+    default=False,
+    help="if using the window sampling",
+)
+parser.add_argument(
+    "--use_video_flip",
+    action="store_true",
+    default=False,
+    help="if using the video flip",
+)
+parser.add_argument(
+    "--fix_backbone",
+    action="store_true",
+    default=False,
+    help="if fix the backbone",
+)
+# config for monocular depth estimator
+parser.add_argument(
+    "--mde_name", type=str, default="zoedepth_nk", help="name of the MDE model"
+)
+args = parser.parse_args()
+args_dict = vars(args)
+# -----------------------------------------------------------------------------
+# merge the `args` to the `cfg`
+cfg.merge_from_dict(args_dict)
+cfg.ckpt_path=os.path.join(args.save_dir, args.model_name ,args.exp_name)

config/yacs.py ADDED Viewed

	@@ -0,0 +1,506 @@

+# Copyright (c) 2018-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+"""YACS -- Yet Another Configuration System is designed to be a simple
+configuration management system for academic and industrial research
+projects.
+See README.md for usage and examples.
+"""
+import copy
+import io
+import logging
+import os
+from ast import literal_eval
+import yaml
+# Flag for py2 and py3 compatibility to use when separate code paths are necessary
+# When _PY2 is False, we assume Python 3 is in use
+_PY2 = False
+# Filename extensions for loading configs from files
+_YAML_EXTS = {"", ".yaml", ".yml"}
+_PY_EXTS = {".py"}
+# py2 and py3 compatibility for checking file object type
+# We simply use this to infer py2 vs py3
+try:
+    _FILE_TYPES = (file, io.IOBase)
+    _PY2 = True
+except NameError:
+    _FILE_TYPES = (io.IOBase,)
+# CfgNodes can only contain a limited set of valid types
+_VALID_TYPES = {tuple, list, str, int, float, bool}
+# py2 allow for str and unicode
+if _PY2:
+    _VALID_TYPES = _VALID_TYPES.union({unicode})  # noqa: F821
+# Utilities for importing modules from file paths
+if _PY2:
+    # imp is available in both py2 and py3 for now, but is deprecated in py3
+    import imp
+else:
+    import importlib.util
+logger = logging.getLogger(__name__)
+class CfgNode(dict):
+    """
+    CfgNode represents an internal node in the configuration tree. It's a simple
+    dict-like container that allows for attribute-based access to keys.
+    """
+    IMMUTABLE = "__immutable__"
+    DEPRECATED_KEYS = "__deprecated_keys__"
+    RENAMED_KEYS = "__renamed_keys__"
+    def __init__(self, init_dict=None, key_list=None):
+        # Recursively convert nested dictionaries in init_dict into CfgNodes
+        init_dict = {} if init_dict is None else init_dict
+        key_list = [] if key_list is None else key_list
+        for k, v in init_dict.items():
+            if type(v) is dict:
+                # Convert dict to CfgNode
+                init_dict[k] = CfgNode(v, key_list=key_list + [k])
+            else:
+                # Check for valid leaf type or nested CfgNode
+                _assert_with_logging(
+                    _valid_type(v, allow_cfg_node=True),
+                    "Key {} with value {} is not a valid type; valid types: {}".format(
+                        ".".join(key_list + [k]), type(v), _VALID_TYPES
+                    ),
+                )
+        super(CfgNode, self).__init__(init_dict)
+        # Manage if the CfgNode is frozen or not
+        self.__dict__[CfgNode.IMMUTABLE] = False
+        # Deprecated options
+        # If an option is removed from the code and you don't want to break existing
+        # yaml configs, you can add the full config key as a string to the set below.
+        self.__dict__[CfgNode.DEPRECATED_KEYS] = set()
+        # Renamed options
+        # If you rename a config option, record the mapping from the old name to the new
+        # name in the dictionary below. Optionally, if the type also changed, you can
+        # make the value a tuple that specifies first the renamed key and then
+        # instructions for how to edit the config file.
+        self.__dict__[CfgNode.RENAMED_KEYS] = {
+            # 'EXAMPLE.OLD.KEY': 'EXAMPLE.NEW.KEY',  # Dummy example to follow
+            # 'EXAMPLE.OLD.KEY': (                   # A more complex example to follow
+            #     'EXAMPLE.NEW.KEY',
+            #     "Also convert to a tuple, e.g., 'foo' -> ('foo',) or "
+            #     + "'foo:bar' -> ('foo', 'bar')"
+            # ),
+        }
+    def __getattr__(self, name):
+        if name in self:
+            return self[name]
+        else:
+            raise AttributeError(name)
+    def __setattr__(self, name, value):
+        if self.is_frozen():
+            raise AttributeError(
+                "Attempted to set {} to {}, but CfgNode is immutable".format(
+                    name, value
+                )
+            )
+        _assert_with_logging(
+            name not in self.__dict__,
+            "Invalid attempt to modify internal CfgNode state: {}".format(name),
+        )
+        _assert_with_logging(
+            _valid_type(value, allow_cfg_node=True),
+            "Invalid type {} for key {}; valid types = {}".format(
+                type(value), name, _VALID_TYPES
+            ),
+        )
+        self[name] = value
+    def __str__(self):
+        def _indent(s_, num_spaces):
+            s = s_.split("\n")
+            if len(s) == 1:
+                return s_
+            first = s.pop(0)
+            s = [(num_spaces * " ") + line for line in s]
+            s = "\n".join(s)
+            s = first + "\n" + s
+            return s
+        r = ""
+        s = []
+        for k, v in sorted(self.items()):
+            seperator = "\n" if isinstance(v, CfgNode) else " "
+            attr_str = "{}:{}{}".format(str(k), seperator, str(v))
+            attr_str = _indent(attr_str, 2)
+            s.append(attr_str)
+        r += "\n".join(s)
+        return r
+    def __repr__(self):
+        return "{}({})".format(self.__class__.__name__, super(CfgNode, self).__repr__())
+    def dump(self):
+        """Dump to a string."""
+        self_as_dict = _to_dict(self)
+        return yaml.safe_dump(self_as_dict)
+    def merge_from_file(self, cfg_filename):
+        """Load a yaml config file and merge it this CfgNode."""
+        with open(cfg_filename, "r") as f:
+            cfg = load_cfg(f)
+        self.merge_from_other_cfg(cfg)
+    def merge_from_other_cfg(self, cfg_other):
+        """Merge `cfg_other` into this CfgNode."""
+        _merge_a_into_b(cfg_other, self, self, [])
+    def merge_from_list(self, cfg_list):
+        """Merge config (keys, values) in a list (e.g., from command line) into
+        this CfgNode. For example, `cfg_list = ['FOO.BAR', 0.5]`.
+        """
+        _assert_with_logging(
+            len(cfg_list) % 2 == 0,
+            "Override list has odd length: {}; it must be a list of pairs".format(
+                cfg_list
+            ),
+        )
+        root = self
+        for full_key, v in zip(cfg_list[0::2], cfg_list[1::2]):
+            if root.key_is_deprecated(full_key):
+                continue
+            if root.key_is_renamed(full_key):
+                root.raise_key_rename_error(full_key)
+            key_list = full_key.split(".")
+            d = self
+            for subkey in key_list[:-1]:
+                _assert_with_logging(
+                    subkey in d, "Non-existent key: {}".format(full_key)
+                )
+                d = d[subkey]
+            subkey = key_list[-1]
+            _assert_with_logging(subkey in d, "Non-existent key: {}".format(full_key))
+            value = _decode_cfg_value(v)
+            value = _check_and_coerce_cfg_value_type(value, d[subkey], subkey, full_key)
+            d[subkey] = value
+    def merge_from_dict(self, cfg_dict):
+        """Merge config (keys, values) in a dict into this CfgNode."""
+        cfg_dict = cfg_dict.items()
+        cfg_list = []
+        for pair in cfg_dict:
+            cfg_list.append(pair[0])
+            cfg_list.append(pair[1])
+        self.merge_from_list(cfg_list)
+    def freeze(self):
+        """Make this CfgNode and all of its children immutable."""
+        self._immutable(True)
+    def defrost(self):
+        """Make this CfgNode and all of its children mutable."""
+        self._immutable(False)
+    def is_frozen(self):
+        """Return mutability."""
+        return self.__dict__[CfgNode.IMMUTABLE]
+    def _immutable(self, is_immutable):
+        """Set immutability to is_immutable and recursively apply the setting
+        to all nested CfgNodes.
+        """
+        self.__dict__[CfgNode.IMMUTABLE] = is_immutable
+        # Recursively set immutable state
+        for v in self.__dict__.values():
+            if isinstance(v, CfgNode):
+                v._immutable(is_immutable)
+        for v in self.values():
+            if isinstance(v, CfgNode):
+                v._immutable(is_immutable)
+    def clone(self):
+        """Recursively copy this CfgNode."""
+        return copy.deepcopy(self)
+    def register_deprecated_key(self, key):
+        """Register key (e.g. `FOO.BAR`) a deprecated option. When merging deprecated
+        keys a warning is generated and the key is ignored.
+        """
+        _assert_with_logging(
+            key not in self.__dict__[CfgNode.DEPRECATED_KEYS],
+            "key {} is already registered as a deprecated key".format(key),
+        )
+        self.__dict__[CfgNode.DEPRECATED_KEYS].add(key)
+    def register_renamed_key(self, old_name, new_name, message=None):
+        """Register a key as having been renamed from `old_name` to `new_name`.
+        When merging a renamed key, an exception is thrown alerting to user to
+        the fact that the key has been renamed.
+        """
+        _assert_with_logging(
+            old_name not in self.__dict__[CfgNode.RENAMED_KEYS],
+            "key {} is already registered as a renamed cfg key".format(old_name),
+        )
+        value = new_name
+        if message:
+            value = (new_name, message)
+        self.__dict__[CfgNode.RENAMED_KEYS][old_name] = value
+    def key_is_deprecated(self, full_key):
+        """Test if a key is deprecated."""
+        if full_key in self.__dict__[CfgNode.DEPRECATED_KEYS]:
+            logger.warning("Deprecated config key (ignoring): {}".format(full_key))
+            return True
+        return False
+    def key_is_renamed(self, full_key):
+        """Test if a key is renamed."""
+        return full_key in self.__dict__[CfgNode.RENAMED_KEYS]
+    def raise_key_rename_error(self, full_key):
+        new_key = self.__dict__[CfgNode.RENAMED_KEYS][full_key]
+        if isinstance(new_key, tuple):
+            msg = " Note: " + new_key[1]
+            new_key = new_key[0]
+        else:
+            msg = ""
+        raise KeyError(
+            "Key {} was renamed to {}; please update your config.{}".format(
+                full_key, new_key, msg
+            )
+        )
+def load_cfg(cfg_file_obj_or_str):
+    """Load a cfg. Supports loading from:
+        - A file object backed by a YAML file
+        - A file object backed by a Python source file that exports an attribute
+          "cfg" that is either a dict or a CfgNode
+        - A string that can be parsed as valid YAML
+    """
+    _assert_with_logging(
+        isinstance(cfg_file_obj_or_str, _FILE_TYPES + (str,)),
+        "Expected first argument to be of type {} or {}, but it was {}".format(
+            _FILE_TYPES, str, type(cfg_file_obj_or_str)
+        ),
+    )
+    if isinstance(cfg_file_obj_or_str, str):
+        return _load_cfg_from_yaml_str(cfg_file_obj_or_str)
+    elif isinstance(cfg_file_obj_or_str, _FILE_TYPES):
+        return _load_cfg_from_file(cfg_file_obj_or_str)
+    else:
+        raise NotImplementedError("Impossible to reach here (unless there's a bug)")
+def _load_cfg_from_file(file_obj):
+    """Load a config from a YAML file or a Python source file."""
+    _, file_extension = os.path.splitext(file_obj.name)
+    if file_extension in _YAML_EXTS:
+        return _load_cfg_from_yaml_str(file_obj.read())
+    elif file_extension in _PY_EXTS:
+        return _load_cfg_py_source(file_obj.name)
+    else:
+        raise Exception(
+            "Attempt to load from an unsupported file type {}; "
+            "only {} are supported".format(file_obj, _YAML_EXTS.union(_PY_EXTS))
+        )
+def _load_cfg_from_yaml_str(str_obj):
+    """Load a config from a YAML string encoding."""
+    cfg_as_dict = yaml.safe_load(str_obj)
+    return CfgNode(cfg_as_dict)
+def _load_cfg_py_source(filename):
+    """Load a config from a Python source file."""
+    module = _load_module_from_file("yacs.config.override", filename)
+    _assert_with_logging(
+        hasattr(module, "cfg"),
+        "Python module from file {} must have 'cfg' attr".format(filename),
+    )
+    VALID_ATTR_TYPES = {dict, CfgNode}
+    _assert_with_logging(
+        type(module.cfg) in VALID_ATTR_TYPES,
+        "Imported module 'cfg' attr must be in {} but is {} instead".format(
+            VALID_ATTR_TYPES, type(module.cfg)
+        ),
+    )
+    if type(module.cfg) is dict:
+        return CfgNode(module.cfg)
+    else:
+        return module.cfg
+def _to_dict(cfg_node):
+    """Recursively convert all CfgNode objects to dict objects."""
+    def convert_to_dict(cfg_node, key_list):
+        if not isinstance(cfg_node, CfgNode):
+            _assert_with_logging(
+                _valid_type(cfg_node),
+                "Key {} with value {} is not a valid type; valid types: {}".format(
+                    ".".join(key_list), type(cfg_node), _VALID_TYPES
+                ),
+            )
+            return cfg_node
+        else:
+            cfg_dict = dict(cfg_node)
+            for k, v in cfg_dict.items():
+                cfg_dict[k] = convert_to_dict(v, key_list + [k])
+            return cfg_dict
+    return convert_to_dict(cfg_node, [])
+def _valid_type(value, allow_cfg_node=False):
+    return (type(value) in _VALID_TYPES) or (allow_cfg_node and type(value) == CfgNode)
+def _merge_a_into_b(a, b, root, key_list):
+    """Merge config dictionary a into config dictionary b, clobbering the
+    options in b whenever they are also specified in a.
+    """
+    _assert_with_logging(
+        isinstance(a, CfgNode),
+        "`a` (cur type {}) must be an instance of {}".format(type(a), CfgNode),
+    )
+    _assert_with_logging(
+        isinstance(b, CfgNode),
+        "`b` (cur type {}) must be an instance of {}".format(type(b), CfgNode),
+    )
+    for k, v_ in a.items():
+        full_key = ".".join(key_list + [k])
+        # a must specify keys that are in b
+        if k not in b:
+            if root.key_is_deprecated(full_key):
+                continue
+            elif root.key_is_renamed(full_key):
+                root.raise_key_rename_error(full_key)
+            else:
+                v = copy.deepcopy(v_)
+                v = _decode_cfg_value(v)
+                b.update({k: v})
+        else:
+            v = copy.deepcopy(v_)
+            v = _decode_cfg_value(v)
+            v = _check_and_coerce_cfg_value_type(v, b[k], k, full_key)
+        # Recursively merge dicts
+        if isinstance(v, CfgNode):
+            try:
+                _merge_a_into_b(v, b[k], root, key_list + [k])
+            except BaseException:
+                raise
+        else:
+            b[k] = v
+def _decode_cfg_value(v):
+    """Decodes a raw config value (e.g., from a yaml config files or command
+    line argument) into a Python object.
+    """
+    # Configs parsed from raw yaml will contain dictionary keys that need to be
+    # converted to CfgNode objects
+    if isinstance(v, dict):
+        return CfgNode(v)
+    # All remaining processing is only applied to strings
+    if not isinstance(v, str):
+        return v
+    # Try to interpret `v` as a:
+    #   string, number, tuple, list, dict, boolean, or None
+    try:
+        v = literal_eval(v)
+    # The following two excepts allow v to pass through when it represents a
+    # string.
+    #
+    # Longer explanation:
+    # The type of v is always a string (before calling literal_eval), but
+    # sometimes it *represents* a string and other times a data structure, like
+    # a list. In the case that v represents a string, what we got back from the
+    # yaml parser is 'foo' *without quotes* (so, not '"foo"'). literal_eval is
+    # ok with '"foo"', but will raise a ValueError if given 'foo'. In other
+    # cases, like paths (v = 'foo/bar' and not v = '"foo/bar"'), literal_eval
+    # will raise a SyntaxError.
+    except ValueError:
+        pass
+    except SyntaxError:
+        pass
+    return v
+def _check_and_coerce_cfg_value_type(replacement, original, key, full_key):
+    """Checks that `replacement`, which is intended to replace `original` is of
+    the right type. The type is correct if it matches exactly or is one of a few
+    cases in which the type can be easily coerced.
+    """
+    original_type = type(original)
+    replacement_type = type(replacement)
+    # The types must match (with some exceptions)
+    if replacement_type == original_type:
+        return replacement
+    # Cast replacement from from_type to to_type if the replacement and original
+    # types match from_type and to_type
+    def conditional_cast(from_type, to_type):
+        if replacement_type == from_type and original_type == to_type:
+            return True, to_type(replacement)
+        else:
+            return False, None
+    # Conditionally casts
+    # list <-> tuple
+    casts = [(tuple, list), (list, tuple)]
+    # For py2: allow converting from str (bytes) to a unicode string
+    try:
+        casts.append((str, unicode))  # noqa: F821
+    except Exception:
+        pass
+    for (from_type, to_type) in casts:
+        converted, converted_value = conditional_cast(from_type, to_type)
+        if converted:
+            return converted_value
+    raise ValueError(
+        "Type mismatch ({} vs. {}) with values ({} vs. {}) for config "
+        "key: {}".format(
+            original_type, replacement_type, original, replacement, full_key
+        )
+    )
+def _assert_with_logging(cond, msg):
+    if not cond:
+        logger.debug(msg)
+    assert cond, msg
+def _load_module_from_file(name, filename):
+    if _PY2:
+        module = imp.load_source(name, filename)
+    else:
+        spec = importlib.util.spec_from_file_location(name, filename)
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+    return module

demo.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import os
+import sys
+import argparse
+from PIL import Image
+project_root = os.path.dirname(os.path.abspath(__file__))
+try:
+    sys.path.append(os.path.join(project_root, "submodules/MoGe"))
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+except:
+    print("Warning: MoGe not found, motion transfer will not be applied")
+import torch
+import numpy as np
+from PIL import Image
+import torchvision.transforms as transforms
+from moviepy.editor import VideoFileClip
+from diffusers.utils import load_image, load_video
+from models.pipelines import DiffusionAsShaderPipeline, FirstFrameRepainter, CameraMotionGenerator, ObjectMotionGenerator
+from submodules.MoGe.moge.model import MoGeModel
+def load_media(media_path, max_frames=49, transform=None):
+    """Load video or image frames and convert to tensor
+    Args:
+        media_path (str): Path to video or image file
+        max_frames (int): Maximum number of frames to load
+        transform (callable): Transform to apply to frames
+    Returns:
+        Tuple[torch.Tensor, float]: Video tensor [T,C,H,W] and FPS
+    """
+    if transform is None:
+        transform = transforms.Compose([
+            transforms.Resize((480, 720)),
+            transforms.ToTensor()
+        ])
+    # Determine if input is video or image based on extension
+    ext = os.path.splitext(media_path)[1].lower()
+    is_video = ext in ['.mp4', '.avi', '.mov']
+    if is_video:
+        frames = load_video(media_path)
+        fps = len(frames) / VideoFileClip(media_path).duration
+    else:
+        # Handle image as single frame
+        image = load_image(media_path)
+        frames = [image]
+        fps = 8  # Default fps for images
+    # Ensure we have exactly max_frames
+    if len(frames) > max_frames:
+        frames = frames[:max_frames]
+    elif len(frames) < max_frames:
+        last_frame = frames[-1]
+        while len(frames) < max_frames:
+            frames.append(last_frame.copy())
+    # Convert frames to tensor
+    video_tensor = torch.stack([transform(frame) for frame in frames])
+    return video_tensor, fps, is_video
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_path', type=str, default=None, help='Path to input video/image')
+    parser.add_argument('--prompt', type=str, required=True, help='Repaint prompt')
+    parser.add_argument('--output_dir', type=str, default='outputs', help='Output directory')
+    parser.add_argument('--gpu', type=int, default=0, help='GPU device ID')
+    parser.add_argument('--checkpoint_path', type=str, default="EXCAI/Diffusion-As-Shader", help='Path to model checkpoint')
+    parser.add_argument('--depth_path', type=str, default=None, help='Path to depth image')
+    parser.add_argument('--tracking_path', type=str, default=None, help='Path to tracking video, if provided, camera motion and object manipulation will not be applied')
+    parser.add_argument('--repaint', type=str, default=None,
+                       help='Path to repainted image, or "true" to perform repainting, if not provided use original frame')
+    parser.add_argument('--camera_motion', type=str, default=None,
+                    help='Camera motion mode: "trans <dx> <dy> <dz>" or "rot <axis> <angle>" or "spiral <radius>"')
+    parser.add_argument('--object_motion', type=str, default=None, help='Object motion mode: up/down/left/right')
+    parser.add_argument('--object_mask', type=str, default=None, help='Path to object mask image (binary image)')
+    parser.add_argument('--tracking_method', type=str, default='spatracker', choices=['spatracker', 'moge'],
+                    help='Tracking method to use (spatracker or moge)')
+    args = parser.parse_args()
+    # Load input video/image
+    video_tensor, fps, is_video = load_media(args.input_path)
+    if not is_video:
+        args.tracking_method = "moge"
+        print("Image input detected, using MoGe for tracking video generation.")
+    # Initialize pipeline
+    das = DiffusionAsShaderPipeline(gpu_id=args.gpu, output_dir=args.output_dir)
+    if args.tracking_method == "moge" and args.tracking_path is None:
+        moge = MoGeModel.from_pretrained("Ruicheng/moge-vitl").to(das.device)
+    # Repaint first frame if requested
+    repaint_img_tensor = None
+    if args.repaint:
+        if args.repaint.lower() == "true":
+            repainter = FirstFrameRepainter(gpu_id=args.gpu, output_dir=args.output_dir)
+            repaint_img_tensor = repainter.repaint(
+                video_tensor[0],
+                prompt=args.prompt,
+                depth_path=args.depth_path
+            )
+        else:
+            repaint_img_tensor, _, _ = load_media(args.repaint)
+            repaint_img_tensor = repaint_img_tensor[0]  # Take first frame
+    # Generate tracking if not provided
+    tracking_tensor = None
+    pred_tracks = None
+    cam_motion = CameraMotionGenerator(args.camera_motion)
+    if args.tracking_path:
+        tracking_tensor, _, _ = load_media(args.tracking_path)
+    elif args.tracking_method == "moge":
+        # Use the first frame from previously loaded video_tensor
+        infer_result = moge.infer(video_tensor[0].to(das.device))  # [C, H, W] in range [0,1]
+        H, W = infer_result["points"].shape[0:2]
+        pred_tracks = infer_result["points"].unsqueeze(0).repeat(49, 1, 1, 1) #[T, H, W, 3]
+        cam_motion.set_intr(infer_result["intrinsics"])
+        # Apply object motion if specified
+        if args.object_motion:
+            if args.object_mask is None:
+                raise ValueError("Object motion specified but no mask provided. Please provide a mask image with --object_mask")
+            # Load mask image
+            mask_image = Image.open(args.object_mask).convert('L')  # Convert to grayscale
+            mask_image = transforms.Resize((480, 720))(mask_image)  # Resize to match video size
+            # Convert to binary mask
+            mask = torch.from_numpy(np.array(mask_image) > 127)  # Threshold at 127
+            motion_generator = ObjectMotionGenerator(device=das.device)
+            pred_tracks = motion_generator.apply_motion(
+                pred_tracks=pred_tracks,
+                mask=mask,
+                motion_type=args.object_motion,
+                distance=50,
+                num_frames=49,
+                tracking_method="moge"
+            )
+            print("Object motion applied")
+        # Apply camera motion if specified
+        if args.camera_motion:
+            poses = cam_motion.get_default_motion() # shape: [49, 4, 4]
+            print("Camera motion applied")
+        else:
+            # no poses
+            poses = torch.eye(4).unsqueeze(0).repeat(49, 1, 1)
+        # change pred_tracks into screen coordinate
+        pred_tracks_flatten = pred_tracks.reshape(video_tensor.shape[0], H*W, 3)
+        pred_tracks = cam_motion.w2s(pred_tracks_flatten, poses).reshape([video_tensor.shape[0], H, W, 3]) # [T, H, W, 3]
+        _, tracking_tensor = das.visualize_tracking_moge(
+            pred_tracks.cpu().numpy(),
+            infer_result["mask"].cpu().numpy()
+        )
+        print('export tracking video via MoGe.')
+    else:
+        # Generate tracking points
+        pred_tracks, pred_visibility, T_Firsts = das.generate_tracking_spatracker(video_tensor)
+        # Apply camera motion if specified
+        if args.camera_motion:
+            poses = cam_motion.get_default_motion() # shape: [49, 4, 4]
+            pred_tracks = cam_motion.apply_motion_on_pts(pred_tracks, poses)
+            print("Camera motion applied")
+        # Apply object motion if specified
+        if args.object_motion:
+            if args.object_mask is None:
+                raise ValueError("Object motion specified but no mask provided. Please provide a mask image with --object_mask")
+            # Load mask image
+            mask_image = Image.open(args.object_mask).convert('L')  # Convert to grayscale
+            mask_image = transforms.Resize((480, 720))(mask_image)  # Resize to match video size
+            # Convert to binary mask
+            mask = torch.from_numpy(np.array(mask_image) > 127)  # Threshold at 127
+            motion_generator = ObjectMotionGenerator(device=das.device)
+            pred_tracks = motion_generator.apply_motion(
+                pred_tracks=pred_tracks.squeeze(),
+                mask=mask,
+                motion_type=args.object_motion,
+                distance=50,
+                num_frames=49,
+                tracking_method="spatracker"
+            ).unsqueeze(0)
+            print(f"Object motion '{args.object_motion}' applied using mask from {args.object_mask}")
+        # Generate tracking tensor from modified tracks
+        _, tracking_tensor = das.visualize_tracking_spatracker(video_tensor, pred_tracks, pred_visibility, T_Firsts)
+    das.apply_tracking(
+        video_tensor=video_tensor,
+        fps=8,
+        tracking_tensor=tracking_tensor,
+        img_cond_tensor=repaint_img_tensor,
+        prompt=args.prompt,
+        checkpoint_path=args.checkpoint_path
+    )

models/cogvideox_tracking.py ADDED Viewed

	@@ -0,0 +1,1020 @@

+from typing import Any, Dict, Optional, Tuple, Union, List, Callable
+import torch, os, math
+from torch import nn
+from PIL import Image
+from tqdm import tqdm
+from diffusers.utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.models.transformers.cogvideox_transformer_3d import CogVideoXBlock, CogVideoXTransformer3DModel
+from diffusers.pipelines.cogvideo.pipeline_cogvideox import CogVideoXPipeline, CogVideoXPipelineOutput
+from diffusers.pipelines.cogvideo.pipeline_cogvideox_image2video import CogVideoXImageToVideoPipeline
+from diffusers.pipelines.cogvideo.pipeline_cogvideox_video2video import CogVideoXVideoToVideoPipeline
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.pipelines.cogvideo.pipeline_cogvideox import retrieve_timesteps
+from transformers import T5EncoderModel, T5Tokenizer
+from diffusers.models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
+from diffusers.schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
+from diffusers.pipelines import DiffusionPipeline
+from diffusers.models.modeling_utils import ModelMixin
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class CogVideoXTransformer3DModelTracking(CogVideoXTransformer3DModel, ModelMixin):
+    """
+    Add tracking maps to the CogVideoX transformer model.
+    Parameters:
+        num_tracking_blocks (`int`, defaults to `18`):
+            The number of tracking blocks to use. Must be less than or equal to num_layers.
+    """
+    def __init__(
+        self,
+        num_tracking_blocks: Optional[int] = 18,
+        num_attention_heads: int = 30,
+        attention_head_dim: int = 64,
+        in_channels: int = 16,
+        out_channels: Optional[int] = 16,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        time_embed_dim: int = 512,
+        text_embed_dim: int = 4096,
+        num_layers: int = 30,
+        dropout: float = 0.0,
+        attention_bias: bool = True,
+        sample_width: int = 90,
+        sample_height: int = 60,
+        sample_frames: int = 49,
+        patch_size: int = 2,
+        temporal_compression_ratio: int = 4,
+        max_text_seq_length: int = 226,
+        activation_fn: str = "gelu-approximate",
+        timestep_activation_fn: str = "silu",
+        norm_elementwise_affine: bool = True,
+        norm_eps: float = 1e-5,
+        spatial_interpolation_scale: float = 1.875,
+        temporal_interpolation_scale: float = 1.0,
+        use_rotary_positional_embeddings: bool = False,
+        use_learned_positional_embeddings: bool = False,
+        **kwargs
+    ):
+        super().__init__(
+            num_attention_heads=num_attention_heads,
+            attention_head_dim=attention_head_dim,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            flip_sin_to_cos=flip_sin_to_cos,
+            freq_shift=freq_shift,
+            time_embed_dim=time_embed_dim,
+            text_embed_dim=text_embed_dim,
+            num_layers=num_layers,
+            dropout=dropout,
+            attention_bias=attention_bias,
+            sample_width=sample_width,
+            sample_height=sample_height,
+            sample_frames=sample_frames,
+            patch_size=patch_size,
+            temporal_compression_ratio=temporal_compression_ratio,
+            max_text_seq_length=max_text_seq_length,
+            activation_fn=activation_fn,
+            timestep_activation_fn=timestep_activation_fn,
+            norm_elementwise_affine=norm_elementwise_affine,
+            norm_eps=norm_eps,
+            spatial_interpolation_scale=spatial_interpolation_scale,
+            temporal_interpolation_scale=temporal_interpolation_scale,
+            use_rotary_positional_embeddings=use_rotary_positional_embeddings,
+            use_learned_positional_embeddings=use_learned_positional_embeddings,
+            **kwargs
+        )
+        inner_dim = num_attention_heads * attention_head_dim
+        self.num_tracking_blocks = num_tracking_blocks
+        # Ensure num_tracking_blocks is not greater than num_layers
+        if num_tracking_blocks > num_layers:
+            raise ValueError("num_tracking_blocks must be less than or equal to num_layers")
+        # Create linear layers for combining hidden states and tracking maps
+        self.combine_linears = nn.ModuleList(
+            [nn.Linear(inner_dim, inner_dim) for _ in range(num_tracking_blocks)]
+        )
+        # Initialize weights of combine_linears to zero
+        for linear in self.combine_linears:
+            linear.weight.data.zero_()
+            linear.bias.data.zero_()
+        # Create transformer blocks for processing tracking maps
+        self.transformer_blocks_copy = nn.ModuleList(
+            [
+                CogVideoXBlock(
+                    dim=inner_dim,
+                    num_attention_heads=self.config.num_attention_heads,
+                    attention_head_dim=self.config.attention_head_dim,
+                    time_embed_dim=self.config.time_embed_dim,
+                    dropout=self.config.dropout,
+                    activation_fn=self.config.activation_fn,
+                    attention_bias=self.config.attention_bias,
+                    norm_elementwise_affine=self.config.norm_elementwise_affine,
+                    norm_eps=self.config.norm_eps,
+                )
+                for _ in range(num_tracking_blocks)
+            ]
+        )
+        # For initial combination of hidden states and tracking maps
+        self.initial_combine_linear = nn.Linear(inner_dim, inner_dim)
+        self.initial_combine_linear.weight.data.zero_()
+        self.initial_combine_linear.bias.data.zero_()
+        # Freeze all parameters
+        for param in self.parameters():
+            param.requires_grad = False
+        # Unfreeze parameters that need to be trained
+        for linear in self.combine_linears:
+            for param in linear.parameters():
+                param.requires_grad = True
+        for block in self.transformer_blocks_copy:
+            for param in block.parameters():
+                param.requires_grad = True
+        for param in self.initial_combine_linear.parameters():
+            param.requires_grad = True
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        tracking_maps: torch.Tensor,
+        timestep: Union[int, float, torch.LongTensor],
+        timestep_cond: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ):
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+        batch_size, num_frames, channels, height, width = hidden_states.shape
+        # 1. Time embedding
+        timesteps = timestep
+        t_emb = self.time_proj(timesteps)
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=hidden_states.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        # 2. Patch embedding
+        hidden_states = self.patch_embed(encoder_hidden_states, hidden_states)
+        hidden_states = self.embedding_dropout(hidden_states)
+        # Process tracking maps
+        prompt_embed = encoder_hidden_states.clone()
+        tracking_maps_hidden_states = self.patch_embed(prompt_embed, tracking_maps)
+        tracking_maps_hidden_states = self.embedding_dropout(tracking_maps_hidden_states)
+        del prompt_embed
+        text_seq_length = encoder_hidden_states.shape[1]
+        encoder_hidden_states = hidden_states[:, :text_seq_length]
+        hidden_states = hidden_states[:, text_seq_length:]
+        tracking_maps = tracking_maps_hidden_states[:, text_seq_length:]
+        # Combine hidden states and tracking maps initially
+        combined = hidden_states + tracking_maps
+        tracking_maps = self.initial_combine_linear(combined)
+        # Process transformer blocks
+        for i in range(len(self.transformer_blocks)):
+            if self.training and self.gradient_checkpointing:
+                # Gradient checkpointing logic for hidden states
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.transformer_blocks[i]),
+                    hidden_states,
+                    encoder_hidden_states,
+                    emb,
+                    image_rotary_emb,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states, encoder_hidden_states = self.transformer_blocks[i](
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=emb,
+                    image_rotary_emb=image_rotary_emb,
+                )
+            if i < len(self.transformer_blocks_copy):
+                if self.training and self.gradient_checkpointing:
+                    # Gradient checkpointing logic for tracking maps
+                    tracking_maps, _ = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(self.transformer_blocks_copy[i]),
+                        tracking_maps,
+                        encoder_hidden_states,
+                        emb,
+                        image_rotary_emb,
+                        **ckpt_kwargs,
+                    )
+                else:
+                    tracking_maps, _ = self.transformer_blocks_copy[i](
+                        hidden_states=tracking_maps,
+                        encoder_hidden_states=encoder_hidden_states,
+                        temb=emb,
+                        image_rotary_emb=image_rotary_emb,
+                    )
+                # Combine hidden states and tracking maps
+                tracking_maps = self.combine_linears[i](tracking_maps)
+                hidden_states = hidden_states + tracking_maps
+        if not self.config.use_rotary_positional_embeddings:
+            # CogVideoX-2B
+            hidden_states = self.norm_final(hidden_states)
+        else:
+            # CogVideoX-5B
+            hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+            hidden_states = self.norm_final(hidden_states)
+            hidden_states = hidden_states[:, text_seq_length:]
+        # 4. Final block
+        hidden_states = self.norm_out(hidden_states, temb=emb)
+        hidden_states = self.proj_out(hidden_states)
+        # 5. Unpatchify
+        # Note: we use `-1` instead of `channels`:
+        #   - It is okay to `channels` use for CogVideoX-2b and CogVideoX-5b (number of input channels is equal to output channels)
+        #   - However, for CogVideoX-5b-I2V also takes concatenated input image latents (number of input channels is twice the output channels)
+        p = self.config.patch_size
+        output = hidden_states.reshape(batch_size, num_frames, height // p, width // p, -1, p, p)
+        output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+        try:
+            model = super().from_pretrained(pretrained_model_name_or_path, **kwargs)
+            print("Loaded DiffusionAsShader checkpoint directly.")
+            for param in model.parameters():
+                param.requires_grad = False
+            for linear in model.combine_linears:
+                for param in linear.parameters():
+                    param.requires_grad = True
+            for block in model.transformer_blocks_copy:
+                for param in block.parameters():
+                    param.requires_grad = True
+            for param in model.initial_combine_linear.parameters():
+                param.requires_grad = True
+            return model
+        except Exception as e:
+            print(f"Failed to load as DiffusionAsShader: {e}")
+            print("Attempting to load as CogVideoXTransformer3DModel and convert...")
+            base_model = CogVideoXTransformer3DModel.from_pretrained(pretrained_model_name_or_path, **kwargs)
+            config = dict(base_model.config)
+            config["num_tracking_blocks"] = kwargs.pop("num_tracking_blocks", 18)
+            model = cls(**config)
+            model.load_state_dict(base_model.state_dict(), strict=False)
+            model.initial_combine_linear.weight.data.zero_()
+            model.initial_combine_linear.bias.data.zero_()
+            for linear in model.combine_linears:
+                linear.weight.data.zero_()
+                linear.bias.data.zero_()
+            for i in range(model.num_tracking_blocks):
+                model.transformer_blocks_copy[i].load_state_dict(model.transformer_blocks[i].state_dict())
+            for param in model.parameters():
+                param.requires_grad = False
+            for linear in model.combine_linears:
+                for param in linear.parameters():
+                    param.requires_grad = True
+            for block in model.transformer_blocks_copy:
+                for param in block.parameters():
+                    param.requires_grad = True
+            for param in model.initial_combine_linear.parameters():
+                param.requires_grad = True
+            return model
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        is_main_process: bool = True,
+        save_function: Optional[Callable] = None,
+        safe_serialization: bool = True,
+        variant: Optional[str] = None,
+        max_shard_size: Union[int, str] = "5GB",
+        push_to_hub: bool = False,
+        **kwargs,
+    ):
+        super().save_pretrained(
+            save_directory,
+            is_main_process=is_main_process,
+            save_function=save_function,
+            safe_serialization=safe_serialization,
+            variant=variant,
+            max_shard_size=max_shard_size,
+            push_to_hub=push_to_hub,
+            **kwargs,
+        )
+        if is_main_process:
+            config_dict = dict(self.config)
+            config_dict.pop("_name_or_path", None)
+            config_dict.pop("_use_default_values", None)
+            config_dict["_class_name"] = "CogVideoXTransformer3DModelTracking"
+            config_dict["num_tracking_blocks"] = self.num_tracking_blocks
+            os.makedirs(save_directory, exist_ok=True)
+            with open(os.path.join(save_directory, "config.json"), "w", encoding="utf-8") as f:
+                import json
+                json.dump(config_dict, f, indent=2)
+class CogVideoXPipelineTracking(CogVideoXPipeline, DiffusionPipeline):
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        vae: AutoencoderKLCogVideoX,
+        transformer: CogVideoXTransformer3DModelTracking,
+        scheduler: Union[CogVideoXDDIMScheduler, CogVideoXDPMScheduler],
+    ):
+        super().__init__(tokenizer, text_encoder, vae, transformer, scheduler)
+        if not isinstance(self.transformer, CogVideoXTransformer3DModelTracking):
+            raise ValueError("The transformer in this pipeline must be of type CogVideoXTransformer3DModelTracking")
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 480,
+        width: int = 720,
+        num_frames: int = 49,
+        num_inference_steps: int = 50,
+        timesteps: Optional[List[int]] = None,
+        guidance_scale: float = 6,
+        use_dynamic_cfg: bool = False,
+        num_videos_per_prompt: int = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: str = "pil",
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 226,
+        tracking_maps: Optional[torch.Tensor] = None,
+    ) -> Union[CogVideoXPipelineOutput, Tuple]:
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        num_videos_per_prompt = 1
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt,
+            callback_on_step_end_tensor_inputs,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+        self._guidance_scale = guidance_scale
+        self._attention_kwargs = attention_kwargs
+        self._interrupt = False
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        do_classifier_free_guidance = guidance_scale > 1.0
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            negative_prompt,
+            do_classifier_free_guidance,
+            num_videos_per_prompt=num_videos_per_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            max_sequence_length=max_sequence_length,
+            device=device,
+        )
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        self._num_timesteps = len(timesteps)
+        latent_channels = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            latent_channels,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        image_rotary_emb = (
+            self._prepare_rotary_positional_embeddings(height, width, latents.size(1), device)
+            if self.transformer.config.use_rotary_positional_embeddings
+            else None
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            old_pred_original_sample = None
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                tracking_maps_latent = torch.cat([tracking_maps] * 2) if do_classifier_free_guidance else tracking_maps
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                timestep = t.expand(latent_model_input.shape[0])
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep=timestep,
+                    image_rotary_emb=image_rotary_emb,
+                    attention_kwargs=attention_kwargs,
+                    tracking_maps=tracking_maps_latent,
+                    return_dict=False,
+                )[0]
+                noise_pred = noise_pred.float()
+                if use_dynamic_cfg:
+                    self._guidance_scale = 1 + guidance_scale * (
+                        (1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2
+                    )
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+                if not isinstance(self.scheduler, CogVideoXDPMScheduler):
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                else:
+                    latents, old_pred_original_sample = self.scheduler.step(
+                        noise_pred,
+                        old_pred_original_sample,
+                        t,
+                        timesteps[i - 1] if i > 0 else None,
+                        latents,
+                        **extra_step_kwargs,
+                        return_dict=False,
+                    )
+                latents = latents.to(prompt_embeds.dtype)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+        if not output_type == "latent":
+            video = self.decode_latents(latents)
+            video = self.video_processor.postprocess_video(video=video, output_type=output_type)
+        else:
+            video = latents
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (video,)
+        return CogVideoXPipelineOutput(frames=video)
+class CogVideoXImageToVideoPipelineTracking(CogVideoXImageToVideoPipeline, DiffusionPipeline):
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        vae: AutoencoderKLCogVideoX,
+        transformer: CogVideoXTransformer3DModelTracking,
+        scheduler: Union[CogVideoXDDIMScheduler, CogVideoXDPMScheduler],
+    ):
+        super().__init__(tokenizer, text_encoder, vae, transformer, scheduler)
+        if not isinstance(self.transformer, CogVideoXTransformer3DModelTracking):
+            raise ValueError("The transformer in this pipeline must be of type CogVideoXTransformer3DModelTracking")
+        # 打印transformer blocks的数量
+        print(f"Number of transformer blocks: {len(self.transformer.transformer_blocks)}")
+        print(f"Number of tracking transformer blocks: {len(self.transformer.transformer_blocks_copy)}")
+        self.transformer = torch.compile(self.transformer)
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Union[torch.Tensor, Image.Image],
+        prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_frames: int = 49,
+        num_inference_steps: int = 50,
+        timesteps: Optional[List[int]] = None,
+        guidance_scale: float = 6,
+        use_dynamic_cfg: bool = False,
+        num_videos_per_prompt: int = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: str = "pil",
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 226,
+        tracking_maps: Optional[torch.Tensor] = None,
+        tracking_image: Optional[torch.Tensor] = None,
+    ) -> Union[CogVideoXPipelineOutput, Tuple]:
+        # Most of the implementation remains the same as the parent class
+        # We will modify the parts that need to handle tracking_maps
+        # 1. Check inputs and set default values
+        self.check_inputs(
+            image,
+            prompt,
+            height,
+            width,
+            negative_prompt,
+            callback_on_step_end_tensor_inputs,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+        self._guidance_scale = guidance_scale
+        self._attention_kwargs = attention_kwargs
+        self._interrupt = False
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            num_videos_per_prompt=num_videos_per_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            max_sequence_length=max_sequence_length,
+            device=device,
+        )
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            del negative_prompt_embeds
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        self._num_timesteps = len(timesteps)
+        # 5. Prepare latents
+        image = self.video_processor.preprocess(image, height=height, width=width).to(
+            device, dtype=prompt_embeds.dtype
+        )
+        tracking_image = self.video_processor.preprocess(tracking_image, height=height, width=width).to(
+            device, dtype=prompt_embeds.dtype
+        )
+        if self.transformer.config.in_channels != 16:
+            latent_channels = self.transformer.config.in_channels // 2
+        else:
+            latent_channels = self.transformer.config.in_channels
+        latents, image_latents = self.prepare_latents(
+            image,
+            batch_size * num_videos_per_prompt,
+            latent_channels,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        del image
+        _, tracking_image_latents = self.prepare_latents(
+            tracking_image,
+            batch_size * num_videos_per_prompt,
+            latent_channels,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents=None,
+        )
+        del tracking_image
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Create rotary embeds if required
+        image_rotary_emb = (
+            self._prepare_rotary_positional_embeddings(height, width, latents.size(1), device)
+            if self.transformer.config.use_rotary_positional_embeddings
+            else None
+        )
+        # 8. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            old_pred_original_sample = None
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                latent_image_input = torch.cat([image_latents] * 2) if do_classifier_free_guidance else image_latents
+                latent_model_input = torch.cat([latent_model_input, latent_image_input], dim=2)
+                del latent_image_input
+                # Handle tracking maps
+                if tracking_maps is not None:
+                    latents_tracking_image = torch.cat([tracking_image_latents] * 2) if do_classifier_free_guidance else tracking_image_latents
+                    tracking_maps_input = torch.cat([tracking_maps] * 2) if do_classifier_free_guidance else tracking_maps
+                    tracking_maps_input = torch.cat([tracking_maps_input, latents_tracking_image], dim=2)
+                    del latents_tracking_image
+                else:
+                    tracking_maps_input = None
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0])
+                # Predict noise
+                self.transformer.to(dtype=latent_model_input.dtype)
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep=timestep,
+                    image_rotary_emb=image_rotary_emb,
+                    attention_kwargs=attention_kwargs,
+                    tracking_maps=tracking_maps_input,
+                    return_dict=False,
+                )[0]
+                del latent_model_input
+                if tracking_maps_input is not None:
+                    del tracking_maps_input
+                noise_pred = noise_pred.float()
+                # perform guidance
+                if use_dynamic_cfg:
+                    self._guidance_scale = 1 + guidance_scale * (
+                        (1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2
+                    )
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    del noise_pred_uncond, noise_pred_text
+                # compute the previous noisy sample x_t -> x_t-1
+                if not isinstance(self.scheduler, CogVideoXDPMScheduler):
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                else:
+                    latents, old_pred_original_sample = self.scheduler.step(
+                        noise_pred,
+                        old_pred_original_sample,
+                        t,
+                        timesteps[i - 1] if i > 0 else None,
+                        latents,
+                        **extra_step_kwargs,
+                        return_dict=False,
+                    )
+                del noise_pred
+                latents = latents.to(prompt_embeds.dtype)
+                # call the callback, if provided
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+        # 9. Post-processing
+        if not output_type == "latent":
+            video = self.decode_latents(latents)
+            video = self.video_processor.postprocess_video(video=video, output_type=output_type)
+        else:
+            video = latents
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (video,)
+        return CogVideoXPipelineOutput(frames=video)
+class CogVideoXVideoToVideoPipelineTracking(CogVideoXVideoToVideoPipeline, DiffusionPipeline):
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        vae: AutoencoderKLCogVideoX,
+        transformer: CogVideoXTransformer3DModelTracking,
+        scheduler: Union[CogVideoXDDIMScheduler, CogVideoXDPMScheduler],
+    ):
+        super().__init__(tokenizer, text_encoder, vae, transformer, scheduler)
+        if not isinstance(self.transformer, CogVideoXTransformer3DModelTracking):
+            raise ValueError("The transformer in this pipeline must be of type CogVideoXTransformer3DModelTracking")
+    @torch.no_grad()
+    def __call__(
+        self,
+        video: List[Image.Image] = None,
+        prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 480,
+        width: int = 720,
+        num_inference_steps: int = 50,
+        timesteps: Optional[List[int]] = None,
+        strength: float = 0.8,
+        guidance_scale: float = 6,
+        use_dynamic_cfg: bool = False,
+        num_videos_per_prompt: int = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: str = "pil",
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 226,
+        tracking_maps: Optional[torch.Tensor] = None,
+    ) -> Union[CogVideoXPipelineOutput, Tuple]:
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        num_videos_per_prompt = 1
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt=prompt,
+            height=height,
+            width=width,
+            strength=strength,
+            negative_prompt=negative_prompt,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            video=video,
+            latents=latents,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+        self._guidance_scale = guidance_scale
+        self._attention_kwargs = attention_kwargs
+        self._interrupt = False
+        # 2. Default call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            negative_prompt,
+            do_classifier_free_guidance,
+            num_videos_per_prompt=num_videos_per_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            max_sequence_length=max_sequence_length,
+            device=device,
+        )
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, timesteps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_videos_per_prompt)
+        self._num_timesteps = len(timesteps)
+        # 5. Prepare latents
+        if latents is None:
+            video = self.video_processor.preprocess_video(video, height=height, width=width)
+            video = video.to(device=device, dtype=prompt_embeds.dtype)
+        latent_channels = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            video,
+            batch_size * num_videos_per_prompt,
+            latent_channels,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            latent_timestep,
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Create rotary embeds if required
+        image_rotary_emb = (
+            self._prepare_rotary_positional_embeddings(height, width, latents.size(1), device)
+            if self.transformer.config.use_rotary_positional_embeddings
+            else None
+        )
+        # 8. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            # for DPM-solver++
+            old_pred_original_sample = None
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                tracking_maps_input = torch.cat([tracking_maps] * 2) if do_classifier_free_guidance else tracking_maps
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0])
+                # predict noise model_output
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep=timestep,
+                    image_rotary_emb=image_rotary_emb,
+                    attention_kwargs=attention_kwargs,
+                    tracking_maps=tracking_maps_input,
+                    return_dict=False,
+                )[0]
+                noise_pred = noise_pred.float()
+                # perform guidance
+                if use_dynamic_cfg:
+                    self._guidance_scale = 1 + guidance_scale * (
+                        (1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2
+                    )
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                if not isinstance(self.scheduler, CogVideoXDPMScheduler):
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                else:
+                    latents, old_pred_original_sample = self.scheduler.step(
+                        noise_pred,
+                        old_pred_original_sample,
+                        t,
+                        timesteps[i - 1] if i > 0 else None,
+                        latents,
+                        **extra_step_kwargs,
+                        return_dict=False,
+                    )
+                latents = latents.to(prompt_embeds.dtype)
+                # call the callback, if provided
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+        if not output_type == "latent":
+            video = self.decode_latents(latents)
+            video = self.video_processor.postprocess_video(video=video, output_type=output_type)
+        else:
+            video = latents
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (video,)
+        return CogVideoXPipelineOutput(frames=video)

models/pipelines.py ADDED Viewed

	@@ -0,0 +1,1040 @@

+import os
+import sys
+import math
+from tqdm import tqdm
+from PIL import Image, ImageDraw
+project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+try:
+    sys.path.append(os.path.join(project_root, "submodules/MoGe"))
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+except:
+    print("Warning: MoGe not found, motion transfer will not be applied")
+import torch
+import numpy as np
+from PIL import Image
+import torchvision.transforms as transforms
+from diffusers import FluxControlPipeline, CogVideoXDPMScheduler
+from diffusers.utils import export_to_video, load_image, load_video
+from models.spatracker.predictor import SpaTrackerPredictor
+from models.spatracker.utils.visualizer import Visualizer
+from models.cogvideox_tracking import CogVideoXImageToVideoPipelineTracking
+from submodules.MoGe.moge.model import MoGeModel
+from image_gen_aux import DepthPreprocessor
+from moviepy.editor import ImageSequenceClip
+class DiffusionAsShaderPipeline:
+    def __init__(self, gpu_id=0, output_dir='outputs'):
+        """Initialize MotionTransfer class
+        Args:
+            gpu_id (int): GPU device ID
+            output_dir (str): Output directory path
+        """
+        # video parameters
+        self.max_depth = 65.0
+        self.fps = 8
+        # camera parameters
+        self.camera_motion=None
+        self.fov=55
+        # device
+        self.device = f"cuda:{gpu_id}"
+        torch.cuda.set_device(gpu_id)
+        # files
+        self.output_dir = output_dir
+        os.makedirs(output_dir, exist_ok=True)
+        # Initialize transform
+        self.transform = transforms.Compose([
+            transforms.Resize((480, 720)),
+            transforms.ToTensor()
+        ])
+    @torch.no_grad()
+    def _infer(
+        self,
+        prompt: str,
+        model_path: str,
+        tracking_tensor: torch.Tensor = None,
+        image_tensor: torch.Tensor = None,  # [C,H,W] in range [0,1]
+        output_path: str = "./output.mp4",
+        num_inference_steps: int = 50,
+        guidance_scale: float = 6.0,
+        num_videos_per_prompt: int = 1,
+        dtype: torch.dtype = torch.bfloat16,
+        fps: int = 24,
+        seed: int = 42,
+    ):
+        """
+        Generates a video based on the given prompt and saves it to the specified path.
+        Parameters:
+        - prompt (str): The description of the video to be generated.
+        - model_path (str): The path of the pre-trained model to be used.
+        - tracking_tensor (torch.Tensor): Tracking video tensor [T, C, H, W] in range [0,1]
+        - image_tensor (torch.Tensor): Input image tensor [C, H, W] in range [0,1]
+        - output_path (str): The path where the generated video will be saved.
+        - num_inference_steps (int): Number of steps for the inference process.
+        - guidance_scale (float): The scale for classifier-free guidance.
+        - num_videos_per_prompt (int): Number of videos to generate per prompt.
+        - dtype (torch.dtype): The data type for computation.
+        - seed (int): The seed for reproducibility.
+        """
+        from transformers import T5EncoderModel, T5Tokenizer
+        from diffusers import AutoencoderKLCogVideoX, CogVideoXDDIMScheduler
+        from models.cogvideox_tracking import CogVideoXTransformer3DModelTracking
+        vae = AutoencoderKLCogVideoX.from_pretrained(model_path, subfolder="vae")
+        text_encoder = T5EncoderModel.from_pretrained(model_path, subfolder="text_encoder")
+        tokenizer = T5Tokenizer.from_pretrained(model_path, subfolder="tokenizer")
+        transformer = CogVideoXTransformer3DModelTracking.from_pretrained(model_path, subfolder="transformer")
+        scheduler = CogVideoXDDIMScheduler.from_pretrained(model_path, subfolder="scheduler")
+        pipe = CogVideoXImageToVideoPipelineTracking(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            transformer=transformer,
+            scheduler=scheduler
+        )
+        # Convert tensor to PIL Image
+        image_np = (image_tensor.permute(1, 2, 0).numpy() * 255).astype(np.uint8)
+        image = Image.fromarray(image_np)
+        height, width = image.height, image.width
+        pipe.transformer.eval()
+        pipe.text_encoder.eval()
+        pipe.vae.eval()
+        # Process tracking tensor
+        tracking_maps = tracking_tensor.float() # [T, C, H, W]
+        tracking_maps = tracking_maps.to(device=self.device, dtype=dtype)
+        tracking_first_frame = tracking_maps[0:1]  # Get first frame as [1, C, H, W]
+        height, width = tracking_first_frame.shape[2], tracking_first_frame.shape[3]
+        # 2. Set Scheduler.
+        pipe.scheduler = CogVideoXDPMScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")
+        pipe.to(self.device, dtype=dtype)
+        # pipe.enable_sequential_cpu_offload()
+        pipe.vae.enable_slicing()
+        pipe.vae.enable_tiling()
+        pipe.transformer.eval()
+        pipe.text_encoder.eval()
+        pipe.vae.eval()
+        pipe.transformer.gradient_checkpointing = False
+        print("Encoding tracking maps")
+        tracking_maps = tracking_maps.unsqueeze(0) # [B, T, C, H, W]
+        tracking_maps = tracking_maps.permute(0, 2, 1, 3, 4)  # [B, C, T, H, W]
+        tracking_latent_dist = pipe.vae.encode(tracking_maps).latent_dist
+        tracking_maps = tracking_latent_dist.sample() * pipe.vae.config.scaling_factor
+        tracking_maps = tracking_maps.permute(0, 2, 1, 3, 4)  # [B, F, C, H, W]
+        # 4. Generate the video frames based on the prompt.
+        video_generate = pipe(
+            prompt=prompt,
+            negative_prompt="The video is not of a high quality, it has a low resolution. Watermark present in each frame. The background is solid. Strange body and strange trajectory. Distortion.",
+            image=image,
+            num_videos_per_prompt=num_videos_per_prompt,
+            num_inference_steps=num_inference_steps,
+            num_frames=49,
+            use_dynamic_cfg=True,
+            guidance_scale=guidance_scale,
+            generator=torch.Generator().manual_seed(seed),
+            tracking_maps=tracking_maps,
+            tracking_image=tracking_first_frame,
+            height=height,
+            width=width,
+        ).frames[0]
+        # 5. Export the generated frames to a video file. fps must be 8 for original video.
+        output_path = output_path if output_path else f"result.mp4"
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        export_to_video(video_generate, output_path, fps=fps)
+    #========== camera parameters ==========#
+    def _set_camera_motion(self, camera_motion):
+        self.camera_motion = camera_motion
+    def _get_intr(self, fov, H=480, W=720):
+        fov_rad = math.radians(fov)
+        focal_length = (W / 2) / math.tan(fov_rad / 2)
+        cx = W / 2
+        cy = H / 2
+        intr = torch.tensor([
+            [focal_length, 0, cx],
+            [0, focal_length, cy],
+            [0, 0, 1]
+        ], dtype=torch.float32)
+        return intr
+    def _apply_poses(self, pts, intr, poses):
+        """
+        Args:
+            pts (torch.Tensor): pointclouds coordinates [T, N, 3]
+            intr (torch.Tensor): camera intrinsics [T, 3, 3]
+            poses (numpy.ndarray): camera poses [T, 4, 4]
+        """
+        poses = torch.from_numpy(poses).float().to(self.device)
+        T, N, _ = pts.shape
+        ones = torch.ones(T, N, 1, device=self.device, dtype=torch.float)
+        pts_hom = torch.cat([pts[:, :, :2], ones], dim=-1)  # (T, N, 3)
+        pts_cam = torch.bmm(pts_hom, torch.linalg.inv(intr).transpose(1, 2))  # (T, N, 3)
+        pts_cam[:,:, :3] /= pts[:, :, 2:3]
+        # to homogeneous
+        pts_cam = torch.cat([pts_cam, ones], dim=-1)  # (T, N, 4)
+        if poses.shape[0] == 1:
+            poses = poses.repeat(T, 1, 1)
+        elif poses.shape[0] != T:
+            raise ValueError(f"Poses length ({poses.shape[0]}) must match sequence length ({T})")
+        pts_world = torch.bmm(pts_cam, poses.transpose(1, 2))[:, :, :3]  # (T, N, 3)
+        pts_proj = torch.bmm(pts_world, intr.transpose(1, 2))  # (T, N, 3)
+        pts_proj[:, :, :2] /= pts_proj[:, :, 2:3]
+        return pts_proj
+    def apply_traj_on_tracking(self, pred_tracks, camera_motion=None, fov=55, frame_num=49):
+        intr = self._get_intr(fov).unsqueeze(0).repeat(frame_num, 1, 1).to(self.device)
+        tracking_pts = self._apply_poses(pred_tracks.squeeze(), intr, camera_motion).unsqueeze(0)
+        return tracking_pts
+    ##============= SpatialTracker =============##
+    def generate_tracking_spatracker(self, video_tensor, density=70):
+        """Generate tracking video
+        Args:
+            video_tensor (torch.Tensor): Input video tensor
+        Returns:
+            str: Path to tracking video
+        """
+        print("Loading tracking models...")
+        # Load tracking model
+        tracker = SpaTrackerPredictor(
+            checkpoint=os.path.join(project_root, 'checkpoints/spatracker/spaT_final.pth'),
+            interp_shape=(384, 576),
+            seq_length=12
+        ).to(self.device)
+        # Load depth model
+        self.depth_preprocessor = DepthPreprocessor.from_pretrained("Intel/zoedepth-nyu-kitti")
+        self.depth_preprocessor.to(self.device)
+        try:
+            video = video_tensor.unsqueeze(0).to(self.device)
+            video_depths = []
+            for i in range(video_tensor.shape[0]):
+                frame = (video_tensor[i].permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8)
+                depth = self.depth_preprocessor(Image.fromarray(frame))[0]
+                depth_tensor = transforms.ToTensor()(depth)  # [1, H, W]
+                video_depths.append(depth_tensor)
+            video_depth = torch.stack(video_depths, dim=0).to(self.device)
+            # print("Video depth shape:", video_depth.shape)
+            segm_mask = np.ones((480, 720), dtype=np.uint8)
+            pred_tracks, pred_visibility, T_Firsts = tracker(
+                video * 255,
+                video_depth=video_depth,
+                grid_size=density,
+                backward_tracking=False,
+                depth_predictor=None,
+                grid_query_frame=0,
+                segm_mask=torch.from_numpy(segm_mask)[None, None].to(self.device),
+                wind_length=12,
+                progressive_tracking=False
+            )
+            return pred_tracks, pred_visibility, T_Firsts
+        finally:
+            # Clean up GPU memory
+            del tracker, self.depth_preprocessor
+            torch.cuda.empty_cache()
+    def visualize_tracking_spatracker(self, video, pred_tracks, pred_visibility, T_Firsts, save_tracking=True):
+        video = video.unsqueeze(0).to(self.device)
+        vis = Visualizer(save_dir=self.output_dir, grayscale=False, fps=24, pad_value=0)
+        msk_query = (T_Firsts == 0)
+        pred_tracks = pred_tracks[:,:,msk_query.squeeze()]
+        pred_visibility = pred_visibility[:,:,msk_query.squeeze()]
+        tracking_video = vis.visualize(video=video, tracks=pred_tracks,
+                        visibility=pred_visibility, save_video=False,
+                        filename="temp")
+        tracking_video = tracking_video.squeeze(0) # [T, C, H, W]
+        wide_list = list(tracking_video.unbind(0))
+        wide_list = [wide.permute(1, 2, 0).cpu().numpy() for wide in wide_list]
+        clip = ImageSequenceClip(wide_list, fps=self.fps)
+        tracking_path = None
+        if save_tracking:
+            try:
+                tracking_path = os.path.join(self.output_dir, "tracking_video.mp4")
+                clip.write_videofile(tracking_path, codec="libx264", fps=self.fps, logger=None)
+                print(f"Video saved to {tracking_path}")
+            except Exception as e:
+                print(f"Warning: Failed to save tracking video: {e}")
+                tracking_path = None
+        # Convert tracking_video back to tensor in range [0,1]
+        tracking_frames = np.array(list(clip.iter_frames())) / 255.0
+        tracking_video = torch.from_numpy(tracking_frames).permute(0, 3, 1, 2).float()
+        return tracking_path, tracking_video
+    ##============= MoGe =============##
+    def valid_mask(self, pixels, W, H):
+        """Check if pixels are within valid image bounds
+        Args:
+            pixels (numpy.ndarray): Pixel coordinates of shape [N, 2]
+            W (int): Image width
+            H (int): Image height
+        Returns:
+            numpy.ndarray: Boolean mask of valid pixels
+        """
+        return ((pixels[:, 0] >= 0) & (pixels[:, 0] < W) & (pixels[:, 1] > 0) & \
+                 (pixels[:, 1] < H))
+    def sort_points_by_depth(self, points, depths):
+        """Sort points by depth values
+        Args:
+            points (numpy.ndarray): Points array of shape [N, 2]
+            depths (numpy.ndarray): Depth values of shape [N]
+        Returns:
+            tuple: (sorted_points, sorted_depths, sort_index)
+        """
+        # Combine points and depths into a single array for sorting
+        combined = np.hstack((points, depths[:, None]))  # Nx3 (points + depth)
+        # Sort by depth (last column) in descending order
+        sort_index = combined[:, -1].argsort()[::-1]
+        sorted_combined = combined[sort_index]
+        # Split back into points and depths
+        sorted_points = sorted_combined[:, :-1]
+        sorted_depths = sorted_combined[:, -1]
+        return sorted_points, sorted_depths, sort_index
+    def draw_rectangle(self, rgb, coord, side_length, color=(255, 0, 0)):
+        """Draw a rectangle on the image
+        Args:
+            rgb (PIL.Image): Image to draw on
+            coord (tuple): Center coordinates (x, y)
+            side_length (int): Length of rectangle sides
+            color (tuple): RGB color tuple
+        """
+        draw = ImageDraw.Draw(rgb)
+        # Calculate the bounding box of the rectangle
+        left_up_point = (coord[0] - side_length//2, coord[1] - side_length//2)
+        right_down_point = (coord[0] + side_length//2, coord[1] + side_length//2)
+        color = tuple(list(color))
+        draw.rectangle(
+            [left_up_point, right_down_point],
+            fill=tuple(color),
+            outline=tuple(color),
+        )
+    def visualize_tracking_moge(self, points, mask, save_tracking=True):
+        """Visualize tracking results from MoGe model
+        Args:
+            points (numpy.ndarray): Points array of shape [T, H, W, 3]
+            mask (numpy.ndarray): Binary mask of shape [H, W]
+            save_tracking (bool): Whether to save tracking video
+        Returns:
+            tuple: (tracking_path, tracking_video)
+                - tracking_path (str): Path to saved tracking video, None if save_tracking is False
+                - tracking_video (torch.Tensor): Tracking visualization tensor of shape [T, C, H, W] in range [0,1]
+        """
+        # Create color array
+        T, H, W, _ = points.shape
+        colors = np.zeros((H, W, 3), dtype=np.uint8)
+        # Set R channel - based on x coordinates (smaller on the left)
+        colors[:, :, 0] = np.tile(np.linspace(0, 255, W), (H, 1))
+        # Set G channel - based on y coordinates (smaller on the top)
+        colors[:, :, 1] = np.tile(np.linspace(0, 255, H), (W, 1)).T
+        # Set B channel - based on depth
+        z_values = points[0, :, :, 2]  # get z values
+        inv_z = 1 / z_values  # calculate 1/z
+        # Calculate 2% and 98% percentiles
+        p2 = np.percentile(inv_z, 2)
+        p98 = np.percentile(inv_z, 98)
+        # Normalize to [0,1] range
+        normalized_z = np.clip((inv_z - p2) / (p98 - p2), 0, 1)
+        colors[:, :, 2] = (normalized_z * 255).astype(np.uint8)
+        colors = colors.astype(np.uint8)
+        # colors = colors * mask[..., None]
+        # points = points * mask[None, :, :, None]
+        points = points.reshape(T, -1, 3)
+        colors = colors.reshape(-1, 3)
+        # Initialize list to store frames
+        frames = []
+        for i, pts_i in enumerate(tqdm(points)):
+            pixels, depths = pts_i[..., :2], pts_i[..., 2]
+            pixels[..., 0] = pixels[..., 0] * W
+            pixels[..., 1] = pixels[..., 1] * H
+            pixels = pixels.astype(int)
+            valid = self.valid_mask(pixels, W, H)
+            frame_rgb = colors[valid]
+            pixels = pixels[valid]
+            depths = depths[valid]
+            img = Image.fromarray(np.uint8(np.zeros([H, W, 3])), mode="RGB")
+            sorted_pixels, _, sort_index = self.sort_points_by_depth(pixels, depths)
+            step = 1
+            sorted_pixels = sorted_pixels[::step]
+            sorted_rgb = frame_rgb[sort_index][::step]
+            for j in range(sorted_pixels.shape[0]):
+                self.draw_rectangle(
+                    img,
+                    coord=(sorted_pixels[j, 0], sorted_pixels[j, 1]),
+                    side_length=2,
+                    color=sorted_rgb[j],
+                )
+            frames.append(np.array(img))
+        # Convert frames to video tensor in range [0,1]
+        tracking_video = torch.from_numpy(np.stack(frames)).permute(0, 3, 1, 2).float() / 255.0
+        tracking_path = None
+        if save_tracking:
+            try:
+                tracking_path = os.path.join(self.output_dir, "tracking_video_moge.mp4")
+                # Convert back to uint8 for saving
+                uint8_frames = [frame.astype(np.uint8) for frame in frames]
+                clip = ImageSequenceClip(uint8_frames, fps=self.fps)
+                clip.write_videofile(tracking_path, codec="libx264", fps=self.fps, logger=None)
+                print(f"Video saved to {tracking_path}")
+            except Exception as e:
+                print(f"Warning: Failed to save tracking video: {e}")
+                tracking_path = None
+        return tracking_path, tracking_video
+    def apply_tracking(self, video_tensor, fps=8, tracking_tensor=None, img_cond_tensor=None, prompt=None, checkpoint_path=None):
+        """Generate final video with motion transfer
+        Args:
+            video_tensor (torch.Tensor): Input video tensor [T,C,H,W]
+            fps (float): Input video FPS
+            tracking_tensor (torch.Tensor): Tracking video tensor [T,C,H,W]
+            image_tensor (torch.Tensor): First frame tensor [C,H,W] to use for generation
+            prompt (str): Generation prompt
+            checkpoint_path (str): Path to model checkpoint
+        """
+        self.fps = fps
+        # Use first frame if no image provided
+        if img_cond_tensor is None:
+            img_cond_tensor = video_tensor[0]
+        # Generate final video
+        final_output = os.path.join(os.path.abspath(self.output_dir), "result.mp4")
+        self._infer(
+            prompt=prompt,
+            model_path=checkpoint_path,
+            tracking_tensor=tracking_tensor,
+            image_tensor=img_cond_tensor,
+            output_path=final_output,
+            num_inference_steps=50,
+            guidance_scale=6.0,
+            dtype=torch.bfloat16,
+            fps=self.fps
+        )
+        print(f"Final video generated successfully at: {final_output}")
+    def _set_object_motion(self, motion_type):
+        """Set object motion type
+        Args:
+            motion_type (str): Motion direction ('up', 'down', 'left', 'right')
+        """
+        self.object_motion = motion_type
+class FirstFrameRepainter:
+    def __init__(self, gpu_id=0, output_dir='outputs'):
+        """Initialize FirstFrameRepainter
+        Args:
+            gpu_id (int): GPU device ID
+            output_dir (str): Output directory path
+        """
+        self.device = f"cuda:{gpu_id}"
+        self.output_dir = output_dir
+        self.max_depth = 65.0
+        os.makedirs(output_dir, exist_ok=True)
+    def repaint(self, image_tensor, prompt, depth_path=None, method="dav"):
+        """Repaint first frame using Flux
+        Args:
+            image_tensor (torch.Tensor): Input image tensor [C,H,W]
+            prompt (str): Repaint prompt
+            depth_path (str): Path to depth image
+            method (str): depth estimator, "moge" or "dav" or "zoedepth"
+        Returns:
+            torch.Tensor: Repainted image tensor [C,H,W]
+        """
+        print("Loading Flux model...")
+        # Load Flux model
+        flux_pipe = FluxControlPipeline.from_pretrained(
+            "black-forest-labs/FLUX.1-Depth-dev",
+            torch_dtype=torch.bfloat16
+        ).to(self.device)
+        # Get depth map
+        if depth_path is None:
+            if method == "moge":
+                self.moge_model = MoGeModel.from_pretrained("Ruicheng/moge-vitl").to(self.device)
+                depth_map = self.moge_model.infer(image_tensor.to(self.device))["depth"]
+                depth_map = torch.clamp(depth_map, max=self.max_depth)
+                depth_normalized = 1.0 - (depth_map / self.max_depth)
+                depth_rgb = (depth_normalized * 255).cpu().numpy().astype(np.uint8)
+                control_image = Image.fromarray(depth_rgb).convert("RGB")
+            elif method == "zoedepth":
+                self.depth_preprocessor = DepthPreprocessor.from_pretrained("Intel/zoedepth-nyu-kitti")
+                self.depth_preprocessor.to(self.device)
+                image_np = (image_tensor.permute(1, 2, 0).numpy() * 255).astype(np.uint8)
+                control_image = self.depth_preprocessor(Image.fromarray(image_np))[0].convert("RGB")
+                control_image = control_image.point(lambda x: 255 - x) # the zoedepth depth is inverted
+            else:
+                self.depth_preprocessor = DepthPreprocessor.from_pretrained("depth-anything/Depth-Anything-V2-Large-hf")
+                self.depth_preprocessor.to(self.device)
+                image_np = (image_tensor.permute(1, 2, 0).numpy() * 255).astype(np.uint8)
+                control_image = self.depth_preprocessor(Image.fromarray(image_np))[0].convert("RGB")
+        else:
+            control_image = Image.open(depth_path).convert("RGB")
+        try:
+            repainted_image = flux_pipe(
+                prompt=prompt,
+                control_image=control_image,
+                height=480,
+                width=720,
+                num_inference_steps=30,
+                guidance_scale=7.5,
+            ).images[0]
+            # Save repainted image
+            repainted_image.save(os.path.join(self.output_dir, "temp_repainted.png"))
+            # Convert PIL Image to tensor
+            transform = transforms.Compose([
+                transforms.ToTensor()
+            ])
+            repainted_tensor = transform(repainted_image)
+            return repainted_tensor
+        finally:
+            # Clean up GPU memory
+            del flux_pipe
+            if method == "moge":
+                del self.moge_model
+            else:
+                del self.depth_preprocessor
+            torch.cuda.empty_cache()
+class CameraMotionGenerator:
+    def __init__(self, motion_type, frame_num=49, H=480, W=720, fx=None, fy=None, fov=55, device='cuda'):
+        self.motion_type = motion_type
+        self.frame_num = frame_num
+        self.fov = fov
+        self.device = device
+        self.W = W
+        self.H = H
+        self.intr = torch.tensor([
+            [0, 0, W / 2],
+            [0, 0, H / 2],
+            [0, 0, 1]
+        ], dtype=torch.float32, device=device)
+        # if fx, fy not provided
+        if not fx or not fy:
+            fov_rad = math.radians(fov)
+            fx = fy = (W / 2) / math.tan(fov_rad / 2)
+        self.intr[0, 0] = fx
+        self.intr[1, 1] = fy
+    def _apply_poses(self, pts, poses):
+        """
+        Args:
+            pts (torch.Tensor): pointclouds coordinates [T, N, 3]
+            intr (torch.Tensor): camera intrinsics [T, 3, 3]
+            poses (numpy.ndarray): camera poses [T, 4, 4]
+        """
+        if isinstance(poses, np.ndarray):
+            poses = torch.from_numpy(poses)
+        intr = self.intr.unsqueeze(0).repeat(self.frame_num, 1, 1).to(torch.float)
+        T, N, _ = pts.shape
+        ones = torch.ones(T, N, 1, device=self.device, dtype=torch.float)
+        pts_hom = torch.cat([pts[:, :, :2], ones], dim=-1)  # (T, N, 3)
+        pts_cam = torch.bmm(pts_hom, torch.linalg.inv(intr).transpose(1, 2))  # (T, N, 3)
+        pts_cam[:,:, :3] *= pts[:, :, 2:3]
+        # to homogeneous
+        pts_cam = torch.cat([pts_cam, ones], dim=-1)  # (T, N, 4)
+        if poses.shape[0] == 1:
+            poses = poses.repeat(T, 1, 1)
+        elif poses.shape[0] != T:
+            raise ValueError(f"Poses length ({poses.shape[0]}) must match sequence length ({T})")
+        poses = poses.to(torch.float).to(self.device)
+        pts_world = torch.bmm(pts_cam, poses.transpose(1, 2))[:, :, :3]  # (T, N, 3)
+        pts_proj = torch.bmm(pts_world, intr.transpose(1, 2))  # (T, N, 3)
+        pts_proj[:, :, :2] /= pts_proj[:, :, 2:3]
+        return pts_proj
+    def w2s(self, pts, poses):
+        if isinstance(poses, np.ndarray):
+            poses = torch.from_numpy(poses)
+        assert poses.shape[0] == self.frame_num
+        poses = poses.to(torch.float32).to(self.device)
+        T, N, _ = pts.shape  # (T, N, 3)
+        intr = self.intr.unsqueeze(0).repeat(self.frame_num, 1, 1)
+        # Step 1: 扩展点的维度，使其变成 (T, N, 4)，最后一维填充1 (齐次坐标)
+        ones = torch.ones((T, N, 1), device=self.device, dtype=pts.dtype)
+        points_world_h = torch.cat([pts, ones], dim=-1)
+        points_camera_h = torch.bmm(poses, points_world_h.permute(0, 2, 1))
+        points_camera = points_camera_h[:, :3, :].permute(0, 2, 1)
+        points_image_h = torch.bmm(points_camera, intr.permute(0, 2, 1))
+        uv = points_image_h[:, :, :2] / points_image_h[:, :, 2:3]
+        # Step 5: 提取深度 (Z) 并拼接
+        depth = points_camera[:, :, 2:3]  # (T, N, 1)
+        uvd = torch.cat([uv, depth], dim=-1)  # (T, N, 3)
+        return uvd  # 屏幕坐标 + 深度 (T, N, 3)
+    def apply_motion_on_pts(self, pts, camera_motion):
+        tracking_pts = self._apply_poses(pts.squeeze(), camera_motion).unsqueeze(0)
+        return tracking_pts
+    def set_intr(self, K):
+        if isinstance(K, np.ndarray):
+            K = torch.from_numpy(K)
+        self.intr = K.to(self.device)
+    def rot_poses(self, angle, axis='y'):
+        """Generate a single rotation matrix
+        Args:
+            angle (float): Rotation angle in degrees
+            axis (str): Rotation axis ('x', 'y', or 'z')
+        Returns:
+            torch.Tensor: Single rotation matrix [4, 4]
+        """
+        angle_rad = math.radians(angle)
+        cos_theta = torch.cos(torch.tensor(angle_rad))
+        sin_theta = torch.sin(torch.tensor(angle_rad))
+        if axis == 'x':
+            rot_mat = torch.tensor([
+                [1, 0, 0, 0],
+                [0, cos_theta, -sin_theta, 0],
+                [0, sin_theta, cos_theta, 0],
+                [0, 0, 0, 1]
+            ], dtype=torch.float32)
+        elif axis == 'y':
+            rot_mat = torch.tensor([
+                [cos_theta, 0, sin_theta, 0],
+                [0, 1, 0, 0],
+                [-sin_theta, 0, cos_theta, 0],
+                [0, 0, 0, 1]
+            ], dtype=torch.float32)
+        elif axis == 'z':
+            rot_mat = torch.tensor([
+                [cos_theta, -sin_theta, 0, 0],
+                [sin_theta, cos_theta, 0, 0],
+                [0, 0, 1, 0],
+                [0, 0, 0, 1]
+            ], dtype=torch.float32)
+        else:
+            raise ValueError("Invalid axis value. Choose 'x', 'y', or 'z'.")
+        return rot_mat.to(self.device)
+    def trans_poses(self, dx, dy, dz):
+        """
+        params:
+        - dx: float, displacement along x axis。
+        - dy: float, displacement along y axis。
+        - dz: float, displacement along z axis。
+        ret:
+        - matrices: torch.Tensor
+        """
+        trans_mats = torch.eye(4).unsqueeze(0).repeat(self.frame_num, 1, 1)  # (n, 4, 4)
+        delta_x = dx / (self.frame_num - 1)
+        delta_y = dy / (self.frame_num - 1)
+        delta_z = dz / (self.frame_num - 1)
+        for i in range(self.frame_num):
+            trans_mats[i, 0, 3] = i * delta_x
+            trans_mats[i, 1, 3] = i * delta_y
+            trans_mats[i, 2, 3] = i * delta_z
+        return trans_mats.to(self.device)
+    def _look_at(self, camera_position, target_position):
+        # look at direction
+        direction = target_position - camera_position
+        direction /= np.linalg.norm(direction)
+        # calculate rotation matrix
+        up = np.array([0, 1, 0])
+        right = np.cross(up, direction)
+        right /= np.linalg.norm(right)
+        up = np.cross(direction, right)
+        rotation_matrix = np.vstack([right, up, direction])
+        rotation_matrix = np.linalg.inv(rotation_matrix)
+        return rotation_matrix
+    def spiral_poses(self, radius, forward_ratio = 0.5, backward_ratio = 0.5, rotation_times = 0.1, look_at_times = 0.5):
+        """Generate spiral camera poses
+        Args:
+            radius (float): Base radius of the spiral
+            forward_ratio (float): Scale factor for forward motion
+            backward_ratio (float): Scale factor for backward motion
+            rotation_times (float): Number of rotations to complete
+            look_at_times (float): Scale factor for look-at point distance
+        Returns:
+            torch.Tensor: Camera poses of shape [num_frames, 4, 4]
+        """
+        # Generate spiral trajectory
+        t = np.linspace(0, 1, self.frame_num)
+        r = np.sin(np.pi * t) * radius * rotation_times
+        theta = 2 * np.pi * t
+        # Calculate camera positions
+        # Limit y motion for better floor/sky view
+        y = r * np.cos(theta) * 0.3
+        x = r * np.sin(theta)
+        z = -r
+        z[z < 0] *= forward_ratio
+        z[z > 0] *= backward_ratio
+        # Set look-at target
+        target_pos = np.array([0, 0, radius * look_at_times])
+        cam_pos = np.vstack([x, y, z]).T
+        cam_poses = []
+        for pos in cam_pos:
+            rot_mat = self._look_at(pos, target_pos)
+            trans_mat = np.eye(4)
+            trans_mat[:3, :3] = rot_mat
+            trans_mat[:3, 3] = pos
+            cam_poses.append(trans_mat[None])
+        camera_poses = np.concatenate(cam_poses, axis=0)
+        return torch.from_numpy(camera_poses).to(self.device)
+    def rot(self, pts, angle, axis):
+        """
+        pts: torch.Tensor, (T, N, 2)
+        """
+        rot_mats = self.rot_poses(angle, axis)
+        pts = self.apply_motion_on_pts(pts, rot_mats)
+        return pts
+    def trans(self, pts, dx, dy, dz):
+        if pts.shape[-1] != 3:
+            raise ValueError("points should be in the 3d coordinate.")
+        trans_mats = self.trans_poses(dx, dy, dz)
+        pts = self.apply_motion_on_pts(pts, trans_mats)
+        return pts
+    def spiral(self, pts, radius):
+        spiral_poses = self.spiral_poses(radius)
+        pts = self.apply_motion_on_pts(pts, spiral_poses)
+        return pts
+    def get_default_motion(self):
+        """Parse motion parameters and generate corresponding motion matrices
+        Supported formats:
+        - trans <dx> <dy> <dz> [start_frame] [end_frame]: Translation motion
+        - rot <axis> <angle> [start_frame] [end_frame]: Rotation motion
+        - spiral <radius> [start_frame] [end_frame]: Spiral motion
+        Multiple transformations can be combined using semicolon (;) as separator:
+        e.g., "trans 0 0 0.5 0 30; rot x 25 0 30; trans 0.1 0 0 30 48"
+        Note:
+            - start_frame and end_frame are optional
+            - frame range: 0-49 (will be clamped to this range)
+            - if not specified, defaults to 0-49
+            - frames after end_frame will maintain the final transformation
+            - for combined transformations, they are applied in sequence
+        Returns:
+            torch.Tensor: Motion matrices [num_frames, 4, 4]
+        """
+        if not isinstance(self.motion_type, str):
+            raise ValueError(f'camera_motion must be a string, but got {type(self.motion_type)}')
+        # Split combined transformations
+        transform_sequences = [s.strip() for s in self.motion_type.split(';')]
+        # Initialize the final motion matrices
+        final_motion = torch.eye(4, device=self.device).unsqueeze(0).repeat(49, 1, 1)
+        # Process each transformation in sequence
+        for transform in transform_sequences:
+            params = transform.lower().split()
+            if not params:
+                continue
+            motion_type = params[0]
+            # Default frame range
+            start_frame = 0
+            end_frame = 48  # 49 frames in total (0-48)
+            if motion_type == 'trans':
+                # Parse translation parameters
+                if len(params) not in [4, 6]:
+                    raise ValueError(f"trans motion requires 3 or 5 parameters: 'trans <dx> <dy> <dz>' or 'trans <dx> <dy> <dz> <start_frame> <end_frame>', got: {transform}")
+                dx, dy, dz = map(float, params[1:4])
+                if len(params) == 6:
+                    start_frame = max(0, min(48, int(params[4])))
+                    end_frame = max(0, min(48, int(params[5])))
+                    if start_frame > end_frame:
+                        start_frame, end_frame = end_frame, start_frame
+                # Generate current transformation
+                current_motion = torch.eye(4, device=self.device).unsqueeze(0).repeat(49, 1, 1)
+                for frame_idx in range(49):
+                    if frame_idx < start_frame:
+                        continue
+                    elif frame_idx <= end_frame:
+                        t = (frame_idx - start_frame) / (end_frame - start_frame)
+                        current_motion[frame_idx, :3, 3] = torch.tensor([dx, dy, dz], device=self.device) * t
+                    else:
+                        current_motion[frame_idx] = current_motion[end_frame]
+                # Combine with previous transformations
+                final_motion = torch.matmul(final_motion, current_motion)
+            elif motion_type == 'rot':
+                # Parse rotation parameters
+                if len(params) not in [3, 5]:
+                    raise ValueError(f"rot motion requires 2 or 4 parameters: 'rot <axis> <angle>' or 'rot <axis> <angle> <start_frame> <end_frame>', got: {transform}")
+                axis = params[1]
+                if axis not in ['x', 'y', 'z']:
+                    raise ValueError(f"Invalid rotation axis '{axis}', must be 'x', 'y' or 'z'")
+                angle = float(params[2])
+                if len(params) == 5:
+                    start_frame = max(0, min(48, int(params[3])))
+                    end_frame = max(0, min(48, int(params[4])))
+                    if start_frame > end_frame:
+                        start_frame, end_frame = end_frame, start_frame
+                current_motion = torch.eye(4, device=self.device).unsqueeze(0).repeat(49, 1, 1)
+                for frame_idx in range(49):
+                    if frame_idx < start_frame:
+                        continue
+                    elif frame_idx <= end_frame:
+                        t = (frame_idx - start_frame) / (end_frame - start_frame)
+                        current_angle = angle * t
+                        current_motion[frame_idx] = self.rot_poses(current_angle, axis)
+                    else:
+                        current_motion[frame_idx] = current_motion[end_frame]
+                # Combine with previous transformations
+                final_motion = torch.matmul(final_motion, current_motion)
+            elif motion_type == 'spiral':
+                # Parse spiral motion parameters
+                if len(params) not in [2, 4]:
+                    raise ValueError(f"spiral motion requires 1 or 3 parameters: 'spiral <radius>' or 'spiral <radius> <start_frame> <end_frame>', got: {transform}")
+                radius = float(params[1])
+                if len(params) == 4:
+                    start_frame = max(0, min(48, int(params[2])))
+                    end_frame = max(0, min(48, int(params[3])))
+                    if start_frame > end_frame:
+                        start_frame, end_frame = end_frame, start_frame
+                current_motion = torch.eye(4, device=self.device).unsqueeze(0).repeat(49, 1, 1)
+                spiral_motion = self.spiral_poses(radius)
+                for frame_idx in range(49):
+                    if frame_idx < start_frame:
+                        continue
+                    elif frame_idx <= end_frame:
+                        t = (frame_idx - start_frame) / (end_frame - start_frame)
+                        idx = int(t * (len(spiral_motion) - 1))
+                        current_motion[frame_idx] = spiral_motion[idx]
+                    else:
+                        current_motion[frame_idx] = current_motion[end_frame]
+                # Combine with previous transformations
+                final_motion = torch.matmul(final_motion, current_motion)
+            else:
+                raise ValueError(f'camera_motion type must be in [trans, spiral, rot], but got {motion_type}')
+        return final_motion
+class ObjectMotionGenerator:
+    def __init__(self, device="cuda:0"):
+        self.device = device
+        self.num_frames = 49
+    def _get_points_in_mask(self, pred_tracks, mask):
+        """Get points that lie within the mask
+        Args:
+            pred_tracks (torch.Tensor): Point trajectories [num_frames, num_points, 3]
+            mask (torch.Tensor): Binary mask [H, W]
+        Returns:
+            torch.Tensor: Boolean mask for selected points [num_points]
+        """
+        first_frame_points = pred_tracks[0]  # [num_points, 3]
+        xy_points = first_frame_points[:, :2]  # [num_points, 2]
+        xy_pixels = xy_points.round().long()
+        xy_pixels[:, 0].clamp_(0, mask.shape[1] - 1)
+        xy_pixels[:, 1].clamp_(0, mask.shape[0] - 1)
+        points_in_mask = mask[xy_pixels[:, 1], xy_pixels[:, 0]]
+        return points_in_mask
+    def apply_motion(self, pred_tracks, mask, motion_type, distance, num_frames=49, tracking_method="spatracker"):
+        self.num_frames = num_frames
+        pred_tracks = pred_tracks.to(self.device).float()
+        mask = mask.to(self.device)
+        template = {
+            'up': ('trans', torch.tensor([0, -1, 0])),
+            'down': ('trans', torch.tensor([0, 1, 0])),
+            'left': ('trans', torch.tensor([-1, 0, 0])),
+            'right': ('trans', torch.tensor([1, 0, 0])),
+            'front': ('trans', torch.tensor([0, 0, 1])),
+            'back': ('trans', torch.tensor([0, 0, -1])),
+            'rot': ('rot', None) # rotate around y axis
+        }
+        if motion_type not in template:
+            raise ValueError(f"unknown motion type: {motion_type}")
+        motion_type, base_vec = template[motion_type]
+        if base_vec is not None:
+            base_vec = base_vec.to(self.device) * distance
+        if tracking_method == "moge":
+            T, H, W, _ = pred_tracks.shape
+            valid_selected = ~torch.any(torch.isnan(pred_tracks[0]), dim=2) & mask
+            points = pred_tracks[0][valid_selected].reshape(-1, 3)
+        else:
+            points_in_mask = self._get_points_in_mask(pred_tracks, mask)
+            points = pred_tracks[0, points_in_mask]
+        center = points.mean(dim=0)
+        motions = []
+        for frame_idx in range(num_frames):
+            t = frame_idx / (num_frames - 1)
+            current_motion = torch.eye(4, device=self.device)
+            current_motion[:3, 3] = -center
+            motion_mat = torch.eye(4, device=self.device)
+            if motion_type == 'trans':
+                motion_mat[:3, 3] = base_vec * t
+            else:  # 'rot'
+                angle_rad = torch.deg2rad(torch.tensor(distance * t, device=self.device))
+                cos_t = torch.cos(angle_rad)
+                sin_t = torch.sin(angle_rad)
+                motion_mat[0, 0] = cos_t
+                motion_mat[0, 2] = sin_t
+                motion_mat[2, 0] = -sin_t
+                motion_mat[2, 2] = cos_t
+            current_motion = motion_mat @ current_motion
+            current_motion[:3, 3] += center
+            motions.append(current_motion)
+        motions = torch.stack(motions)  # [num_frames, 4, 4]
+        if tracking_method == "moge":
+            modified_tracks = pred_tracks.clone().reshape(T, -1, 3)
+            valid_selected = valid_selected.reshape([-1])
+            for frame_idx in range(self.num_frames):
+                motion_mat = motions[frame_idx]
+                if W > 1:
+                    motion_mat = motion_mat.clone()
+                    motion_mat[0, 3] /= W
+                    motion_mat[1, 3] /= H
+                points = modified_tracks[frame_idx, valid_selected]
+                points_homo = torch.cat([points, torch.ones_like(points[:, :1])], dim=1)
+                transformed_points = torch.matmul(points_homo, motion_mat.T)
+                modified_tracks[frame_idx, valid_selected] = transformed_points[:, :3]
+            return modified_tracks.reshape(T, H, W, 3)
+        else:
+            points_in_mask = self._get_points_in_mask(pred_tracks, mask)
+            modified_tracks = pred_tracks.clone()
+            for frame_idx in range(pred_tracks.shape[0]):
+                motion_mat = motions[frame_idx]
+                points = modified_tracks[frame_idx, points_in_mask]
+                points_homo = torch.cat([points, torch.ones_like(points[:, :1])], dim=1)
+                transformed_points = torch.matmul(points_homo, motion_mat.T)
+                modified_tracks[frame_idx, points_in_mask] = transformed_points[:, :3]
+            return modified_tracks

models/spatracker/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.

models/spatracker/models/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.

models/spatracker/models/build_spatracker.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from models.spatracker.models.core.spatracker.spatracker import SpaTracker
+def build_spatracker(
+    checkpoint: str,
+    seq_length: int = 8,
+):
+    model_name = checkpoint.split("/")[-1].split(".")[0]
+    return build_spatracker_from_cfg(checkpoint=checkpoint, seq_length=seq_length)
+# model used to produce the results in the paper
+def build_spatracker_from_cfg(checkpoint=None, seq_length=8):
+    return _build_spatracker(
+        stride=4,
+        sequence_len=seq_length,
+        checkpoint=checkpoint,
+    )
+def _build_spatracker(
+    stride,
+    sequence_len,
+    checkpoint=None,
+):
+    spatracker = SpaTracker(
+        stride=stride,
+        S=sequence_len,
+        add_space_attn=True,
+        space_depth=6,
+        time_depth=6,
+    )
+    if checkpoint is not None:
+        with open(checkpoint, "rb") as f:
+            state_dict = torch.load(f, map_location="cpu", weights_only=True)
+            if "model" in state_dict:
+                model_paras = spatracker.state_dict()
+                paras_dict = {k: v for k,v in state_dict["model"].items() if k in spatracker.state_dict()}
+                model_paras.update(paras_dict)
+                state_dict = model_paras
+        spatracker.load_state_dict(state_dict)
+    return spatracker

models/spatracker/models/core/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.

models/spatracker/models/core/embeddings.py ADDED Viewed

	@@ -0,0 +1,250 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import numpy as np
+def get_3d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    if isinstance(grid_size, tuple):
+        grid_size_h, grid_size_w = grid_size
+    else:
+        grid_size_h = grid_size_w = grid_size
+    grid_h = np.arange(grid_size_h, dtype=np.float32)
+    grid_w = np.arange(grid_size_w, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size_h, grid_size_w])
+    pos_embed = get_3d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate(
+            [np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0
+        )
+    return pos_embed
+def get_3d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 3 == 0
+    # use half of dimensions to encode grid_h
+    B, S, N, _ = grid.shape
+    gridx = grid[..., 0].view(B*S*N).detach().cpu().numpy()
+    gridy = grid[..., 1].view(B*S*N).detach().cpu().numpy()
+    gridz = grid[..., 2].view(B*S*N).detach().cpu().numpy()
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, gridx)  # (N, D/3)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, gridy)  # (N, D/3)
+    emb_z = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, gridz)  # (N, D/3)
+    emb = np.concatenate([emb_h, emb_w, emb_z], axis=1)  # (N, D)
+    emb = torch.from_numpy(emb).to(grid.device)
+    return emb.view(B, S, N, embed_dim)
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    if isinstance(grid_size, tuple):
+        grid_size_h, grid_size_w = grid_size
+    else:
+        grid_size_h = grid_size_w = grid_size
+    grid_h = np.arange(grid_size_h, dtype=np.float32)
+    grid_w = np.arange(grid_size_w, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size_h, grid_size_w])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate(
+            [np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0
+        )
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000 ** omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+def get_2d_embedding(xy, C, cat_coords=True):
+    B, N, D = xy.shape
+    assert D == 2
+    x = xy[:, :, 0:1]
+    y = xy[:, :, 1:2]
+    div_term = (
+        torch.arange(0, C, 2, device=xy.device, dtype=torch.float32) * (1000.0 / C)
+    ).reshape(1, 1, int(C / 2))
+    pe_x = torch.zeros(B, N, C, device=xy.device, dtype=torch.float32)
+    pe_y = torch.zeros(B, N, C, device=xy.device, dtype=torch.float32)
+    pe_x[:, :, 0::2] = torch.sin(x * div_term)
+    pe_x[:, :, 1::2] = torch.cos(x * div_term)
+    pe_y[:, :, 0::2] = torch.sin(y * div_term)
+    pe_y[:, :, 1::2] = torch.cos(y * div_term)
+    pe = torch.cat([pe_x, pe_y], dim=2)  # B, N, C*3
+    if cat_coords:
+        pe = torch.cat([xy, pe], dim=2)  # B, N, C*3+3
+    return pe
+def get_3d_embedding(xyz, C, cat_coords=True):
+    B, N, D = xyz.shape
+    assert D == 3
+    x = xyz[:, :, 0:1]
+    y = xyz[:, :, 1:2]
+    z = xyz[:, :, 2:3]
+    div_term = (
+        torch.arange(0, C, 2, device=xyz.device, dtype=torch.float32) * (1000.0 / C)
+    ).reshape(1, 1, int(C / 2))
+    pe_x = torch.zeros(B, N, C, device=xyz.device, dtype=torch.float32)
+    pe_y = torch.zeros(B, N, C, device=xyz.device, dtype=torch.float32)
+    pe_z = torch.zeros(B, N, C, device=xyz.device, dtype=torch.float32)
+    pe_x[:, :, 0::2] = torch.sin(x * div_term)
+    pe_x[:, :, 1::2] = torch.cos(x * div_term)
+    pe_y[:, :, 0::2] = torch.sin(y * div_term)
+    pe_y[:, :, 1::2] = torch.cos(y * div_term)
+    pe_z[:, :, 0::2] = torch.sin(z * div_term)
+    pe_z[:, :, 1::2] = torch.cos(z * div_term)
+    pe = torch.cat([pe_x, pe_y, pe_z], dim=2)  # B, N, C*3
+    if cat_coords:
+        pe = torch.cat([pe, xyz], dim=2)  # B, N, C*3+3
+    return pe
+def get_4d_embedding(xyzw, C, cat_coords=True):
+    B, N, D = xyzw.shape
+    assert D == 4
+    x = xyzw[:, :, 0:1]
+    y = xyzw[:, :, 1:2]
+    z = xyzw[:, :, 2:3]
+    w = xyzw[:, :, 3:4]
+    div_term = (
+        torch.arange(0, C, 2, device=xyzw.device, dtype=torch.float32) * (1000.0 / C)
+    ).reshape(1, 1, int(C / 2))
+    pe_x = torch.zeros(B, N, C, device=xyzw.device, dtype=torch.float32)
+    pe_y = torch.zeros(B, N, C, device=xyzw.device, dtype=torch.float32)
+    pe_z = torch.zeros(B, N, C, device=xyzw.device, dtype=torch.float32)
+    pe_w = torch.zeros(B, N, C, device=xyzw.device, dtype=torch.float32)
+    pe_x[:, :, 0::2] = torch.sin(x * div_term)
+    pe_x[:, :, 1::2] = torch.cos(x * div_term)
+    pe_y[:, :, 0::2] = torch.sin(y * div_term)
+    pe_y[:, :, 1::2] = torch.cos(y * div_term)
+    pe_z[:, :, 0::2] = torch.sin(z * div_term)
+    pe_z[:, :, 1::2] = torch.cos(z * div_term)
+    pe_w[:, :, 0::2] = torch.sin(w * div_term)
+    pe_w[:, :, 1::2] = torch.cos(w * div_term)
+    pe = torch.cat([pe_x, pe_y, pe_z, pe_w], dim=2)  # B, N, C*3
+    if cat_coords:
+        pe = torch.cat([pe, xyzw], dim=2)  # B, N, C*3+3
+    return pe
+import torch.nn as nn
+class Embedder_Fourier(nn.Module):
+    def __init__(self, input_dim, max_freq_log2, N_freqs,
+                 log_sampling=True, include_input=True,
+                 periodic_fns=(torch.sin, torch.cos)):
+        '''
+        :param input_dim: dimension of input to be embedded
+        :param max_freq_log2: log2 of max freq; min freq is 1 by default
+        :param N_freqs: number of frequency bands
+        :param log_sampling: if True, frequency bands are linerly sampled in log-space
+        :param include_input: if True, raw input is included in the embedding
+        :param periodic_fns: periodic functions used to embed input
+        '''
+        super(Embedder_Fourier, self).__init__()
+        self.input_dim = input_dim
+        self.include_input = include_input
+        self.periodic_fns = periodic_fns
+        self.out_dim = 0
+        if self.include_input:
+            self.out_dim += self.input_dim
+        self.out_dim += self.input_dim * N_freqs * len(self.periodic_fns)
+        if log_sampling:
+            self.freq_bands = 2. ** torch.linspace(0., max_freq_log2, N_freqs)
+        else:
+            self.freq_bands = torch.linspace(
+                2. ** 0., 2. ** max_freq_log2, N_freqs)
+        self.freq_bands = self.freq_bands.numpy().tolist()
+    def forward(self,
+                input: torch.Tensor,
+                rescale: float = 1.0):
+        '''
+        :param input: tensor of shape [..., self.input_dim]
+        :return: tensor of shape [..., self.out_dim]
+        '''
+        assert (input.shape[-1] == self.input_dim)
+        out = []
+        if self.include_input:
+            out.append(input/rescale)
+        for i in range(len(self.freq_bands)):
+            freq = self.freq_bands[i]
+            for p_fn in self.periodic_fns:
+                out.append(p_fn(input * freq))
+        out = torch.cat(out, dim=-1)
+        assert (out.shape[-1] == self.out_dim)
+        return out

models/spatracker/models/core/model_utils.py ADDED Viewed

	@@ -0,0 +1,477 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn.functional as F
+from easydict import EasyDict as edict
+from sklearn.decomposition import PCA
+import matplotlib.pyplot as plt
+EPS = 1e-6
+def nearest_sample2d(im, x, y, return_inbounds=False):
+    # x and y are each B, N
+    # output is B, C, N
+    if len(im.shape) == 5:
+        B, N, C, H, W = list(im.shape)
+    else:
+        B, C, H, W = list(im.shape)
+    N = list(x.shape)[1]
+    x = x.float()
+    y = y.float()
+    H_f = torch.tensor(H, dtype=torch.float32)
+    W_f = torch.tensor(W, dtype=torch.float32)
+    # inbound_mask = (x>-0.5).float()*(y>-0.5).float()*(x<W_f+0.5).float()*(y<H_f+0.5).float()
+    max_y = (H_f - 1).int()
+    max_x = (W_f - 1).int()
+    x0 = torch.floor(x).int()
+    x1 = x0 + 1
+    y0 = torch.floor(y).int()
+    y1 = y0 + 1
+    x0_clip = torch.clamp(x0, 0, max_x)
+    x1_clip = torch.clamp(x1, 0, max_x)
+    y0_clip = torch.clamp(y0, 0, max_y)
+    y1_clip = torch.clamp(y1, 0, max_y)
+    dim2 = W
+    dim1 = W * H
+    base = torch.arange(0, B, dtype=torch.int64, device=x.device) * dim1
+    base = torch.reshape(base, [B, 1]).repeat([1, N])
+    base_y0 = base + y0_clip * dim2
+    base_y1 = base + y1_clip * dim2
+    idx_y0_x0 = base_y0 + x0_clip
+    idx_y0_x1 = base_y0 + x1_clip
+    idx_y1_x0 = base_y1 + x0_clip
+    idx_y1_x1 = base_y1 + x1_clip
+    # use the indices to lookup pixels in the flat image
+    # im is B x C x H x W
+    # move C out to last dim
+    if len(im.shape) == 5:
+        im_flat = (im.permute(0, 3, 4, 1, 2)).reshape(B * H * W, N, C)
+        i_y0_x0 = torch.diagonal(im_flat[idx_y0_x0.long()], dim1=1, dim2=2).permute(
+            0, 2, 1
+        )
+        i_y0_x1 = torch.diagonal(im_flat[idx_y0_x1.long()], dim1=1, dim2=2).permute(
+            0, 2, 1
+        )
+        i_y1_x0 = torch.diagonal(im_flat[idx_y1_x0.long()], dim1=1, dim2=2).permute(
+            0, 2, 1
+        )
+        i_y1_x1 = torch.diagonal(im_flat[idx_y1_x1.long()], dim1=1, dim2=2).permute(
+            0, 2, 1
+        )
+    else:
+        im_flat = (im.permute(0, 2, 3, 1)).reshape(B * H * W, C)
+        i_y0_x0 = im_flat[idx_y0_x0.long()]
+        i_y0_x1 = im_flat[idx_y0_x1.long()]
+        i_y1_x0 = im_flat[idx_y1_x0.long()]
+        i_y1_x1 = im_flat[idx_y1_x1.long()]
+    # Finally calculate interpolated values.
+    x0_f = x0.float()
+    x1_f = x1.float()
+    y0_f = y0.float()
+    y1_f = y1.float()
+    w_y0_x0 = ((x1_f - x) * (y1_f - y)).unsqueeze(2)
+    w_y0_x1 = ((x - x0_f) * (y1_f - y)).unsqueeze(2)
+    w_y1_x0 = ((x1_f - x) * (y - y0_f)).unsqueeze(2)
+    w_y1_x1 = ((x - x0_f) * (y - y0_f)).unsqueeze(2)
+    # w_yi_xo is B * N * 1
+    max_idx = torch.cat([w_y0_x0, w_y0_x1, w_y1_x0, w_y1_x1], dim=-1).max(dim=-1)[1]
+    output = torch.stack([i_y0_x0, i_y0_x1, i_y1_x0, i_y1_x1], dim=-1).gather(-1, max_idx[...,None,None].repeat(1,1,C,1)).squeeze(-1)
+    # output is B*N x C
+    output = output.view(B, -1, C)
+    output = output.permute(0, 2, 1)
+    # output is B x C x N
+    if return_inbounds:
+        x_valid = (x > -0.5).byte() & (x < float(W_f - 0.5)).byte()
+        y_valid = (y > -0.5).byte() & (y < float(H_f - 0.5)).byte()
+        inbounds = (x_valid & y_valid).float()
+        inbounds = inbounds.reshape(
+            B, N
+        )  # something seems wrong here for B>1; i'm getting an error here (or downstream if i put -1)
+        return output, inbounds
+    return output  # B, C, N
+def smart_cat(tensor1, tensor2, dim):
+    if tensor1 is None:
+        return tensor2
+    return torch.cat([tensor1, tensor2], dim=dim)
+def normalize_single(d):
+    # d is a whatever shape torch tensor
+    dmin = torch.min(d)
+    dmax = torch.max(d)
+    d = (d - dmin) / (EPS + (dmax - dmin))
+    return d
+def normalize(d):
+    # d is B x whatever. normalize within each element of the batch
+    out = torch.zeros(d.size())
+    if d.is_cuda:
+        out = out.cuda()
+    B = list(d.size())[0]
+    for b in list(range(B)):
+        out[b] = normalize_single(d[b])
+    return out
+def meshgrid2d(B, Y, X, stack=False, norm=False, device="cuda"):
+    # returns a meshgrid sized B x Y x X
+    grid_y = torch.linspace(0.0, Y - 1, Y, device=torch.device(device))
+    grid_y = torch.reshape(grid_y, [1, Y, 1])
+    grid_y = grid_y.repeat(B, 1, X)
+    grid_x = torch.linspace(0.0, X - 1, X, device=torch.device(device))
+    grid_x = torch.reshape(grid_x, [1, 1, X])
+    grid_x = grid_x.repeat(B, Y, 1)
+    if stack:
+        # note we stack in xy order
+        # (see https://pytorch.org/docs/stable/nn.functional.html#torch.nn.functional.grid_sample)
+        grid = torch.stack([grid_x, grid_y], dim=-1)
+        return grid
+    else:
+        return grid_y, grid_x
+def reduce_masked_mean(x, mask, dim=None, keepdim=False):
+    # x and mask are the same shape, or at least broadcastably so < actually it's safer if you disallow broadcasting
+    # returns shape-1
+    # axis can be a list of axes
+    for (a, b) in zip(x.size(), mask.size()):
+        assert a == b  # some shape mismatch!
+    prod = x * mask
+    if dim is None:
+        numer = torch.sum(prod)
+        denom = EPS + torch.sum(mask)
+    else:
+        numer = torch.sum(prod, dim=dim, keepdim=keepdim)
+        denom = EPS + torch.sum(mask, dim=dim, keepdim=keepdim)
+    mean = numer / denom
+    return mean
+def bilinear_sample2d(im, x, y, return_inbounds=False):
+    # x and y are each B, N
+    # output is B, C, N
+    if len(im.shape) == 5:
+        B, N, C, H, W = list(im.shape)
+    else:
+        B, C, H, W = list(im.shape)
+    N = list(x.shape)[1]
+    x = x.float()
+    y = y.float()
+    H_f = torch.tensor(H, dtype=torch.float32)
+    W_f = torch.tensor(W, dtype=torch.float32)
+    # inbound_mask = (x>-0.5).float()*(y>-0.5).float()*(x<W_f+0.5).float()*(y<H_f+0.5).float()
+    max_y = (H_f - 1).int()
+    max_x = (W_f - 1).int()
+    x0 = torch.floor(x).int()
+    x1 = x0 + 1
+    y0 = torch.floor(y).int()
+    y1 = y0 + 1
+    x0_clip = torch.clamp(x0, 0, max_x)
+    x1_clip = torch.clamp(x1, 0, max_x)
+    y0_clip = torch.clamp(y0, 0, max_y)
+    y1_clip = torch.clamp(y1, 0, max_y)
+    dim2 = W
+    dim1 = W * H
+    base = torch.arange(0, B, dtype=torch.int64, device=x.device) * dim1
+    base = torch.reshape(base, [B, 1]).repeat([1, N])
+    base_y0 = base + y0_clip * dim2
+    base_y1 = base + y1_clip * dim2
+    idx_y0_x0 = base_y0 + x0_clip
+    idx_y0_x1 = base_y0 + x1_clip
+    idx_y1_x0 = base_y1 + x0_clip
+    idx_y1_x1 = base_y1 + x1_clip
+    # use the indices to lookup pixels in the flat image
+    # im is B x C x H x W
+    # move C out to last dim
+    if len(im.shape) == 5:
+        im_flat = (im.permute(0, 3, 4, 1, 2)).reshape(B * H * W, N, C)
+        i_y0_x0 = torch.diagonal(im_flat[idx_y0_x0.long()], dim1=1, dim2=2).permute(
+            0, 2, 1
+        )
+        i_y0_x1 = torch.diagonal(im_flat[idx_y0_x1.long()], dim1=1, dim2=2).permute(
+            0, 2, 1
+        )
+        i_y1_x0 = torch.diagonal(im_flat[idx_y1_x0.long()], dim1=1, dim2=2).permute(
+            0, 2, 1
+        )
+        i_y1_x1 = torch.diagonal(im_flat[idx_y1_x1.long()], dim1=1, dim2=2).permute(
+            0, 2, 1
+        )
+    else:
+        im_flat = (im.permute(0, 2, 3, 1)).reshape(B * H * W, C)
+        i_y0_x0 = im_flat[idx_y0_x0.long()]
+        i_y0_x1 = im_flat[idx_y0_x1.long()]
+        i_y1_x0 = im_flat[idx_y1_x0.long()]
+        i_y1_x1 = im_flat[idx_y1_x1.long()]
+    # Finally calculate interpolated values.
+    x0_f = x0.float()
+    x1_f = x1.float()
+    y0_f = y0.float()
+    y1_f = y1.float()
+    w_y0_x0 = ((x1_f - x) * (y1_f - y)).unsqueeze(2)
+    w_y0_x1 = ((x - x0_f) * (y1_f - y)).unsqueeze(2)
+    w_y1_x0 = ((x1_f - x) * (y - y0_f)).unsqueeze(2)
+    w_y1_x1 = ((x - x0_f) * (y - y0_f)).unsqueeze(2)
+    output = (
+        w_y0_x0 * i_y0_x0 + w_y0_x1 * i_y0_x1 + w_y1_x0 * i_y1_x0 + w_y1_x1 * i_y1_x1
+    )
+    # output is B*N x C
+    output = output.view(B, -1, C)
+    output = output.permute(0, 2, 1)
+    # output is B x C x N
+    if return_inbounds:
+        x_valid = (x > -0.5).byte() & (x < float(W_f - 0.5)).byte()
+        y_valid = (y > -0.5).byte() & (y < float(H_f - 0.5)).byte()
+        inbounds = (x_valid & y_valid).float()
+        inbounds = inbounds.reshape(
+            B, N
+        )  # something seems wrong here for B>1; i'm getting an error here (or downstream if i put -1)
+        return output, inbounds
+    return output  # B, C, N
+def procrustes_analysis(X0,X1,Weight): # [B,N,3]
+    # translation
+    t0 = X0.mean(dim=1,keepdim=True)
+    t1 = X1.mean(dim=1,keepdim=True)
+    X0c = X0-t0
+    X1c = X1-t1
+    # scale
+    # s0 = (X0c**2).sum(dim=-1).mean().sqrt()
+    # s1 = (X1c**2).sum(dim=-1).mean().sqrt()
+    # X0cs = X0c/s0
+    # X1cs = X1c/s1
+    # rotation (use double for SVD, float loses precision)
+    U,_,V = (X0c.t()@X1c).double().svd(some=True)
+    R = ([email protected]()).float()
+    if R.det()<0: R[2] *= -1
+    # align X1 to X0: X1to0 = (X1-t1)/@R.t()+t0
+    se3 = edict(t0=t0[0],t1=t1[0],R=R)
+    return se3
+def bilinear_sampler(input, coords, align_corners=True, padding_mode="border"):
+    r"""Sample a tensor using bilinear interpolation
+    `bilinear_sampler(input, coords)` samples a tensor :attr:`input` at
+    coordinates :attr:`coords` using bilinear interpolation. It is the same
+    as `torch.nn.functional.grid_sample()` but with a different coordinate
+    convention.
+    The input tensor is assumed to be of shape :math:`(B, C, H, W)`, where
+    :math:`B` is the batch size, :math:`C` is the number of channels,
+    :math:`H` is the height of the image, and :math:`W` is the width of the
+    image. The tensor :attr:`coords` of shape :math:`(B, H_o, W_o, 2)` is
+    interpreted as an array of 2D point coordinates :math:`(x_i,y_i)`.
+    Alternatively, the input tensor can be of size :math:`(B, C, T, H, W)`,
+    in which case sample points are triplets :math:`(t_i,x_i,y_i)`. Note
+    that in this case the order of the components is slightly different
+    from `grid_sample()`, which would expect :math:`(x_i,y_i,t_i)`.
+    If `align_corners` is `True`, the coordinate :math:`x` is assumed to be
+    in the range :math:`[0,W-1]`, with 0 corresponding to the center of the
+    left-most image pixel :math:`W-1` to the center of the right-most
+    pixel.
+    If `align_corners` is `False`, the coordinate :math:`x` is assumed to
+    be in the range :math:`[0,W]`, with 0 corresponding to the left edge of
+    the left-most pixel :math:`W` to the right edge of the right-most
+    pixel.
+    Similar conventions apply to the :math:`y` for the range
+    :math:`[0,H-1]` and :math:`[0,H]` and to :math:`t` for the range
+    :math:`[0,T-1]` and :math:`[0,T]`.
+    Args:
+        input (Tensor): batch of input images.
+        coords (Tensor): batch of coordinates.
+        align_corners (bool, optional): Coordinate convention. Defaults to `True`.
+        padding_mode (str, optional): Padding mode. Defaults to `"border"`.
+    Returns:
+        Tensor: sampled points.
+    """
+    sizes = input.shape[2:]
+    assert len(sizes) in [2, 3]
+    if len(sizes) == 3:
+        # t x y -> x y t to match dimensions T H W in grid_sample
+        coords = coords[..., [1, 2, 0]]
+    if align_corners:
+        coords = coords * torch.tensor(
+            [2 / max(size - 1, 1) for size in reversed(sizes)], device=coords.device
+        )
+    else:
+        coords = coords * torch.tensor([2 / size for size in reversed(sizes)], device=coords.device)
+    coords -= 1
+    return F.grid_sample(input, coords, align_corners=align_corners, padding_mode=padding_mode)
+def sample_features4d(input, coords):
+    r"""Sample spatial features
+    `sample_features4d(input, coords)` samples the spatial features
+    :attr:`input` represented by a 4D tensor :math:`(B, C, H, W)`.
+    The field is sampled at coordinates :attr:`coords` using bilinear
+    interpolation. :attr:`coords` is assumed to be of shape :math:`(B, R,
+    3)`, where each sample has the format :math:`(x_i, y_i)`. This uses the
+    same convention as :func:`bilinear_sampler` with `align_corners=True`.
+    The output tensor has one feature per point, and has shape :math:`(B,
+    R, C)`.
+    Args:
+        input (Tensor): spatial features.
+        coords (Tensor): points.
+    Returns:
+        Tensor: sampled features.
+    """
+    B, _, _, _ = input.shape
+    # B R 2 -> B R 1 2
+    coords = coords.unsqueeze(2)
+    # B C R 1
+    feats = bilinear_sampler(input, coords)
+    return feats.permute(0, 2, 1, 3).view(
+        B, -1, feats.shape[1] * feats.shape[3]
+    )  # B C R 1 -> B R C
+def sample_features5d(input, coords):
+    r"""Sample spatio-temporal features
+    `sample_features5d(input, coords)` works in the same way as
+    :func:`sample_features4d` but for spatio-temporal features and points:
+    :attr:`input` is a 5D tensor :math:`(B, T, C, H, W)`, :attr:`coords` is
+    a :math:`(B, R1, R2, 3)` tensor of spatio-temporal point :math:`(t_i,
+    x_i, y_i)`. The output tensor has shape :math:`(B, R1, R2, C)`.
+    Args:
+        input (Tensor): spatio-temporal features.
+        coords (Tensor): spatio-temporal points.
+    Returns:
+        Tensor: sampled features.
+    """
+    B, T, _, _, _ = input.shape
+    # B T C H W -> B C T H W
+    input = input.permute(0, 2, 1, 3, 4)
+    # B R1 R2 3 -> B R1 R2 1 3
+    coords = coords.unsqueeze(3)
+    # B C R1 R2 1
+    feats = bilinear_sampler(input, coords)
+    return feats.permute(0, 2, 3, 1, 4).view(
+        B, feats.shape[2], feats.shape[3], feats.shape[1]
+    )  # B C R1 R2 1 -> B R1 R2 C
+def vis_PCA(fmaps, save_dir):
+    """
+        visualize the PCA of the feature maps
+    args:
+        fmaps: feature maps  1 C H W
+        save_dir: the directory to save the PCA visualization
+    """
+    pca = PCA(n_components=3)
+    fmap_vis = fmaps[0,...]
+    fmap_vnorm = (
+        (fmap_vis-fmap_vis.min())/
+        (fmap_vis.max()-fmap_vis.min()))
+    H_vis, W_vis = fmap_vis.shape[1:]
+    fmap_vnorm = fmap_vnorm.reshape(fmap_vnorm.shape[0],
+                                        -1).permute(1,0)
+    fmap_pca = pca.fit_transform(fmap_vnorm.detach().cpu().numpy())
+    pca = fmap_pca.reshape(H_vis,W_vis,3)
+    plt.imsave(save_dir,
+                (
+                    (pca-pca.min())/
+                    (pca.max()-pca.min())
+                    ))
+        # debug=False
+        # if debug==True:
+        #     pcd_idx = 60
+        #     vis_PCA(fmapYZ[0,:1], "./yz.png")
+        #     vis_PCA(fmapXZ[0,:1], "./xz.png")
+        #     vis_PCA(fmaps[0,:1], "./xy.png")
+        #     vis_PCA(fmaps[0,-1:], "./xy_.png")
+        #     fxy_q = fxy[0,0,pcd_idx:pcd_idx+1, :, None, None]
+        #     fyz_q = fyz[0,0,pcd_idx:pcd_idx+1, :, None, None]
+        #     fxz_q = fxz[0,0,pcd_idx:pcd_idx+1, :, None, None]
+        #     corr_map = (fxy_q*fmaps[0,-1:]).sum(dim=1)
+        #     corr_map_yz = (fyz_q*fmapYZ[0,-1:]).sum(dim=1)
+        #     corr_map_xz = (fxz_q*fmapXZ[0,-1:]).sum(dim=1)
+        #     coord_last = coords[0,-1,pcd_idx:pcd_idx+1]
+        #     coord_last_neigh = coords[0,-1, self.neigh_indx[pcd_idx]]
+        #     depth_last = depths_dnG[-1,0]
+        #     abs_res = (depth_last-coord_last[-1,-1]).abs()
+        #     abs_res = (abs_res - abs_res.min())/(abs_res.max()-abs_res.min())
+        #     res_dp = torch.exp(-abs_res)
+        #     enhance_corr = res_dp*corr_map
+        #     plt.imsave("./res.png", res_dp.detach().cpu().numpy())
+        #     plt.imsave("./enhance_corr.png", enhance_corr[0].detach().cpu().numpy())
+        #     plt.imsave("./corr_map.png", corr_map[0].detach().cpu().numpy())
+        #     plt.imsave("./corr_map_yz.png", corr_map_yz[0].detach().cpu().numpy())
+        #     plt.imsave("./corr_map_xz.png", corr_map_xz[0].detach().cpu().numpy())
+        #     img_feat = cv2.imread("./xy.png")
+        #     cv2.circle(img_feat, (int(coord_last[0,0]), int(coord_last[0,1])), 2, (0, 0, 255), -1)
+        #     for p_i in coord_last_neigh:
+        #         cv2.circle(img_feat, (int(p_i[0]), int(p_i[1])), 1, (0, 255, 0), -1)
+        #     cv2.imwrite("./xy_coord.png", img_feat)
+        #     import ipdb; ipdb.set_trace()

models/spatracker/models/core/spatracker/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.

models/spatracker/models/core/spatracker/blocks.py ADDED Viewed

	@@ -0,0 +1,999 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.cuda.amp import autocast
+from einops import rearrange
+import collections
+from functools import partial
+from itertools import repeat
+import torchvision.models as tvm
+from models.spatracker.models.core.spatracker.vit.encoder import ImageEncoderViT as vitEnc
+from models.spatracker.models.core.spatracker.dpt.models import DPTEncoder
+from models.spatracker.models.core.spatracker.loftr import LocalFeatureTransformer
+# from models.monoD.depth_anything.dpt import DPTHeadEnc, DPTHead
+# From PyTorch internals
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return tuple(x)
+        return tuple(repeat(x, n))
+    return parse
+def exists(val):
+    return val is not None
+def default(val, d):
+    return val if exists(val) else d
+to_2tuple = _ntuple(2)
+class Mlp(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        norm_layer=None,
+        bias=True,
+        drop=0.0,
+        use_conv=False,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        bias = to_2tuple(bias)
+        drop_probs = to_2tuple(drop)
+        linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
+        self.fc1 = linear_layer(in_features, hidden_features, bias=bias[0])
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.norm = norm_layer(hidden_features) if norm_layer is not None else nn.Identity()
+        self.fc2 = linear_layer(hidden_features, out_features, bias=bias[1])
+        self.drop2 = nn.Dropout(drop_probs[1])
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+class Attention(nn.Module):
+    def __init__(self, query_dim, context_dim=None,
+                  num_heads=8, dim_head=48, qkv_bias=False, flash=False):
+        super().__init__()
+        inner_dim = self.inner_dim = dim_head * num_heads
+        context_dim = default(context_dim, query_dim)
+        self.scale = dim_head**-0.5
+        self.heads = num_heads
+        self.flash = flash
+        self.qkv = nn.Linear(query_dim, inner_dim*3, bias=qkv_bias)
+        self.proj = nn.Linear(inner_dim, query_dim)
+    def forward(self, x, context=None, attn_bias=None):
+        B, N1, _ = x.shape
+        C = self.inner_dim
+        h = self.heads
+        # q = self.to_q(x).reshape(B, N1, h, C // h).permute(0, 2, 1, 3)
+        # k, v = self.to_kv(context).chunk(2, dim=-1)
+        # context = default(context, x)
+        qkv = self.qkv(x).reshape(B, N1, 3, h, C // h)
+        q, k, v = qkv[:,:, 0], qkv[:,:, 1], qkv[:,:, 2]
+        N2 = x.shape[1]
+        k = k.reshape(B, N2, h, C // h).permute(0, 2, 1, 3)
+        v = v.reshape(B, N2, h, C // h).permute(0, 2, 1, 3)
+        q = q.reshape(B, N1, h, C // h).permute(0, 2, 1, 3)
+        if self.flash==False:
+            sim = (q @ k.transpose(-2, -1)) * self.scale
+            if attn_bias is not None:
+                sim = sim + attn_bias
+            attn = sim.softmax(dim=-1)
+            x = (attn @ v).transpose(1, 2).reshape(B, N1, C)
+        else:
+            input_args = [x.half().contiguous() for x in [q, k, v]]
+            x = F.scaled_dot_product_attention(*input_args).permute(0,2,1,3).reshape(B,N1,-1)  # type: ignore
+        # return self.to_out(x.float())
+        return self.proj(x.float())
+class ResidualBlock(nn.Module):
+    def __init__(self, in_planes, planes, norm_fn="group", stride=1):
+        super(ResidualBlock, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_planes,
+            planes,
+            kernel_size=3,
+            padding=1,
+            stride=stride,
+            padding_mode="zeros",
+        )
+        self.conv2 = nn.Conv2d(
+            planes, planes, kernel_size=3, padding=1, padding_mode="zeros"
+        )
+        self.relu = nn.ReLU(inplace=True)
+        num_groups = planes // 8
+        if norm_fn == "group":
+            self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+            self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+            if not stride == 1:
+                self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+        elif norm_fn == "batch":
+            self.norm1 = nn.BatchNorm2d(planes)
+            self.norm2 = nn.BatchNorm2d(planes)
+            if not stride == 1:
+                self.norm3 = nn.BatchNorm2d(planes)
+        elif norm_fn == "instance":
+            self.norm1 = nn.InstanceNorm2d(planes)
+            self.norm2 = nn.InstanceNorm2d(planes)
+            if not stride == 1:
+                self.norm3 = nn.InstanceNorm2d(planes)
+        elif norm_fn == "none":
+            self.norm1 = nn.Sequential()
+            self.norm2 = nn.Sequential()
+            if not stride == 1:
+                self.norm3 = nn.Sequential()
+        if stride == 1:
+            self.downsample = None
+        else:
+            self.downsample = nn.Sequential(
+                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3
+            )
+    def forward(self, x):
+        y = x
+        y = self.relu(self.norm1(self.conv1(y)))
+        y = self.relu(self.norm2(self.conv2(y)))
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return self.relu(x + y)
+class BasicEncoder(nn.Module):
+    def __init__(
+        self, input_dim=3, output_dim=128, stride=8, norm_fn="batch", dropout=0.0,
+        Embed3D=False
+    ):
+        super(BasicEncoder, self).__init__()
+        self.stride = stride
+        self.norm_fn = norm_fn
+        self.in_planes = 64
+        if self.norm_fn == "group":
+            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=self.in_planes)
+            self.norm2 = nn.GroupNorm(num_groups=8, num_channels=output_dim * 2)
+        elif self.norm_fn == "batch":
+            self.norm1 = nn.BatchNorm2d(self.in_planes)
+            self.norm2 = nn.BatchNorm2d(output_dim * 2)
+        elif self.norm_fn == "instance":
+            self.norm1 = nn.InstanceNorm2d(self.in_planes)
+            self.norm2 = nn.InstanceNorm2d(output_dim * 2)
+        elif self.norm_fn == "none":
+            self.norm1 = nn.Sequential()
+        self.conv1 = nn.Conv2d(
+            input_dim,
+            self.in_planes,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            padding_mode="zeros",
+        )
+        self.relu1 = nn.ReLU(inplace=True)
+        self.shallow = False
+        if self.shallow:
+            self.layer1 = self._make_layer(64, stride=1)
+            self.layer2 = self._make_layer(96, stride=2)
+            self.layer3 = self._make_layer(128, stride=2)
+            self.conv2 = nn.Conv2d(128 + 96 + 64, output_dim, kernel_size=1)
+        else:
+            if Embed3D:
+                self.conv_fuse = nn.Conv2d(64+63,
+                                           self.in_planes, kernel_size=3, padding=1)
+            self.layer1 = self._make_layer(64, stride=1)
+            self.layer2 = self._make_layer(96, stride=2)
+            self.layer3 = self._make_layer(128, stride=2)
+            self.layer4 = self._make_layer(128, stride=2)
+            self.conv2 = nn.Conv2d(
+                128 + 128 + 96 + 64,
+                output_dim * 2,
+                kernel_size=3,
+                padding=1,
+                padding_mode="zeros",
+            )
+            self.relu2 = nn.ReLU(inplace=True)
+            self.conv3 = nn.Conv2d(output_dim * 2, output_dim, kernel_size=1)
+        self.dropout = None
+        if dropout > 0:
+            self.dropout = nn.Dropout2d(p=dropout)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out",
+                                                 nonlinearity="relu")
+            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def _make_layer(self, dim, stride=1):
+        layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride)
+        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
+        layers = (layer1, layer2)
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+    def forward(self, x, feat_PE=None):
+        _, _, H, W = x.shape
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+        if self.shallow:
+            a = self.layer1(x)
+            b = self.layer2(a)
+            c = self.layer3(b)
+            a = F.interpolate(
+                a,
+                (H // self.stride, W // self.stride),
+                mode="bilinear",
+                align_corners=True,
+            )
+            b = F.interpolate(
+                b,
+                (H // self.stride, W // self.stride),
+                mode="bilinear",
+                align_corners=True,
+            )
+            c = F.interpolate(
+                c,
+                (H // self.stride, W // self.stride),
+                mode="bilinear",
+                align_corners=True,
+            )
+            x = self.conv2(torch.cat([a, b, c], dim=1))
+        else:
+            if feat_PE is not None:
+                x = self.conv_fuse(torch.cat([x, feat_PE], dim=1))
+                a = self.layer1(x)
+            else:
+                a = self.layer1(x)
+            b = self.layer2(a)
+            c = self.layer3(b)
+            d = self.layer4(c)
+            a = F.interpolate(
+                a,
+                (H // self.stride, W // self.stride),
+                mode="bilinear",
+                align_corners=True,
+            )
+            b = F.interpolate(
+                b,
+                (H // self.stride, W // self.stride),
+                mode="bilinear",
+                align_corners=True,
+            )
+            c = F.interpolate(
+                c,
+                (H // self.stride, W // self.stride),
+                mode="bilinear",
+                align_corners=True,
+            )
+            d = F.interpolate(
+                d,
+                (H // self.stride, W // self.stride),
+                mode="bilinear",
+                align_corners=True,
+            )
+            x = self.conv2(torch.cat([a, b, c, d], dim=1))
+            x = self.norm2(x)
+            x = self.relu2(x)
+            x = self.conv3(x)
+        if self.training and self.dropout is not None:
+            x = self.dropout(x)
+        return x
+class VitEncoder(nn.Module):
+    def __init__(self, input_dim=4, output_dim=128, stride=4):
+        super(VitEncoder, self).__init__()
+        self.vit = vitEnc(img_size=512,
+                     depth=6, num_heads=8, in_chans=input_dim,
+                     out_chans=output_dim,embed_dim=384).cuda()
+        self.stride = stride
+    def forward(self, x):
+        T, C, H, W = x.shape
+        x_resize = F.interpolate(x.view(-1, C, H, W), size=(512, 512),
+                                  mode='bilinear', align_corners=False)
+        x_resize = self.vit(x_resize)
+        x = F.interpolate(x_resize, size=(H//self.stride, W//self.stride),
+                            mode='bilinear', align_corners=False)
+        return x
+class DPTEnc(nn.Module):
+    def __init__(self, input_dim=3, output_dim=128, stride=2):
+        super(DPTEnc, self).__init__()
+        self.dpt = DPTEncoder()
+        self.stride = stride
+    def forward(self, x):
+        T, C, H, W = x.shape
+        x = (x-0.5)/0.5
+        x_resize = F.interpolate(x.view(-1, C, H, W), size=(384, 384),
+                                  mode='bilinear', align_corners=False)
+        x_resize = self.dpt(x_resize)
+        x = F.interpolate(x_resize, size=(H//self.stride, W//self.stride),
+                            mode='bilinear', align_corners=False)
+        return x
+# class DPT_DINOv2(nn.Module):
+#     def __init__(self, encoder='vits', features=64, out_channels=[48, 96, 192, 384],
+#                   use_bn=True, use_clstoken=False, localhub=True, stride=2, enc_only=True):
+#         super(DPT_DINOv2, self).__init__()
+#         self.stride = stride
+#         self.enc_only = enc_only
+#         assert encoder in ['vits', 'vitb', 'vitl']
+#         if localhub:
+#             self.pretrained = torch.hub.load('models/torchhub/facebookresearch_dinov2_main', 'dinov2_{:}14'.format(encoder), source='local', pretrained=False)
+#         else:
+#             self.pretrained = torch.hub.load('facebookresearch/dinov2', 'dinov2_{:}14'.format(encoder))
+#         state_dict = torch.load("models/monoD/zoeDepth/ckpts/dinov2_vits14_pretrain.pth")
+#         self.pretrained.load_state_dict(state_dict, strict=True)
+#         self.pretrained.requires_grad_(False)
+#         dim = self.pretrained.blocks[0].attn.qkv.in_features
+#         if enc_only == True:
+#             out_channels=[128, 128, 128, 128]
+#         self.DPThead = DPTHeadEnc(1, dim, features, use_bn, out_channels=out_channels, use_clstoken=use_clstoken)
+#     def forward(self, x):
+#         mean_ = torch.tensor([0.485, 0.456, 0.406],
+#                              device=x.device).view(1, 3, 1, 1)
+#         std_ = torch.tensor([0.229, 0.224, 0.225],
+#                             device=x.device).view(1, 3, 1, 1)
+#         x = (x+1)/2
+#         x = (x - mean_)/std_
+#         h, w = x.shape[-2:]
+#         h_re, w_re = 560, 560
+#         x_resize = F.interpolate(x, size=(h_re, w_re),
+#                                   mode='bilinear', align_corners=False)
+#         with torch.no_grad():
+#             features = self.pretrained.get_intermediate_layers(x_resize, 4, return_class_token=True)
+#         patch_h, patch_w = h_re // 14, w_re // 14
+#         feat = self.DPThead(features, patch_h, patch_w, self.enc_only)
+#         feat = F.interpolate(feat, size=(h//self.stride, w//self.stride), mode="bilinear", align_corners=True)
+#         return feat
+class VGG19(nn.Module):
+    def __init__(self, pretrained=False, amp = False, amp_dtype = torch.float16) -> None:
+        super().__init__()
+        self.layers = nn.ModuleList(tvm.vgg19_bn(pretrained=pretrained).features[:40])
+        self.amp = amp
+        self.amp_dtype = amp_dtype
+    def forward(self, x, **kwargs):
+        with torch.autocast("cuda", enabled=self.amp, dtype = self.amp_dtype):
+            feats = {}
+            scale = 1
+            for layer in self.layers:
+                if isinstance(layer, nn.MaxPool2d):
+                    feats[scale] = x
+                    scale = scale*2
+                x = layer(x)
+            return feats
+class CNNandDinov2(nn.Module):
+    def __init__(self, cnn_kwargs = None, amp = True, amp_dtype = torch.float16):
+        super().__init__()
+        # in case the Internet connection is not stable, please load the DINOv2 locally
+        self.dinov2_vitl14 = torch.hub.load('models/torchhub/facebookresearch_dinov2_main',
+                                          'dinov2_{:}14'.format("vitl"), source='local', pretrained=False)
+        state_dict = torch.load("models/monoD/zoeDepth/ckpts/dinov2_vitl14_pretrain.pth")
+        self.dinov2_vitl14.load_state_dict(state_dict, strict=True)
+        cnn_kwargs = cnn_kwargs if cnn_kwargs is not None else {}
+        self.cnn = VGG19(**cnn_kwargs)
+        self.amp = amp
+        self.amp_dtype = amp_dtype
+        if self.amp:
+            dinov2_vitl14 = dinov2_vitl14.to(self.amp_dtype)
+        self.dinov2_vitl14 = [dinov2_vitl14] # ugly hack to not show parameters to DDP
+    def train(self, mode: bool = True):
+        return self.cnn.train(mode)
+    def forward(self, x, upsample = False):
+        B,C,H,W = x.shape
+        feature_pyramid = self.cnn(x)
+        if not upsample:
+            with torch.no_grad():
+                if self.dinov2_vitl14[0].device != x.device:
+                    self.dinov2_vitl14[0] = self.dinov2_vitl14[0].to(x.device).to(self.amp_dtype)
+                dinov2_features_16 = self.dinov2_vitl14[0].forward_features(x.to(self.amp_dtype))
+                features_16 = dinov2_features_16['x_norm_patchtokens'].permute(0,2,1).reshape(B,1024,H//14, W//14)
+                del dinov2_features_16
+                feature_pyramid[16] = features_16
+        return feature_pyramid
+class Dinov2(nn.Module):
+    def __init__(self, amp = True, amp_dtype = torch.float16):
+        super().__init__()
+        # in case the Internet connection is not stable, please load the DINOv2 locally
+        self.dinov2_vitl14 = torch.hub.load('models/torchhub/facebookresearch_dinov2_main',
+                                          'dinov2_{:}14'.format("vitl"), source='local', pretrained=False)
+        state_dict = torch.load("models/monoD/zoeDepth/ckpts/dinov2_vitl14_pretrain.pth")
+        self.dinov2_vitl14.load_state_dict(state_dict, strict=True)
+        self.amp = amp
+        self.amp_dtype = amp_dtype
+        if self.amp:
+            self.dinov2_vitl14 = self.dinov2_vitl14.to(self.amp_dtype)
+    def forward(self, x, upsample = False):
+        B,C,H,W = x.shape
+        mean_ = torch.tensor([0.485, 0.456, 0.406],
+                             device=x.device).view(1, 3, 1, 1)
+        std_ = torch.tensor([0.229, 0.224, 0.225],
+                            device=x.device).view(1, 3, 1, 1)
+        x = (x+1)/2
+        x = (x - mean_)/std_
+        h_re, w_re = 560, 560
+        x_resize = F.interpolate(x, size=(h_re, w_re),
+                                  mode='bilinear', align_corners=True)
+        if not upsample:
+            with torch.no_grad():
+                dinov2_features_16 = self.dinov2_vitl14.forward_features(x_resize.to(self.amp_dtype))
+                features_16 = dinov2_features_16['x_norm_patchtokens'].permute(0,2,1).reshape(B,1024,h_re//14, w_re//14)
+                del dinov2_features_16
+        features_16 = F.interpolate(features_16, size=(H//8, W//8), mode="bilinear", align_corners=True)
+        return features_16
+class AttnBlock(nn.Module):
+    """
+    A DiT block with adaptive layer norm zero (adaLN-Zero) conditioning.
+    """
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0,
+                  flash=False, **block_kwargs):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.flash=flash
+        self.attn = Attention(
+            hidden_size, num_heads=num_heads, qkv_bias=True, flash=flash,
+            **block_kwargs
+        )
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(
+            in_features=hidden_size,
+            hidden_features=mlp_hidden_dim,
+            act_layer=approx_gelu,
+            drop=0,
+        )
+    def forward(self, x):
+        x = x + self.attn(self.norm1(x))
+        x = x + self.mlp(self.norm2(x))
+        return x
+class CrossAttnBlock(nn.Module):
+    def __init__(self, hidden_size, context_dim, num_heads=1, mlp_ratio=4.0,
+                 flash=True, **block_kwargs):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.norm_context = nn.LayerNorm(hidden_size)
+        self.cross_attn = Attention(
+            hidden_size, context_dim=context_dim,
+            num_heads=num_heads, qkv_bias=True, **block_kwargs, flash=flash
+        )
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(
+            in_features=hidden_size,
+            hidden_features=mlp_hidden_dim,
+            act_layer=approx_gelu,
+            drop=0,
+        )
+    def forward(self, x, context):
+        with autocast():
+            x = x + self.cross_attn(
+                self.norm1(x), self.norm_context(context)
+            )
+        x = x + self.mlp(self.norm2(x))
+        return x
+def bilinear_sampler(img, coords, mode="bilinear", mask=False):
+    """Wrapper for grid_sample, uses pixel coordinates"""
+    H, W = img.shape[-2:]
+    xgrid, ygrid = coords.split([1, 1], dim=-1)
+    # go to 0,1 then 0,2 then -1,1
+    xgrid = 2 * xgrid / (W - 1) - 1
+    ygrid = 2 * ygrid / (H - 1) - 1
+    grid = torch.cat([xgrid, ygrid], dim=-1)
+    img = F.grid_sample(img, grid, align_corners=True)
+    if mask:
+        mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1)
+        return img, mask.float()
+    return img
+class CorrBlock:
+    def __init__(self, fmaps, num_levels=4, radius=4, depths_dnG=None):
+        B, S, C, H_prev, W_prev = fmaps.shape
+        self.S, self.C, self.H, self.W = S, C, H_prev, W_prev
+        self.num_levels = num_levels
+        self.radius = radius
+        self.fmaps_pyramid = []
+        self.depth_pyramid = []
+        self.fmaps_pyramid.append(fmaps)
+        if depths_dnG is not None:
+           self.depth_pyramid.append(depths_dnG)
+        for i in range(self.num_levels - 1):
+            if depths_dnG is not None:
+                depths_dnG_ = depths_dnG.reshape(B * S, 1, H_prev, W_prev)
+                depths_dnG_ = F.avg_pool2d(depths_dnG_, 2, stride=2)
+                _, _, H, W = depths_dnG_.shape
+                depths_dnG = depths_dnG_.reshape(B, S, 1, H, W)
+                self.depth_pyramid.append(depths_dnG)
+            fmaps_ = fmaps.reshape(B * S, C, H_prev, W_prev)
+            fmaps_ = F.avg_pool2d(fmaps_, 2, stride=2)
+            _, _, H, W = fmaps_.shape
+            fmaps = fmaps_.reshape(B, S, C, H, W)
+            H_prev = H
+            W_prev = W
+            self.fmaps_pyramid.append(fmaps)
+    def sample(self, coords):
+        r = self.radius
+        B, S, N, D = coords.shape
+        assert D == 2
+        H, W = self.H, self.W
+        out_pyramid = []
+        for i in range(self.num_levels):
+            corrs = self.corrs_pyramid[i]  # B, S, N, H, W
+            _, _, _, H, W = corrs.shape
+            dx = torch.linspace(-r, r, 2 * r + 1)
+            dy = torch.linspace(-r, r, 2 * r + 1)
+            delta = torch.stack(torch.meshgrid(dy, dx, indexing="ij"), axis=-1).to(
+                coords.device
+            )
+            centroid_lvl = coords.reshape(B * S * N, 1, 1, 2) / 2 ** i
+            delta_lvl = delta.view(1, 2 * r + 1, 2 * r + 1, 2)
+            coords_lvl = centroid_lvl + delta_lvl
+            corrs = bilinear_sampler(corrs.reshape(B * S * N, 1, H, W), coords_lvl)
+            corrs = corrs.view(B, S, N, -1)
+            out_pyramid.append(corrs)
+        out = torch.cat(out_pyramid, dim=-1)  # B, S, N, LRR*2
+        return out.contiguous().float()
+    def corr(self, targets):
+        B, S, N, C = targets.shape
+        assert C == self.C
+        assert S == self.S
+        fmap1 = targets
+        self.corrs_pyramid = []
+        for fmaps in self.fmaps_pyramid:
+            _, _, _, H, W = fmaps.shape
+            fmap2s = fmaps.view(B, S, C, H * W)
+            corrs = torch.matmul(fmap1, fmap2s)
+            corrs = corrs.view(B, S, N, H, W)
+            corrs = corrs / torch.sqrt(torch.tensor(C).float())
+            self.corrs_pyramid.append(corrs)
+    def corr_sample(self, targets, coords, coords_dp=None):
+        B, S, N, C = targets.shape
+        r = self.radius
+        Dim_c = (2*r+1)**2
+        assert C == self.C
+        assert S == self.S
+        out_pyramid = []
+        out_pyramid_dp = []
+        for i in range(self.num_levels):
+            dx = torch.linspace(-r, r, 2 * r + 1)
+            dy = torch.linspace(-r, r, 2 * r + 1)
+            delta = torch.stack(torch.meshgrid(dy, dx, indexing="ij"), axis=-1).to(
+                coords.device
+            )
+            centroid_lvl = coords.reshape(B * S * N, 1, 1, 2) / 2 ** i
+            delta_lvl = delta.view(1, 2 * r + 1, 2 * r + 1, 2)
+            coords_lvl = centroid_lvl + delta_lvl
+            fmaps = self.fmaps_pyramid[i]
+            _, _, _, H, W = fmaps.shape
+            fmap2s = fmaps.view(B*S, C, H, W)
+            if len(self.depth_pyramid)>0:
+                depths_dnG_i = self.depth_pyramid[i]
+                depths_dnG_i = depths_dnG_i.view(B*S, 1, H, W)
+                dnG_sample = bilinear_sampler(depths_dnG_i, coords_lvl.view(B*S,1,N*Dim_c,2))
+                dp_corrs = (dnG_sample.view(B*S,N,-1) - coords_dp[0]).abs()/coords_dp[0]
+                out_pyramid_dp.append(dp_corrs)
+            fmap2s_sample = bilinear_sampler(fmap2s, coords_lvl.view(B*S,1,N*Dim_c,2))
+            fmap2s_sample = fmap2s_sample.permute(0, 3, 1, 2) # B*S, N*Dim_c, C, -1
+            corrs = torch.matmul(targets.reshape(B*S*N, 1, -1), fmap2s_sample.reshape(B*S*N, Dim_c, -1).permute(0, 2, 1))
+            corrs = corrs / torch.sqrt(torch.tensor(C).float())
+            corrs = corrs.view(B, S, N, -1)
+            out_pyramid.append(corrs)
+        out = torch.cat(out_pyramid, dim=-1)  # B, S, N, LRR*2
+        if len(self.depth_pyramid)>0:
+            out_dp = torch.cat(out_pyramid_dp, dim=-1)
+            self.fcorrD = out_dp.contiguous().float()
+        else:
+            self.fcorrD = torch.zeros_like(out).contiguous().float()
+        return out.contiguous().float()
+class EUpdateFormer(nn.Module):
+    """
+    Transformer model that updates track estimates.
+    """
+    def __init__(
+        self,
+        space_depth=12,
+        time_depth=12,
+        input_dim=320,
+        hidden_size=384,
+        num_heads=8,
+        output_dim=130,
+        mlp_ratio=4.0,
+        vq_depth=3,
+        add_space_attn=True,
+        add_time_attn=True,
+        flash=True
+    ):
+        super().__init__()
+        self.out_channels = 2
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.add_space_attn = add_space_attn
+        self.input_transform = torch.nn.Linear(input_dim, hidden_size, bias=True)
+        self.flash = flash
+        self.flow_head = nn.Sequential(
+            nn.Linear(hidden_size, output_dim, bias=True),
+            nn.ReLU(inplace=True),
+            nn.Linear(output_dim, output_dim, bias=True),
+            nn.ReLU(inplace=True),
+            nn.Linear(output_dim, output_dim, bias=True)
+        )
+        cross_attn_kwargs = {
+            "d_model": 384,
+            "nhead": 4,
+            "layer_names": ['self', 'cross'] * 3,
+        }
+        self.gnn = LocalFeatureTransformer(cross_attn_kwargs)
+        # Attention Modules in the temporal dimension
+        self.time_blocks = nn.ModuleList(
+            [
+                AttnBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio, flash=flash) if add_time_attn else nn.Identity()
+                for _ in range(time_depth)
+            ]
+        )
+        if add_space_attn:
+            self.space_blocks = nn.ModuleList(
+                [
+                    AttnBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio, flash=flash)
+                    for _ in range(space_depth)
+                ]
+            )
+            assert len(self.time_blocks) >= len(self.space_blocks)
+        # Placeholder for the rigid transformation
+        self.RigidProj = nn.Linear(self.hidden_size, 128, bias=True)
+        self.Proj = nn.Linear(self.hidden_size, 128, bias=True)
+        self.se3_dec = nn.Linear(384, 3, bias=True)
+        self.initialize_weights()
+    def initialize_weights(self):
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+    def forward(self, input_tensor, se3_feature):
+        """ Updating with Transformer
+        Args:
+            input_tensor: B, N, T, C
+            arap_embed: B, N, T, C
+        """
+        B, N, T, C = input_tensor.shape
+        x = self.input_transform(input_tensor)
+        tokens = x
+        K = 0
+        j = 0
+        for i in range(len(self.time_blocks)):
+            tokens_time = rearrange(tokens, "b n t c -> (b n) t c", b=B, t=T, n=N+K)
+            tokens_time = self.time_blocks[i](tokens_time)
+            tokens = rearrange(tokens_time, "(b n) t c -> b n t c ", b=B, t=T, n=N+K)
+            if self.add_space_attn and (
+                i % (len(self.time_blocks) // len(self.space_blocks)) == 0
+            ):
+                tokens_space = rearrange(tokens, "b n t c -> (b t) n c ", b=B, t=T, n=N)
+                tokens_space = self.space_blocks[j](tokens_space)
+                tokens = rearrange(tokens_space, "(b t) n c -> b n t c  ", b=B, t=T, n=N)
+                j += 1
+        B, N, S, _ = tokens.shape
+        feat0, feat1 = self.gnn(tokens.view(B*N*S, -1)[None,...], se3_feature[None, ...])
+        so3 = F.tanh(self.se3_dec(feat0.view(B*N*S, -1)[None,...].view(B, N, S, -1))/100)
+        flow = self.flow_head(feat0.view(B,N,S,-1))
+        return flow, _, _, feat1, so3
+class FusionFormer(nn.Module):
+    """
+    Fuse the feature tracks info with the low rank motion tokens
+    """
+    def __init__(
+        self,
+        d_model=64,
+        nhead=8,
+        attn_iters=4,
+        mlp_ratio=4.0,
+        flash=False,
+        input_dim=35,
+        output_dim=384+3,
+    ):
+        super().__init__()
+        self.flash = flash
+        self.in_proj = nn.ModuleList(
+            [
+                nn.Linear(input_dim, d_model)
+                for _ in range(2)
+            ]
+        )
+        self.out_proj = nn.Linear(d_model, output_dim, bias=True)
+        self.time_blocks = nn.ModuleList(
+            [
+                CrossAttnBlock(d_model, d_model, nhead, mlp_ratio=mlp_ratio)
+                for _ in range(attn_iters)
+            ]
+        )
+        self.space_blocks = nn.ModuleList(
+            [
+                AttnBlock(d_model, nhead, mlp_ratio=mlp_ratio, flash=self.flash)
+                for _ in range(attn_iters)
+            ]
+        )
+        self.initialize_weights()
+    def initialize_weights(self):
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        self.out_proj.weight.data.fill_(0)
+        self.out_proj.bias.data.fill_(0)
+    def forward(self, x, token_cls):
+        """ Fuse the feature tracks info with the low rank motion tokens
+        Args:
+            x: B, S, N, C
+            Traj_whole: B T N C
+        """
+        B, S, N, C = x.shape
+        _, T, _, _ = token_cls.shape
+        x = self.in_proj[0](x)
+        token_cls = self.in_proj[1](token_cls)
+        token_cls = rearrange(token_cls, 'b t n c -> (b n) t c')
+        for i in range(len(self.space_blocks)):
+            x = rearrange(x, 'b s n c -> (b n) s c')
+            x = self.time_blocks[i](x, token_cls)
+            x = self.space_blocks[i](x.permute(1,0,2))
+            x = rearrange(x, '(b s) n c -> b s n c', b=B, s=S, n=N)
+        x = self.out_proj(x)
+        delta_xyz = x[..., :3]
+        feat_traj = x[..., 3:]
+        return delta_xyz, feat_traj
+class Lie():
+    """
+    Lie algebra for SO(3) and SE(3) operations in PyTorch
+    """
+    def so3_to_SO3(self,w): # [...,3]
+        wx = self.skew_symmetric(w)
+        theta = w.norm(dim=-1)[...,None,None]
+        I = torch.eye(3,device=w.device,dtype=torch.float32)
+        A = self.taylor_A(theta)
+        B = self.taylor_B(theta)
+        R = I+A*wx+B*wx@wx
+        return R
+    def SO3_to_so3(self,R,eps=1e-7): # [...,3,3]
+        trace = R[...,0,0]+R[...,1,1]+R[...,2,2]
+        theta = ((trace-1)/2).clamp(-1+eps,1-eps).acos_()[...,None,None]%np.pi # ln(R) will explode if theta==pi
+        lnR = 1/(2*self.taylor_A(theta)+1e-8)*(R-R.transpose(-2,-1)) # FIXME: wei-chiu finds it weird
+        w0,w1,w2 = lnR[...,2,1],lnR[...,0,2],lnR[...,1,0]
+        w = torch.stack([w0,w1,w2],dim=-1)
+        return w
+    def se3_to_SE3(self,wu): # [...,3]
+        w,u = wu.split([3,3],dim=-1)
+        wx = self.skew_symmetric(w)
+        theta = w.norm(dim=-1)[...,None,None]
+        I = torch.eye(3,device=w.device,dtype=torch.float32)
+        A = self.taylor_A(theta)
+        B = self.taylor_B(theta)
+        C = self.taylor_C(theta)
+        R = I+A*wx+B*wx@wx
+        V = I+B*wx+C*wx@wx
+        Rt = torch.cat([R,(V@u[...,None])],dim=-1)
+        return Rt
+    def SE3_to_se3(self,Rt,eps=1e-8): # [...,3,4]
+        R,t = Rt.split([3,1],dim=-1)
+        w = self.SO3_to_so3(R)
+        wx = self.skew_symmetric(w)
+        theta = w.norm(dim=-1)[...,None,None]
+        I = torch.eye(3,device=w.device,dtype=torch.float32)
+        A = self.taylor_A(theta)
+        B = self.taylor_B(theta)
+        invV = I-0.5*wx+(1-A/(2*B))/(theta**2+eps)*wx@wx
+        u = (invV@t)[...,0]
+        wu = torch.cat([w,u],dim=-1)
+        return wu
+    def skew_symmetric(self,w):
+        w0,w1,w2 = w.unbind(dim=-1)
+        O = torch.zeros_like(w0)
+        wx = torch.stack([torch.stack([O,-w2,w1],dim=-1),
+                          torch.stack([w2,O,-w0],dim=-1),
+                          torch.stack([-w1,w0,O],dim=-1)],dim=-2)
+        return wx
+    def taylor_A(self,x,nth=10):
+        # Taylor expansion of sin(x)/x
+        ans = torch.zeros_like(x)
+        denom = 1.
+        for i in range(nth+1):
+            if i>0: denom *= (2*i)*(2*i+1)
+            ans = ans+(-1)**i*x**(2*i)/denom
+        return ans
+    def taylor_B(self,x,nth=10):
+        # Taylor expansion of (1-cos(x))/x**2
+        ans = torch.zeros_like(x)
+        denom = 1.
+        for i in range(nth+1):
+            denom *= (2*i+1)*(2*i+2)
+            ans = ans+(-1)**i*x**(2*i)/denom
+        return ans
+    def taylor_C(self,x,nth=10):
+        # Taylor expansion of (x-sin(x))/x**3
+        ans = torch.zeros_like(x)
+        denom = 1.
+        for i in range(nth+1):
+            denom *= (2*i+2)*(2*i+3)
+            ans = ans+(-1)**i*x**(2*i)/denom
+        return ans
+def pix2cam(coords,
+            intr):
+    """
+    Args:
+        coords: [B, T, N, 3]
+        intr: [B, T, 3, 3]
+    """
+    coords=coords.detach()
+    B, S, N, _, = coords.shape
+    xy_src = coords.reshape(B*S*N, 3)
+    intr = intr[:, :, None, ...].repeat(1, 1, N, 1, 1).reshape(B*S*N, 3, 3)
+    xy_src = torch.cat([xy_src[..., :2], torch.ones_like(xy_src[..., :1])], dim=-1)
+    xyz_src = (torch.inverse(intr)@xy_src[...,None])[...,0]
+    dp_pred = coords[..., 2]
+    xyz_src_ = (xyz_src*(dp_pred.reshape(S*N, 1)))
+    xyz_src_ = xyz_src_.reshape(B, S, N, 3)
+    return xyz_src_
+def cam2pix(coords,
+            intr):
+    """
+    Args:
+        coords: [B, T, N, 3]
+        intr: [B, T, 3, 3]
+    """
+    coords=coords.detach()
+    B, S, N, _, = coords.shape
+    xy_src = coords.reshape(B*S*N, 3).clone()
+    intr = intr[:, :, None, ...].repeat(1, 1, N, 1, 1).reshape(B*S*N, 3, 3)
+    xy_src = xy_src / (xy_src[..., 2:]+1e-5)
+    xyz_src = (intr@xy_src[...,None])[...,0]
+    dp_pred = coords[..., 2]
+    xyz_src[...,2] *= dp_pred.reshape(S*N)
+    xyz_src = xyz_src.reshape(B, S, N, 3)
+    return xyz_src
+def edgeMat(traj3d):
+    """
+    Args:
+        traj3d: [B, T, N, 3]
+    """
+    B, T, N, _ = traj3d.shape
+    traj3d = traj3d
+    traj3d = traj3d.view(B, T, N, 3)
+    traj3d = traj3d[..., None, :] - traj3d[..., None, :, :] # B, T, N, N, 3
+    edgeMat = traj3d.norm(dim=-1)  # B, T, N, N
+    return edgeMat

models/spatracker/models/core/spatracker/dpt/__init__.py ADDED Viewed

File without changes

models/spatracker/models/core/spatracker/dpt/base_model.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import torch
+class BaseModel(torch.nn.Module):
+    def load(self, path):
+        """Load model from file.
+        Args:
+            path (str): file path
+        """
+        parameters = torch.load(path, map_location=torch.device("cpu"))
+        if "optimizer" in parameters:
+            parameters = parameters["model"]
+        self.load_state_dict(parameters)

models/spatracker/models/core/spatracker/dpt/blocks.py ADDED Viewed

	@@ -0,0 +1,394 @@

+import torch
+import torch.nn as nn
+from models.spatracker.models.core.spatracker.dpt.vit import (
+    _make_pretrained_vitb_rn50_384,
+    _make_pretrained_vitl16_384,
+    _make_pretrained_vitb16_384,
+    forward_vit,
+    _make_pretrained_vit_tiny
+)
+def _make_encoder(
+    backbone,
+    features,
+    use_pretrained,
+    groups=1,
+    expand=False,
+    exportable=True,
+    hooks=None,
+    use_vit_only=False,
+    use_readout="ignore",
+    enable_attention_hooks=False,
+):
+    if backbone == "vitl16_384":
+        pretrained = _make_pretrained_vitl16_384(
+            use_pretrained,
+            hooks=hooks,
+            use_readout=use_readout,
+            enable_attention_hooks=enable_attention_hooks,
+        )
+        scratch = _make_scratch(
+            [256, 512, 1024, 1024], features, groups=groups, expand=expand
+        )  # ViT-L/16 - 85.0% Top1 (backbone)
+    elif backbone == "vitb_rn50_384":
+        pretrained = _make_pretrained_vitb_rn50_384(
+            use_pretrained,
+            hooks=hooks,
+            use_vit_only=use_vit_only,
+            use_readout=use_readout,
+            enable_attention_hooks=enable_attention_hooks,
+        )
+        scratch = _make_scratch(
+            [256, 512, 768, 768], features, groups=groups, expand=expand
+        )  # ViT-H/16 - 85.0% Top1 (backbone)
+    elif backbone == "vitb16_384":
+        pretrained = _make_pretrained_vitb16_384(
+            use_pretrained,
+            hooks=hooks,
+            use_readout=use_readout,
+            enable_attention_hooks=enable_attention_hooks,
+        )
+        scratch = _make_scratch(
+            [96, 192, 384, 768], features, groups=groups, expand=expand
+        )  # ViT-B/16 - 84.6% Top1 (backbone)
+    elif backbone == "resnext101_wsl":
+        pretrained = _make_pretrained_resnext101_wsl(use_pretrained)
+        scratch = _make_scratch(
+            [256, 512, 1024, 2048], features, groups=groups, expand=expand
+        )  # efficientnet_lite3
+    elif backbone == "vit_tiny_r_s16_p8_384":
+        pretrained = _make_pretrained_vit_tiny(
+            use_pretrained,
+            hooks=hooks,
+            use_readout=use_readout,
+            enable_attention_hooks=enable_attention_hooks,
+        )
+        scratch = _make_scratch(
+            [96, 192, 384, 768], features, groups=groups, expand=expand
+        )
+    else:
+        print(f"Backbone '{backbone}' not implemented")
+        assert False
+    return pretrained, scratch
+def _make_scratch(in_shape, out_shape, groups=1, expand=False):
+    scratch = nn.Module()
+    out_shape1 = out_shape
+    out_shape2 = out_shape
+    out_shape3 = out_shape
+    out_shape4 = out_shape
+    if expand == True:
+        out_shape1 = out_shape
+        out_shape2 = out_shape * 2
+        out_shape3 = out_shape * 4
+        out_shape4 = out_shape * 8
+    scratch.layer1_rn = nn.Conv2d(
+        in_shape[0],
+        out_shape1,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer2_rn = nn.Conv2d(
+        in_shape[1],
+        out_shape2,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer3_rn = nn.Conv2d(
+        in_shape[2],
+        out_shape3,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer4_rn = nn.Conv2d(
+        in_shape[3],
+        out_shape4,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    return scratch
+def _make_resnet_backbone(resnet):
+    pretrained = nn.Module()
+    pretrained.layer1 = nn.Sequential(
+        resnet.conv1, resnet.bn1, resnet.relu, resnet.maxpool, resnet.layer1
+    )
+    pretrained.layer2 = resnet.layer2
+    pretrained.layer3 = resnet.layer3
+    pretrained.layer4 = resnet.layer4
+    return pretrained
+def _make_pretrained_resnext101_wsl(use_pretrained):
+    resnet = torch.hub.load("facebookresearch/WSL-Images", "resnext101_32x8d_wsl")
+    return _make_resnet_backbone(resnet)
+class Interpolate(nn.Module):
+    """Interpolation module."""
+    def __init__(self, scale_factor, mode, align_corners=False):
+        """Init.
+        Args:
+            scale_factor (float): scaling
+            mode (str): interpolation mode
+        """
+        super(Interpolate, self).__init__()
+        self.interp = nn.functional.interpolate
+        self.scale_factor = scale_factor
+        self.mode = mode
+        self.align_corners = align_corners
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: interpolated data
+        """
+        x = self.interp(
+            x,
+            scale_factor=self.scale_factor,
+            mode=self.mode,
+            align_corners=self.align_corners,
+        )
+        return x
+class ResidualConvUnit(nn.Module):
+    """Residual convolution module."""
+    def __init__(self, features):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+        self.conv1 = nn.Conv2d(
+            features, features, kernel_size=3, stride=1, padding=1, bias=True
+        )
+        self.conv2 = nn.Conv2d(
+            features, features, kernel_size=3, stride=1, padding=1, bias=True
+        )
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: output
+        """
+        out = self.relu(x)
+        out = self.conv1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        return out + x
+class FeatureFusionBlock(nn.Module):
+    """Feature fusion block."""
+    def __init__(self, features):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock, self).__init__()
+        self.resConfUnit1 = ResidualConvUnit(features)
+        self.resConfUnit2 = ResidualConvUnit(features)
+    def forward(self, *xs):
+        """Forward pass.
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+        if len(xs) == 2:
+            output += self.resConfUnit1(xs[1])
+        output = self.resConfUnit2(output)
+        output = nn.functional.interpolate(
+            output, scale_factor=2, mode="bilinear", align_corners=True
+        )
+        return output
+class ResidualConvUnit_custom(nn.Module):
+    """Residual convolution module."""
+    def __init__(self, features, activation, bn):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+        self.bn = bn
+        self.groups = 1
+        self.conv1 = nn.Conv2d(
+            features,
+            features,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=not self.bn,
+            groups=self.groups,
+        )
+        self.conv2 = nn.Conv2d(
+            features,
+            features,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=not self.bn,
+            groups=self.groups,
+        )
+        if self.bn == True:
+            self.bn1 = nn.BatchNorm2d(features)
+            self.bn2 = nn.BatchNorm2d(features)
+        self.activation = activation
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: output
+        """
+        out = self.activation(x)
+        out = self.conv1(out)
+        if self.bn == True:
+            out = self.bn1(out)
+        out = self.activation(out)
+        out = self.conv2(out)
+        if self.bn == True:
+            out = self.bn2(out)
+        if self.groups > 1:
+            out = self.conv_merge(out)
+        return self.skip_add.add(out, x)
+        # return out + x
+class FeatureFusionBlock_custom(nn.Module):
+    """Feature fusion block."""
+    def __init__(
+        self,
+        features,
+        activation,
+        deconv=False,
+        bn=False,
+        expand=False,
+        align_corners=True,
+    ):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock_custom, self).__init__()
+        self.deconv = deconv
+        self.align_corners = align_corners
+        self.groups = 1
+        self.expand = expand
+        out_features = features
+        if self.expand == True:
+            out_features = features // 2
+        self.out_conv = nn.Conv2d(
+            features,
+            out_features,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+            groups=1,
+        )
+        self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn)
+        self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn)
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, *xs):
+        """Forward pass.
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+        if len(xs) == 2:
+            res = self.resConfUnit1(xs[1])
+            output = self.skip_add.add(output, res)
+            # output += res
+        output = self.resConfUnit2(output)
+        output = nn.functional.interpolate(
+            output, scale_factor=2, mode="bilinear", align_corners=self.align_corners
+        )
+        output = self.out_conv(output)
+        return output

models/spatracker/models/core/spatracker/dpt/midas_net.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"""MidashNet: Network for monocular depth estimation trained by mixing several datasets.
+This file contains code that is adapted from
+https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py
+"""
+import torch
+import torch.nn as nn
+from models.spatracker.models.core.spatracker.dpt.base_model import BaseModel
+from models.spatracker.models.core.spatracker.dpt.blocks import FeatureFusionBlock, Interpolate, _make_encoder
+class MidasNet_large(BaseModel):
+    """Network for monocular depth estimation."""
+    def __init__(self, path=None, features=256, non_negative=True):
+        """Init.
+        Args:
+            path (str, optional): Path to saved model. Defaults to None.
+            features (int, optional): Number of features. Defaults to 256.
+            backbone (str, optional): Backbone network for encoder. Defaults to resnet50
+        """
+        print("Loading weights: ", path)
+        super(MidasNet_large, self).__init__()
+        use_pretrained = False if path is None else True
+        self.pretrained, self.scratch = _make_encoder(
+            backbone="resnext101_wsl", features=features, use_pretrained=use_pretrained
+        )
+        self.scratch.refinenet4 = FeatureFusionBlock(features)
+        self.scratch.refinenet3 = FeatureFusionBlock(features)
+        self.scratch.refinenet2 = FeatureFusionBlock(features)
+        self.scratch.refinenet1 = FeatureFusionBlock(features)
+        self.scratch.output_conv = nn.Sequential(
+            nn.Conv2d(features, 128, kernel_size=3, stride=1, padding=1),
+            Interpolate(scale_factor=2, mode="bilinear"),
+            nn.Conv2d(128, 32, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
+            nn.ReLU(True) if non_negative else nn.Identity(),
+        )
+        if path:
+            self.load(path)
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input data (image)
+        Returns:
+            tensor: depth
+        """
+        layer_1 = self.pretrained.layer1(x)
+        layer_2 = self.pretrained.layer2(layer_1)
+        layer_3 = self.pretrained.layer3(layer_2)
+        layer_4 = self.pretrained.layer4(layer_3)
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+        path_4 = self.scratch.refinenet4(layer_4_rn)
+        path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
+        path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
+        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+        out = self.scratch.output_conv(path_1)
+        return torch.squeeze(out, dim=1)

models/spatracker/models/core/spatracker/dpt/models.py ADDED Viewed

	@@ -0,0 +1,231 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from models.spatracker.models.core.spatracker.dpt.base_model import BaseModel
+from models.spatracker.models.core.spatracker.dpt.blocks import (
+    FeatureFusionBlock,
+    FeatureFusionBlock_custom,
+    Interpolate,
+    _make_encoder,
+    forward_vit,
+)
+def _make_fusion_block(features, use_bn):
+    return FeatureFusionBlock_custom(
+        features,
+        nn.ReLU(False),
+        deconv=False,
+        bn=use_bn,
+        expand=False,
+        align_corners=True,
+    )
+class DPT(BaseModel):
+    def __init__(
+        self,
+        head,
+        features=256,
+        backbone="vitb_rn50_384",
+        readout="project",
+        channels_last=False,
+        use_bn=True,
+        enable_attention_hooks=False,
+    ):
+        super(DPT, self).__init__()
+        self.channels_last = channels_last
+        hooks = {
+            "vitb_rn50_384": [0, 1, 8, 11],
+            "vitb16_384": [2, 5, 8, 11],
+            "vitl16_384": [5, 11, 17, 23],
+            "vit_tiny_r_s16_p8_384": [0, 1, 2, 3],
+        }
+        # Instantiate backbone and reassemble blocks
+        self.pretrained, self.scratch = _make_encoder(
+            backbone,
+            features,
+            False,  # Set to true of you want to train from scratch, uses ImageNet weights
+            groups=1,
+            expand=False,
+            exportable=False,
+            hooks=hooks[backbone],
+            use_readout=readout,
+            enable_attention_hooks=enable_attention_hooks,
+        )
+        self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
+        self.scratch.output_conv = head
+        self.proj_out = nn.Sequential(
+                        nn.Conv2d(
+                                256+512+384+384,
+                                256,
+                                kernel_size=3,
+                                padding=1,
+                                padding_mode="zeros",
+                            ),
+                        nn.BatchNorm2d(128 * 2),
+                        nn.ReLU(True),
+                        nn.Conv2d(
+                                128 * 2,
+                                128,
+                                kernel_size=3,
+                                padding=1,
+                                padding_mode="zeros",
+                            )
+                        )
+    def forward(self, x, only_enc=False):
+        if self.channels_last == True:
+            x.contiguous(memory_format=torch.channels_last)
+        if only_enc:
+            layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x)
+            a = (layer_1)
+            b = (
+                F.interpolate(
+                layer_2,
+                scale_factor=2,
+                mode="bilinear",
+                align_corners=True,
+                )
+            )
+            c = (
+                F.interpolate(
+                layer_3,
+                scale_factor=8,
+                mode="bilinear",
+                align_corners=True,
+                )
+            )
+            d = (
+                F.interpolate(
+                layer_4,
+                scale_factor=16,
+                mode="bilinear",
+                align_corners=True,
+                )
+            )
+            x = self.proj_out(torch.cat([a, b, c, d], dim=1))
+            return x
+        else:
+            layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x)
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+        path_4 = self.scratch.refinenet4(layer_4_rn)
+        path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
+        path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
+        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+        _,_,H_out,W_out = path_1.size()
+        path_2_up = F.interpolate(path_2, size=(H_out,W_out), mode="bilinear", align_corners=True)
+        path_3_up = F.interpolate(path_3, size=(H_out,W_out), mode="bilinear", align_corners=True)
+        path_4_up = F.interpolate(path_4, size=(H_out,W_out), mode="bilinear", align_corners=True)
+        out = self.scratch.output_conv(path_1+path_2_up+path_3_up+path_4_up)
+        return out
+class DPTDepthModel(DPT):
+    def __init__(
+        self, path=None, non_negative=True, scale=1.0, shift=0.0, invert=False, **kwargs
+    ):
+        features = kwargs["features"] if "features" in kwargs else 256
+        self.scale = scale
+        self.shift = shift
+        self.invert = invert
+        head = nn.Sequential(
+            nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),
+            Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
+            nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
+            nn.ReLU(True) if non_negative else nn.Identity(),
+            nn.Identity(),
+        )
+        super().__init__(head, **kwargs)
+        if path is not None:
+            self.load(path)
+    def forward(self, x):
+        inv_depth = super().forward(x).squeeze(dim=1)
+        if self.invert:
+            depth = self.scale * inv_depth + self.shift
+            depth[depth < 1e-8] = 1e-8
+            depth = 1.0 / depth
+            return depth
+        else:
+            return inv_depth
+class DPTEncoder(DPT):
+    def __init__(
+        self, path=None, non_negative=True, scale=1.0, shift=0.0, invert=False, **kwargs
+    ):
+        features = kwargs["features"] if "features" in kwargs else 256
+        self.scale = scale
+        self.shift = shift
+        head = nn.Sequential(
+            nn.Conv2d(features, 128, kernel_size=3, stride=1, padding=1),
+        )
+        super().__init__(head, **kwargs)
+        if path is not None:
+            self.load(path)
+    def forward(self, x):
+        features = super().forward(x, only_enc=True).squeeze(dim=1)
+        return features
+class DPTSegmentationModel(DPT):
+    def __init__(self, num_classes, path=None, **kwargs):
+        features = kwargs["features"] if "features" in kwargs else 256
+        kwargs["use_bn"] = True
+        head = nn.Sequential(
+            nn.Conv2d(features, features, kernel_size=3, padding=1, bias=False),
+            nn.BatchNorm2d(features),
+            nn.ReLU(True),
+            nn.Dropout(0.1, False),
+            nn.Conv2d(features, num_classes, kernel_size=1),
+            Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
+        )
+        super().__init__(head, **kwargs)
+        self.auxlayer = nn.Sequential(
+            nn.Conv2d(features, features, kernel_size=3, padding=1, bias=False),
+            nn.BatchNorm2d(features),
+            nn.ReLU(True),
+            nn.Dropout(0.1, False),
+            nn.Conv2d(features, num_classes, kernel_size=1),
+        )
+        if path is not None:
+            self.load(path)

models/spatracker/models/core/spatracker/dpt/transforms.py ADDED Viewed

	@@ -0,0 +1,231 @@

+import numpy as np
+import cv2
+import math
+def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA):
+    """Rezise the sample to ensure the given size. Keeps aspect ratio.
+    Args:
+        sample (dict): sample
+        size (tuple): image size
+    Returns:
+        tuple: new size
+    """
+    shape = list(sample["disparity"].shape)
+    if shape[0] >= size[0] and shape[1] >= size[1]:
+        return sample
+    scale = [0, 0]
+    scale[0] = size[0] / shape[0]
+    scale[1] = size[1] / shape[1]
+    scale = max(scale)
+    shape[0] = math.ceil(scale * shape[0])
+    shape[1] = math.ceil(scale * shape[1])
+    # resize
+    sample["image"] = cv2.resize(
+        sample["image"], tuple(shape[::-1]), interpolation=image_interpolation_method
+    )
+    sample["disparity"] = cv2.resize(
+        sample["disparity"], tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST
+    )
+    sample["mask"] = cv2.resize(
+        sample["mask"].astype(np.float32),
+        tuple(shape[::-1]),
+        interpolation=cv2.INTER_NEAREST,
+    )
+    sample["mask"] = sample["mask"].astype(bool)
+    return tuple(shape)
+class Resize(object):
+    """Resize sample to given size (width, height)."""
+    def __init__(
+        self,
+        width,
+        height,
+        resize_target=True,
+        keep_aspect_ratio=False,
+        ensure_multiple_of=1,
+        resize_method="lower_bound",
+        image_interpolation_method=cv2.INTER_AREA,
+    ):
+        """Init.
+        Args:
+            width (int): desired output width
+            height (int): desired output height
+            resize_target (bool, optional):
+                True: Resize the full sample (image, mask, target).
+                False: Resize image only.
+                Defaults to True.
+            keep_aspect_ratio (bool, optional):
+                True: Keep the aspect ratio of the input sample.
+                Output sample might not have the given width and height, and
+                resize behaviour depends on the parameter 'resize_method'.
+                Defaults to False.
+            ensure_multiple_of (int, optional):
+                Output width and height is constrained to be multiple of this parameter.
+                Defaults to 1.
+            resize_method (str, optional):
+                "lower_bound": Output will be at least as large as the given size.
+                "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
+                "minimal": Scale as least as possible.  (Output size might be smaller than given size.)
+                Defaults to "lower_bound".
+        """
+        self.__width = width
+        self.__height = height
+        self.__resize_target = resize_target
+        self.__keep_aspect_ratio = keep_aspect_ratio
+        self.__multiple_of = ensure_multiple_of
+        self.__resize_method = resize_method
+        self.__image_interpolation_method = image_interpolation_method
+    def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
+        y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        if max_val is not None and y > max_val:
+            y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        if y < min_val:
+            y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        return y
+    def get_size(self, width, height):
+        # determine new height and width
+        scale_height = self.__height / height
+        scale_width = self.__width / width
+        if self.__keep_aspect_ratio:
+            if self.__resize_method == "lower_bound":
+                # scale such that output size is lower bound
+                if scale_width > scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "upper_bound":
+                # scale such that output size is upper bound
+                if scale_width < scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "minimal":
+                # scale as least as possbile
+                if abs(1 - scale_width) < abs(1 - scale_height):
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            else:
+                raise ValueError(
+                    f"resize_method {self.__resize_method} not implemented"
+                )
+        if self.__resize_method == "lower_bound":
+            new_height = self.constrain_to_multiple_of(
+                scale_height * height, min_val=self.__height
+            )
+            new_width = self.constrain_to_multiple_of(
+                scale_width * width, min_val=self.__width
+            )
+        elif self.__resize_method == "upper_bound":
+            new_height = self.constrain_to_multiple_of(
+                scale_height * height, max_val=self.__height
+            )
+            new_width = self.constrain_to_multiple_of(
+                scale_width * width, max_val=self.__width
+            )
+        elif self.__resize_method == "minimal":
+            new_height = self.constrain_to_multiple_of(scale_height * height)
+            new_width = self.constrain_to_multiple_of(scale_width * width)
+        else:
+            raise ValueError(f"resize_method {self.__resize_method} not implemented")
+        return (new_width, new_height)
+    def __call__(self, sample):
+        width, height = self.get_size(
+            sample["image"].shape[1], sample["image"].shape[0]
+        )
+        # resize sample
+        sample["image"] = cv2.resize(
+            sample["image"],
+            (width, height),
+            interpolation=self.__image_interpolation_method,
+        )
+        if self.__resize_target:
+            if "disparity" in sample:
+                sample["disparity"] = cv2.resize(
+                    sample["disparity"],
+                    (width, height),
+                    interpolation=cv2.INTER_NEAREST,
+                )
+            if "depth" in sample:
+                sample["depth"] = cv2.resize(
+                    sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST
+                )
+            sample["mask"] = cv2.resize(
+                sample["mask"].astype(np.float32),
+                (width, height),
+                interpolation=cv2.INTER_NEAREST,
+            )
+            sample["mask"] = sample["mask"].astype(bool)
+        return sample
+class NormalizeImage(object):
+    """Normlize image by given mean and std."""
+    def __init__(self, mean, std):
+        self.__mean = mean
+        self.__std = std
+    def __call__(self, sample):
+        sample["image"] = (sample["image"] - self.__mean) / self.__std
+        return sample
+class PrepareForNet(object):
+    """Prepare sample for usage as network input."""
+    def __init__(self):
+        pass
+    def __call__(self, sample):
+        image = np.transpose(sample["image"], (2, 0, 1))
+        sample["image"] = np.ascontiguousarray(image).astype(np.float32)
+        if "mask" in sample:
+            sample["mask"] = sample["mask"].astype(np.float32)
+            sample["mask"] = np.ascontiguousarray(sample["mask"])
+        if "disparity" in sample:
+            disparity = sample["disparity"].astype(np.float32)
+            sample["disparity"] = np.ascontiguousarray(disparity)
+        if "depth" in sample:
+            depth = sample["depth"].astype(np.float32)
+            sample["depth"] = np.ascontiguousarray(depth)
+        return sample

models/spatracker/models/core/spatracker/dpt/vit.py ADDED Viewed

	@@ -0,0 +1,596 @@

+import torch
+import torch.nn as nn
+import timm
+import types
+import math
+import torch.nn.functional as F
+activations = {}
+def get_activation(name):
+    def hook(model, input, output):
+        activations[name] = output
+    return hook
+attention = {}
+def get_attention(name):
+    def hook(module, input, output):
+        x = input[0]
+        B, N, C = x.shape
+        qkv = (
+            module.qkv(x)
+            .reshape(B, N, 3, module.num_heads, C // module.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = (
+            qkv[0],
+            qkv[1],
+            qkv[2],
+        )  # make torchscript happy (cannot use tensor as tuple)
+        attn = (q @ k.transpose(-2, -1)) * module.scale
+        attn = attn.softmax(dim=-1)  # [:,:,1,1:]
+        attention[name] = attn
+    return hook
+def get_mean_attention_map(attn, token, shape):
+    attn = attn[:, :, token, 1:]
+    attn = attn.unflatten(2, torch.Size([shape[2] // 16, shape[3] // 16])).float()
+    attn = torch.nn.functional.interpolate(
+        attn, size=shape[2:], mode="bicubic", align_corners=False
+    ).squeeze(0)
+    all_attn = torch.mean(attn, 0)
+    return all_attn
+class Slice(nn.Module):
+    def __init__(self, start_index=1):
+        super(Slice, self).__init__()
+        self.start_index = start_index
+    def forward(self, x):
+        return x[:, self.start_index :]
+class AddReadout(nn.Module):
+    def __init__(self, start_index=1):
+        super(AddReadout, self).__init__()
+        self.start_index = start_index
+    def forward(self, x):
+        if self.start_index == 2:
+            readout = (x[:, 0] + x[:, 1]) / 2
+        else:
+            readout = x[:, 0]
+        return x[:, self.start_index :] + readout.unsqueeze(1)
+class ProjectReadout(nn.Module):
+    def __init__(self, in_features, start_index=1):
+        super(ProjectReadout, self).__init__()
+        self.start_index = start_index
+        self.project = nn.Sequential(nn.Linear(2 * in_features, in_features), nn.GELU())
+    def forward(self, x):
+        readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index :])
+        features = torch.cat((x[:, self.start_index :], readout), -1)
+        return self.project(features)
+class Transpose(nn.Module):
+    def __init__(self, dim0, dim1):
+        super(Transpose, self).__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+    def forward(self, x):
+        x = x.transpose(self.dim0, self.dim1)
+        return x
+def forward_vit(pretrained, x):
+    b, c, h, w = x.shape
+    glob = pretrained.model.forward_flex(x)
+    layer_1 = pretrained.activations["1"]
+    layer_2 = pretrained.activations["2"]
+    layer_3 = pretrained.activations["3"]
+    layer_4 = pretrained.activations["4"]
+    layer_1 = pretrained.act_postprocess1[0:2](layer_1)
+    layer_2 = pretrained.act_postprocess2[0:2](layer_2)
+    layer_3 = pretrained.act_postprocess3[0:2](layer_3)
+    layer_4 = pretrained.act_postprocess4[0:2](layer_4)
+    unflatten = nn.Sequential(
+        nn.Unflatten(
+            2,
+            torch.Size(
+                [
+                    h // pretrained.model.patch_size[1],
+                    w // pretrained.model.patch_size[0],
+                ]
+            ),
+        )
+    )
+    if layer_1.ndim == 3:
+        layer_1 = unflatten(layer_1)
+    if layer_2.ndim == 3:
+        layer_2 = unflatten(layer_2)
+    if layer_3.ndim == 3:
+        layer_3 = unflatten(layer_3)
+    if layer_4.ndim == 3:
+        layer_4 = unflatten(layer_4)
+    layer_1 = pretrained.act_postprocess1[3 : len(pretrained.act_postprocess1)](layer_1)
+    layer_2 = pretrained.act_postprocess2[3 : len(pretrained.act_postprocess2)](layer_2)
+    layer_3 = pretrained.act_postprocess3[3 : len(pretrained.act_postprocess3)](layer_3)
+    layer_4 = pretrained.act_postprocess4[3 : len(pretrained.act_postprocess4)](layer_4)
+    return layer_1, layer_2, layer_3, layer_4
+def _resize_pos_embed(self, posemb, gs_h, gs_w):
+    posemb_tok, posemb_grid = (
+        posemb[:, : self.start_index],
+        posemb[0, self.start_index :],
+    )
+    gs_old = int(math.sqrt(len(posemb_grid)))
+    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)
+    posemb_grid = F.interpolate(posemb_grid, size=(gs_h, gs_w), mode="bilinear")
+    posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1)
+    posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
+    return posemb
+def forward_flex(self, x):
+    b, c, h, w = x.shape
+    pos_embed = self._resize_pos_embed(
+        self.pos_embed, h // self.patch_size[1], w // self.patch_size[0]
+    )
+    B = x.shape[0]
+    if hasattr(self.patch_embed, "backbone"):
+        x = self.patch_embed.backbone(x)
+        if isinstance(x, (list, tuple)):
+            x = x[-1]  # last feature if backbone outputs list/tuple of features
+    x = self.patch_embed.proj(x).flatten(2).transpose(1, 2)
+    if getattr(self, "dist_token", None) is not None:
+        cls_tokens = self.cls_token.expand(
+            B, -1, -1
+        )  # stole cls_tokens impl from Phil Wang, thanks
+        dist_token = self.dist_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, dist_token, x), dim=1)
+    else:
+        cls_tokens = self.cls_token.expand(
+            B, -1, -1
+        )  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+    x = x + pos_embed
+    x = self.pos_drop(x)
+    for blk in self.blocks:
+        x = blk(x)
+    x = self.norm(x)
+    return x
+def get_readout_oper(vit_features, features, use_readout, start_index=1):
+    if use_readout == "ignore":
+        readout_oper = [Slice(start_index)] * len(features)
+    elif use_readout == "add":
+        readout_oper = [AddReadout(start_index)] * len(features)
+    elif use_readout == "project":
+        readout_oper = [
+            ProjectReadout(vit_features, start_index) for out_feat in features
+        ]
+    else:
+        assert (
+            False
+        ), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'"
+    return readout_oper
+def _make_vit_b16_backbone(
+    model,
+    features=[96, 192, 384, 768],
+    size=[384, 384],
+    hooks=[2, 5, 8, 11],
+    vit_features=768,
+    use_readout="ignore",
+    start_index=1,
+    enable_attention_hooks=False,
+):
+    pretrained = nn.Module()
+    pretrained.model = model
+    pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
+    pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
+    pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
+    pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
+    pretrained.activations = activations
+    if enable_attention_hooks:
+        pretrained.model.blocks[hooks[0]].attn.register_forward_hook(
+            get_attention("attn_1")
+        )
+        pretrained.model.blocks[hooks[1]].attn.register_forward_hook(
+            get_attention("attn_2")
+        )
+        pretrained.model.blocks[hooks[2]].attn.register_forward_hook(
+            get_attention("attn_3")
+        )
+        pretrained.model.blocks[hooks[3]].attn.register_forward_hook(
+            get_attention("attn_4")
+        )
+        pretrained.attention = attention
+    readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
+    # 32, 48, 136, 384
+    pretrained.act_postprocess1 = nn.Sequential(
+        readout_oper[0],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[0],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.ConvTranspose2d(
+            in_channels=features[0],
+            out_channels=features[0],
+            kernel_size=4,
+            stride=4,
+            padding=0,
+            bias=True,
+            dilation=1,
+            groups=1,
+        ),
+    )
+    pretrained.act_postprocess2 = nn.Sequential(
+        readout_oper[1],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[1],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.ConvTranspose2d(
+            in_channels=features[1],
+            out_channels=features[1],
+            kernel_size=2,
+            stride=2,
+            padding=0,
+            bias=True,
+            dilation=1,
+            groups=1,
+        ),
+    )
+    pretrained.act_postprocess3 = nn.Sequential(
+        readout_oper[2],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[2],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+    )
+    pretrained.act_postprocess4 = nn.Sequential(
+        readout_oper[3],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[3],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.Conv2d(
+            in_channels=features[3],
+            out_channels=features[3],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        ),
+    )
+    pretrained.model.start_index = start_index
+    pretrained.model.patch_size = [16, 16]
+    # We inject this function into the VisionTransformer instances so that
+    # we can use it with interpolated position embeddings without modifying the library source.
+    pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
+    pretrained.model._resize_pos_embed = types.MethodType(
+        _resize_pos_embed, pretrained.model
+    )
+    return pretrained
+def _make_vit_b_rn50_backbone(
+    model,
+    features=[256, 512, 768, 768],
+    size=[384, 384],
+    hooks=[0, 1, 8, 11],
+    vit_features=384,
+    use_vit_only=False,
+    use_readout="ignore",
+    start_index=1,
+    enable_attention_hooks=False,
+):
+    pretrained = nn.Module()
+    pretrained.model = model
+    pretrained.model.patch_size = [32, 32]
+    ps = pretrained.model.patch_size[0]
+    if use_vit_only == True:
+        pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
+        pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
+    else:
+        pretrained.model.patch_embed.backbone.stages[0].register_forward_hook(
+            get_activation("1")
+        )
+        pretrained.model.patch_embed.backbone.stages[1].register_forward_hook(
+            get_activation("2")
+        )
+    pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
+    pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
+    if enable_attention_hooks:
+        pretrained.model.blocks[2].attn.register_forward_hook(get_attention("attn_1"))
+        pretrained.model.blocks[5].attn.register_forward_hook(get_attention("attn_2"))
+        pretrained.model.blocks[8].attn.register_forward_hook(get_attention("attn_3"))
+        pretrained.model.blocks[11].attn.register_forward_hook(get_attention("attn_4"))
+        pretrained.attention = attention
+    pretrained.activations = activations
+    readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
+    if use_vit_only == True:
+        pretrained.act_postprocess1 = nn.Sequential(
+            readout_oper[0],
+            Transpose(1, 2),
+            nn.Unflatten(2, torch.Size([size[0] // ps, size[1] // ps])),
+            nn.Conv2d(
+                in_channels=vit_features,
+                out_channels=features[0],
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ),
+            nn.ConvTranspose2d(
+                in_channels=features[0],
+                out_channels=features[0],
+                kernel_size=4,
+                stride=4,
+                padding=0,
+                bias=True,
+                dilation=1,
+                groups=1,
+            ),
+        )
+        pretrained.act_postprocess2 = nn.Sequential(
+            readout_oper[1],
+            Transpose(1, 2),
+            nn.Unflatten(2, torch.Size([size[0] // ps, size[1] // ps])),
+            nn.Conv2d(
+                in_channels=vit_features,
+                out_channels=features[1],
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ),
+            nn.ConvTranspose2d(
+                in_channels=features[1],
+                out_channels=features[1],
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                bias=True,
+                dilation=1,
+                groups=1,
+            ),
+        )
+    else:
+        pretrained.act_postprocess1 = nn.Sequential(
+            nn.Identity(), nn.Identity(), nn.Identity()
+        )
+        pretrained.act_postprocess2 = nn.Sequential(
+            nn.Identity(), nn.Identity(), nn.Identity()
+        )
+    pretrained.act_postprocess3 = nn.Sequential(
+        readout_oper[2],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // ps, size[1] // ps])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[2],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+    )
+    pretrained.act_postprocess4 = nn.Sequential(
+        readout_oper[3],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // ps, size[1] // ps])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[3],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.Conv2d(
+            in_channels=features[3],
+            out_channels=features[3],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        ),
+    )
+    pretrained.model.start_index = start_index
+    pretrained.model.patch_size = [32, 32]
+    # We inject this function into the VisionTransformer instances so that
+    # we can use it with interpolated position embeddings without modifying the library source.
+    pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
+    # We inject this function into the VisionTransformer instances so that
+    # we can use it with interpolated position embeddings without modifying the library source.
+    pretrained.model._resize_pos_embed = types.MethodType(
+        _resize_pos_embed, pretrained.model
+    )
+    return pretrained
+def _make_pretrained_vitb_rn50_384(
+    pretrained,
+    use_readout="ignore",
+    hooks=None,
+    use_vit_only=False,
+    enable_attention_hooks=False,
+):
+    # model = timm.create_model("vit_base_resnet50_384", pretrained=pretrained)
+    # model = timm.create_model("vit_tiny_r_s16_p8_384", pretrained=pretrained)
+    model = timm.create_model("vit_small_r26_s32_384", pretrained=pretrained)
+    hooks = [0, 1, 8, 11] if hooks == None else hooks
+    return _make_vit_b_rn50_backbone(
+        model,
+        features=[128, 256, 384, 384],
+        size=[384, 384],
+        hooks=hooks,
+        use_vit_only=use_vit_only,
+        use_readout=use_readout,
+        enable_attention_hooks=enable_attention_hooks,
+    )
+def _make_pretrained_vit_tiny(
+    pretrained,
+    use_readout="ignore",
+    hooks=None,
+    use_vit_only=False,
+    enable_attention_hooks=False,
+):
+    # model = timm.create_model("vit_base_resnet50_384", pretrained=pretrained)
+    model = timm.create_model("vit_tiny_r_s16_p8_384", pretrained=pretrained)
+    import ipdb; ipdb.set_trace()
+    hooks = [0, 1, 8, 11] if hooks == None else hooks
+    return _make_vit_tiny_backbone(
+        model,
+        features=[256, 512, 768, 768],
+        size=[384, 384],
+        hooks=hooks,
+        use_vit_only=use_vit_only,
+        use_readout=use_readout,
+        enable_attention_hooks=enable_attention_hooks,
+    )
+def _make_pretrained_vitl16_384(
+    pretrained, use_readout="ignore", hooks=None, enable_attention_hooks=False
+):
+    model = timm.create_model("vit_large_patch16_384", pretrained=pretrained)
+    hooks = [5, 11, 17, 23] if hooks == None else hooks
+    return _make_vit_b16_backbone(
+        model,
+        features=[256, 512, 1024, 1024],
+        hooks=hooks,
+        vit_features=1024,
+        use_readout=use_readout,
+        enable_attention_hooks=enable_attention_hooks,
+    )
+def _make_pretrained_vitb16_384(
+    pretrained, use_readout="ignore", hooks=None, enable_attention_hooks=False
+):
+    model = timm.create_model("vit_base_patch16_384", pretrained=pretrained)
+    hooks = [2, 5, 8, 11] if hooks == None else hooks
+    return _make_vit_b16_backbone(
+        model,
+        features=[96, 192, 384, 768],
+        hooks=hooks,
+        use_readout=use_readout,
+        enable_attention_hooks=enable_attention_hooks,
+    )
+def _make_pretrained_deitb16_384(
+    pretrained, use_readout="ignore", hooks=None, enable_attention_hooks=False
+):
+    model = timm.create_model("vit_deit_base_patch16_384", pretrained=pretrained)
+    hooks = [2, 5, 8, 11] if hooks == None else hooks
+    return _make_vit_b16_backbone(
+        model,
+        features=[96, 192, 384, 768],
+        hooks=hooks,
+        use_readout=use_readout,
+        enable_attention_hooks=enable_attention_hooks,
+    )
+def _make_pretrained_deitb16_distil_384(
+    pretrained, use_readout="ignore", hooks=None, enable_attention_hooks=False
+):
+    model = timm.create_model(
+        "vit_deit_base_distilled_patch16_384", pretrained=pretrained
+    )
+    hooks = [2, 5, 8, 11] if hooks == None else hooks
+    return _make_vit_b16_backbone(
+        model,
+        features=[96, 192, 384, 768],
+        hooks=hooks,
+        use_readout=use_readout,
+        start_index=2,
+        enable_attention_hooks=enable_attention_hooks,
+    )

models/spatracker/models/core/spatracker/feature_net.py ADDED Viewed

	@@ -0,0 +1,915 @@

+"""
+    Adapted from ConvONet
+    https://github.com/autonomousvision/convolutional_occupancy_networks/blob/838bea5b2f1314f2edbb68d05ebb0db49f1f3bd2/src/encoder/pointnet.py#L1
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# from torch_scatter import scatter_mean, scatter_max
+from models.spatracker.models.core.spatracker.unet import UNet
+from models.spatracker.models.core.model_utils import (
+    vis_PCA
+)
+from einops import rearrange
+def compute_iou(occ1, occ2):
+    ''' Computes the Intersection over Union (IoU) value for two sets of
+    occupancy values.
+    Args:
+        occ1 (tensor): first set of occupancy values
+        occ2 (tensor): second set of occupancy values
+    '''
+    occ1 = np.asarray(occ1)
+    occ2 = np.asarray(occ2)
+    # Put all data in second dimension
+    # Also works for 1-dimensional data
+    if occ1.ndim >= 2:
+        occ1 = occ1.reshape(occ1.shape[0], -1)
+    if occ2.ndim >= 2:
+        occ2 = occ2.reshape(occ2.shape[0], -1)
+    # Convert to boolean values
+    occ1 = (occ1 >= 0.5)
+    occ2 = (occ2 >= 0.5)
+    # Compute IOU
+    area_union = (occ1 | occ2).astype(np.float32).sum(axis=-1)
+    area_intersect = (occ1 & occ2).astype(np.float32).sum(axis=-1)
+    iou = (area_intersect / area_union)
+    return iou
+def chamfer_distance(points1, points2, use_kdtree=True, give_id=False):
+    ''' Returns the chamfer distance for the sets of points.
+    Args:
+        points1 (numpy array): first point set
+        points2 (numpy array): second point set
+        use_kdtree (bool): whether to use a kdtree
+        give_id (bool): whether to return the IDs of nearest points
+    '''
+    if use_kdtree:
+        return chamfer_distance_kdtree(points1, points2, give_id=give_id)
+    else:
+        return chamfer_distance_naive(points1, points2)
+def chamfer_distance_naive(points1, points2):
+    ''' Naive implementation of the Chamfer distance.
+    Args:
+        points1 (numpy array): first point set
+        points2 (numpy array): second point set
+    '''
+    assert(points1.size() == points2.size())
+    batch_size, T, _ = points1.size()
+    points1 = points1.view(batch_size, T, 1, 3)
+    points2 = points2.view(batch_size, 1, T, 3)
+    distances = (points1 - points2).pow(2).sum(-1)
+    chamfer1 = distances.min(dim=1)[0].mean(dim=1)
+    chamfer2 = distances.min(dim=2)[0].mean(dim=1)
+    chamfer = chamfer1 + chamfer2
+    return chamfer
+def chamfer_distance_kdtree(points1, points2, give_id=False):
+    ''' KD-tree based implementation of the Chamfer distance.
+    Args:
+        points1 (numpy array): first point set
+        points2 (numpy array): second point set
+        give_id (bool): whether to return the IDs of the nearest points
+    '''
+    # Points have size batch_size x T x 3
+    batch_size = points1.size(0)
+    # First convert points to numpy
+    points1_np = points1.detach().cpu().numpy()
+    points2_np = points2.detach().cpu().numpy()
+    # Get list of nearest neighbors indieces
+    idx_nn_12, _ = get_nearest_neighbors_indices_batch(points1_np, points2_np)
+    idx_nn_12 = torch.LongTensor(idx_nn_12).to(points1.device)
+    # Expands it as batch_size x 1 x 3
+    idx_nn_12_expand = idx_nn_12.view(batch_size, -1, 1).expand_as(points1)
+    # Get list of nearest neighbors indieces
+    idx_nn_21, _ = get_nearest_neighbors_indices_batch(points2_np, points1_np)
+    idx_nn_21 = torch.LongTensor(idx_nn_21).to(points1.device)
+    # Expands it as batch_size x T x 3
+    idx_nn_21_expand = idx_nn_21.view(batch_size, -1, 1).expand_as(points2)
+    # Compute nearest neighbors in points2 to points in points1
+    # points_12[i, j, k] = points2[i, idx_nn_12_expand[i, j, k], k]
+    points_12 = torch.gather(points2, dim=1, index=idx_nn_12_expand)
+    # Compute nearest neighbors in points1 to points in points2
+    # points_21[i, j, k] = points2[i, idx_nn_21_expand[i, j, k], k]
+    points_21 = torch.gather(points1, dim=1, index=idx_nn_21_expand)
+    # Compute chamfer distance
+    chamfer1 = (points1 - points_12).pow(2).sum(2).mean(1)
+    chamfer2 = (points2 - points_21).pow(2).sum(2).mean(1)
+    # Take sum
+    chamfer = chamfer1 + chamfer2
+    # If required, also return nearest neighbors
+    if give_id:
+        return chamfer1, chamfer2, idx_nn_12, idx_nn_21
+    return chamfer
+def get_nearest_neighbors_indices_batch(points_src, points_tgt, k=1):
+    ''' Returns the nearest neighbors for point sets batchwise.
+    Args:
+        points_src (numpy array): source points
+        points_tgt (numpy array): target points
+        k (int): number of nearest neighbors to return
+    '''
+    indices = []
+    distances = []
+    for (p1, p2) in zip(points_src, points_tgt):
+        raise NotImplementedError()
+        # kdtree = KDTree(p2)
+        dist, idx = kdtree.query(p1, k=k)
+        indices.append(idx)
+        distances.append(dist)
+    return indices, distances
+def make_3d_grid(bb_min, bb_max, shape):
+    ''' Makes a 3D grid.
+    Args:
+        bb_min (tuple): bounding box minimum
+        bb_max (tuple): bounding box maximum
+        shape (tuple): output shape
+    '''
+    size = shape[0] * shape[1] * shape[2]
+    pxs = torch.linspace(bb_min[0], bb_max[0], shape[0])
+    pys = torch.linspace(bb_min[1], bb_max[1], shape[1])
+    pzs = torch.linspace(bb_min[2], bb_max[2], shape[2])
+    pxs = pxs.view(-1, 1, 1).expand(*shape).contiguous().view(size)
+    pys = pys.view(1, -1, 1).expand(*shape).contiguous().view(size)
+    pzs = pzs.view(1, 1, -1).expand(*shape).contiguous().view(size)
+    p = torch.stack([pxs, pys, pzs], dim=1)
+    return p
+def transform_points(points, transform):
+    ''' Transforms points with regard to passed camera information.
+    Args:
+        points (tensor): points tensor
+        transform (tensor): transformation matrices
+    '''
+    assert(points.size(2) == 3)
+    assert(transform.size(1) == 3)
+    assert(points.size(0) == transform.size(0))
+    if transform.size(2) == 4:
+        R = transform[:, :, :3]
+        t = transform[:, :, 3:]
+        points_out = points @ R.transpose(1, 2) + t.transpose(1, 2)
+    elif transform.size(2) == 3:
+        K = transform
+        points_out = points @ K.transpose(1, 2)
+    return points_out
+def b_inv(b_mat):
+    ''' Performs batch matrix inversion.
+    Arguments:
+        b_mat: the batch of matrices that should be inverted
+    '''
+    eye = b_mat.new_ones(b_mat.size(-1)).diag().expand_as(b_mat)
+    b_inv, _ = torch.gesv(eye, b_mat)
+    return b_inv
+def project_to_camera(points, transform):
+    ''' Projects points to the camera plane.
+    Args:
+        points (tensor): points tensor
+        transform (tensor): transformation matrices
+    '''
+    p_camera = transform_points(points, transform)
+    p_camera = p_camera[..., :2] / p_camera[..., 2:]
+    return p_camera
+def fix_Rt_camera(Rt, loc, scale):
+    ''' Fixes Rt camera matrix.
+    Args:
+        Rt (tensor): Rt camera matrix
+        loc (tensor): location
+        scale (float): scale
+    '''
+    # Rt is B x 3 x 4
+    # loc is B x 3 and scale is B
+    batch_size = Rt.size(0)
+    R = Rt[:, :, :3]
+    t = Rt[:, :, 3:]
+    scale = scale.view(batch_size, 1, 1)
+    R_new = R * scale
+    t_new = t + R @ loc.unsqueeze(2)
+    Rt_new = torch.cat([R_new, t_new], dim=2)
+    assert(Rt_new.size() == (batch_size, 3, 4))
+    return Rt_new
+def normalize_coordinate(p, padding=0.1, plane='xz'):
+    ''' Normalize coordinate to [0, 1] for unit cube experiments
+    Args:
+        p (tensor): point
+        padding (float): conventional padding paramter of ONet for unit cube, so [-0.5, 0.5] -> [-0.55, 0.55]
+        plane (str): plane feature type, ['xz', 'xy', 'yz']
+    '''
+    # breakpoint()
+    if plane == 'xz':
+        xy = p[:, :, [0, 2]]
+    elif plane =='xy':
+        xy = p[:, :, [0, 1]]
+    else:
+        xy = p[:, :, [1, 2]]
+    xy = torch.clamp(xy, min=1e-6, max=1. - 1e-6)
+    # xy_new = xy / (1 + padding + 10e-6) # (-0.5, 0.5)
+    # xy_new = xy_new + 0.5 # range (0, 1)
+    # # f there are outliers out of the range
+    # if xy_new.max() >= 1:
+    #     xy_new[xy_new >= 1] = 1 - 10e-6
+    # if xy_new.min() < 0:
+    #     xy_new[xy_new < 0] = 0.0
+    # xy_new = (xy + 1.) / 2.
+    return xy
+def normalize_3d_coordinate(p, padding=0.1):
+    ''' Normalize coordinate to [0, 1] for unit cube experiments.
+        Corresponds to our 3D model
+    Args:
+        p (tensor): point
+        padding (float): conventional padding paramter of ONet for unit cube, so [-0.5, 0.5] -> [-0.55, 0.55]
+    '''
+    p_nor = p / (1 + padding + 10e-4) # (-0.5, 0.5)
+    p_nor = p_nor + 0.5 # range (0, 1)
+    # f there are outliers out of the range
+    if p_nor.max() >= 1:
+        p_nor[p_nor >= 1] = 1 - 10e-4
+    if p_nor.min() < 0:
+        p_nor[p_nor < 0] = 0.0
+    return p_nor
+def normalize_coord(p, vol_range, plane='xz'):
+    ''' Normalize coordinate to [0, 1] for sliding-window experiments
+    Args:
+        p (tensor): point
+        vol_range (numpy array): volume boundary
+        plane (str): feature type, ['xz', 'xy', 'yz'] - canonical planes; ['grid'] - grid volume
+    '''
+    p[:, 0] = (p[:, 0] - vol_range[0][0]) / (vol_range[1][0] - vol_range[0][0])
+    p[:, 1] = (p[:, 1] - vol_range[0][1]) / (vol_range[1][1] - vol_range[0][1])
+    p[:, 2] = (p[:, 2] - vol_range[0][2]) / (vol_range[1][2] - vol_range[0][2])
+    if plane == 'xz':
+        x = p[:, [0, 2]]
+    elif plane =='xy':
+        x = p[:, [0, 1]]
+    elif plane =='yz':
+        x = p[:, [1, 2]]
+    else:
+        x = p
+    return x
+def coordinate2index(x, reso, coord_type='2d'):
+    ''' Normalize coordinate to [0, 1] for unit cube experiments.
+        Corresponds to our 3D model
+    Args:
+        x (tensor): coordinate
+        reso (int): defined resolution
+        coord_type (str): coordinate type
+    '''
+    x = (x * reso).long()
+    if coord_type == '2d': # plane
+        index = x[:, :, 0] + reso * x[:, :, 1]
+    elif coord_type == '3d': # grid
+        index = x[:, :, 0] + reso * (x[:, :, 1] + reso * x[:, :, 2])
+    index = index[:, None, :]
+    return index
+def coord2index(p, vol_range, reso=None, plane='xz'):
+    ''' Normalize coordinate to [0, 1] for sliding-window experiments.
+        Corresponds to our 3D model
+    Args:
+        p (tensor): points
+        vol_range (numpy array): volume boundary
+        reso (int): defined resolution
+        plane (str): feature type, ['xz', 'xy', 'yz'] - canonical planes; ['grid'] - grid volume
+    '''
+    # normalize to [0, 1]
+    x = normalize_coord(p, vol_range, plane=plane)
+    if isinstance(x, np.ndarray):
+        x = np.floor(x * reso).astype(int)
+    else: #* pytorch tensor
+        x = (x * reso).long()
+    if x.shape[1] == 2:
+        index = x[:, 0] + reso * x[:, 1]
+        index[index > reso**2] = reso**2
+    elif x.shape[1] == 3:
+        index = x[:, 0] + reso * (x[:, 1] + reso * x[:, 2])
+        index[index > reso**3] = reso**3
+    return index[None]
+def update_reso(reso, depth):
+    ''' Update the defined resolution so that UNet can process.
+    Args:
+        reso (int): defined resolution
+        depth (int): U-Net number of layers
+    '''
+    base = 2**(int(depth) - 1)
+    if ~(reso / base).is_integer(): # when this is not integer, U-Net dimension error
+        for i in range(base):
+            if ((reso + i) / base).is_integer():
+                reso = reso + i
+                break
+    return reso
+def decide_total_volume_range(query_vol_metric, recep_field, unit_size, unet_depth):
+    ''' Update the defined resolution so that UNet can process.
+    Args:
+        query_vol_metric (numpy array): query volume size
+        recep_field (int): defined the receptive field for U-Net
+        unit_size (float): the defined voxel size
+        unet_depth (int): U-Net number of layers
+    '''
+    reso = query_vol_metric / unit_size + recep_field - 1
+    reso = update_reso(int(reso), unet_depth) # make sure input reso can be processed by UNet
+    input_vol_metric = reso * unit_size
+    p_c = np.array([0.0, 0.0, 0.0]).astype(np.float32)
+    lb_input_vol, ub_input_vol = p_c - input_vol_metric/2, p_c + input_vol_metric/2
+    lb_query_vol, ub_query_vol = p_c - query_vol_metric/2, p_c + query_vol_metric/2
+    input_vol = [lb_input_vol, ub_input_vol]
+    query_vol = [lb_query_vol, ub_query_vol]
+    # handle the case when resolution is too large
+    if reso > 10000:
+        reso = 1
+    return input_vol, query_vol, reso
+def add_key(base, new, base_name, new_name, device=None):
+    ''' Add new keys to the given input
+    Args:
+        base (tensor): inputs
+        new (tensor): new info for the inputs
+        base_name (str): name for the input
+        new_name (str): name for the new info
+        device (device): pytorch device
+    '''
+    if (new is not None) and (isinstance(new, dict)):
+        if device is not None:
+            for key in new.keys():
+                new[key] = new[key].to(device)
+        base = {base_name: base,
+                new_name: new}
+    return base
+class map2local(object):
+    ''' Add new keys to the given input
+    Args:
+        s (float): the defined voxel size
+        pos_encoding (str): method for the positional encoding, linear|sin_cos
+    '''
+    def __init__(self, s, pos_encoding='linear'):
+        super().__init__()
+        self.s = s
+        self.pe = positional_encoding(basis_function=pos_encoding)
+    def __call__(self, p):
+        p = torch.remainder(p, self.s) / self.s # always possitive
+        # p = torch.fmod(p, self.s) / self.s # same sign as input p!
+        p = self.pe(p)
+        return p
+class positional_encoding(object):
+    ''' Positional Encoding (presented in NeRF)
+    Args:
+        basis_function (str): basis function
+    '''
+    def __init__(self, basis_function='sin_cos'):
+        super().__init__()
+        self.func = basis_function
+        L = 10
+        freq_bands = 2.**(np.linspace(0, L-1, L))
+        self.freq_bands = freq_bands * math.pi
+    def __call__(self, p):
+        if self.func == 'sin_cos':
+            out = []
+            p = 2.0 * p - 1.0 # chagne to the range [-1, 1]
+            for freq in self.freq_bands:
+                out.append(torch.sin(freq * p))
+                out.append(torch.cos(freq * p))
+            p = torch.cat(out, dim=2)
+        return p
+# Resnet Blocks
+class ResnetBlockFC(nn.Module):
+    ''' Fully connected ResNet Block class.
+    Args:
+        size_in (int): input dimension
+        size_out (int): output dimension
+        size_h (int): hidden dimension
+    '''
+    def __init__(self, size_in, size_out=None, size_h=None):
+        super().__init__()
+        # Attributes
+        if size_out is None:
+            size_out = size_in
+        if size_h is None:
+            size_h = min(size_in, size_out)
+        self.size_in = size_in
+        self.size_h = size_h
+        self.size_out = size_out
+        # Submodules
+        self.fc_0 = nn.Linear(size_in, size_h)
+        self.fc_1 = nn.Linear(size_h, size_out)
+        self.actvn = nn.ReLU()
+        if size_in == size_out:
+            self.shortcut = None
+        else:
+            self.shortcut = nn.Linear(size_in, size_out, bias=False)
+        # Initialization
+        nn.init.zeros_(self.fc_1.weight)
+    def forward(self, x):
+        net = self.fc_0(self.actvn(x))
+        dx = self.fc_1(self.actvn(net))
+        if self.shortcut is not None:
+            x_s = self.shortcut(x)
+        else:
+            x_s = x
+        return x_s + dx
+'''
+------------------ the key model for Pointnet ----------------------------
+'''
+class LocalSoftSplat(nn.Module):
+    def __init__(self, ch=128, dim=3, hidden_dim=128, scatter_type='max',
+                 unet=True, unet_kwargs=None, unet3d=False, unet3d_kwargs=None,
+                 hw=None, grid_resolution=None, plane_type='xz', padding=0.1,
+                 n_blocks=4, splat_func=None):
+        super().__init__()
+        c_dim = ch
+        self.c_dim = c_dim
+        self.fc_pos = nn.Linear(dim, 2*hidden_dim)
+        self.blocks = nn.ModuleList([
+            ResnetBlockFC(2*hidden_dim, hidden_dim) for i in range(n_blocks)
+        ])
+        self.fc_c = nn.Linear(hidden_dim, c_dim)
+        self.actvn = nn.ReLU()
+        self.hidden_dim = hidden_dim
+        if unet:
+            self.unet = UNet(c_dim, in_channels=c_dim, **unet_kwargs)
+        else:
+            self.unet = None
+        # get splat func
+        self.splat_func = splat_func
+    def forward(self, img_feat,
+                Fxy2xz, Fxy2yz, Dz, gridxy=None):
+        """
+        Args:
+            img_feat (tensor): image features
+            Fxy2xz (tensor): transformation matrix from xy to xz
+            Fxy2yz (tensor): transformation matrix from xy to yz
+        """
+        B, T, _, H, W = img_feat.shape
+        fea_reshp = rearrange(img_feat, 'b t c h w -> (b h w) t c',
+                               c=img_feat.shape[2], h=H, w=W)
+        gridyz = gridxy + Fxy2yz
+        gridxz = gridxy + Fxy2xz
+        # normalize
+        gridyz[:, 0, ...] = (gridyz[:, 0, ...] / (H - 1) - 0.5) * 2
+        gridyz[:, 1, ...] = (gridyz[:, 1, ...] / (Dz - 1) - 0.5) * 2
+        gridxz[:, 0, ...] = (gridxz[:, 0, ...] / (W - 1) - 0.5) * 2
+        gridxz[:, 1, ...] = (gridxz[:, 1, ...] / (Dz - 1) - 0.5) * 2
+        if len(self.blocks) > 0:
+            net = self.fc_pos(fea_reshp)
+            net = self.blocks[0](net)
+            for block in self.blocks[1:]:
+                # splat and fusion
+                net_plane = rearrange(net, '(b h w) t c -> (b t) c h w', b=B, h=H, w=W)
+                net_planeYZ = self.splat_func(net_plane, Fxy2yz, None,
+                                    strMode="avg", tenoutH=Dz, tenoutW=H)
+                net_planeXZ = self.splat_func(net_plane, Fxy2xz, None,
+                                    strMode="avg", tenoutH=Dz, tenoutW=W)
+                net_plane = net_plane + (
+                    F.grid_sample(
+                    net_planeYZ, gridyz.permute(0,2,3,1), mode='bilinear', padding_mode='border') +
+                    F.grid_sample(
+                    net_planeXZ, gridxz.permute(0,2,3,1), mode='bilinear', padding_mode='border')
+                                    )
+                pooled = rearrange(net_plane, 't c h w -> (h w) t c',
+                            c=net_plane.shape[1], h=H, w=W)
+                net = torch.cat([net, pooled], dim=2)
+                net = block(net)
+            c = self.fc_c(net)
+            net_plane = rearrange(c, '(b h w) t c -> (b t) c h w', b=B, h=H, w=W)
+        else:
+            net_plane = rearrange(img_feat, 'b t c h w -> (b t) c h w',
+                                c=img_feat.shape[2], h=H, w=W)
+        net_planeYZ = self.splat_func(net_plane, Fxy2yz, None,
+                                strMode="avg", tenoutH=Dz, tenoutW=H)
+        net_planeXZ = self.splat_func(net_plane, Fxy2xz, None,
+                                strMode="avg", tenoutH=Dz, tenoutW=W)
+        return net_plane[None], net_planeYZ[None], net_planeXZ[None]
+class LocalPoolPointnet(nn.Module):
+    ''' PointNet-based encoder network with ResNet blocks for each point.
+        Number of input points are fixed.
+    Args:
+        c_dim (int): dimension of latent code c
+        dim (int): input points dimension
+        hidden_dim (int): hidden dimension of the network
+        scatter_type (str): feature aggregation when doing local pooling
+        unet (bool): weather to use U-Net
+        unet_kwargs (str): U-Net parameters
+        unet3d (bool): weather to use 3D U-Net
+        unet3d_kwargs (str): 3D U-Net parameters
+        plane_resolution (int): defined resolution for plane feature
+        grid_resolution (int): defined resolution for grid feature
+        plane_type (str): feature type, 'xz' - 1-plane, ['xz', 'xy', 'yz'] - 3-plane, ['grid'] - 3D grid volume
+        padding (float): conventional padding paramter of ONet for unit cube, so [-0.5, 0.5] -> [-0.55, 0.55]
+        n_blocks (int): number of blocks ResNetBlockFC layers
+    '''
+    def __init__(self, ch=128, dim=3, hidden_dim=128, scatter_type='max',
+                 unet=True, unet_kwargs=None, unet3d=False, unet3d_kwargs=None,
+                 hw=None, grid_resolution=None, plane_type='xz', padding=0.1, n_blocks=5):
+        super().__init__()
+        c_dim = ch
+        unet3d = False
+        plane_type = ['xy', 'xz', 'yz']
+        plane_resolution = hw
+        self.c_dim = c_dim
+        self.fc_pos = nn.Linear(dim, 2*hidden_dim)
+        self.blocks = nn.ModuleList([
+            ResnetBlockFC(2*hidden_dim, hidden_dim) for i in range(n_blocks)
+        ])
+        self.fc_c = nn.Linear(hidden_dim, c_dim)
+        self.actvn = nn.ReLU()
+        self.hidden_dim = hidden_dim
+        if unet:
+            self.unet = UNet(c_dim, in_channels=c_dim, **unet_kwargs)
+        else:
+            self.unet = None
+        if unet3d:
+            # self.unet3d = UNet3D(**unet3d_kwargs)
+            raise NotImplementedError()
+        else:
+            self.unet3d = None
+        self.reso_plane = plane_resolution
+        self.reso_grid = grid_resolution
+        self.plane_type = plane_type
+        self.padding = padding
+        if scatter_type == 'max':
+            self.scatter = scatter_max
+        elif scatter_type == 'mean':
+            self.scatter = scatter_mean
+        else:
+            raise ValueError('incorrect scatter type')
+    def generate_plane_features(self, p, c, plane='xz'):
+        # acquire indices of features in plane
+        xy = normalize_coordinate(p.clone(), plane=plane, padding=self.padding) # normalize to the range of (0, 1)
+        index = coordinate2index(xy, self.reso_plane)
+        # scatter plane features from points
+        fea_plane = c.new_zeros(p.size(0), self.c_dim, self.reso_plane**2)
+        c = c.permute(0, 2, 1) # B x 512 x T
+        fea_plane = scatter_mean(c, index, out=fea_plane) # B x 512 x reso^2
+        fea_plane = fea_plane.reshape(p.size(0), self.c_dim, self.reso_plane, self.reso_plane) # sparce matrix (B x 512 x reso x reso)
+        # process the plane features with UNet
+        if self.unet is not None:
+            fea_plane = self.unet(fea_plane)
+        return fea_plane
+    def generate_grid_features(self, p, c):
+        p_nor = normalize_3d_coordinate(p.clone(), padding=self.padding)
+        index = coordinate2index(p_nor, self.reso_grid, coord_type='3d')
+        # scatter grid features from points
+        fea_grid = c.new_zeros(p.size(0), self.c_dim, self.reso_grid**3)
+        c = c.permute(0, 2, 1)
+        fea_grid = scatter_mean(c, index, out=fea_grid) # B x C x reso^3
+        fea_grid = fea_grid.reshape(p.size(0), self.c_dim, self.reso_grid, self.reso_grid, self.reso_grid) # sparce matrix (B x 512 x reso x reso)
+        if self.unet3d is not None:
+            fea_grid = self.unet3d(fea_grid)
+        return fea_grid
+    def pool_local(self, xy, index, c):
+        bs, fea_dim = c.size(0), c.size(2)
+        keys = xy.keys()
+        c_out = 0
+        for key in keys:
+            # scatter plane features from points
+            if key == 'grid':
+                fea = self.scatter(c.permute(0, 2, 1), index[key], dim_size=self.reso_grid**3)
+            else:
+                c_permute = c.permute(0, 2, 1)
+                fea = self.scatter(c_permute, index[key], dim_size=self.reso_plane**2)
+            if self.scatter == scatter_max:
+                fea = fea[0]
+            # gather feature back to points
+            fea = fea.gather(dim=2, index=index[key].expand(-1, fea_dim, -1))
+            c_out = c_out + fea
+        return c_out.permute(0, 2, 1)
+    def forward(self, p_input, img_feats=None):
+        """
+        Args:
+            p_input (tensor): input points    T 3 H W
+            img_feats (tensor): image features  T C H W
+        """
+        T, _, H, W = img_feats.size()
+        p = rearrange(p_input, 't c h w -> (h w) t c', c=3, h=H, w=W)
+        fea_reshp = rearrange(img_feats, 't c h w -> (h w) t c',
+                               c=img_feats.shape[1], h=H, w=W)
+        # acquire the index for each point
+        coord = {}
+        index = {}
+        if 'xz' in self.plane_type:
+            coord['xz'] = normalize_coordinate(p.clone(), plane='xz', padding=self.padding)
+            index['xz'] = coordinate2index(coord['xz'], self.reso_plane)
+        if 'xy' in self.plane_type:
+            coord['xy'] = normalize_coordinate(p.clone(), plane='xy', padding=self.padding)
+            index['xy'] = coordinate2index(coord['xy'], self.reso_plane)
+        if 'yz' in self.plane_type:
+            coord['yz'] = normalize_coordinate(p.clone(), plane='yz', padding=self.padding)
+            index['yz'] = coordinate2index(coord['yz'], self.reso_plane)
+        if 'grid' in self.plane_type:
+            coord['grid'] = normalize_3d_coordinate(p.clone(), padding=self.padding)
+            index['grid'] = coordinate2index(coord['grid'], self.reso_grid, coord_type='3d')
+        net = self.fc_pos(p) + fea_reshp
+        net = self.blocks[0](net)
+        for block in self.blocks[1:]:
+            pooled = self.pool_local(coord, index, net)
+            net = torch.cat([net, pooled], dim=2)
+            net = block(net)
+        c = self.fc_c(net)
+        fea = {}
+        if 'grid' in self.plane_type:
+            fea['grid'] = self.generate_grid_features(p, c)
+        if 'xz' in self.plane_type:
+            fea['xz'] = self.generate_plane_features(p, c, plane='xz')
+        if 'xy' in self.plane_type:
+            fea['xy'] = self.generate_plane_features(p, c, plane='xy')
+        if 'yz' in self.plane_type:
+            fea['yz'] = self.generate_plane_features(p, c, plane='yz')
+        ret = torch.stack([fea['xy'], fea['xz'], fea['yz']]).permute((1, 0, 2, 3, 4))
+        return ret
+class PatchLocalPoolPointnet(nn.Module):
+    ''' PointNet-based encoder network with ResNet blocks.
+        First transform input points to local system based on the given voxel size.
+        Support non-fixed number of point cloud, but need to precompute the index
+    Args:
+        c_dim (int): dimension of latent code c
+        dim (int): input points dimension
+        hidden_dim (int): hidden dimension of the network
+        scatter_type (str): feature aggregation when doing local pooling
+        unet (bool): weather to use U-Net
+        unet_kwargs (str): U-Net parameters
+        unet3d (bool): weather to use 3D U-Net
+        unet3d_kwargs (str): 3D U-Net parameters
+        plane_resolution (int): defined resolution for plane feature
+        grid_resolution (int): defined resolution for grid feature
+        plane_type (str): feature type, 'xz' - 1-plane, ['xz', 'xy', 'yz'] - 3-plane, ['grid'] - 3D grid volume
+        padding (float): conventional padding paramter of ONet for unit cube, so [-0.5, 0.5] -> [-0.55, 0.55]
+        n_blocks (int): number of blocks ResNetBlockFC layers
+        local_coord (bool): whether to use local coordinate
+        pos_encoding (str): method for the positional encoding, linear|sin_cos
+        unit_size (float): defined voxel unit size for local system
+    '''
+    def __init__(self, c_dim=128, dim=3, hidden_dim=128, scatter_type='max',
+                 unet=False, unet_kwargs=None, unet3d=False, unet3d_kwargs=None,
+                 plane_resolution=None, grid_resolution=None, plane_type='xz', padding=0.1, n_blocks=5,
+                 local_coord=False, pos_encoding='linear', unit_size=0.1):
+        super().__init__()
+        self.c_dim = c_dim
+        self.blocks = nn.ModuleList([
+            ResnetBlockFC(2*hidden_dim, hidden_dim) for i in range(n_blocks)
+        ])
+        self.fc_c = nn.Linear(hidden_dim, c_dim)
+        self.actvn = nn.ReLU()
+        self.hidden_dim = hidden_dim
+        self.reso_plane = plane_resolution
+        self.reso_grid = grid_resolution
+        self.plane_type = plane_type
+        self.padding = padding
+        if unet:
+            self.unet = UNet(c_dim, in_channels=c_dim, **unet_kwargs)
+        else:
+            self.unet = None
+        if unet3d:
+            # self.unet3d = UNet3D(**unet3d_kwargs)
+            raise NotImplementedError()
+        else:
+            self.unet3d = None
+        if scatter_type == 'max':
+            self.scatter = scatter_max
+        elif scatter_type == 'mean':
+            self.scatter = scatter_mean
+        else:
+            raise ValueError('incorrect scatter type')
+        if local_coord:
+            self.map2local = map2local(unit_size, pos_encoding=pos_encoding)
+        else:
+            self.map2local = None
+        if pos_encoding == 'sin_cos':
+            self.fc_pos = nn.Linear(60, 2*hidden_dim)
+        else:
+            self.fc_pos = nn.Linear(dim, 2*hidden_dim)
+    def generate_plane_features(self, index, c):
+        c = c.permute(0, 2, 1)
+        # scatter plane features from points
+        if index.max() < self.reso_plane**2:
+            fea_plane = c.new_zeros(c.size(0), self.c_dim, self.reso_plane**2)
+            fea_plane = scatter_mean(c, index, out=fea_plane) # B x c_dim x reso^2
+        else:
+            fea_plane = scatter_mean(c, index) # B x c_dim x reso^2
+            if fea_plane.shape[-1] > self.reso_plane**2: # deal with outliers
+                fea_plane = fea_plane[:, :, :-1]
+        fea_plane = fea_plane.reshape(c.size(0), self.c_dim, self.reso_plane, self.reso_plane)
+        # process the plane features with UNet
+        if self.unet is not None:
+            fea_plane = self.unet(fea_plane)
+        return fea_plane
+    def generate_grid_features(self, index, c):
+        # scatter grid features from points
+        c = c.permute(0, 2, 1)
+        if index.max() < self.reso_grid**3:
+            fea_grid = c.new_zeros(c.size(0), self.c_dim, self.reso_grid**3)
+            fea_grid = scatter_mean(c, index, out=fea_grid) # B x c_dim x reso^3
+        else:
+            fea_grid = scatter_mean(c, index) # B x c_dim x reso^3
+            if fea_grid.shape[-1] > self.reso_grid**3: # deal with outliers
+                fea_grid = fea_grid[:, :, :-1]
+        fea_grid = fea_grid.reshape(c.size(0), self.c_dim, self.reso_grid, self.reso_grid, self.reso_grid)
+        if self.unet3d is not None:
+            fea_grid = self.unet3d(fea_grid)
+        return fea_grid
+    def pool_local(self, index, c):
+        bs, fea_dim = c.size(0), c.size(2)
+        keys = index.keys()
+        c_out = 0
+        for key in keys:
+            # scatter plane features from points
+            if key == 'grid':
+                fea = self.scatter(c.permute(0, 2, 1), index[key])
+            else:
+                fea = self.scatter(c.permute(0, 2, 1), index[key])
+            if self.scatter == scatter_max:
+                fea = fea[0]
+            # gather feature back to points
+            fea = fea.gather(dim=2, index=index[key].expand(-1, fea_dim, -1))
+            c_out += fea
+        return c_out.permute(0, 2, 1)
+    def forward(self, inputs):
+        p = inputs['points']
+        index = inputs['index']
+        batch_size, T, D = p.size()
+        if self.map2local:
+            pp = self.map2local(p)
+            net = self.fc_pos(pp)
+        else:
+            net = self.fc_pos(p)
+        net = self.blocks[0](net)
+        for block in self.blocks[1:]:
+            pooled = self.pool_local(index, net)
+            net = torch.cat([net, pooled], dim=2)
+            net = block(net)
+        c = self.fc_c(net)
+        fea = {}
+        if 'grid' in self.plane_type:
+            fea['grid'] = self.generate_grid_features(index['grid'], c)
+        if 'xz' in self.plane_type:
+            fea['xz'] = self.generate_plane_features(index['xz'], c)
+        if 'xy' in self.plane_type:
+            fea['xy'] = self.generate_plane_features(index['xy'], c)
+        if 'yz' in self.plane_type:
+            fea['yz'] = self.generate_plane_features(index['yz'], c)
+        return fea

models/spatracker/models/core/spatracker/loftr/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .transformer import LocalFeatureTransformer

models/spatracker/models/core/spatracker/loftr/linear_attention.py ADDED Viewed

	@@ -0,0 +1,81 @@

+"""
+Linear Transformer proposed in "Transformers are RNNs: Fast Autoregressive Transformers with Linear Attention"
+Modified from: https://github.com/idiap/fast-transformers/blob/master/fast_transformers/attention/linear_attention.py
+"""
+import torch
+from torch.nn import Module, Dropout
+def elu_feature_map(x):
+    return torch.nn.functional.elu(x) + 1
+class LinearAttention(Module):
+    def __init__(self, eps=1e-6):
+        super().__init__()
+        self.feature_map = elu_feature_map
+        self.eps = eps
+    def forward(self, queries, keys, values, q_mask=None, kv_mask=None):
+        """ Multi-Head linear attention proposed in "Transformers are RNNs"
+        Args:
+            queries: [N, L, H, D]
+            keys: [N, S, H, D]
+            values: [N, S, H, D]
+            q_mask: [N, L]
+            kv_mask: [N, S]
+        Returns:
+            queried_values: (N, L, H, D)
+        """
+        Q = self.feature_map(queries)
+        K = self.feature_map(keys)
+        # set padded position to zero
+        if q_mask is not None:
+            Q = Q * q_mask[:, :, None, None]
+        if kv_mask is not None:
+            K = K * kv_mask[:, :, None, None]
+            values = values * kv_mask[:, :, None, None]
+        v_length = values.size(1)
+        values = values / v_length  # prevent fp16 overflow
+        KV = torch.einsum("nshd,nshv->nhdv", K, values)  # (S,D)' @ S,V
+        Z = 1 / (torch.einsum("nlhd,nhd->nlh", Q, K.sum(dim=1)) + self.eps)
+        queried_values = torch.einsum("nlhd,nhdv,nlh->nlhv", Q, KV, Z) * v_length
+        return queried_values.contiguous()
+class FullAttention(Module):
+    def __init__(self, use_dropout=False, attention_dropout=0.1):
+        super().__init__()
+        self.use_dropout = use_dropout
+        self.dropout = Dropout(attention_dropout)
+    def forward(self, queries, keys, values, q_mask=None, kv_mask=None):
+        """ Multi-head scaled dot-product attention, a.k.a full attention.
+        Args:
+            queries: [N, L, H, D]
+            keys: [N, S, H, D]
+            values: [N, S, H, D]
+            q_mask: [N, L]
+            kv_mask: [N, S]
+        Returns:
+            queried_values: (N, L, H, D)
+        """
+        # Compute the unnormalized attention and apply the masks
+        QK = torch.einsum("nlhd,nshd->nlsh", queries, keys)
+        if kv_mask is not None:
+            QK.masked_fill_(~(q_mask[:, :, None, None] * kv_mask[:, None, :, None]), float('-inf'))
+        # Compute the attention and the weighted average
+        softmax_temp = 1. / queries.size(3)**.5  # sqrt(D)
+        A = torch.softmax(softmax_temp * QK, dim=2)
+        if self.use_dropout:
+            A = self.dropout(A)
+        queried_values = torch.einsum("nlsh,nshd->nlhd", A, values)
+        return queried_values.contiguous()

models/spatracker/models/core/spatracker/loftr/transformer.py ADDED Viewed

	@@ -0,0 +1,142 @@

+'''
+modified from
+https://github.com/zju3dv/LoFTR/blob/master/src/loftr/loftr_module/transformer.py
+'''
+import torch
+from torch.nn import Module, Dropout
+import copy
+import torch.nn as nn
+import torch.nn.functional as F
+def elu_feature_map(x):
+    return torch.nn.functional.elu(x) + 1
+class FullAttention(Module):
+    def __init__(self, use_dropout=False, attention_dropout=0.1):
+        super().__init__()
+        self.use_dropout = use_dropout
+        self.dropout = Dropout(attention_dropout)
+    def forward(self, queries, keys, values, q_mask=None, kv_mask=None):
+        """ Multi-head scaled dot-product attention, a.k.a full attention.
+        Args:
+            queries: [N, L, H, D]
+            keys: [N, S, H, D]
+            values: [N, S, H, D]
+            q_mask: [N, L]
+            kv_mask: [N, S]
+        Returns:
+            queried_values: (N, L, H, D)
+        """
+        # Compute the unnormalized attention and apply the masks
+        # QK = torch.einsum("nlhd,nshd->nlsh", queries, keys)
+        # if kv_mask is not None:
+        #     QK.masked_fill_(~(q_mask[:, :, None, None] * kv_mask[:, None, :, None]), float(-1e12))
+        # softmax_temp = 1. / queries.size(3)**.5  # sqrt(D)
+        # A = torch.softmax(softmax_temp * QK, dim=2)
+        # if self.use_dropout:
+        #     A = self.dropout(A)
+        # queried_values_ = torch.einsum("nlsh,nshd->nlhd", A, values)
+        # Compute the attention and the weighted average
+        input_args = [x.half().contiguous() for x in [queries.permute(0,2,1,3), keys.permute(0,2,1,3), values.permute(0,2,1,3)]]
+        queried_values = F.scaled_dot_product_attention(*input_args).permute(0,2,1,3).float()  # type: ignore
+        return queried_values.contiguous()
+class TransformerEncoderLayer(nn.Module):
+    def __init__(self,
+                 d_model,
+                 nhead,):
+        super(TransformerEncoderLayer, self).__init__()
+        self.dim = d_model // nhead
+        self.nhead = nhead
+        # multi-head attention
+        self.q_proj = nn.Linear(d_model, d_model, bias=False)
+        self.k_proj = nn.Linear(d_model, d_model, bias=False)
+        self.v_proj = nn.Linear(d_model, d_model, bias=False)
+        self.attention = FullAttention()
+        self.merge = nn.Linear(d_model, d_model, bias=False)
+        # feed-forward network
+        self.mlp = nn.Sequential(
+            nn.Linear(d_model*2, d_model*2, bias=False),
+            nn.ReLU(True),
+            nn.Linear(d_model*2, d_model, bias=False),
+        )
+        # norm and dropout
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+    def forward(self, x, source, x_mask=None, source_mask=None):
+        """
+        Args:
+            x (torch.Tensor): [N, L, C]
+            source (torch.Tensor): [N, S, C]
+            x_mask (torch.Tensor): [N, L] (optional)
+            source_mask (torch.Tensor): [N, S] (optional)
+        """
+        bs = x.size(0)
+        query, key, value = x, source, source
+        # multi-head attention
+        query = self.q_proj(query).view(bs, -1, self.nhead, self.dim)  # [N, L, (H, D)]
+        key = self.k_proj(key).view(bs, -1, self.nhead, self.dim)  # [N, S, (H, D)]
+        value = self.v_proj(value).view(bs, -1, self.nhead, self.dim)
+        message = self.attention(query, key, value, q_mask=x_mask, kv_mask=source_mask)  # [N, L, (H, D)]
+        message = self.merge(message.view(bs, -1, self.nhead*self.dim))  # [N, L, C]
+        message = self.norm1(message)
+        # feed-forward network
+        message = self.mlp(torch.cat([x, message], dim=2))
+        message = self.norm2(message)
+        return x + message
+class LocalFeatureTransformer(nn.Module):
+    """A Local Feature Transformer module."""
+    def __init__(self, config):
+        super(LocalFeatureTransformer, self).__init__()
+        self.config = config
+        self.d_model = config['d_model']
+        self.nhead = config['nhead']
+        self.layer_names = config['layer_names']
+        encoder_layer = TransformerEncoderLayer(config['d_model'], config['nhead'])
+        self.layers = nn.ModuleList([copy.deepcopy(encoder_layer) for _ in range(len(self.layer_names))])
+        self._reset_parameters()
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def forward(self, feat0, feat1, mask0=None, mask1=None):
+        """
+        Args:
+            feat0 (torch.Tensor): [N, L, C]
+            feat1 (torch.Tensor): [N, S, C]
+            mask0 (torch.Tensor): [N, L] (optional)
+            mask1 (torch.Tensor): [N, S] (optional)
+        """
+        assert self.d_model == feat0.size(2), "the feature number of src and transformer must be equal"
+        for layer, name in zip(self.layers, self.layer_names):
+            if name == 'self':
+                feat0 = layer(feat0, feat0, mask0, mask0)
+                feat1 = layer(feat1, feat1, mask1, mask1)
+            elif name == 'cross':
+                feat0 = layer(feat0, feat1, mask0, mask1)
+                feat1 = layer(feat1, feat0, mask1, mask0)
+            else:
+                raise KeyError
+        return feat0, feat1

models/spatracker/models/core/spatracker/losses.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn.functional as F
+from models.spatracker.models.core.model_utils import reduce_masked_mean
+from models.spatracker.models.core.spatracker.blocks import (
+        pix2cam
+)
+from models.spatracker.models.core.model_utils import (
+    bilinear_sample2d
+)
+EPS = 1e-6
+import torchvision.transforms.functional as TF
+sigma = 3
+x_grid = torch.arange(-7,8,1)
+y_grid = torch.arange(-7,8,1)
+x_grid, y_grid = torch.meshgrid(x_grid, y_grid)
+gridxy = torch.stack([x_grid, y_grid], dim=-1).float()
+gs_kernel = torch.exp(-torch.sum(gridxy**2, dim=-1)/(2*sigma**2))
+def balanced_ce_loss(pred, gt, valid=None):
+    total_balanced_loss = 0.0
+    for j in range(len(gt)):
+        B, S, N = gt[j].shape
+        # pred and gt are the same shape
+        for (a, b) in zip(pred[j].size(), gt[j].size()):
+            assert a == b  # some shape mismatch!
+        # if valid is not None:
+        for (a, b) in zip(pred[j].size(), valid[j].size()):
+            assert a == b  # some shape mismatch!
+        pos = (gt[j] > 0.95).float()
+        neg = (gt[j] < 0.05).float()
+        label = pos * 2.0 - 1.0
+        a = -label * pred[j]
+        b = F.relu(a)
+        loss = b + torch.log(torch.exp(-b) + torch.exp(a - b))
+        pos_loss = reduce_masked_mean(loss, pos * valid[j])
+        neg_loss = reduce_masked_mean(loss, neg * valid[j])
+        balanced_loss = pos_loss + neg_loss
+        total_balanced_loss += balanced_loss / float(N)
+    import ipdb; ipdb.set_trace()
+    return total_balanced_loss
+def sequence_loss(flow_preds, flow_gt, vis, valids, gamma=0.8,
+                  intr=None, trajs_g_all=None):
+    """Loss function defined over sequence of flow predictions"""
+    total_flow_loss = 0.0
+    for j in range(len(flow_gt)):
+        B, S, N, D = flow_gt[j].shape
+        # assert D == 3
+        B, S1, N = vis[j].shape
+        B, S2, N = valids[j].shape
+        assert S == S1
+        assert S == S2
+        n_predictions = len(flow_preds[j])
+        if intr is not None:
+            intr_i = intr[j]
+        flow_loss = 0.0
+        for i in range(n_predictions):
+            i_weight = gamma ** (n_predictions - i - 1)
+            flow_pred = flow_preds[j][i][..., -N:, :D]
+            flow_gt_j = flow_gt[j].clone()
+            if intr is not None:
+                xyz_j_gt = pix2cam(flow_gt_j, intr_i)
+            try:
+                i_loss = (flow_pred - flow_gt_j).abs()  # B, S, N, 3
+            except:
+                import ipdb; ipdb.set_trace()
+            if D==3:
+                i_loss[...,2]*=30
+            i_loss = torch.mean(i_loss, dim=3)  # B, S, N
+            flow_loss += i_weight * (reduce_masked_mean(i_loss, valids[j]))
+        flow_loss = flow_loss / n_predictions
+        total_flow_loss += flow_loss / float(N)
+    return total_flow_loss

models/spatracker/models/core/spatracker/softsplat.py ADDED Viewed

	@@ -0,0 +1,539 @@

+#!/usr/bin/env python
+"""The code of softsplat function is modified from:
+https://github.com/sniklaus/softmax-splatting/blob/master/softsplat.py
+"""
+import collections
+import cupy
+import os
+import re
+import torch
+import typing
+##########################################################
+objCudacache = {}
+def cuda_int32(intIn:int):
+    return cupy.int32(intIn)
+# end
+def cuda_float32(fltIn:float):
+    return cupy.float32(fltIn)
+# end
+def cuda_kernel(strFunction:str, strKernel:str, objVariables:typing.Dict):
+    if 'device' not in objCudacache:
+        objCudacache['device'] = torch.cuda.get_device_name()
+    # end
+    strKey = strFunction
+    for strVariable in objVariables:
+        objValue = objVariables[strVariable]
+        strKey += strVariable
+        if objValue is None:
+            continue
+        elif type(objValue) == int:
+            strKey += str(objValue)
+        elif type(objValue) == float:
+            strKey += str(objValue)
+        elif type(objValue) == bool:
+            strKey += str(objValue)
+        elif type(objValue) == str:
+            strKey += objValue
+        elif type(objValue) == torch.Tensor:
+            strKey += str(objValue.dtype)
+            strKey += str(objValue.shape)
+            strKey += str(objValue.stride())
+        elif True:
+            print(strVariable, type(objValue))
+            assert(False)
+        # end
+    # end
+    strKey += objCudacache['device']
+    if strKey not in objCudacache:
+        for strVariable in objVariables:
+            objValue = objVariables[strVariable]
+            if objValue is None:
+                continue
+            elif type(objValue) == int:
+                strKernel = strKernel.replace('{{' + strVariable + '}}', str(objValue))
+            elif type(objValue) == float:
+                strKernel = strKernel.replace('{{' + strVariable + '}}', str(objValue))
+            elif type(objValue) == bool:
+                strKernel = strKernel.replace('{{' + strVariable + '}}', str(objValue))
+            elif type(objValue) == str:
+                strKernel = strKernel.replace('{{' + strVariable + '}}', objValue)
+            elif type(objValue) == torch.Tensor and objValue.dtype == torch.uint8:
+                strKernel = strKernel.replace('{{type}}', 'unsigned char')
+            elif type(objValue) == torch.Tensor and objValue.dtype == torch.float16:
+                strKernel = strKernel.replace('{{type}}', 'half')
+            elif type(objValue) == torch.Tensor and objValue.dtype == torch.float32:
+                strKernel = strKernel.replace('{{type}}', 'float')
+            elif type(objValue) == torch.Tensor and objValue.dtype == torch.float64:
+                strKernel = strKernel.replace('{{type}}', 'double')
+            elif type(objValue) == torch.Tensor and objValue.dtype == torch.int32:
+                strKernel = strKernel.replace('{{type}}', 'int')
+            elif type(objValue) == torch.Tensor and objValue.dtype == torch.int64:
+                strKernel = strKernel.replace('{{type}}', 'long')
+            elif type(objValue) == torch.Tensor:
+                print(strVariable, objValue.dtype)
+                assert(False)
+            elif True:
+                print(strVariable, type(objValue))
+                assert(False)
+            # end
+        # end
+        while True:
+            objMatch = re.search('(SIZE_)([0-4])(\()([^\)]*)(\))', strKernel)
+            if objMatch is None:
+                break
+            # end
+            intArg = int(objMatch.group(2))
+            strTensor = objMatch.group(4)
+            intSizes = objVariables[strTensor].size()
+            strKernel = strKernel.replace(objMatch.group(), str(intSizes[intArg] if torch.is_tensor(intSizes[intArg]) == False else intSizes[intArg].item()))
+        # end
+        while True:
+            objMatch = re.search('(OFFSET_)([0-4])(\()', strKernel)
+            if objMatch is None:
+                break
+            # end
+            intStart = objMatch.span()[1]
+            intStop = objMatch.span()[1]
+            intParentheses = 1
+            while True:
+                intParentheses += 1 if strKernel[intStop] == '(' else 0
+                intParentheses -= 1 if strKernel[intStop] == ')' else 0
+                if intParentheses == 0:
+                    break
+                # end
+                intStop += 1
+            # end
+            intArgs = int(objMatch.group(2))
+            strArgs = strKernel[intStart:intStop].split(',')
+            assert(intArgs == len(strArgs) - 1)
+            strTensor = strArgs[0]
+            intStrides = objVariables[strTensor].stride()
+            strIndex = []
+            for intArg in range(intArgs):
+                strIndex.append('((' + strArgs[intArg + 1].replace('{', '(').replace('}', ')').strip() + ')*' + str(intStrides[intArg] if torch.is_tensor(intStrides[intArg]) == False else intStrides[intArg].item()) + ')')
+            # end
+            strKernel = strKernel.replace('OFFSET_' + str(intArgs) + '(' + strKernel[intStart:intStop] + ')', '(' + str.join('+', strIndex) + ')')
+        # end
+        while True:
+            objMatch = re.search('(VALUE_)([0-4])(\()', strKernel)
+            if objMatch is None:
+                break
+            # end
+            intStart = objMatch.span()[1]
+            intStop = objMatch.span()[1]
+            intParentheses = 1
+            while True:
+                intParentheses += 1 if strKernel[intStop] == '(' else 0
+                intParentheses -= 1 if strKernel[intStop] == ')' else 0
+                if intParentheses == 0:
+                    break
+                # end
+                intStop += 1
+            # end
+            intArgs = int(objMatch.group(2))
+            strArgs = strKernel[intStart:intStop].split(',')
+            assert(intArgs == len(strArgs) - 1)
+            strTensor = strArgs[0]
+            intStrides = objVariables[strTensor].stride()
+            strIndex = []
+            for intArg in range(intArgs):
+                strIndex.append('((' + strArgs[intArg + 1].replace('{', '(').replace('}', ')').strip() + ')*' + str(intStrides[intArg] if torch.is_tensor(intStrides[intArg]) == False else intStrides[intArg].item()) + ')')
+            # end
+            strKernel = strKernel.replace('VALUE_' + str(intArgs) + '(' + strKernel[intStart:intStop] + ')', strTensor + '[' + str.join('+', strIndex) + ']')
+        # end
+        objCudacache[strKey] = {
+            'strFunction': strFunction,
+            'strKernel': strKernel
+        }
+    # end
+    return strKey
+# end
+@cupy.memoize(for_each_device=True)
+def cuda_launch(strKey:str):
+    if 'CUDA_HOME' not in os.environ:
+        os.environ['CUDA_HOME'] = cupy.cuda.get_cuda_path()
+    # end
+    return cupy.RawKernel(objCudacache[strKey]['strKernel'], objCudacache[strKey]['strFunction'])
+# end
+##########################################################
+def softsplat(tenIn:torch.Tensor, tenFlow:torch.Tensor,
+              tenMetric:torch.Tensor, strMode:str, tenoutH=None, tenoutW=None):
+    assert(strMode.split('-')[0] in ['sum', 'avg', 'linear', 'soft'])
+    if strMode == 'sum': assert(tenMetric is None)
+    if strMode == 'avg': assert(tenMetric is None)
+    if strMode.split('-')[0] == 'linear': assert(tenMetric is not None)
+    if strMode.split('-')[0] == 'soft': assert(tenMetric is not None)
+    if strMode == 'avg':
+        tenIn = torch.cat([tenIn, tenIn.new_ones([tenIn.shape[0], 1, tenIn.shape[2], tenIn.shape[3]])], 1)
+    elif strMode.split('-')[0] == 'linear':
+        tenIn = torch.cat([tenIn * tenMetric, tenMetric], 1)
+    elif strMode.split('-')[0] == 'soft':
+        tenIn = torch.cat([tenIn * tenMetric.exp(), tenMetric.exp()], 1)
+    # end
+    tenOut = softsplat_func.apply(tenIn, tenFlow, tenoutH, tenoutW)
+    if strMode.split('-')[0] in ['avg', 'linear', 'soft']:
+        tenNormalize = tenOut[:, -1:, :, :]
+        if len(strMode.split('-')) == 1:
+            tenNormalize = tenNormalize + 0.0000001
+        elif strMode.split('-')[1] == 'addeps':
+            tenNormalize = tenNormalize + 0.0000001
+        elif strMode.split('-')[1] == 'zeroeps':
+            tenNormalize[tenNormalize == 0.0] = 1.0
+        elif strMode.split('-')[1] == 'clipeps':
+            tenNormalize = tenNormalize.clip(0.0000001, None)
+        # end
+        tenOut = tenOut[:, :-1, :, :] / tenNormalize
+    # end
+    return tenOut
+# end
+class softsplat_func(torch.autograd.Function):
+    @staticmethod
+    @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
+    def forward(self, tenIn, tenFlow, H=None, W=None):
+        if H is None:
+            tenOut = tenIn.new_zeros([tenIn.shape[0], tenIn.shape[1], tenIn.shape[2], tenIn.shape[3]])
+        else:
+            tenOut = tenIn.new_zeros([tenIn.shape[0], tenIn.shape[1], H, W])
+        if tenIn.is_cuda == True:
+            cuda_launch(cuda_kernel('softsplat_out', '''
+                extern "C" __global__ void __launch_bounds__(512) softsplat_out(
+                    const int n,
+                    const {{type}}* __restrict__ tenIn,
+                    const {{type}}* __restrict__ tenFlow,
+                    {{type}}* __restrict__ tenOut
+                ) { for (int intIndex = (blockIdx.x * blockDim.x) + threadIdx.x; intIndex < n; intIndex += blockDim.x * gridDim.x) {
+                    const int intN = ( intIndex / SIZE_3(tenIn) / SIZE_2(tenIn) / SIZE_1(tenIn) ) % SIZE_0(tenIn);
+                    const int intC = ( intIndex / SIZE_3(tenIn) / SIZE_2(tenIn)                  ) % SIZE_1(tenIn);
+                    const int intY = ( intIndex / SIZE_3(tenIn)                                   ) % SIZE_2(tenIn);
+                    const int intX = ( intIndex                                                    ) % SIZE_3(tenIn);
+                    assert(SIZE_1(tenFlow) == 2);
+                    {{type}} fltX = ({{type}}) (intX) + VALUE_4(tenFlow, intN, 0, intY, intX);
+                    {{type}} fltY = ({{type}}) (intY) + VALUE_4(tenFlow, intN, 1, intY, intX);
+                    if (isfinite(fltX) == false) { return; }
+                    if (isfinite(fltY) == false) { return; }
+                    {{type}} fltIn = VALUE_4(tenIn, intN, intC, intY, intX);
+                    int intNorthwestX = (int) (floor(fltX));
+                    int intNorthwestY = (int) (floor(fltY));
+                    int intNortheastX = intNorthwestX + 1;
+                    int intNortheastY = intNorthwestY;
+                    int intSouthwestX = intNorthwestX;
+                    int intSouthwestY = intNorthwestY + 1;
+                    int intSoutheastX = intNorthwestX + 1;
+                    int intSoutheastY = intNorthwestY + 1;
+                    {{type}} fltNorthwest = (({{type}}) (intSoutheastX) - fltX) * (({{type}}) (intSoutheastY) - fltY);
+                    {{type}} fltNortheast = (fltX - ({{type}}) (intSouthwestX)) * (({{type}}) (intSouthwestY) - fltY);
+                    {{type}} fltSouthwest = (({{type}}) (intNortheastX) - fltX) * (fltY - ({{type}}) (intNortheastY));
+                    {{type}} fltSoutheast = (fltX - ({{type}}) (intNorthwestX)) * (fltY - ({{type}}) (intNorthwestY));
+                    if ((intNorthwestX >= 0) && (intNorthwestX < SIZE_3(tenOut)) && (intNorthwestY >= 0) && (intNorthwestY < SIZE_2(tenOut))) {
+                        atomicAdd(&tenOut[OFFSET_4(tenOut, intN, intC, intNorthwestY, intNorthwestX)], fltIn * fltNorthwest);
+                    }
+                    if ((intNortheastX >= 0) && (intNortheastX < SIZE_3(tenOut)) && (intNortheastY >= 0) && (intNortheastY < SIZE_2(tenOut))) {
+                        atomicAdd(&tenOut[OFFSET_4(tenOut, intN, intC, intNortheastY, intNortheastX)], fltIn * fltNortheast);
+                    }
+                    if ((intSouthwestX >= 0) && (intSouthwestX < SIZE_3(tenOut)) && (intSouthwestY >= 0) && (intSouthwestY < SIZE_2(tenOut))) {
+                        atomicAdd(&tenOut[OFFSET_4(tenOut, intN, intC, intSouthwestY, intSouthwestX)], fltIn * fltSouthwest);
+                    }
+                    if ((intSoutheastX >= 0) && (intSoutheastX < SIZE_3(tenOut)) && (intSoutheastY >= 0) && (intSoutheastY < SIZE_2(tenOut))) {
+                        atomicAdd(&tenOut[OFFSET_4(tenOut, intN, intC, intSoutheastY, intSoutheastX)], fltIn * fltSoutheast);
+                    }
+                } }
+            ''', {
+                'tenIn': tenIn,
+                'tenFlow': tenFlow,
+                'tenOut': tenOut
+            }))(
+                grid=tuple([int((tenIn.nelement() + 512 - 1) / 512), 1, 1]),
+                block=tuple([512, 1, 1]),
+                args=[cuda_int32(tenOut.nelement()), tenIn.data_ptr(), tenFlow.data_ptr(), tenOut.data_ptr()],
+                stream=collections.namedtuple('Stream', 'ptr')(torch.cuda.current_stream().cuda_stream)
+            )
+        elif tenIn.is_cuda != True:
+            assert(False)
+        # end
+        self.save_for_backward(tenIn, tenFlow)
+        return tenOut
+    # end
+    @staticmethod
+    @torch.cuda.amp.custom_bwd
+    def backward(self, tenOutgrad):
+        tenIn, tenFlow = self.saved_tensors
+        tenOutgrad = tenOutgrad.contiguous(); assert(tenOutgrad.is_cuda == True)
+        tenIngrad = tenIn.new_zeros([tenIn.shape[0], tenIn.shape[1], tenIn.shape[2], tenIn.shape[3]]) if self.needs_input_grad[0] == True else None
+        tenFlowgrad = tenFlow.new_zeros([tenFlow.shape[0], tenFlow.shape[1], tenFlow.shape[2], tenFlow.shape[3]]) if self.needs_input_grad[1] == True else None
+        Hgrad = None
+        Wgrad = None
+        if tenIngrad is not None:
+            cuda_launch(cuda_kernel('softsplat_ingrad', '''
+                extern "C" __global__ void __launch_bounds__(512) softsplat_ingrad(
+                    const int n,
+                    const {{type}}* __restrict__ tenIn,
+                    const {{type}}* __restrict__ tenFlow,
+                    const {{type}}* __restrict__ tenOutgrad,
+                    {{type}}* __restrict__ tenIngrad,
+                    {{type}}* __restrict__ tenFlowgrad
+                ) { for (int intIndex = (blockIdx.x * blockDim.x) + threadIdx.x; intIndex < n; intIndex += blockDim.x * gridDim.x) {
+                    const int intN = ( intIndex / SIZE_3(tenIngrad) / SIZE_2(tenIngrad) / SIZE_1(tenIngrad) ) % SIZE_0(tenIngrad);
+                    const int intC = ( intIndex / SIZE_3(tenIngrad) / SIZE_2(tenIngrad)                     ) % SIZE_1(tenIngrad);
+                    const int intY = ( intIndex / SIZE_3(tenIngrad)                                         ) % SIZE_2(tenIngrad);
+                    const int intX = ( intIndex                                                             ) % SIZE_3(tenIngrad);
+                    assert(SIZE_1(tenFlow) == 2);
+                    {{type}} fltIngrad = 0.0f;
+                    {{type}} fltX = ({{type}}) (intX) + VALUE_4(tenFlow, intN, 0, intY, intX);
+                    {{type}} fltY = ({{type}}) (intY) + VALUE_4(tenFlow, intN, 1, intY, intX);
+                    if (isfinite(fltX) == false) { return; }
+                    if (isfinite(fltY) == false) { return; }
+                    int intNorthwestX = (int) (floor(fltX));
+                    int intNorthwestY = (int) (floor(fltY));
+                    int intNortheastX = intNorthwestX + 1;
+                    int intNortheastY = intNorthwestY;
+                    int intSouthwestX = intNorthwestX;
+                    int intSouthwestY = intNorthwestY + 1;
+                    int intSoutheastX = intNorthwestX + 1;
+                    int intSoutheastY = intNorthwestY + 1;
+                    {{type}} fltNorthwest = (({{type}}) (intSoutheastX) - fltX) * (({{type}}) (intSoutheastY) - fltY);
+                    {{type}} fltNortheast = (fltX - ({{type}}) (intSouthwestX)) * (({{type}}) (intSouthwestY) - fltY);
+                    {{type}} fltSouthwest = (({{type}}) (intNortheastX) - fltX) * (fltY - ({{type}}) (intNortheastY));
+                    {{type}} fltSoutheast = (fltX - ({{type}}) (intNorthwestX)) * (fltY - ({{type}}) (intNorthwestY));
+                    if ((intNorthwestX >= 0) && (intNorthwestX < SIZE_3(tenOutgrad)) && (intNorthwestY >= 0) && (intNorthwestY < SIZE_2(tenOutgrad))) {
+                        fltIngrad += VALUE_4(tenOutgrad, intN, intC, intNorthwestY, intNorthwestX) * fltNorthwest;
+                    }
+                    if ((intNortheastX >= 0) && (intNortheastX < SIZE_3(tenOutgrad)) && (intNortheastY >= 0) && (intNortheastY < SIZE_2(tenOutgrad))) {
+                        fltIngrad += VALUE_4(tenOutgrad, intN, intC, intNortheastY, intNortheastX) * fltNortheast;
+                    }
+                    if ((intSouthwestX >= 0) && (intSouthwestX < SIZE_3(tenOutgrad)) && (intSouthwestY >= 0) && (intSouthwestY < SIZE_2(tenOutgrad))) {
+                        fltIngrad += VALUE_4(tenOutgrad, intN, intC, intSouthwestY, intSouthwestX) * fltSouthwest;
+                    }
+                    if ((intSoutheastX >= 0) && (intSoutheastX < SIZE_3(tenOutgrad)) && (intSoutheastY >= 0) && (intSoutheastY < SIZE_2(tenOutgrad))) {
+                        fltIngrad += VALUE_4(tenOutgrad, intN, intC, intSoutheastY, intSoutheastX) * fltSoutheast;
+                    }
+                    tenIngrad[intIndex] = fltIngrad;
+                } }
+            ''', {
+                'tenIn': tenIn,
+                'tenFlow': tenFlow,
+                'tenOutgrad': tenOutgrad,
+                'tenIngrad': tenIngrad,
+                'tenFlowgrad': tenFlowgrad
+            }))(
+                grid=tuple([int((tenIngrad.nelement() + 512 - 1) / 512), 1, 1]),
+                block=tuple([512, 1, 1]),
+                args=[cuda_int32(tenIngrad.nelement()), tenIn.data_ptr(), tenFlow.data_ptr(), tenOutgrad.data_ptr(), tenIngrad.data_ptr(), None],
+                stream=collections.namedtuple('Stream', 'ptr')(torch.cuda.current_stream().cuda_stream)
+            )
+        # end
+        if tenFlowgrad is not None:
+            cuda_launch(cuda_kernel('softsplat_flowgrad', '''
+                extern "C" __global__ void __launch_bounds__(512) softsplat_flowgrad(
+                    const int n,
+                    const {{type}}* __restrict__ tenIn,
+                    const {{type}}* __restrict__ tenFlow,
+                    const {{type}}* __restrict__ tenOutgrad,
+                    {{type}}* __restrict__ tenIngrad,
+                    {{type}}* __restrict__ tenFlowgrad
+                ) { for (int intIndex = (blockIdx.x * blockDim.x) + threadIdx.x; intIndex < n; intIndex += blockDim.x * gridDim.x) {
+                    const int intN = ( intIndex / SIZE_3(tenFlowgrad) / SIZE_2(tenFlowgrad) / SIZE_1(tenFlowgrad) ) % SIZE_0(tenFlowgrad);
+                    const int intC = ( intIndex / SIZE_3(tenFlowgrad) / SIZE_2(tenFlowgrad)                       ) % SIZE_1(tenFlowgrad);
+                    const int intY = ( intIndex / SIZE_3(tenFlowgrad)                                             ) % SIZE_2(tenFlowgrad);
+                    const int intX = ( intIndex                                                                   ) % SIZE_3(tenFlowgrad);
+                    assert(SIZE_1(tenFlow) == 2);
+                    {{type}} fltFlowgrad = 0.0f;
+                    {{type}} fltX = ({{type}}) (intX) + VALUE_4(tenFlow, intN, 0, intY, intX);
+                    {{type}} fltY = ({{type}}) (intY) + VALUE_4(tenFlow, intN, 1, intY, intX);
+                    if (isfinite(fltX) == false) { return; }
+                    if (isfinite(fltY) == false) { return; }
+                    int intNorthwestX = (int) (floor(fltX));
+                    int intNorthwestY = (int) (floor(fltY));
+                    int intNortheastX = intNorthwestX + 1;
+                    int intNortheastY = intNorthwestY;
+                    int intSouthwestX = intNorthwestX;
+                    int intSouthwestY = intNorthwestY + 1;
+                    int intSoutheastX = intNorthwestX + 1;
+                    int intSoutheastY = intNorthwestY + 1;
+                    {{type}} fltNorthwest = 0.0f;
+                    {{type}} fltNortheast = 0.0f;
+                    {{type}} fltSouthwest = 0.0f;
+                    {{type}} fltSoutheast = 0.0f;
+                    if (intC == 0) {
+                        fltNorthwest = (({{type}}) (-1.0f)) * (({{type}}) (intSoutheastY) - fltY);
+                        fltNortheast = (({{type}}) (+1.0f)) * (({{type}}) (intSouthwestY) - fltY);
+                        fltSouthwest = (({{type}}) (-1.0f)) * (fltY - ({{type}}) (intNortheastY));
+                        fltSoutheast = (({{type}}) (+1.0f)) * (fltY - ({{type}}) (intNorthwestY));
+                    } else if (intC == 1) {
+                        fltNorthwest = (({{type}}) (intSoutheastX) - fltX) * (({{type}}) (-1.0f));
+                        fltNortheast = (fltX - ({{type}}) (intSouthwestX)) * (({{type}}) (-1.0f));
+                        fltSouthwest = (({{type}}) (intNortheastX) - fltX) * (({{type}}) (+1.0f));
+                        fltSoutheast = (fltX - ({{type}}) (intNorthwestX)) * (({{type}}) (+1.0f));
+                    }
+                    for (int intChannel = 0; intChannel < SIZE_1(tenOutgrad); intChannel += 1) {
+                        {{type}} fltIn = VALUE_4(tenIn, intN, intChannel, intY, intX);
+                        if ((intNorthwestX >= 0) && (intNorthwestX < SIZE_3(tenOutgrad)) && (intNorthwestY >= 0) && (intNorthwestY < SIZE_2(tenOutgrad))) {
+                            fltFlowgrad += VALUE_4(tenOutgrad, intN, intChannel, intNorthwestY, intNorthwestX) * fltIn * fltNorthwest;
+                        }
+                        if ((intNortheastX >= 0) && (intNortheastX < SIZE_3(tenOutgrad)) && (intNortheastY >= 0) && (intNortheastY < SIZE_2(tenOutgrad))) {
+                            fltFlowgrad += VALUE_4(tenOutgrad, intN, intChannel, intNortheastY, intNortheastX) * fltIn * fltNortheast;
+                        }
+                        if ((intSouthwestX >= 0) && (intSouthwestX < SIZE_3(tenOutgrad)) && (intSouthwestY >= 0) && (intSouthwestY < SIZE_2(tenOutgrad))) {
+                            fltFlowgrad += VALUE_4(tenOutgrad, intN, intChannel, intSouthwestY, intSouthwestX) * fltIn * fltSouthwest;
+                        }
+                        if ((intSoutheastX >= 0) && (intSoutheastX < SIZE_3(tenOutgrad)) && (intSoutheastY >= 0) && (intSoutheastY < SIZE_2(tenOutgrad))) {
+                            fltFlowgrad += VALUE_4(tenOutgrad, intN, intChannel, intSoutheastY, intSoutheastX) * fltIn * fltSoutheast;
+                        }
+                    }
+                    tenFlowgrad[intIndex] = fltFlowgrad;
+                } }
+            ''', {
+                'tenIn': tenIn,
+                'tenFlow': tenFlow,
+                'tenOutgrad': tenOutgrad,
+                'tenIngrad': tenIngrad,
+                'tenFlowgrad': tenFlowgrad
+            }))(
+                grid=tuple([int((tenFlowgrad.nelement() + 512 - 1) / 512), 1, 1]),
+                block=tuple([512, 1, 1]),
+                args=[cuda_int32(tenFlowgrad.nelement()), tenIn.data_ptr(), tenFlow.data_ptr(), tenOutgrad.data_ptr(), None, tenFlowgrad.data_ptr()],
+                stream=collections.namedtuple('Stream', 'ptr')(torch.cuda.current_stream().cuda_stream)
+            )
+        # end
+        return tenIngrad, tenFlowgrad, Hgrad, Wgrad
+    # end
+# end

models/spatracker/models/core/spatracker/spatracker.py ADDED Viewed

	@@ -0,0 +1,732 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+from easydict import EasyDict as edict
+from einops import rearrange
+from sklearn.cluster import SpectralClustering
+from models.spatracker.models.core.spatracker.blocks import Lie
+import matplotlib.pyplot as plt
+import cv2
+import torch.nn.functional as F
+from models.spatracker.models.core.spatracker.blocks import (
+    BasicEncoder,
+    CorrBlock,
+    EUpdateFormer,
+    FusionFormer,
+    pix2cam,
+    cam2pix,
+    edgeMat,
+    VitEncoder,
+    DPTEnc,
+    Dinov2
+)
+from models.spatracker.models.core.spatracker.feature_net import (
+    LocalSoftSplat
+)
+from models.spatracker.models.core.model_utils import (
+    meshgrid2d, bilinear_sample2d, smart_cat, sample_features5d, vis_PCA
+)
+from models.spatracker.models.core.embeddings import (
+    get_2d_embedding,
+    get_3d_embedding,
+    get_1d_sincos_pos_embed_from_grid,
+    get_2d_sincos_pos_embed,
+    get_3d_sincos_pos_embed_from_grid,
+    Embedder_Fourier,
+)
+import numpy as np
+from models.spatracker.models.core.spatracker.softsplat import softsplat
+torch.manual_seed(0)
+def get_points_on_a_grid(grid_size, interp_shape,
+                          grid_center=(0, 0), device="cuda"):
+    if grid_size == 1:
+        return torch.tensor([interp_shape[1] / 2,
+                             interp_shape[0] / 2], device=device)[
+            None, None
+        ]
+    grid_y, grid_x = meshgrid2d(
+        1, grid_size, grid_size, stack=False, norm=False, device=device
+    )
+    step = interp_shape[1] // 64
+    if grid_center[0] != 0 or grid_center[1] != 0:
+        grid_y = grid_y - grid_size / 2.0
+        grid_x = grid_x - grid_size / 2.0
+    grid_y = step + grid_y.reshape(1, -1) / float(grid_size - 1) * (
+        interp_shape[0] - step * 2
+    )
+    grid_x = step + grid_x.reshape(1, -1) / float(grid_size - 1) * (
+        interp_shape[1] - step * 2
+    )
+    grid_y = grid_y + grid_center[0]
+    grid_x = grid_x + grid_center[1]
+    xy = torch.stack([grid_x, grid_y], dim=-1).to(device)
+    return xy
+def sample_pos_embed(grid_size, embed_dim, coords):
+    if coords.shape[-1] == 2:
+        pos_embed = get_2d_sincos_pos_embed(embed_dim=embed_dim,
+                                             grid_size=grid_size)
+        pos_embed = (
+            torch.from_numpy(pos_embed)
+            .reshape(grid_size[0], grid_size[1], embed_dim)
+            .float()
+            .unsqueeze(0)
+            .to(coords.device)
+        )
+        sampled_pos_embed = bilinear_sample2d(
+            pos_embed.permute(0, 3, 1, 2),
+            coords[:, 0, :, 0], coords[:, 0, :, 1]
+        )
+    elif coords.shape[-1] == 3:
+        sampled_pos_embed = get_3d_sincos_pos_embed_from_grid(
+            embed_dim, coords[:, :1, ...]
+        ).float()[:,0,...].permute(0, 2, 1)
+    return sampled_pos_embed
+class SpaTracker(nn.Module):
+    def __init__(
+        self,
+        S=8,
+        stride=8,
+        add_space_attn=True,
+        num_heads=8,
+        hidden_size=384,
+        space_depth=12,
+        time_depth=12,
+        args=edict({})
+    ):
+        super(SpaTracker, self).__init__()
+        # step1: config the arch of the model
+        self.args=args
+        # step1.1: config the default value of the model
+        if getattr(args, "depth_color", None) == None:
+            self.args.depth_color = False
+        if getattr(args, "if_ARAP", None) == None:
+            self.args.if_ARAP = True
+        if getattr(args, "flash_attn", None) == None:
+            self.args.flash_attn = True
+        if getattr(args, "backbone", None) == None:
+            self.args.backbone = "CNN"
+        if getattr(args, "Nblock", None) == None:
+            self.args.Nblock = 0
+        if getattr(args, "Embed3D", None) == None:
+            self.args.Embed3D = True
+        # step1.2: config the model parameters
+        self.S = S
+        self.stride = stride
+        self.hidden_dim = 256
+        self.latent_dim = latent_dim = 128
+        self.b_latent_dim = self.latent_dim//3
+        self.corr_levels = 4
+        self.corr_radius = 3
+        self.add_space_attn = add_space_attn
+        self.lie = Lie()
+        # step2: config the model components
+        # @Encoder
+        self.fnet = BasicEncoder(input_dim=3,
+            output_dim=self.latent_dim, norm_fn="instance", dropout=0,
+            stride=stride, Embed3D=False
+        )
+        # conv head for the tri-plane features
+        self.headyz = nn.Sequential(
+            nn.Conv2d(self.latent_dim, self.latent_dim, 3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(self.latent_dim, self.latent_dim, 3, padding=1))
+        self.headxz = nn.Sequential(
+            nn.Conv2d(self.latent_dim, self.latent_dim, 3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(self.latent_dim, self.latent_dim, 3, padding=1))
+        # @UpdateFormer
+        self.updateformer = EUpdateFormer(
+            space_depth=space_depth,
+            time_depth=time_depth,
+            input_dim=456,
+            hidden_size=hidden_size,
+            num_heads=num_heads,
+            output_dim=latent_dim + 3,
+            mlp_ratio=4.0,
+            add_space_attn=add_space_attn,
+            flash=getattr(self.args, "flash_attn", True)
+        )
+        self.support_features = torch.zeros(100, 384).to("cuda") + 0.1
+        self.norm = nn.GroupNorm(1, self.latent_dim)
+        self.ffeat_updater = nn.Sequential(
+            nn.Linear(self.latent_dim, self.latent_dim),
+            nn.GELU(),
+        )
+        self.ffeatyz_updater = nn.Sequential(
+            nn.Linear(self.latent_dim, self.latent_dim),
+            nn.GELU(),
+        )
+        self.ffeatxz_updater = nn.Sequential(
+            nn.Linear(self.latent_dim, self.latent_dim),
+            nn.GELU(),
+        )
+        #TODO @NeuralArap: optimize the arap
+        self.embed_traj = Embedder_Fourier(
+            input_dim=5, max_freq_log2=5.0, N_freqs=3, include_input=True
+        )
+        self.embed3d = Embedder_Fourier(
+            input_dim=3, max_freq_log2=10.0, N_freqs=10, include_input=True
+        )
+        self.embedConv = nn.Conv2d(self.latent_dim+63,
+                            self.latent_dim, 3, padding=1)
+        # @Vis_predictor
+        self.vis_predictor = nn.Sequential(
+            nn.Linear(128, 1),
+        )
+        self.embedProj = nn.Linear(63, 456)
+        self.zeroMLPflow = nn.Linear(195, 130)
+    def prepare_track(self, rgbds, queries):
+        """
+        NOTE:
+        Normalized the rgbs and sorted the queries via their first appeared time
+        Args:
+            rgbds: the input rgbd images (B T 4 H W)
+            queries: the input queries (B N 4)
+        Return:
+            rgbds: the normalized rgbds (B T 4 H W)
+            queries: the sorted queries (B N 4)
+            track_mask:
+        """
+        assert (rgbds.shape[2]==4) and (queries.shape[2]==4)
+        #Step1: normalize the rgbs input
+        device = rgbds.device
+        rgbds[:, :, :3, ...] = 2 * (rgbds[:, :, :3, ...] / 255.0) - 1.0
+        B, T, C, H, W = rgbds.shape
+        B, N, __ = queries.shape
+        self.traj_e = torch.zeros((B, T, N, 3), device=device)
+        self.vis_e = torch.zeros((B, T, N), device=device)
+        #Step2: sort the points via their first appeared time
+        first_positive_inds = queries[0, :, 0].long()
+        __, sort_inds = torch.sort(first_positive_inds, dim=0, descending=False)
+        inv_sort_inds = torch.argsort(sort_inds, dim=0)
+        first_positive_sorted_inds = first_positive_inds[sort_inds]
+        # check if can be inverse
+        assert torch.allclose(
+            first_positive_inds, first_positive_inds[sort_inds][inv_sort_inds]
+        )
+        # filter those points never appear points during 1 - T
+        ind_array = torch.arange(T, device=device)
+        ind_array = ind_array[None, :, None].repeat(B, 1, N)
+        track_mask = (ind_array >=
+                      first_positive_inds[None, None, :]).unsqueeze(-1)
+        # scale the coords_init
+        coords_init = queries[:, :, 1:].reshape(B, 1, N, 3).repeat(
+            1, self.S, 1, 1
+        )
+        coords_init[..., :2] /= float(self.stride)
+        #Step3: initial the regular grid
+        gridx = torch.linspace(0, W//self.stride - 1, W//self.stride)
+        gridy = torch.linspace(0, H//self.stride - 1, H//self.stride)
+        gridx, gridy = torch.meshgrid(gridx, gridy)
+        gridxy = torch.stack([gridx, gridy], dim=-1).to(rgbds.device).permute(
+            2, 1, 0
+        )
+        vis_init = torch.ones((B, self.S, N, 1), device=device).float() * 10
+        # Step4: initial traj for neural arap
+        T_series = torch.linspace(0, 5, T).reshape(1, T, 1 , 1).cuda() # 1 T 1 1
+        T_series = T_series.repeat(B, 1, N, 1)
+        # get the 3d traj in the camera coordinates
+        intr_init = self.intrs[:,queries[0,:,0].long()]
+        Traj_series = pix2cam(queries[:,:,None,1:].double(), intr_init.double())
+        #torch.inverse(intr_init.double())@queries[:,:,1:,None].double() # B N 3 1
+        Traj_series = Traj_series.repeat(1, 1, T, 1).permute(0, 2, 1, 3).float()
+        Traj_series = torch.cat([T_series, Traj_series], dim=-1)
+        # get the indicator for the neural arap
+        Traj_mask = -1e2*torch.ones_like(T_series)
+        Traj_series = torch.cat([Traj_series, Traj_mask], dim=-1)
+        return (
+            rgbds,
+            first_positive_inds,
+            first_positive_sorted_inds,
+            sort_inds, inv_sort_inds,
+            track_mask, gridxy, coords_init[..., sort_inds, :].clone(),
+            vis_init, Traj_series[..., sort_inds, :].clone()
+            )
+    def sample_trifeat(self, t,
+                       coords,
+                       featMapxy,
+                       featMapyz,
+                       featMapxz):
+        """
+        Sample the features from the 5D triplane feature map 3*(B S C H W)
+        Args:
+            t: the time index
+            coords: the coordinates of the points B S N 3
+            featMapxy: the feature map B S C Hx Wy
+            featMapyz: the feature map B S C Hy Wz
+            featMapxz: the feature map B S C Hx Wz
+        """
+        # get xy_t yz_t xz_t
+        queried_t = t.reshape(1, 1, -1, 1)
+        xy_t = torch.cat(
+            [queried_t, coords[..., [0,1]]],
+            dim=-1
+            )
+        yz_t = torch.cat(
+            [queried_t, coords[..., [1, 2]]],
+            dim=-1
+            )
+        xz_t = torch.cat(
+            [queried_t, coords[..., [0, 2]]],
+            dim=-1
+            )
+        featxy_init = sample_features5d(featMapxy, xy_t)
+        featyz_init = sample_features5d(featMapyz, yz_t)
+        featxz_init = sample_features5d(featMapxz, xz_t)
+        featxy_init = featxy_init.repeat(1, self.S, 1, 1)
+        featyz_init = featyz_init.repeat(1, self.S, 1, 1)
+        featxz_init = featxz_init.repeat(1, self.S, 1, 1)
+        return featxy_init, featyz_init, featxz_init
+    def neural_arap(self, coords, Traj_arap, intrs_S, T_mark):
+        """ calculate the ARAP embedding and offset
+        Args:
+            coords: the coordinates of the current points   1 S N' 3
+            Traj_arap: the trajectory of the points   1 T N' 5
+            intrs_S: the camera intrinsics B S 3 3
+        """
+        coords_out = coords.clone()
+        coords_out[..., :2] *= float(self.stride)
+        coords_out[..., 2] = coords_out[..., 2]/self.Dz
+        coords_out[..., 2] = coords_out[..., 2]*(self.d_far-self.d_near) + self.d_near
+        intrs_S = intrs_S[:, :, None, ...].repeat(1, 1, coords_out.shape[2], 1, 1)
+        B, S, N, D = coords_out.shape
+        if S != intrs_S.shape[1]:
+            intrs_S = torch.cat(
+                [intrs_S, intrs_S[:, -1:].repeat(1, S - intrs_S.shape[1],1,1,1)], dim=1
+            )
+            T_mark = torch.cat(
+                [T_mark, T_mark[:, -1:].repeat(1, S - T_mark.shape[1],1)], dim=1
+            )
+        xyz_ = pix2cam(coords_out.double(), intrs_S.double()[:,:,0])
+        xyz_ = xyz_.float()
+        xyz_embed = torch.cat([T_mark[...,None], xyz_,
+                               torch.zeros_like(T_mark[...,None])], dim=-1)
+        xyz_embed = self.embed_traj(xyz_embed)
+        Traj_arap_embed = self.embed_traj(Traj_arap)
+        d_xyz,traj_feat = self.arapFormer(xyz_embed, Traj_arap_embed)
+        # update in camera coordinate
+        xyz_ = xyz_ + d_xyz.clamp(-5, 5)
+        # project back to the image plane
+        coords_out = cam2pix(xyz_.double(), intrs_S[:,:,0].double()).float()
+        # resize back
+        coords_out[..., :2] /= float(self.stride)
+        coords_out[..., 2] = (coords_out[..., 2] - self.d_near)/(self.d_far-self.d_near)
+        coords_out[..., 2] *= self.Dz
+        return xyz_, coords_out, traj_feat
+    def gradient_arap(self, coords, aff_avg=None, aff_std=None, aff_f_sg=None,
+                      iter=0, iter_num=4, neigh_idx=None, intr=None, msk_track=None):
+        with torch.enable_grad():
+            coords.requires_grad_(True)
+            y = self.ARAP_ln(coords, aff_f_sg=aff_f_sg, neigh_idx=neigh_idx,
+                              iter=iter, iter_num=iter_num, intr=intr,msk_track=msk_track)
+            d_output = torch.ones_like(y, requires_grad=False, device=y.device)
+            gradients = torch.autograd.grad(
+                outputs=y,
+                inputs=coords,
+                grad_outputs=d_output,
+                create_graph=True,
+                retain_graph=True,
+                only_inputs=True, allow_unused=True)[0]
+        return gradients.detach()
+    def forward_iteration(
+        self,
+        fmapXY,
+        fmapYZ,
+        fmapXZ,
+        coords_init,
+        feat_init=None,
+        vis_init=None,
+        track_mask=None,
+        iters=4,
+        intrs_S=None,
+    ):
+        B, S_init, N, D = coords_init.shape
+        assert D == 3
+        assert B == 1
+        B, S, __, H8, W8 = fmapXY.shape
+        device = fmapXY.device
+        if S_init < S:
+            coords = torch.cat(
+                [coords_init, coords_init[:, -1].repeat(1, S - S_init, 1, 1)],
+                dim=1
+            )
+            vis_init = torch.cat(
+                [vis_init, vis_init[:, -1].repeat(1, S - S_init, 1, 1)], dim=1
+            )
+            intrs_S = torch.cat(
+                [intrs_S, intrs_S[:, -1].repeat(1, S - S_init, 1, 1)], dim=1
+            )
+        else:
+            coords = coords_init.clone()
+        fcorr_fnXY = CorrBlock(
+            fmapXY, num_levels=self.corr_levels, radius=self.corr_radius
+        )
+        fcorr_fnYZ = CorrBlock(
+            fmapYZ, num_levels=self.corr_levels, radius=self.corr_radius
+        )
+        fcorr_fnXZ = CorrBlock(
+            fmapXZ, num_levels=self.corr_levels, radius=self.corr_radius
+        )
+        ffeats = torch.split(feat_init.clone(), dim=-1, split_size_or_sections=1)
+        ffeats = [f.squeeze(-1) for f in ffeats]
+        times_ = torch.linspace(0, S - 1, S).reshape(1, S, 1)
+        pos_embed = sample_pos_embed(
+            grid_size=(H8, W8),
+            embed_dim=456,
+            coords=coords[..., :2],
+        )
+        pos_embed = rearrange(pos_embed, "b e n -> (b n) e").unsqueeze(1)
+        times_embed = (
+            torch.from_numpy(get_1d_sincos_pos_embed_from_grid(456, times_[0]))[None]
+            .repeat(B, 1, 1)
+            .float()
+            .to(device)
+        )
+        coord_predictions = []
+        attn_predictions = []
+        Rot_ln = 0
+        support_feat = self.support_features
+        for __ in range(iters):
+            coords = coords.detach()
+            # if self.args.if_ARAP == True:
+            #     # refine the track with arap
+            #     xyz_pred, coords, flows_cat0 = self.neural_arap(coords.detach(),
+            #                                                    Traj_arap.detach(),
+            #                                                    intrs_S, T_mark)
+            with torch.no_grad():
+                fcorrsXY = fcorr_fnXY.corr_sample(ffeats[0], coords[..., :2])
+                fcorrsYZ = fcorr_fnYZ.corr_sample(ffeats[1], coords[..., [1,2]])
+                fcorrsXZ = fcorr_fnXZ.corr_sample(ffeats[2], coords[..., [0,2]])
+            # fcorrs = fcorrsXY
+            fcorrs = fcorrsXY + fcorrsYZ + fcorrsXZ
+            LRR = fcorrs.shape[3]
+            fcorrs_ = fcorrs.permute(0, 2, 1, 3).reshape(B * N, S, LRR)
+            flows_ = (coords - coords[:, 0:1]).permute(0, 2, 1, 3).reshape(B * N, S, 3)
+            flows_cat = get_3d_embedding(flows_, 64, cat_coords=True)
+            flows_cat =  self.zeroMLPflow(flows_cat)
+            ffeats_xy = ffeats[0].permute(0,
+                                          2, 1, 3).reshape(B * N, S, self.latent_dim)
+            ffeats_yz = ffeats[1].permute(0,
+                                          2, 1, 3).reshape(B * N, S, self.latent_dim)
+            ffeats_xz = ffeats[2].permute(0,
+                                          2, 1, 3).reshape(B * N, S, self.latent_dim)
+            ffeats_ = ffeats_xy + ffeats_yz + ffeats_xz
+            if track_mask.shape[1] < vis_init.shape[1]:
+                track_mask = torch.cat(
+                    [
+                        track_mask,
+                        torch.zeros_like(track_mask[:, 0]).repeat(
+                            1, vis_init.shape[1] - track_mask.shape[1], 1, 1
+                        ),
+                    ],
+                    dim=1,
+                )
+            concat = (
+                torch.cat([track_mask, vis_init], dim=2)
+                .permute(0, 2, 1, 3)
+                .reshape(B * N, S, 2)
+            )
+            transformer_input = torch.cat([flows_cat, fcorrs_, ffeats_, concat], dim=2)
+            if transformer_input.shape[-1] < pos_embed.shape[-1]:
+            # padding the transformer_input to the same dimension as pos_embed
+                transformer_input = F.pad(
+                    transformer_input, (0, pos_embed.shape[-1] - transformer_input.shape[-1]),
+                    "constant", 0
+                )
+            x = transformer_input + pos_embed + times_embed
+            x = rearrange(x, "(b n) t d -> b n t d", b=B)
+            delta, AttnMap, so3_dist, delta_se3F, so3 = self.updateformer(x, support_feat)
+            support_feat = support_feat + delta_se3F[0]/100
+            delta = rearrange(delta, " b n t d -> (b n) t d")
+            d_coord = delta[:, :, :3]
+            d_feats = delta[:, :, 3:]
+            ffeats_xy = self.ffeat_updater(self.norm(d_feats.view(-1, self.latent_dim))) + ffeats_xy.reshape(-1, self.latent_dim)
+            ffeats_yz = self.ffeatyz_updater(self.norm(d_feats.view(-1, self.latent_dim))) + ffeats_yz.reshape(-1, self.latent_dim)
+            ffeats_xz = self.ffeatxz_updater(self.norm(d_feats.view(-1, self.latent_dim))) + ffeats_xz.reshape(-1, self.latent_dim)
+            ffeats[0] = ffeats_xy.reshape(B, N, S, self.latent_dim).permute(
+                0, 2, 1, 3
+            )  # B,S,N,C
+            ffeats[1] = ffeats_yz.reshape(B, N, S, self.latent_dim).permute(
+                0, 2, 1, 3
+            )  # B,S,N,C
+            ffeats[2] = ffeats_xz.reshape(B, N, S, self.latent_dim).permute(
+                0, 2, 1, 3
+            )  # B,S,N,C
+            coords = coords + d_coord.reshape(B, N, S, 3).permute(0, 2, 1, 3)
+            if torch.isnan(coords).any():
+                import ipdb; ipdb.set_trace()
+            coords_out = coords.clone()
+            coords_out[..., :2] *= float(self.stride)
+            coords_out[..., 2] = coords_out[..., 2]/self.Dz
+            coords_out[..., 2] = coords_out[..., 2]*(self.d_far-self.d_near) + self.d_near
+            coord_predictions.append(coords_out)
+            attn_predictions.append(AttnMap)
+        ffeats_f = ffeats[0] + ffeats[1] + ffeats[2]
+        vis_e = self.vis_predictor(ffeats_f.reshape(B * S * N, self.latent_dim)).reshape(
+            B, S, N
+        )
+        self.support_features = support_feat.detach()
+        return coord_predictions, attn_predictions, vis_e, feat_init, Rot_ln
+    def forward(self, rgbds, queries, iters=4, feat_init=None,
+                is_train=False, intrs=None, wind_S=None):
+        self.support_features = torch.zeros(100, 384).to("cuda") + 0.1
+        self.is_train=is_train
+        B, T, C, H, W = rgbds.shape
+        # set the intrinsic or simply initialized
+        if intrs is None:
+            intrs = torch.from_numpy(np.array([[W, 0.0, W//2],
+                                              [0.0, W, H//2],
+                                              [0.0, 0.0, 1.0]]))
+            intrs = intrs[None,
+                         None,...].repeat(B, T, 1, 1).float().to(rgbds.device)
+        self.intrs = intrs
+        # prepare the input for tracking
+        (
+          rgbds,
+          first_positive_inds,
+          first_positive_sorted_inds, sort_inds,
+          inv_sort_inds, track_mask, gridxy,
+          coords_init, vis_init, Traj_arap
+          ) = self.prepare_track(rgbds.clone(), queries)
+        coords_init_ = coords_init.clone()
+        vis_init_ = vis_init[:, :, sort_inds].clone()
+        depth_all = rgbds[:, :, 3,...]
+        d_near = self.d_near = depth_all[depth_all>0.01].min().item()
+        d_far = self.d_far = depth_all[depth_all>0.01].max().item()
+        if wind_S is not None:
+            self.S = wind_S
+        B, N, __ = queries.shape
+        self.Dz = Dz = W//self.stride
+        w_idx_start = 0
+        p_idx_end = 0
+        p_idx_start = 0
+        fmaps_ = None
+        vis_predictions = []
+        coord_predictions = []
+        attn_predictions = []
+        p_idx_end_list = []
+        Rigid_ln_total = 0
+        while w_idx_start < T - self.S // 2:
+            curr_wind_points = torch.nonzero(
+                first_positive_sorted_inds < w_idx_start + self.S)
+            if curr_wind_points.shape[0] == 0:
+                w_idx_start = w_idx_start + self.S // 2
+                continue
+            p_idx_end = curr_wind_points[-1] + 1
+            p_idx_end_list.append(p_idx_end)
+            # the T may not be divided by self.S
+            rgbds_seq = rgbds[:, w_idx_start:w_idx_start + self.S].clone()
+            S = S_local = rgbds_seq.shape[1]
+            if S < self.S:
+                rgbds_seq = torch.cat(
+                    [rgbds_seq,
+                     rgbds_seq[:, -1, None].repeat(1, self.S - S, 1, 1, 1)],
+                    dim=1,
+                )
+                S = rgbds_seq.shape[1]
+            rgbs_ = rgbds_seq.reshape(B * S, C, H, W)[:, :3]
+            depths = rgbds_seq.reshape(B * S, C, H, W)[:, 3:].clone()
+            # open the mask
+            # Traj_arap[:, w_idx_start:w_idx_start + self.S, :p_idx_end, -1] = 0
+            #step1: normalize the depth map
+            depths = (depths - d_near)/(d_far-d_near)
+            depths_dn = nn.functional.interpolate(
+                    depths, scale_factor=1.0 / self.stride, mode="nearest")
+            depths_dnG = depths_dn*Dz
+            #step2: normalize the coordinate
+            coords_init_[:, :, p_idx_start:p_idx_end, 2] = (
+                coords_init[:, :, p_idx_start:p_idx_end, 2] - d_near
+                )/(d_far-d_near)
+            coords_init_[:, :, p_idx_start:p_idx_end, 2] *= Dz
+            # efficient triplane splatting
+            gridxyz = torch.cat([gridxy[None,...].repeat(
+                                depths_dn.shape[0],1,1,1), depths_dnG], dim=1)
+            Fxy2yz = gridxyz[:,[1, 2], ...] - gridxyz[:,:2]
+            Fxy2xz = gridxyz[:,[0, 2], ...] - gridxyz[:,:2]
+            if getattr(self.args, "Embed3D", None) == True:
+                gridxyz_nm = gridxyz.clone()
+                gridxyz_nm[:,0,...] = (gridxyz_nm[:,0,...]-gridxyz_nm[:,0,...].min())/(gridxyz_nm[:,0,...].max()-gridxyz_nm[:,0,...].min())
+                gridxyz_nm[:,1,...] = (gridxyz_nm[:,1,...]-gridxyz_nm[:,1,...].min())/(gridxyz_nm[:,1,...].max()-gridxyz_nm[:,1,...].min())
+                gridxyz_nm[:,2,...] = (gridxyz_nm[:,2,...]-gridxyz_nm[:,2,...].min())/(gridxyz_nm[:,2,...].max()-gridxyz_nm[:,2,...].min())
+                gridxyz_nm = 2*(gridxyz_nm-0.5)
+                _,_,h4,w4 = gridxyz_nm.shape
+                gridxyz_nm = gridxyz_nm.permute(0,2,3,1).reshape(S*h4*w4, 3)
+                featPE = self.embed3d(gridxyz_nm).view(S, h4, w4, -1).permute(0,3,1,2)
+                if fmaps_ is None:
+                    fmaps_ = torch.cat([self.fnet(rgbs_),featPE], dim=1)
+                    fmaps_ = self.embedConv(fmaps_)
+                else:
+                    fmaps_new = torch.cat([self.fnet(rgbs_[self.S // 2 :]),featPE[self.S // 2 :]], dim=1)
+                    fmaps_new = self.embedConv(fmaps_new)
+                    fmaps_ = torch.cat(
+                        [fmaps_[self.S // 2 :], fmaps_new], dim=0
+                    )
+            else:
+                if fmaps_ is None:
+                    fmaps_ = self.fnet(rgbs_)
+                else:
+                    fmaps_ = torch.cat(
+                    [fmaps_[self.S // 2 :], self.fnet(rgbs_[self.S // 2 :])], dim=0
+                    )
+            fmapXY = fmaps_[:, :self.latent_dim].reshape(
+                B, S, self.latent_dim, H // self.stride, W // self.stride
+            )
+            fmapYZ = softsplat(fmapXY[0], Fxy2yz, None,
+                            strMode="avg", tenoutH=self.Dz, tenoutW=H//self.stride)
+            fmapXZ = softsplat(fmapXY[0], Fxy2xz, None,
+                                strMode="avg", tenoutH=self.Dz, tenoutW=W//self.stride)
+            fmapYZ = self.headyz(fmapYZ)[None, ...]
+            fmapXZ = self.headxz(fmapXZ)[None, ...]
+            if p_idx_end - p_idx_start > 0:
+                queried_t = (first_positive_sorted_inds[p_idx_start:p_idx_end]
+                                                        - w_idx_start)
+                (featxy_init,
+                 featyz_init,
+                 featxz_init) = self.sample_trifeat(
+                     t=queried_t,featMapxy=fmapXY,
+                     featMapyz=fmapYZ,featMapxz=fmapXZ,
+                     coords=coords_init_[:, :1, p_idx_start:p_idx_end]
+                     )
+                # T, S, N, C, 3
+                feat_init_curr = torch.stack([featxy_init,
+                                              featyz_init, featxz_init], dim=-1)
+                feat_init = smart_cat(feat_init, feat_init_curr, dim=2)
+            if p_idx_start > 0:
+                # preprocess the coordinates of last windows
+                last_coords = coords[-1][:, self.S // 2 :].clone()
+                last_coords[..., :2] /= float(self.stride)
+                last_coords[..., 2:] = (last_coords[..., 2:]-d_near)/(d_far-d_near)
+                last_coords[..., 2:] = last_coords[..., 2:]*Dz
+                coords_init_[:, : self.S // 2, :p_idx_start] = last_coords
+                coords_init_[:, self.S // 2 :, :p_idx_start] = last_coords[
+                    :, -1
+                ].repeat(1, self.S // 2, 1, 1)
+                last_vis = vis[:, self.S // 2 :].unsqueeze(-1)
+                vis_init_[:, : self.S // 2, :p_idx_start] = last_vis
+                vis_init_[:, self.S // 2 :, :p_idx_start] = last_vis[:, -1].repeat(
+                    1, self.S // 2, 1, 1
+                )
+            coords, attns, vis, __, Rigid_ln = self.forward_iteration(
+                fmapXY=fmapXY,
+                fmapYZ=fmapYZ,
+                fmapXZ=fmapXZ,
+                coords_init=coords_init_[:, :, :p_idx_end],
+                feat_init=feat_init[:, :, :p_idx_end],
+                vis_init=vis_init_[:, :, :p_idx_end],
+                track_mask=track_mask[:, w_idx_start : w_idx_start + self.S, :p_idx_end],
+                iters=iters,
+                intrs_S=self.intrs[:, w_idx_start : w_idx_start + self.S],
+                )
+            Rigid_ln_total+=Rigid_ln
+            if is_train:
+                vis_predictions.append(torch.sigmoid(vis[:, :S_local]))
+                coord_predictions.append([coord[:, :S_local] for coord in coords])
+                attn_predictions.append(attns)
+            self.traj_e[:, w_idx_start:w_idx_start+self.S, :p_idx_end] = coords[-1][:, :S_local]
+            self.vis_e[:, w_idx_start:w_idx_start+self.S, :p_idx_end] = vis[:, :S_local]
+            track_mask[:, : w_idx_start + self.S, :p_idx_end] = 0.0
+            w_idx_start = w_idx_start + self.S // 2
+            p_idx_start = p_idx_end
+        self.traj_e = self.traj_e[:, :, inv_sort_inds]
+        self.vis_e = self.vis_e[:, :, inv_sort_inds]
+        self.vis_e = torch.sigmoid(self.vis_e)
+        train_data = (
+            (vis_predictions, coord_predictions, attn_predictions,
+             p_idx_end_list, sort_inds, Rigid_ln_total)
+        )
+        if self.is_train:
+            return self.traj_e, feat_init, self.vis_e, train_data
+        else:
+            return self.traj_e, feat_init, self.vis_e

models/spatracker/models/core/spatracker/unet.py ADDED Viewed

	@@ -0,0 +1,258 @@

+'''
+Codes are from:
+https://github.com/jaxony/unet-pytorch/blob/master/model.py
+'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+from collections import OrderedDict
+from torch.nn import init
+import numpy as np
+def conv3x3(in_channels, out_channels, stride=1,
+            padding=1, bias=True, groups=1):
+    return nn.Conv2d(
+        in_channels,
+        out_channels,
+        kernel_size=3,
+        stride=stride,
+        padding=padding,
+        bias=bias,
+        groups=groups)
+def upconv2x2(in_channels, out_channels, mode='transpose'):
+    if mode == 'transpose':
+        return nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size=2,
+            stride=2)
+    else:
+        # out_channels is always going to be the same
+        # as in_channels
+        return nn.Sequential(
+            nn.Upsample(mode='bilinear', scale_factor=2),
+            conv1x1(in_channels, out_channels))
+def conv1x1(in_channels, out_channels, groups=1):
+    return nn.Conv2d(
+        in_channels,
+        out_channels,
+        kernel_size=1,
+        groups=groups,
+        stride=1)
+class DownConv(nn.Module):
+    """
+    A helper Module that performs 2 convolutions and 1 MaxPool.
+    A ReLU activation follows each convolution.
+    """
+    def __init__(self, in_channels, out_channels, pooling=True):
+        super(DownConv, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.pooling = pooling
+        self.conv1 = conv3x3(self.in_channels, self.out_channels)
+        self.conv2 = conv3x3(self.out_channels, self.out_channels)
+        if self.pooling:
+            self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = F.relu(self.conv2(x))
+        before_pool = x
+        if self.pooling:
+            x = self.pool(x)
+        return x, before_pool
+class UpConv(nn.Module):
+    """
+    A helper Module that performs 2 convolutions and 1 UpConvolution.
+    A ReLU activation follows each convolution.
+    """
+    def __init__(self, in_channels, out_channels,
+                 merge_mode='concat', up_mode='transpose'):
+        super(UpConv, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.merge_mode = merge_mode
+        self.up_mode = up_mode
+        self.upconv = upconv2x2(self.in_channels, self.out_channels,
+            mode=self.up_mode)
+        if self.merge_mode == 'concat':
+            self.conv1 = conv3x3(
+                2*self.out_channels, self.out_channels)
+        else:
+            # num of input channels to conv2 is same
+            self.conv1 = conv3x3(self.out_channels, self.out_channels)
+        self.conv2 = conv3x3(self.out_channels, self.out_channels)
+    def forward(self, from_down, from_up):
+        """ Forward pass
+        Arguments:
+            from_down: tensor from the encoder pathway
+            from_up: upconv'd tensor from the decoder pathway
+        """
+        from_up = self.upconv(from_up)
+        if self.merge_mode == 'concat':
+            x = torch.cat((from_up, from_down), 1)
+        else:
+            x = from_up + from_down
+        x = F.relu(self.conv1(x))
+        x = F.relu(self.conv2(x))
+        return x
+class UNet(nn.Module):
+    """ `UNet` class is based on https://arxiv.org/abs/1505.04597
+    The U-Net is a convolutional encoder-decoder neural network.
+    Contextual spatial information (from the decoding,
+    expansive pathway) about an input tensor is merged with
+    information representing the localization of details
+    (from the encoding, compressive pathway).
+    Modifications to the original paper:
+    (1) padding is used in 3x3 convolutions to prevent loss
+        of border pixels
+    (2) merging outputs does not require cropping due to (1)
+    (3) residual connections can be used by specifying
+        UNet(merge_mode='add')
+    (4) if non-parametric upsampling is used in the decoder
+        pathway (specified by upmode='upsample'), then an
+        additional 1x1 2d convolution occurs after upsampling
+        to reduce channel dimensionality by a factor of 2.
+        This channel halving happens with the convolution in
+        the tranpose convolution (specified by upmode='transpose')
+    """
+    def __init__(self, num_classes, in_channels=3, depth=5,
+                 start_filts=64, up_mode='transpose',
+                 merge_mode='concat', **kwargs):
+        """
+        Arguments:
+            in_channels: int, number of channels in the input tensor.
+                Default is 3 for RGB images.
+            depth: int, number of MaxPools in the U-Net.
+            start_filts: int, number of convolutional filters for the
+                first conv.
+            up_mode: string, type of upconvolution. Choices: 'transpose'
+                for transpose convolution or 'upsample' for nearest neighbour
+                upsampling.
+        """
+        super(UNet, self).__init__()
+        if up_mode in ('transpose', 'upsample'):
+            self.up_mode = up_mode
+        else:
+            raise ValueError("\"{}\" is not a valid mode for "
+                             "upsampling. Only \"transpose\" and "
+                             "\"upsample\" are allowed.".format(up_mode))
+        if merge_mode in ('concat', 'add'):
+            self.merge_mode = merge_mode
+        else:
+            raise ValueError("\"{}\" is not a valid mode for"
+                             "merging up and down paths. "
+                             "Only \"concat\" and "
+                             "\"add\" are allowed.".format(up_mode))
+        # NOTE: up_mode 'upsample' is incompatible with merge_mode 'add'
+        if self.up_mode == 'upsample' and self.merge_mode == 'add':
+            raise ValueError("up_mode \"upsample\" is incompatible "
+                             "with merge_mode \"add\" at the moment "
+                             "because it doesn't make sense to use "
+                             "nearest neighbour to reduce "
+                             "depth channels (by half).")
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.start_filts = start_filts
+        self.depth = depth
+        self.down_convs = []
+        self.up_convs = []
+        # create the encoder pathway and add to a list
+        for i in range(depth):
+            ins = self.in_channels if i == 0 else outs
+            outs = self.start_filts*(2**i)
+            pooling = True if i < depth-1 else False
+            down_conv = DownConv(ins, outs, pooling=pooling)
+            self.down_convs.append(down_conv)
+        # create the decoder pathway and add to a list
+        # - careful! decoding only requires depth-1 blocks
+        for i in range(depth-1):
+            ins = outs
+            outs = ins // 2
+            up_conv = UpConv(ins, outs, up_mode=up_mode,
+                merge_mode=merge_mode)
+            self.up_convs.append(up_conv)
+        # add the list of modules to current module
+        self.down_convs = nn.ModuleList(self.down_convs)
+        self.up_convs = nn.ModuleList(self.up_convs)
+        self.conv_final = conv1x1(outs, self.num_classes)
+        self.reset_params()
+    @staticmethod
+    def weight_init(m):
+        if isinstance(m, nn.Conv2d):
+            init.xavier_normal_(m.weight)
+            init.constant_(m.bias, 0)
+    def reset_params(self):
+        for i, m in enumerate(self.modules()):
+            self.weight_init(m)
+    def forward(self, x):
+        encoder_outs = []
+        # encoder pathway, save outputs for merging
+        for i, module in enumerate(self.down_convs):
+            x, before_pool = module(x)
+            encoder_outs.append(before_pool)
+        for i, module in enumerate(self.up_convs):
+            before_pool = encoder_outs[-(i+2)]
+            x = module(before_pool, x)
+        # No softmax is used. This means you need to use
+        # nn.CrossEntropyLoss is your training script,
+        # as this module includes a softmax already.
+        x = self.conv_final(x)
+        return x
+if __name__ == "__main__":
+    """
+    testing
+    """
+    model = UNet(1, depth=5, merge_mode='concat', in_channels=1, start_filts=32)
+    print(model)
+    print(sum(p.numel() for p in model.parameters()))
+    reso = 176
+    x = np.zeros((1, 1, reso, reso))
+    x[:,:,int(reso/2-1), int(reso/2-1)] = np.nan
+    x = torch.FloatTensor(x)
+    out = model(x)
+    print('%f'%(torch.sum(torch.isnan(out)).detach().cpu().numpy()/(reso*reso)))
+    # loss = torch.sum(out)
+    # loss.backward()

models/spatracker/models/core/spatracker/vit/__init__.py ADDED Viewed

File without changes

models/spatracker/models/core/spatracker/vit/common.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+from typing import Type
+class MLPBlock(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        mlp_dim: int,
+        act: Type[nn.Module] = nn.GELU,
+    ) -> None:
+        super().__init__()
+        self.lin1 = nn.Linear(embedding_dim, mlp_dim)
+        self.lin2 = nn.Linear(mlp_dim, embedding_dim)
+        self.act = act()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.lin2(self.act(self.lin1(x)))
+# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
+# Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa
+class LayerNorm2d(nn.Module):
+    def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(num_channels))
+        self.bias = nn.Parameter(torch.zeros(num_channels))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x

models/spatracker/models/core/spatracker/vit/encoder.py ADDED Viewed

	@@ -0,0 +1,397 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Tuple, Type
+from models.spatracker.models.core.spatracker.vit.common import (
+    LayerNorm2d, MLPBlock
+)
+# This class and its supporting functions below lightly adapted from the ViTDet backbone available at: https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py # noqa
+class ImageEncoderViT(nn.Module):
+    def __init__(
+        self,
+        img_size: int = 1024,
+        patch_size: int = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        depth: int = 12,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        out_chans: int = 256,
+        qkv_bias: bool = True,
+        norm_layer: Type[nn.Module] = nn.LayerNorm,
+        act_layer: Type[nn.Module] = nn.GELU,
+        use_abs_pos: bool = True,
+        use_rel_pos: bool = False,
+        rel_pos_zero_init: bool = True,
+        window_size: int = 0,
+        global_attn_indexes: Tuple[int, ...] = (),
+    ) -> None:
+        """
+        Args:
+            img_size (int): Input image size.
+            patch_size (int): Patch size.
+            in_chans (int): Number of input image channels.
+            embed_dim (int): Patch embedding dimension.
+            depth (int): Depth of ViT.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_abs_pos (bool): If True, use absolute positional embeddings.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks.
+            global_attn_indexes (list): Indexes for blocks using global attention.
+        """
+        super().__init__()
+        self.img_size = img_size
+        self.patch_embed = PatchEmbed(
+            kernel_size=(patch_size, patch_size),
+            stride=(patch_size, patch_size),
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+        self.pos_embed: Optional[nn.Parameter] = None
+        if use_abs_pos:
+            # Initialize absolute positional embedding with pretrain image size.
+            self.pos_embed = nn.Parameter(
+                torch.zeros(1, img_size // patch_size, img_size // patch_size, embed_dim)
+            )
+        self.blocks = nn.ModuleList()
+        for i in range(depth):
+            block = Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                use_rel_pos=use_rel_pos,
+                rel_pos_zero_init=rel_pos_zero_init,
+                window_size=window_size if i not in global_attn_indexes else 0,
+                input_size=(img_size // patch_size, img_size // patch_size),
+            )
+            self.blocks.append(block)
+        self.neck = nn.Sequential(
+            nn.Conv2d(
+                embed_dim,
+                out_chans,
+                kernel_size=1,
+                bias=False,
+            ),
+            LayerNorm2d(out_chans),
+            nn.Conv2d(
+                out_chans,
+                out_chans,
+                kernel_size=3,
+                padding=1,
+                bias=False,
+            ),
+            LayerNorm2d(out_chans),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.patch_embed(x)
+        if self.pos_embed is not None:
+            x = x + self.pos_embed
+        for blk in self.blocks:
+            x = blk(x)
+        x = self.neck(x.permute(0, 3, 1, 2))
+        return x
+class Block(nn.Module):
+    """Transformer blocks with support of window attention and residual propagation blocks"""
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        norm_layer: Type[nn.Module] = nn.LayerNorm,
+        act_layer: Type[nn.Module] = nn.GELU,
+        use_rel_pos: bool = False,
+        rel_pos_zero_init: bool = True,
+        window_size: int = 0,
+        input_size: Optional[Tuple[int, int]] = None,
+    ) -> None:
+        """
+        Args:
+            dim (int): Number of input channels.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks. If it equals 0, then
+                use global attention.
+            input_size (tuple(int, int) or None): Input resolution for calculating the relative
+                positional parameter size.
+        """
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            use_rel_pos=use_rel_pos,
+            rel_pos_zero_init=rel_pos_zero_init,
+            input_size=input_size if window_size == 0 else (window_size, window_size),
+        )
+        self.norm2 = norm_layer(dim)
+        self.mlp = MLPBlock(embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer)
+        self.window_size = window_size
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        shortcut = x
+        x = self.norm1(x)
+        # Window partition
+        if self.window_size > 0:
+            H, W = x.shape[1], x.shape[2]
+            x, pad_hw = window_partition(x, self.window_size)
+        x = self.attn(x)
+        # Reverse window partition
+        if self.window_size > 0:
+            x = window_unpartition(x, self.window_size, pad_hw, (H, W))
+        x = shortcut + x
+        x = x + self.mlp(self.norm2(x))
+        return x
+class Attention(nn.Module):
+    """Multi-head Attention block with relative position embeddings."""
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = True,
+        use_rel_pos: bool = False,
+        rel_pos_zero_init: bool = True,
+        input_size: Optional[Tuple[int, int]] = None,
+    ) -> None:
+        """
+        Args:
+            dim (int): Number of input channels.
+            num_heads (int): Number of attention heads.
+            qkv_bias (bool):  If True, add a learnable bias to query, key, value.
+            rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            input_size (tuple(int, int) or None): Input resolution for calculating the relative
+                positional parameter size.
+        """
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+        self.use_rel_pos = use_rel_pos
+        if self.use_rel_pos:
+            assert (
+                input_size is not None
+            ), "Input size must be provided if using relative positional encoding."
+            # initialize relative positional embeddings
+            self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
+            self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, H, W, _ = x.shape
+        # qkv with shape (3, B, nHead, H * W, C)
+        qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        # q, k, v with shape (B * nHead, H * W, C)
+        q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0)
+        attn = (q * self.scale) @ k.transpose(-2, -1)
+        if self.use_rel_pos:
+            attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W))
+        attn = attn.softmax(dim=-1)
+        x = (attn @ v).view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1)
+        x = self.proj(x)
+        return x
+def window_partition(x: torch.Tensor, window_size: int) -> Tuple[torch.Tensor, Tuple[int, int]]:
+    """
+    Partition into non-overlapping windows with padding if needed.
+    Args:
+        x (tensor): input tokens with [B, H, W, C].
+        window_size (int): window size.
+    Returns:
+        windows: windows after partition with [B * num_windows, window_size, window_size, C].
+        (Hp, Wp): padded height and width before partition
+    """
+    B, H, W, C = x.shape
+    pad_h = (window_size - H % window_size) % window_size
+    pad_w = (window_size - W % window_size) % window_size
+    if pad_h > 0 or pad_w > 0:
+        x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
+    Hp, Wp = H + pad_h, W + pad_w
+    x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows, (Hp, Wp)
+def window_unpartition(
+    windows: torch.Tensor, window_size: int, pad_hw: Tuple[int, int], hw: Tuple[int, int]
+) -> torch.Tensor:
+    """
+    Window unpartition into original sequences and removing padding.
+    Args:
+        windows (tensor): input tokens with [B * num_windows, window_size, window_size, C].
+        window_size (int): window size.
+        pad_hw (Tuple): padded height and width (Hp, Wp).
+        hw (Tuple): original height and width (H, W) before padding.
+    Returns:
+        x: unpartitioned sequences with [B, H, W, C].
+    """
+    Hp, Wp = pad_hw
+    H, W = hw
+    B = windows.shape[0] // (Hp * Wp // window_size // window_size)
+    x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
+    if Hp > H or Wp > W:
+        x = x[:, :H, :W, :].contiguous()
+    return x
+def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
+    """
+    Get relative positional embeddings according to the relative positions of
+        query and key sizes.
+    Args:
+        q_size (int): size of query q.
+        k_size (int): size of key k.
+        rel_pos (Tensor): relative position embeddings (L, C).
+    Returns:
+        Extracted positional embeddings according to relative positions.
+    """
+    max_rel_dist = int(2 * max(q_size, k_size) - 1)
+    # Interpolate rel pos if needed.
+    if rel_pos.shape[0] != max_rel_dist:
+        # Interpolate rel pos.
+        rel_pos_resized = F.interpolate(
+            rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
+            size=max_rel_dist,
+            mode="linear",
+        )
+        rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
+    else:
+        rel_pos_resized = rel_pos
+    # Scale the coords with short length if shapes for q and k are different.
+    q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
+    k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
+    relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
+    return rel_pos_resized[relative_coords.long()]
+def add_decomposed_rel_pos(
+    attn: torch.Tensor,
+    q: torch.Tensor,
+    rel_pos_h: torch.Tensor,
+    rel_pos_w: torch.Tensor,
+    q_size: Tuple[int, int],
+    k_size: Tuple[int, int],
+) -> torch.Tensor:
+    """
+    Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
+    https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py   # noqa B950
+    Args:
+        attn (Tensor): attention map.
+        q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
+        rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
+        rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
+        q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
+        k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
+    Returns:
+        attn (Tensor): attention map with added relative positional embeddings.
+    """
+    q_h, q_w = q_size
+    k_h, k_w = k_size
+    Rh = get_rel_pos(q_h, k_h, rel_pos_h)
+    Rw = get_rel_pos(q_w, k_w, rel_pos_w)
+    B, _, dim = q.shape
+    r_q = q.reshape(B, q_h, q_w, dim)
+    rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
+    rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
+    attn = (
+        attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
+    ).view(B, q_h * q_w, k_h * k_w)
+    return attn
+class PatchEmbed(nn.Module):
+    """
+    Image to Patch Embedding.
+    """
+    def __init__(
+        self,
+        kernel_size: Tuple[int, int] = (16, 16),
+        stride: Tuple[int, int] = (16, 16),
+        padding: Tuple[int, int] = (0, 0),
+        in_chans: int = 3,
+        embed_dim: int = 768,
+    ) -> None:
+        """
+        Args:
+            kernel_size (Tuple): kernel size of the projection layer.
+            stride (Tuple): stride of the projection layer.
+            padding (Tuple): padding size of the projection layer.
+            in_chans (int): Number of input image channels.
+            embed_dim (int): Patch embedding dimension.
+        """
+        super().__init__()
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x)
+        # B C H W -> B H W C
+        x = x.permute(0, 2, 3, 1)
+        return x

models/spatracker/predictor.py ADDED Viewed

	@@ -0,0 +1,284 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn.functional as F
+import time
+from tqdm import tqdm
+from models.spatracker.models.core.spatracker.spatracker import get_points_on_a_grid
+from models.spatracker.models.core.model_utils import smart_cat
+from models.spatracker.models.build_spatracker import (
+    build_spatracker,
+)
+from models.spatracker.models.core.model_utils import (
+    meshgrid2d, bilinear_sample2d, smart_cat
+)
+class SpaTrackerPredictor(torch.nn.Module):
+    def __init__(
+        self, checkpoint="cotracker/checkpoints/cotracker_stride_4_wind_8.pth",
+        interp_shape=(384, 512),
+        seq_length=16
+    ):
+        super().__init__()
+        self.interp_shape = interp_shape
+        self.support_grid_size = 6
+        model = build_spatracker(checkpoint, seq_length=seq_length)
+        self.model = model
+        self.model.eval()
+    @torch.no_grad()
+    def forward(
+        self,
+        video,  # (1, T, 3, H, W)
+        video_depth = None, # (T, 1, H, W)
+        # input prompt types:
+        # - None. Dense tracks are computed in this case. You can adjust *query_frame* to compute tracks starting from a specific frame.
+        # *backward_tracking=True* will compute tracks in both directions.
+        # - queries. Queried points of shape (1, N, 3) in format (t, x, y) for frame index and pixel coordinates.
+        # - grid_size. Grid of N*N points from the first frame. if segm_mask is provided, then computed only for the mask.
+        # You can adjust *query_frame* and *backward_tracking* for the regular grid in the same way as for dense tracks.
+        queries: torch.Tensor = None,
+        segm_mask: torch.Tensor = None,  # Segmentation mask of shape (B, 1, H, W)
+        grid_size: int = 0,
+        grid_query_frame: int = 0,  # only for dense and regular grid tracks
+        backward_tracking: bool = False,
+        depth_predictor=None,
+        wind_length: int = 8,
+        progressive_tracking: bool = False,
+    ):
+        if queries is None and grid_size == 0:
+            tracks, visibilities, T_Firsts = self._compute_dense_tracks(
+                video,
+                grid_query_frame=grid_query_frame,
+                backward_tracking=backward_tracking,
+                video_depth=video_depth,
+                depth_predictor=depth_predictor,
+                wind_length=wind_length,
+            )
+        else:
+            tracks, visibilities, T_Firsts = self._compute_sparse_tracks(
+                video,
+                queries,
+                segm_mask,
+                grid_size,
+                add_support_grid=False, #(grid_size == 0 or segm_mask is not None),
+                grid_query_frame=grid_query_frame,
+                backward_tracking=backward_tracking,
+                video_depth=video_depth,
+                depth_predictor=depth_predictor,
+                wind_length=wind_length,
+            )
+        return tracks, visibilities, T_Firsts
+    def _compute_dense_tracks(
+        self, video, grid_query_frame, grid_size=30, backward_tracking=False,
+        depth_predictor=None, video_depth=None, wind_length=8
+    ):
+        *_, H, W = video.shape
+        grid_step = W // grid_size
+        grid_width = W // grid_step
+        grid_height = H // grid_step
+        tracks = visibilities = T_Firsts = None
+        grid_pts = torch.zeros((1, grid_width * grid_height, 3)).to(video.device)
+        grid_pts[0, :, 0] = grid_query_frame
+        for offset in tqdm(range(grid_step * grid_step)):
+            ox = offset % grid_step
+            oy = offset // grid_step
+            grid_pts[0, :, 1] = (
+                torch.arange(grid_width).repeat(grid_height) * grid_step + ox
+            )
+            grid_pts[0, :, 2] = (
+                torch.arange(grid_height).repeat_interleave(grid_width) * grid_step + oy
+            )
+            tracks_step, visibilities_step, T_First_step = self._compute_sparse_tracks(
+                video=video,
+                queries=grid_pts,
+                backward_tracking=backward_tracking,
+                wind_length=wind_length,
+                video_depth=video_depth,
+                depth_predictor=depth_predictor,
+            )
+            tracks = smart_cat(tracks, tracks_step, dim=2)
+            visibilities = smart_cat(visibilities, visibilities_step, dim=2)
+            T_Firsts = smart_cat(T_Firsts, T_First_step, dim=1)
+        return tracks, visibilities, T_Firsts
+    def _compute_sparse_tracks(
+        self,
+        video,
+        queries,
+        segm_mask=None,
+        grid_size=0,
+        add_support_grid=False,
+        grid_query_frame=0,
+        backward_tracking=False,
+        depth_predictor=None,
+        video_depth=None,
+        wind_length=8,
+    ):
+        B, T, C, H, W = video.shape
+        assert B == 1
+        video = video.reshape(B * T, C, H, W)
+        video = F.interpolate(video, tuple(self.interp_shape), mode="bilinear")
+        video = video.reshape(B, T, 3, self.interp_shape[0], self.interp_shape[1])
+        if queries is not None:
+            queries = queries.clone()
+            B, N, D = queries.shape
+            assert D == 3
+            queries[:, :, 1] *= self.interp_shape[1] / W
+            queries[:, :, 2] *= self.interp_shape[0] / H
+        elif grid_size > 0:
+            grid_pts = get_points_on_a_grid(grid_size, self.interp_shape, device=video.device)
+            if segm_mask is not None:
+                segm_mask = F.interpolate(
+                    segm_mask, tuple(self.interp_shape), mode="nearest"
+                )
+                point_mask = segm_mask[0, 0][
+                    (grid_pts[0, :, 1]).round().long().cpu(),
+                    (grid_pts[0, :, 0]).round().long().cpu(),
+                ].bool()
+                grid_pts_extra = grid_pts[:, point_mask]
+            else:
+                grid_pts_extra = None
+            if grid_pts_extra is not None:
+                total_num = int(grid_pts_extra.shape[1])
+                total_num = min(800, total_num)
+                pick_idx = torch.randperm(grid_pts_extra.shape[1])[:total_num]
+                grid_pts_extra = grid_pts_extra[:, pick_idx]
+                queries_extra = torch.cat(
+                    [
+                        torch.ones_like(grid_pts_extra[:, :, :1]) * grid_query_frame,
+                        grid_pts_extra,
+                    ],
+                    dim=2,
+                )
+            queries = torch.cat(
+                [torch.zeros_like(grid_pts[:, :, :1]), grid_pts],
+                dim=2,
+            )
+        if add_support_grid:
+            grid_pts = get_points_on_a_grid(self.support_grid_size, self.interp_shape, device=video.device)
+            grid_pts = torch.cat(
+                [torch.zeros_like(grid_pts[:, :, :1]), grid_pts], dim=2
+            )
+            queries = torch.cat([queries, grid_pts], dim=1)
+        ## ----------- estimate the video depth -----------##
+        if video_depth is None:
+            with torch.no_grad():
+                if video[0].shape[0]>30:
+                    vidDepths = []
+                    for i in range(video[0].shape[0]//30+1):
+                        if (i+1)*30 > video[0].shape[0]:
+                            end_idx = video[0].shape[0]
+                        else:
+                            end_idx = (i+1)*30
+                        if end_idx == i*30:
+                            break
+                        video_ = video[0][i*30:end_idx]
+                        vidDepths.append(depth_predictor.infer(video_/255))
+                    video_depth = torch.cat(vidDepths, dim=0)
+                else:
+                    video_depth = depth_predictor.infer(video[0]/255)
+        video_depth = F.interpolate(video_depth,
+                                     tuple(self.interp_shape), mode="nearest")
+        # from PIL import Image
+        # import numpy
+        # depth_frame = video_depth[0].detach().cpu()
+        # depth_frame = depth_frame.squeeze(0)
+        # print(depth_frame)
+        # print(depth_frame.min(), depth_frame.max())
+        # depth_img = (depth_frame * 255).numpy().astype(numpy.uint8)
+        # depth_img = Image.fromarray(depth_img, mode='L')
+        # depth_img.save('outputs/depth_map.png')
+        # frame = video[0, 0].detach().cpu()
+        # frame = frame.permute(1, 2, 0)
+        # frame = (frame * 255).numpy().astype(numpy.uint8)
+        # frame = Image.fromarray(frame, mode='RGB')
+        # frame.save('outputs/frame.png')
+        depths = video_depth
+        rgbds = torch.cat([video, depths[None,...]], dim=2)
+        # get the 3D queries
+        depth_interp=[]
+        for i in range(queries.shape[1]):
+            depth_interp_i = bilinear_sample2d(video_depth[queries[:, i:i+1, 0].long()],
+                                queries[:, i:i+1, 1], queries[:, i:i+1, 2])
+            depth_interp.append(depth_interp_i)
+        depth_interp = torch.cat(depth_interp, dim=1)
+        queries = smart_cat(queries, depth_interp,dim=-1)
+        #NOTE: free the memory of depth_predictor
+        del depth_predictor
+        torch.cuda.empty_cache()
+        t0 = time.time()
+        tracks, __, visibilities = self.model(rgbds=rgbds, queries=queries, iters=6, wind_S=wind_length)
+        print("Time taken for inference: ", time.time()-t0)
+        if backward_tracking:
+            tracks, visibilities = self._compute_backward_tracks(
+                rgbds, queries, tracks, visibilities
+            )
+            if add_support_grid:
+                queries[:, -self.support_grid_size ** 2 :, 0] = T - 1
+        if add_support_grid:
+            tracks = tracks[:, :, : -self.support_grid_size ** 2]
+            visibilities = visibilities[:, :, : -self.support_grid_size ** 2]
+        thr = 0.9
+        visibilities = visibilities > thr
+        # correct query-point predictions
+        # see https://github.com/facebookresearch/co-tracker/issues/28
+        # TODO: batchify
+        for i in range(len(queries)):
+            queries_t = queries[i, :tracks.size(2), 0].to(torch.int64)
+            arange = torch.arange(0, len(queries_t))
+            # overwrite the predictions with the query points
+            tracks[i, queries_t, arange] = queries[i, :tracks.size(2), 1:]
+            # correct visibilities, the query points should be visible
+            visibilities[i, queries_t, arange] = True
+        T_First = queries[..., :tracks.size(2), 0].to(torch.uint8)
+        tracks[:, :, :, 0] *= W / float(self.interp_shape[1])
+        tracks[:, :, :, 1] *= H / float(self.interp_shape[0])
+        return tracks, visibilities, T_First
+    def _compute_backward_tracks(self, video, queries, tracks, visibilities):
+        inv_video = video.flip(1).clone()
+        inv_queries = queries.clone()
+        inv_queries[:, :, 0] = inv_video.shape[1] - inv_queries[:, :, 0] - 1
+        inv_tracks, __, inv_visibilities = self.model(
+            rgbds=inv_video, queries=queries, iters=6
+        )
+        inv_tracks = inv_tracks.flip(1)
+        inv_visibilities = inv_visibilities.flip(1)
+        mask = tracks == 0
+        tracks[mask] = inv_tracks[mask]
+        visibilities[mask[:, :, :, 0]] = inv_visibilities[mask[:, :, :, 0]]
+        return tracks, visibilities

models/spatracker/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.

models/spatracker/utils/basic.py ADDED Viewed

	@@ -0,0 +1,397 @@

+import os
+import numpy as np
+from os.path import isfile
+import torch
+import torch.nn.functional as F
+EPS = 1e-6
+import copy
+def sub2ind(height, width, y, x):
+    return y*width + x
+def ind2sub(height, width, ind):
+    y = ind // width
+    x = ind % width
+    return y, x
+def get_lr_str(lr):
+    lrn = "%.1e" % lr # e.g., 5.0e-04
+    lrn = lrn[0] + lrn[3:5] + lrn[-1] # e.g., 5e-4
+    return lrn
+def strnum(x):
+    s = '%g' % x
+    if '.' in s:
+        if x < 1.0:
+            s = s[s.index('.'):]
+        s = s[:min(len(s),4)]
+    return s
+def assert_same_shape(t1, t2):
+    for (x, y) in zip(list(t1.shape), list(t2.shape)):
+        assert(x==y)
+def print_stats(name, tensor):
+    shape = tensor.shape
+    tensor = tensor.detach().cpu().numpy()
+    print('%s (%s) min = %.2f, mean = %.2f, max = %.2f' % (name, tensor.dtype, np.min(tensor), np.mean(tensor), np.max(tensor)), shape)
+def print_stats_py(name, tensor):
+    shape = tensor.shape
+    print('%s (%s) min = %.2f, mean = %.2f, max = %.2f' % (name, tensor.dtype, np.min(tensor), np.mean(tensor), np.max(tensor)), shape)
+def print_(name, tensor):
+    tensor = tensor.detach().cpu().numpy()
+    print(name, tensor, tensor.shape)
+def mkdir(path):
+    if not os.path.exists(path):
+        os.makedirs(path)
+def normalize_single(d):
+    # d is a whatever shape torch tensor
+    dmin = torch.min(d)
+    dmax = torch.max(d)
+    d = (d-dmin)/(EPS+(dmax-dmin))
+    return d
+def normalize(d):
+    # d is B x whatever. normalize within each element of the batch
+    out = torch.zeros(d.size())
+    if d.is_cuda:
+        out = out.cuda()
+    B = list(d.size())[0]
+    for b in list(range(B)):
+        out[b] = normalize_single(d[b])
+    return out
+def hard_argmax2d(tensor):
+    B, C, Y, X = list(tensor.shape)
+    assert(C==1)
+    # flatten the Tensor along the height and width axes
+    flat_tensor = tensor.reshape(B, -1)
+    # argmax of the flat tensor
+    argmax = torch.argmax(flat_tensor, dim=1)
+    # convert the indices into 2d coordinates
+    argmax_y = torch.floor(argmax / X) # row
+    argmax_x = argmax % X # col
+    argmax_y = argmax_y.reshape(B)
+    argmax_x = argmax_x.reshape(B)
+    return argmax_y, argmax_x
+def argmax2d(heat, hard=True):
+    B, C, Y, X = list(heat.shape)
+    assert(C==1)
+    if hard:
+        # hard argmax
+        loc_y, loc_x = hard_argmax2d(heat)
+        loc_y = loc_y.float()
+        loc_x = loc_x.float()
+    else:
+        heat = heat.reshape(B, Y*X)
+        prob = torch.nn.functional.softmax(heat, dim=1)
+        grid_y, grid_x = meshgrid2d(B, Y, X)
+        grid_y = grid_y.reshape(B, -1)
+        grid_x = grid_x.reshape(B, -1)
+        loc_y = torch.sum(grid_y*prob, dim=1)
+        loc_x = torch.sum(grid_x*prob, dim=1)
+        # these are B
+    return loc_y, loc_x
+def reduce_masked_mean(x, mask, dim=None, keepdim=False):
+    # x and mask are the same shape, or at least broadcastably so < actually it's safer if you disallow broadcasting
+    # returns shape-1
+    # axis can be a list of axes
+    for (a,b) in zip(x.size(), mask.size()):
+        # if not b==1:
+        assert(a==b) # some shape mismatch!
+    # assert(x.size() == mask.size())
+    prod = x*mask
+    if dim is None:
+        numer = torch.sum(prod)
+        denom = EPS+torch.sum(mask)
+    else:
+        numer = torch.sum(prod, dim=dim, keepdim=keepdim)
+        denom = EPS+torch.sum(mask, dim=dim, keepdim=keepdim)
+    mean = numer/denom
+    return mean
+def reduce_masked_median(x, mask, keep_batch=False):
+    # x and mask are the same shape
+    assert(x.size() == mask.size())
+    device = x.device
+    B = list(x.shape)[0]
+    x = x.detach().cpu().numpy()
+    mask = mask.detach().cpu().numpy()
+    if keep_batch:
+        x = np.reshape(x, [B, -1])
+        mask = np.reshape(mask, [B, -1])
+        meds = np.zeros([B], np.float32)
+        for b in list(range(B)):
+            xb = x[b]
+            mb = mask[b]
+            if np.sum(mb) > 0:
+                xb = xb[mb > 0]
+                meds[b] = np.median(xb)
+            else:
+                meds[b] = np.nan
+        meds = torch.from_numpy(meds).to(device)
+        return meds.float()
+    else:
+        x = np.reshape(x, [-1])
+        mask = np.reshape(mask, [-1])
+        if np.sum(mask) > 0:
+            x = x[mask > 0]
+            med = np.median(x)
+        else:
+            med = np.nan
+        med = np.array([med], np.float32)
+        med = torch.from_numpy(med).to(device)
+        return med.float()
+def pack_seqdim(tensor, B):
+    shapelist = list(tensor.shape)
+    B_, S = shapelist[:2]
+    assert(B==B_)
+    otherdims = shapelist[2:]
+    tensor = torch.reshape(tensor, [B*S]+otherdims)
+    return tensor
+def unpack_seqdim(tensor, B):
+    shapelist = list(tensor.shape)
+    BS = shapelist[0]
+    assert(BS%B==0)
+    otherdims = shapelist[1:]
+    S = int(BS/B)
+    tensor = torch.reshape(tensor, [B,S]+otherdims)
+    return tensor
+def meshgrid2d(B, Y, X, stack=False, norm=False, device='cuda', on_chans=False):
+    # returns a meshgrid sized B x Y x X
+    grid_y = torch.linspace(0.0, Y-1, Y, device=torch.device(device))
+    grid_y = torch.reshape(grid_y, [1, Y, 1])
+    grid_y = grid_y.repeat(B, 1, X)
+    grid_x = torch.linspace(0.0, X-1, X, device=torch.device(device))
+    grid_x = torch.reshape(grid_x, [1, 1, X])
+    grid_x = grid_x.repeat(B, Y, 1)
+    if norm:
+        grid_y, grid_x = normalize_grid2d(
+            grid_y, grid_x, Y, X)
+    if stack:
+        # note we stack in xy order
+        # (see https://pytorch.org/docs/stable/nn.functional.html#torch.nn.functional.grid_sample)
+        if on_chans:
+            grid = torch.stack([grid_x, grid_y], dim=1)
+        else:
+            grid = torch.stack([grid_x, grid_y], dim=-1)
+        return grid
+    else:
+        return grid_y, grid_x
+def meshgrid3d(B, Z, Y, X, stack=False, norm=False, device='cuda'):
+    # returns a meshgrid sized B x Z x Y x X
+    grid_z = torch.linspace(0.0, Z-1, Z, device=device)
+    grid_z = torch.reshape(grid_z, [1, Z, 1, 1])
+    grid_z = grid_z.repeat(B, 1, Y, X)
+    grid_y = torch.linspace(0.0, Y-1, Y, device=device)
+    grid_y = torch.reshape(grid_y, [1, 1, Y, 1])
+    grid_y = grid_y.repeat(B, Z, 1, X)
+    grid_x = torch.linspace(0.0, X-1, X, device=device)
+    grid_x = torch.reshape(grid_x, [1, 1, 1, X])
+    grid_x = grid_x.repeat(B, Z, Y, 1)
+    # if cuda:
+    #     grid_z = grid_z.cuda()
+    #     grid_y = grid_y.cuda()
+    #     grid_x = grid_x.cuda()
+    if norm:
+        grid_z, grid_y, grid_x = normalize_grid3d(
+            grid_z, grid_y, grid_x, Z, Y, X)
+    if stack:
+        # note we stack in xyz order
+        # (see https://pytorch.org/docs/stable/nn.functional.html#torch.nn.functional.grid_sample)
+        grid = torch.stack([grid_x, grid_y, grid_z], dim=-1)
+        return grid
+    else:
+        return grid_z, grid_y, grid_x
+def normalize_grid2d(grid_y, grid_x, Y, X, clamp_extreme=True):
+    # make things in [-1,1]
+    grid_y = 2.0*(grid_y / float(Y-1)) - 1.0
+    grid_x = 2.0*(grid_x / float(X-1)) - 1.0
+    if clamp_extreme:
+        grid_y = torch.clamp(grid_y, min=-2.0, max=2.0)
+        grid_x = torch.clamp(grid_x, min=-2.0, max=2.0)
+    return grid_y, grid_x
+def normalize_grid3d(grid_z, grid_y, grid_x, Z, Y, X, clamp_extreme=True):
+    # make things in [-1,1]
+    grid_z = 2.0*(grid_z / float(Z-1)) - 1.0
+    grid_y = 2.0*(grid_y / float(Y-1)) - 1.0
+    grid_x = 2.0*(grid_x / float(X-1)) - 1.0
+    if clamp_extreme:
+        grid_z = torch.clamp(grid_z, min=-2.0, max=2.0)
+        grid_y = torch.clamp(grid_y, min=-2.0, max=2.0)
+        grid_x = torch.clamp(grid_x, min=-2.0, max=2.0)
+    return grid_z, grid_y, grid_x
+def gridcloud2d(B, Y, X, norm=False, device='cuda'):
+    # we want to sample for each location in the grid
+    grid_y, grid_x = meshgrid2d(B, Y, X, norm=norm, device=device)
+    x = torch.reshape(grid_x, [B, -1])
+    y = torch.reshape(grid_y, [B, -1])
+    # these are B x N
+    xy = torch.stack([x, y], dim=2)
+    # this is B x N x 2
+    return xy
+def gridcloud3d(B, Z, Y, X, norm=False, device='cuda'):
+    # we want to sample for each location in the grid
+    grid_z, grid_y, grid_x = meshgrid3d(B, Z, Y, X, norm=norm, device=device)
+    x = torch.reshape(grid_x, [B, -1])
+    y = torch.reshape(grid_y, [B, -1])
+    z = torch.reshape(grid_z, [B, -1])
+    # these are B x N
+    xyz = torch.stack([x, y, z], dim=2)
+    # this is B x N x 3
+    return xyz
+import re
+def readPFM(file):
+    file = open(file, 'rb')
+    color = None
+    width = None
+    height = None
+    scale = None
+    endian = None
+    header = file.readline().rstrip()
+    if header == b'PF':
+        color = True
+    elif header == b'Pf':
+        color = False
+    else:
+        raise Exception('Not a PFM file.')
+    dim_match = re.match(rb'^(\d+)\s(\d+)\s$', file.readline())
+    if dim_match:
+        width, height = map(int, dim_match.groups())
+    else:
+        raise Exception('Malformed PFM header.')
+    scale = float(file.readline().rstrip())
+    if scale < 0: # little-endian
+        endian = '<'
+        scale = -scale
+    else:
+        endian = '>' # big-endian
+    data = np.fromfile(file, endian + 'f')
+    shape = (height, width, 3) if color else (height, width)
+    data = np.reshape(data, shape)
+    data = np.flipud(data)
+    return data
+def normalize_boxlist2d(boxlist2d, H, W):
+    boxlist2d = boxlist2d.clone()
+    ymin, xmin, ymax, xmax = torch.unbind(boxlist2d, dim=2)
+    ymin = ymin / float(H)
+    ymax = ymax / float(H)
+    xmin = xmin / float(W)
+    xmax = xmax / float(W)
+    boxlist2d = torch.stack([ymin, xmin, ymax, xmax], dim=2)
+    return boxlist2d
+def unnormalize_boxlist2d(boxlist2d, H, W):
+    boxlist2d = boxlist2d.clone()
+    ymin, xmin, ymax, xmax = torch.unbind(boxlist2d, dim=2)
+    ymin = ymin * float(H)
+    ymax = ymax * float(H)
+    xmin = xmin * float(W)
+    xmax = xmax * float(W)
+    boxlist2d = torch.stack([ymin, xmin, ymax, xmax], dim=2)
+    return boxlist2d
+def unnormalize_box2d(box2d, H, W):
+    return unnormalize_boxlist2d(box2d.unsqueeze(1), H, W).squeeze(1)
+def normalize_box2d(box2d, H, W):
+    return normalize_boxlist2d(box2d.unsqueeze(1), H, W).squeeze(1)
+def get_gaussian_kernel_2d(channels, kernel_size=3, sigma=2.0, mid_one=False):
+    C = channels
+    xy_grid = gridcloud2d(C, kernel_size, kernel_size) # C x N x 2
+    mean = (kernel_size - 1)/2.0
+    variance = sigma**2.0
+    gaussian_kernel = (1.0/(2.0*np.pi*variance)**1.5) * torch.exp(-torch.sum((xy_grid - mean)**2.0, dim=-1) / (2.0*variance)) # C X N
+    gaussian_kernel = gaussian_kernel.view(C, 1, kernel_size, kernel_size) # C x 1 x 3 x 3
+    kernel_sum = torch.sum(gaussian_kernel, dim=(2,3), keepdim=True)
+    gaussian_kernel = gaussian_kernel / kernel_sum # normalize
+    if mid_one:
+        # normalize so that the middle element is 1
+        maxval = gaussian_kernel[:,:,(kernel_size//2),(kernel_size//2)].reshape(C, 1, 1, 1)
+        gaussian_kernel = gaussian_kernel / maxval
+    return gaussian_kernel
+def gaussian_blur_2d(input, kernel_size=3, sigma=2.0, reflect_pad=False, mid_one=False):
+    B, C, Z, X = input.shape
+    kernel = get_gaussian_kernel_2d(C, kernel_size, sigma, mid_one=mid_one)
+    if reflect_pad:
+        pad = (kernel_size - 1)//2
+        out = F.pad(input, (pad, pad, pad, pad), mode='reflect')
+        out = F.conv2d(out, kernel, padding=0, groups=C)
+    else:
+        out = F.conv2d(input, kernel, padding=(kernel_size - 1)//2, groups=C)
+    return out
+def gradient2d(x, absolute=False, square=False, return_sum=False):
+    # x should be B x C x H x W
+    dh = x[:, :, 1:, :] - x[:, :, :-1, :]
+    dw = x[:, :, :, 1:] - x[:, :, :, :-1]
+    zeros = torch.zeros_like(x)
+    zero_h = zeros[:, :, 0:1, :]
+    zero_w = zeros[:, :, :, 0:1]
+    dh = torch.cat([dh, zero_h], axis=2)
+    dw = torch.cat([dw, zero_w], axis=3)
+    if absolute:
+        dh = torch.abs(dh)
+        dw = torch.abs(dw)
+    if square:
+        dh = dh ** 2
+        dw = dw ** 2
+    if return_sum:
+        return dh+dw
+    else:
+        return dh, dw

models/spatracker/utils/geom.py ADDED Viewed

	@@ -0,0 +1,547 @@

+import torch
+import models.spatracker.utils.basic
+import numpy as np
+import torchvision.ops as ops
+from models.spatracker.utils.basic import print_
+def matmul2(mat1, mat2):
+    return torch.matmul(mat1, mat2)
+def matmul3(mat1, mat2, mat3):
+    return torch.matmul(mat1, torch.matmul(mat2, mat3))
+def eye_3x3(B, device='cuda'):
+    rt = torch.eye(3, device=torch.device(device)).view(1,3,3).repeat([B, 1, 1])
+    return rt
+def eye_4x4(B, device='cuda'):
+    rt = torch.eye(4, device=torch.device(device)).view(1,4,4).repeat([B, 1, 1])
+    return rt
+def safe_inverse(a): #parallel version
+    B, _, _ = list(a.shape)
+    inv = a.clone()
+    r_transpose = a[:, :3, :3].transpose(1,2) #inverse of rotation matrix
+    inv[:, :3, :3] = r_transpose
+    inv[:, :3, 3:4] = -torch.matmul(r_transpose, a[:, :3, 3:4])
+    return inv
+def safe_inverse_single(a):
+    r, t = split_rt_single(a)
+    t = t.view(3,1)
+    r_transpose = r.t()
+    inv = torch.cat([r_transpose, -torch.matmul(r_transpose, t)], 1)
+    bottom_row = a[3:4, :] # this is [0, 0, 0, 1]
+    # bottom_row = torch.tensor([0.,0.,0.,1.]).view(1,4)
+    inv = torch.cat([inv, bottom_row], 0)
+    return inv
+def split_intrinsics(K):
+    # K is B x 3 x 3 or B x 4 x 4
+    fx = K[:,0,0]
+    fy = K[:,1,1]
+    x0 = K[:,0,2]
+    y0 = K[:,1,2]
+    return fx, fy, x0, y0
+def apply_pix_T_cam(pix_T_cam, xyz):
+    fx, fy, x0, y0 = split_intrinsics(pix_T_cam)
+    # xyz is shaped B x H*W x 3
+    # returns xy, shaped B x H*W x 2
+    B, N, C = list(xyz.shape)
+    assert(C==3)
+    x, y, z = torch.unbind(xyz, axis=-1)
+    fx = torch.reshape(fx, [B, 1])
+    fy = torch.reshape(fy, [B, 1])
+    x0 = torch.reshape(x0, [B, 1])
+    y0 = torch.reshape(y0, [B, 1])
+    EPS = 1e-4
+    z = torch.clamp(z, min=EPS)
+    x = (x*fx)/(z)+x0
+    y = (y*fy)/(z)+y0
+    xy = torch.stack([x, y], axis=-1)
+    return xy
+def apply_pix_T_cam_py(pix_T_cam, xyz):
+    fx, fy, x0, y0 = split_intrinsics(pix_T_cam)
+    # xyz is shaped B x H*W x 3
+    # returns xy, shaped B x H*W x 2
+    B, N, C = list(xyz.shape)
+    assert(C==3)
+    x, y, z = xyz[:,:,0], xyz[:,:,1], xyz[:,:,2]
+    fx = np.reshape(fx, [B, 1])
+    fy = np.reshape(fy, [B, 1])
+    x0 = np.reshape(x0, [B, 1])
+    y0 = np.reshape(y0, [B, 1])
+    EPS = 1e-4
+    z = np.clip(z, EPS, None)
+    x = (x*fx)/(z)+x0
+    y = (y*fy)/(z)+y0
+    xy = np.stack([x, y], axis=-1)
+    return xy
+def get_camM_T_camXs(origin_T_camXs, ind=0):
+    B, S = list(origin_T_camXs.shape)[0:2]
+    camM_T_camXs = torch.zeros_like(origin_T_camXs)
+    for b in list(range(B)):
+        camM_T_origin = safe_inverse_single(origin_T_camXs[b,ind])
+        for s in list(range(S)):
+            camM_T_camXs[b,s] = torch.matmul(camM_T_origin, origin_T_camXs[b,s])
+    return camM_T_camXs
+def apply_4x4(RT, xyz):
+    B, N, _ = list(xyz.shape)
+    ones = torch.ones_like(xyz[:,:,0:1])
+    xyz1 = torch.cat([xyz, ones], 2)
+    xyz1_t = torch.transpose(xyz1, 1, 2)
+    # this is B x 4 x N
+    xyz2_t = torch.matmul(RT, xyz1_t)
+    xyz2 = torch.transpose(xyz2_t, 1, 2)
+    xyz2 = xyz2[:,:,:3]
+    return xyz2
+def apply_4x4_py(RT, xyz):
+    # print('RT', RT.shape)
+    B, N, _ = list(xyz.shape)
+    ones = np.ones_like(xyz[:,:,0:1])
+    xyz1 = np.concatenate([xyz, ones], 2)
+    # print('xyz1', xyz1.shape)
+    xyz1_t = xyz1.transpose(0,2,1)
+    # print('xyz1_t', xyz1_t.shape)
+    # this is B x 4 x N
+    xyz2_t = np.matmul(RT, xyz1_t)
+    # print('xyz2_t', xyz2_t.shape)
+    xyz2 = xyz2_t.transpose(0,2,1)
+    # print('xyz2', xyz2.shape)
+    xyz2 = xyz2[:,:,:3]
+    return xyz2
+def apply_3x3(RT, xy):
+    B, N, _ = list(xy.shape)
+    ones = torch.ones_like(xy[:,:,0:1])
+    xy1 = torch.cat([xy, ones], 2)
+    xy1_t = torch.transpose(xy1, 1, 2)
+    # this is B x 4 x N
+    xy2_t = torch.matmul(RT, xy1_t)
+    xy2 = torch.transpose(xy2_t, 1, 2)
+    xy2 = xy2[:,:,:2]
+    return xy2
+def generate_polygon(ctr_x, ctr_y, avg_r, irregularity, spikiness, num_verts):
+    '''
+    Start with the center of the polygon at ctr_x, ctr_y,
+    Then creates the polygon by sampling points on a circle around the center.
+    Random noise is added by varying the angular spacing between sequential points,
+    and by varying the radial distance of each point from the centre.
+    Params:
+        ctr_x, ctr_y - coordinates of the "centre" of the polygon
+        avg_r - in px, the average radius of this polygon, this roughly controls how large the polygon is, really only useful for order of magnitude.
+        irregularity - [0,1] indicating how much variance there is in the angular spacing of vertices. [0,1] will map to [0, 2pi/numberOfVerts]
+        spikiness - [0,1] indicating how much variance there is in each vertex from the circle of radius avg_r. [0,1] will map to [0, avg_r]
+pp        num_verts
+    Returns:
+        np.array [num_verts, 2] - CCW order.
+    '''
+    # spikiness
+    spikiness = np.clip(spikiness, 0, 1) * avg_r
+    # generate n angle steps
+    irregularity = np.clip(irregularity, 0, 1) * 2 * np.pi / num_verts
+    lower = (2*np.pi / num_verts) - irregularity
+    upper = (2*np.pi / num_verts) + irregularity
+    # angle steps
+    angle_steps = np.random.uniform(lower, upper, num_verts)
+    sc = (2 * np.pi) / angle_steps.sum()
+    angle_steps *= sc
+    # get all radii
+    angle = np.random.uniform(0, 2*np.pi)
+    radii = np.clip(np.random.normal(avg_r, spikiness, num_verts), 0, 2 * avg_r)
+    # compute all points
+    points = []
+    for i in range(num_verts):
+        x = ctr_x + radii[i] * np.cos(angle)
+        y = ctr_y + radii[i] * np.sin(angle)
+        points.append([x, y])
+        angle += angle_steps[i]
+    return np.array(points).astype(int)
+def get_random_affine_2d(B, rot_min=-5.0, rot_max=5.0, tx_min=-0.1, tx_max=0.1, ty_min=-0.1, ty_max=0.1, sx_min=-0.05, sx_max=0.05, sy_min=-0.05, sy_max=0.05, shx_min=-0.05, shx_max=0.05, shy_min=-0.05, shy_max=0.05):
+    '''
+    Params:
+        rot_min: rotation amount min
+        rot_max: rotation amount max
+        tx_min: translation x min
+        tx_max: translation x max
+        ty_min: translation y min
+        ty_max: translation y max
+        sx_min: scaling x min
+        sx_max: scaling x max
+        sy_min: scaling y min
+        sy_max: scaling y max
+        shx_min: shear x min
+        shx_max: shear x max
+        shy_min: shear y min
+        shy_max: shear y max
+    Returns:
+        transformation matrix: (B, 3, 3)
+    '''
+    # rotation
+    if rot_max - rot_min != 0:
+        rot_amount = np.random.uniform(low=rot_min, high=rot_max, size=B)
+        rot_amount = np.pi/180.0*rot_amount
+    else:
+        rot_amount = rot_min
+    rotation = np.zeros((B, 3, 3)) # B, 3, 3
+    rotation[:, 2, 2] = 1
+    rotation[:, 0, 0] = np.cos(rot_amount)
+    rotation[:, 0, 1] = -np.sin(rot_amount)
+    rotation[:, 1, 0] = np.sin(rot_amount)
+    rotation[:, 1, 1] = np.cos(rot_amount)
+    # translation
+    translation = np.zeros((B, 3, 3)) # B, 3, 3
+    translation[:, [0,1,2], [0,1,2]] = 1
+    if (tx_max - tx_min) > 0:
+        trans_x = np.random.uniform(low=tx_min, high=tx_max, size=B)
+        translation[:, 0, 2] = trans_x
+    # else:
+    #     translation[:, 0, 2] = tx_max
+    if ty_max - ty_min != 0:
+        trans_y = np.random.uniform(low=ty_min, high=ty_max, size=B)
+        translation[:, 1, 2] = trans_y
+    # else:
+    #     translation[:, 1, 2] = ty_max
+    # scaling
+    scaling = np.zeros((B, 3, 3)) # B, 3, 3
+    scaling[:, [0,1,2], [0,1,2]] = 1
+    if (sx_max - sx_min) > 0:
+        scale_x = 1 + np.random.uniform(low=sx_min, high=sx_max, size=B)
+        scaling[:, 0, 0] = scale_x
+    # else:
+    #     scaling[:, 0, 0] = sx_max
+    if (sy_max - sy_min) > 0:
+        scale_y = 1 + np.random.uniform(low=sy_min, high=sy_max, size=B)
+        scaling[:, 1, 1] = scale_y
+    # else:
+    #     scaling[:, 1, 1] = sy_max
+    # shear
+    shear = np.zeros((B, 3, 3)) # B, 3, 3
+    shear[:, [0,1,2], [0,1,2]] = 1
+    if (shx_max - shx_min) > 0:
+        shear_x = np.random.uniform(low=shx_min, high=shx_max, size=B)
+        shear[:, 0, 1] = shear_x
+    # else:
+    #     shear[:, 0, 1] = shx_max
+    if (shy_max - shy_min) > 0:
+        shear_y = np.random.uniform(low=shy_min, high=shy_max, size=B)
+        shear[:, 1, 0] = shear_y
+    # else:
+    #     shear[:, 1, 0] = shy_max
+    # compose all those
+    rt = np.einsum("ijk,ikl->ijl", rotation, translation)
+    ss = np.einsum("ijk,ikl->ijl", scaling, shear)
+    trans = np.einsum("ijk,ikl->ijl", rt, ss)
+    return trans
+def get_centroid_from_box2d(box2d):
+    ymin = box2d[:,0]
+    xmin = box2d[:,1]
+    ymax = box2d[:,2]
+    xmax = box2d[:,3]
+    x = (xmin+xmax)/2.0
+    y = (ymin+ymax)/2.0
+    return y, x
+def normalize_boxlist2d(boxlist2d, H, W):
+    boxlist2d = boxlist2d.clone()
+    ymin, xmin, ymax, xmax = torch.unbind(boxlist2d, dim=2)
+    ymin = ymin / float(H)
+    ymax = ymax / float(H)
+    xmin = xmin / float(W)
+    xmax = xmax / float(W)
+    boxlist2d = torch.stack([ymin, xmin, ymax, xmax], dim=2)
+    return boxlist2d
+def unnormalize_boxlist2d(boxlist2d, H, W):
+    boxlist2d = boxlist2d.clone()
+    ymin, xmin, ymax, xmax = torch.unbind(boxlist2d, dim=2)
+    ymin = ymin * float(H)
+    ymax = ymax * float(H)
+    xmin = xmin * float(W)
+    xmax = xmax * float(W)
+    boxlist2d = torch.stack([ymin, xmin, ymax, xmax], dim=2)
+    return boxlist2d
+def unnormalize_box2d(box2d, H, W):
+    return unnormalize_boxlist2d(box2d.unsqueeze(1), H, W).squeeze(1)
+def normalize_box2d(box2d, H, W):
+    return normalize_boxlist2d(box2d.unsqueeze(1), H, W).squeeze(1)
+def get_size_from_box2d(box2d):
+    ymin = box2d[:,0]
+    xmin = box2d[:,1]
+    ymax = box2d[:,2]
+    xmax = box2d[:,3]
+    height = ymax-ymin
+    width = xmax-xmin
+    return height, width
+def crop_and_resize(im, boxlist, PH, PW, boxlist_is_normalized=False):
+    B, C, H, W = im.shape
+    B2, N, D = boxlist.shape
+    assert(B==B2)
+    assert(D==4)
+    # PH, PW is the size to resize to
+    # output is B,N,C,PH,PW
+    # pt wants xy xy, unnormalized
+    if boxlist_is_normalized:
+        boxlist_unnorm = unnormalize_boxlist2d(boxlist, H, W)
+    else:
+        boxlist_unnorm = boxlist
+    ymin, xmin, ymax, xmax = boxlist_unnorm.unbind(2)
+    # boxlist_pt = torch.stack([boxlist_unnorm[:,1], boxlist_unnorm[:,0], boxlist_unnorm[:,3], boxlist_unnorm[:,2]], dim=1)
+    boxlist_pt = torch.stack([xmin, ymin, xmax, ymax], dim=2)
+    # we want a B-len list of K x 4 arrays
+    # print('im', im.shape)
+    # print('boxlist', boxlist.shape)
+    # print('boxlist_pt', boxlist_pt.shape)
+    # boxlist_pt = list(boxlist_pt.unbind(0))
+    crops = []
+    for b in range(B):
+        crops_b = ops.roi_align(im[b:b+1], [boxlist_pt[b]], output_size=(PH, PW))
+        crops.append(crops_b)
+    # # crops = im
+    # print('crops', crops.shape)
+    # crops = crops.reshape(B,N,C,PH,PW)
+    # crops = []
+    # for b in range(B):
+    #     crop_b = ops.roi_align(im[b:b+1], [boxlist_pt[b]], output_size=(PH, PW))
+    #     print('crop_b', crop_b.shape)
+    #     crops.append(crop_b)
+    crops = torch.stack(crops, dim=0)
+    # print('crops', crops.shape)
+    # boxlist_list = boxlist_pt.unbind(0)
+    # print('rgb_crop', rgb_crop.shape)
+    return crops
+# def get_boxlist_from_centroid_and_size(cy, cx, h, w, clip=True):
+#     # cy,cx are both B,N
+#     ymin = cy - h/2
+#     ymax = cy + h/2
+#     xmin = cx - w/2
+#     xmax = cx + w/2
+#     box = torch.stack([ymin, xmin, ymax, xmax], dim=-1)
+#     if clip:
+#         box = torch.clamp(box, 0, 1)
+#     return box
+def get_boxlist_from_centroid_and_size(cy, cx, h, w):#, clip=False):
+    # cy,cx are the same shape
+    ymin = cy - h/2
+    ymax = cy + h/2
+    xmin = cx - w/2
+    xmax = cx + w/2
+    # if clip:
+    #     ymin = torch.clamp(ymin, 0, H-1)
+    #     ymax = torch.clamp(ymax, 0, H-1)
+    #     xmin = torch.clamp(xmin, 0, W-1)
+    #     xmax = torch.clamp(xmax, 0, W-1)
+    box = torch.stack([ymin, xmin, ymax, xmax], dim=-1)
+    return box
+def get_box2d_from_mask(mask, normalize=False):
+    # mask is B, 1, H, W
+    B, C, H, W = mask.shape
+    assert(C==1)
+    xy = utils.basic.gridcloud2d(B, H, W, norm=False, device=mask.device) # B, H*W, 2
+    box = torch.zeros((B, 4), dtype=torch.float32, device=mask.device)
+    for b in range(B):
+        xy_b = xy[b] # H*W, 2
+        mask_b = mask[b].reshape(H*W)
+        xy_ = xy_b[mask_b > 0]
+        x_ = xy_[:,0]
+        y_ = xy_[:,1]
+        ymin = torch.min(y_)
+        ymax = torch.max(y_)
+        xmin = torch.min(x_)
+        xmax = torch.max(x_)
+        box[b] = torch.stack([ymin, xmin, ymax, xmax], dim=0)
+    if normalize:
+        box = normalize_boxlist2d(box.unsqueeze(1), H, W).squeeze(1)
+    return box
+def convert_box2d_to_intrinsics(box2d, pix_T_cam, H, W, use_image_aspect_ratio=True, mult_padding=1.0):
+    # box2d is B x 4, with ymin, xmin, ymax, xmax in normalized coords
+    # ymin, xmin, ymax, xmax = torch.unbind(box2d, dim=1)
+    # H, W is the original size of the image
+    # mult_padding is relative to object size in pixels
+    # i assume we're rendering an image the same size as the original (H, W)
+    if not mult_padding==1.0:
+        y, x = get_centroid_from_box2d(box2d)
+        h, w = get_size_from_box2d(box2d)
+        box2d = get_box2d_from_centroid_and_size(
+            y, x, h*mult_padding, w*mult_padding, clip=False)
+    if use_image_aspect_ratio:
+        h, w = get_size_from_box2d(box2d)
+        y, x = get_centroid_from_box2d(box2d)
+        # note h,w are relative right now
+        # we need to undo this, to see the real ratio
+        h = h*float(H)
+        w = w*float(W)
+        box_ratio = h/w
+        im_ratio = H/float(W)
+        # print('box_ratio:', box_ratio)
+        # print('im_ratio:', im_ratio)
+        if box_ratio >= im_ratio:
+            w = h/im_ratio
+            # print('setting w:', h/im_ratio)
+        else:
+            h = w*im_ratio
+            # print('setting h:', w*im_ratio)
+        box2d = get_box2d_from_centroid_and_size(
+            y, x, h/float(H), w/float(W), clip=False)
+    assert(h > 1e-4)
+    assert(w > 1e-4)
+    ymin, xmin, ymax, xmax = torch.unbind(box2d, dim=1)
+    fx, fy, x0, y0 = split_intrinsics(pix_T_cam)
+    # the topleft of the new image will now have a different offset from the center of projection
+    new_x0 = x0 - xmin*W
+    new_y0 = y0 - ymin*H
+    pix_T_cam = pack_intrinsics(fx, fy, new_x0, new_y0)
+    # this alone will give me an image in original resolution,
+    # with its topleft at the box corner
+    box_h, box_w = get_size_from_box2d(box2d)
+    # these are normalized, and shaped B. (e.g., [0.4], [0.3])
+    # we are going to scale the image by the inverse of this,
+    # since we are zooming into this area
+    sy = 1./box_h
+    sx = 1./box_w
+    pix_T_cam = scale_intrinsics(pix_T_cam, sx, sy)
+    return pix_T_cam, box2d
+def pixels2camera(x,y,z,fx,fy,x0,y0):
+    # x and y are locations in pixel coordinates, z is a depth in meters
+    # they can be images or pointclouds
+    # fx, fy, x0, y0 are camera intrinsics
+    # returns xyz, sized B x N x 3
+    B = x.shape[0]
+    fx = torch.reshape(fx, [B,1])
+    fy = torch.reshape(fy, [B,1])
+    x0 = torch.reshape(x0, [B,1])
+    y0 = torch.reshape(y0, [B,1])
+    x = torch.reshape(x, [B,-1])
+    y = torch.reshape(y, [B,-1])
+    z = torch.reshape(z, [B,-1])
+    # unproject
+    x = (z/fx)*(x-x0)
+    y = (z/fy)*(y-y0)
+    xyz = torch.stack([x,y,z], dim=2)
+    # B x N x 3
+    return xyz
+def camera2pixels(xyz, pix_T_cam):
+    # xyz is shaped B x H*W x 3
+    # returns xy, shaped B x H*W x 2
+    fx, fy, x0, y0 = split_intrinsics(pix_T_cam)
+    x, y, z = torch.unbind(xyz, dim=-1)
+    B = list(z.shape)[0]
+    fx = torch.reshape(fx, [B,1])
+    fy = torch.reshape(fy, [B,1])
+    x0 = torch.reshape(x0, [B,1])
+    y0 = torch.reshape(y0, [B,1])
+    x = torch.reshape(x, [B,-1])
+    y = torch.reshape(y, [B,-1])
+    z = torch.reshape(z, [B,-1])
+    EPS = 1e-4
+    z = torch.clamp(z, min=EPS)
+    x = (x*fx)/z + x0
+    y = (y*fy)/z + y0
+    xy = torch.stack([x, y], dim=-1)
+    return xy
+def depth2pointcloud(z, pix_T_cam):
+    B, C, H, W = list(z.shape)
+    device = z.device
+    y, x = utils.basic.meshgrid2d(B, H, W, device=device)
+    z = torch.reshape(z, [B, H, W])
+    fx, fy, x0, y0 = split_intrinsics(pix_T_cam)
+    xyz = pixels2camera(x, y, z, fx, fy, x0, y0)
+    return xyz

models/spatracker/utils/improc.py ADDED Viewed

	@@ -0,0 +1,1447 @@

+import torch
+import numpy as np
+import models.spatracker.utils.basic
+from sklearn.decomposition import PCA
+from matplotlib import cm
+import matplotlib.pyplot as plt
+import cv2
+import torch.nn.functional as F
+import torchvision
+EPS = 1e-6
+from skimage.color import (
+    rgb2lab, rgb2yuv, rgb2ycbcr, lab2rgb, yuv2rgb, ycbcr2rgb,
+    rgb2hsv, hsv2rgb, rgb2xyz, xyz2rgb, rgb2hed, hed2rgb)
+def _convert(input_, type_):
+    return {
+        'float': input_.float(),
+        'double': input_.double(),
+    }.get(type_, input_)
+def _generic_transform_sk_3d(transform, in_type='', out_type=''):
+    def apply_transform_individual(input_):
+        device = input_.device
+        input_ = input_.cpu()
+        input_ = _convert(input_, in_type)
+        input_ = input_.permute(1, 2, 0).detach().numpy()
+        transformed = transform(input_)
+        output = torch.from_numpy(transformed).float().permute(2, 0, 1)
+        output = _convert(output, out_type)
+        return output.to(device)
+    def apply_transform(input_):
+        to_stack = []
+        for image in input_:
+            to_stack.append(apply_transform_individual(image))
+        return torch.stack(to_stack)
+    return apply_transform
+hsv_to_rgb = _generic_transform_sk_3d(hsv2rgb)
+def preprocess_color_tf(x):
+    import tensorflow as tf
+    return tf.cast(x,tf.float32) * 1./255 - 0.5
+def preprocess_color(x):
+    if isinstance(x, np.ndarray):
+        return x.astype(np.float32) * 1./255 - 0.5
+    else:
+        return x.float() * 1./255 - 0.5
+def pca_embed(emb, keep, valid=None):
+    ## emb -- [S,H/2,W/2,C]
+    ## keep is the number of principal components to keep
+    ## Helper function for reduce_emb.
+    emb = emb + EPS
+    #emb is B x C x H x W
+    emb = emb.permute(0, 2, 3, 1).cpu().detach().numpy() #this is B x H x W x C
+    if valid:
+        valid = valid.cpu().detach().numpy().reshape((H*W))
+    emb_reduced = list()
+    B, H, W, C = np.shape(emb)
+    for img in emb:
+        if np.isnan(img).any():
+            emb_reduced.append(np.zeros([H, W, keep]))
+            continue
+        pixels_kd = np.reshape(img, (H*W, C))
+        if valid:
+            pixels_kd_pca = pixels_kd[valid]
+        else:
+            pixels_kd_pca = pixels_kd
+        P = PCA(keep)
+        P.fit(pixels_kd_pca)
+        if valid:
+            pixels3d = P.transform(pixels_kd)*valid
+        else:
+            pixels3d = P.transform(pixels_kd)
+        out_img = np.reshape(pixels3d, [H,W,keep]).astype(np.float32)
+        if np.isnan(out_img).any():
+            emb_reduced.append(np.zeros([H, W, keep]))
+            continue
+        emb_reduced.append(out_img)
+    emb_reduced = np.stack(emb_reduced, axis=0).astype(np.float32)
+    return torch.from_numpy(emb_reduced).permute(0, 3, 1, 2)
+def pca_embed_together(emb, keep):
+    ## emb -- [S,H/2,W/2,C]
+    ## keep is the number of principal components to keep
+    ## Helper function for reduce_emb.
+    emb = emb + EPS
+    #emb is B x C x H x W
+    emb = emb.permute(0, 2, 3, 1).cpu().detach().numpy() #this is B x H x W x C
+    B, H, W, C = np.shape(emb)
+    if np.isnan(emb).any():
+        return torch.zeros(B, keep, H, W)
+    pixelskd = np.reshape(emb, (B*H*W, C))
+    P = PCA(keep)
+    P.fit(pixelskd)
+    pixels3d = P.transform(pixelskd)
+    out_img = np.reshape(pixels3d, [B,H,W,keep]).astype(np.float32)
+    if np.isnan(out_img).any():
+        return torch.zeros(B, keep, H, W)
+    return torch.from_numpy(out_img).permute(0, 3, 1, 2)
+def reduce_emb(emb, valid=None, inbound=None, together=False):
+    ## emb -- [S,C,H/2,W/2], inbound -- [S,1,H/2,W/2]
+    ## Reduce number of chans to 3 with PCA. For vis.
+    # S,H,W,C = emb.shape.as_list()
+    S, C, H, W = list(emb.size())
+    keep = 3
+    if together:
+        reduced_emb = pca_embed_together(emb, keep)
+    else:
+        reduced_emb = pca_embed(emb, keep, valid) #not im
+    reduced_emb = utils.basic.normalize(reduced_emb) - 0.5
+    if inbound is not None:
+        emb_inbound = emb*inbound
+    else:
+        emb_inbound = None
+    return reduced_emb, emb_inbound
+def get_feat_pca(feat, valid=None):
+    B, C, D, W = list(feat.size())
+    # feat is B x C x D x W. If 3D input, average it through Height dimension before passing into this function.
+    pca, _ = reduce_emb(feat, valid=valid,inbound=None, together=True)
+    # pca is B x 3 x W x D
+    return pca
+def gif_and_tile(ims, just_gif=False):
+    S = len(ims)
+    # each im is B x H x W x C
+    # i want a gif in the left, and the tiled frames on the right
+    # for the gif tool, this means making a B x S x H x W tensor
+    # where the leftmost part is sequential and the rest is tiled
+    gif = torch.stack(ims, dim=1)
+    if just_gif:
+        return gif
+    til = torch.cat(ims, dim=2)
+    til = til.unsqueeze(dim=1).repeat(1, S, 1, 1, 1)
+    im = torch.cat([gif, til], dim=3)
+    return im
+def back2color(i, blacken_zeros=False):
+    if blacken_zeros:
+        const = torch.tensor([-0.5])
+        i = torch.where(i==0.0, const.cuda() if i.is_cuda else const, i)
+        return back2color(i)
+    else:
+        return ((i+0.5)*255).type(torch.ByteTensor)
+def convert_occ_to_height(occ, reduce_axis=3):
+    B, C, D, H, W = list(occ.shape)
+    assert(C==1)
+    # note that height increases DOWNWARD in the tensor
+    # (like pixel/camera coordinates)
+    G = list(occ.shape)[reduce_axis]
+    values = torch.linspace(float(G), 1.0, steps=G, dtype=torch.float32, device=occ.device)
+    if reduce_axis==2:
+        # fro view
+        values = values.view(1, 1, G, 1, 1)
+    elif reduce_axis==3:
+        # top view
+        values = values.view(1, 1, 1, G, 1)
+    elif reduce_axis==4:
+        # lateral view
+        values = values.view(1, 1, 1, 1, G)
+    else:
+        assert(False) # you have to reduce one of the spatial dims (2-4)
+    values = torch.max(occ*values, dim=reduce_axis)[0]/float(G)
+    # values = values.view([B, C, D, W])
+    return values
+def xy2heatmap(xy, sigma, grid_xs, grid_ys, norm=False):
+    # xy is B x N x 2, containing float x and y coordinates of N things
+    # grid_xs and grid_ys are B x N x Y x X
+    B, N, Y, X = list(grid_xs.shape)
+    mu_x = xy[:,:,0].clone()
+    mu_y = xy[:,:,1].clone()
+    x_valid = (mu_x>-0.5) & (mu_x<float(X+0.5))
+    y_valid = (mu_y>-0.5) & (mu_y<float(Y+0.5))
+    not_valid = ~(x_valid & y_valid)
+    mu_x[not_valid] = -10000
+    mu_y[not_valid] = -10000
+    mu_x = mu_x.reshape(B, N, 1, 1).repeat(1, 1, Y, X)
+    mu_y = mu_y.reshape(B, N, 1, 1).repeat(1, 1, Y, X)
+    sigma_sq = sigma*sigma
+    # sigma_sq = (sigma*sigma).reshape(B, N, 1, 1)
+    sq_diff_x = (grid_xs - mu_x)**2
+    sq_diff_y = (grid_ys - mu_y)**2
+    term1 = 1./2.*np.pi*sigma_sq
+    term2 = torch.exp(-(sq_diff_x+sq_diff_y)/(2.*sigma_sq))
+    gauss = term1*term2
+    if norm:
+        # normalize so each gaussian peaks at 1
+        gauss_ = gauss.reshape(B*N, Y, X)
+        gauss_ = utils.basic.normalize(gauss_)
+        gauss = gauss_.reshape(B, N, Y, X)
+    return gauss
+def xy2heatmaps(xy, Y, X, sigma=30.0, norm=True):
+    # xy is B x N x 2
+    B, N, D = list(xy.shape)
+    assert(D==2)
+    device = xy.device
+    grid_y, grid_x = utils.basic.meshgrid2d(B, Y, X, device=device)
+    # grid_x and grid_y are B x Y x X
+    grid_xs = grid_x.unsqueeze(1).repeat(1, N, 1, 1)
+    grid_ys = grid_y.unsqueeze(1).repeat(1, N, 1, 1)
+    heat = xy2heatmap(xy, sigma, grid_xs, grid_ys, norm=norm)
+    return heat
+def draw_circles_at_xy(xy, Y, X, sigma=12.5, round=False):
+    B, N, D = list(xy.shape)
+    assert(D==2)
+    prior = xy2heatmaps(xy, Y, X, sigma=sigma)
+    # prior is B x N x Y x X
+    if round:
+        prior = (prior > 0.5).float()
+    return prior
+def seq2color(im, norm=True, colormap='coolwarm'):
+    B, S, H, W = list(im.shape)
+    # S is sequential
+    # prep a mask of the valid pixels, so we can blacken the invalids later
+    mask = torch.max(im, dim=1, keepdim=True)[0]
+    # turn the S dim into an explicit sequence
+    coeffs = np.linspace(1.0, float(S), S).astype(np.float32)/float(S)
+    # # increase the spacing from the center
+    # coeffs[:int(S/2)] -= 2.0
+    # coeffs[int(S/2)+1:] += 2.0
+    coeffs = torch.from_numpy(coeffs).float().cuda()
+    coeffs = coeffs.reshape(1, S, 1, 1).repeat(B, 1, H, W)
+    # scale each channel by the right coeff
+    im = im * coeffs
+    # now im is in [1/S, 1], except for the invalid parts which are 0
+    # keep the highest valid coeff at each pixel
+    im = torch.max(im, dim=1, keepdim=True)[0]
+    out = []
+    for b in range(B):
+        im_ = im[b]
+        # move channels out to last dim_
+        im_ = im_.detach().cpu().numpy()
+        im_ = np.squeeze(im_)
+        # im_ is H x W
+        if colormap=='coolwarm':
+            im_ = cm.coolwarm(im_)[:, :, :3]
+        elif colormap=='PiYG':
+            im_ = cm.PiYG(im_)[:, :, :3]
+        elif colormap=='winter':
+            im_ = cm.winter(im_)[:, :, :3]
+        elif colormap=='spring':
+            im_ = cm.spring(im_)[:, :, :3]
+        elif colormap=='onediff':
+            im_ = np.reshape(im_, (-1))
+            im0_ = cm.spring(im_)[:, :3]
+            im1_ = cm.winter(im_)[:, :3]
+            im1_[im_==1/float(S)] = im0_[im_==1/float(S)]
+            im_ = np.reshape(im1_, (H, W, 3))
+        else:
+            assert(False) # invalid colormap
+        # move channels into dim 0
+        im_ = np.transpose(im_, [2, 0, 1])
+        im_ = torch.from_numpy(im_).float().cuda()
+        out.append(im_)
+    out = torch.stack(out, dim=0)
+    # blacken the invalid pixels, instead of using the 0-color
+    out = out*mask
+    # out = out*255.0
+    # put it in [-0.5, 0.5]
+    out = out - 0.5
+    return out
+def colorize(d):
+    # this is actually just grayscale right now
+    if d.ndim==2:
+        d = d.unsqueeze(dim=0)
+    else:
+        assert(d.ndim==3)
+    # color_map = cm.get_cmap('plasma')
+    color_map = cm.get_cmap('inferno')
+    # S1, D = traj.shape
+    # print('d1', d.shape)
+    C,H,W = d.shape
+    assert(C==1)
+    d = d.reshape(-1)
+    d = d.detach().cpu().numpy()
+    # print('d2', d.shape)
+    color = np.array(color_map(d)) * 255 # rgba
+    # print('color1', color.shape)
+    color = np.reshape(color[:,:3], [H*W, 3])
+    # print('color2', color.shape)
+    color = torch.from_numpy(color).permute(1,0).reshape(3,H,W)
+    # # gather
+    # cm = matplotlib.cm.get_cmap(cmap if cmap is not None else 'gray')
+    # if cmap=='RdBu' or cmap=='RdYlGn':
+    #     colors = cm(np.arange(256))[:, :3]
+    #  else:
+    #      colors = cm.colors
+    #      colors = np.array(colors).astype(np.float32)
+    #      colors = np.reshape(colors, [-1, 3])
+    #      colors = tf.constant(colors, dtype=tf.float32)
+    #      value = tf.gather(colors, indices)
+    # colorize(value, normalize=True, vmin=None, vmax=None, cmap=None, vals=255)
+    # copy to the three chans
+    # d = d.repeat(3, 1, 1)
+    return color
+def oned2inferno(d, norm=True, do_colorize=False):
+    # convert a 1chan input to a 3chan image output
+    # if it's just B x H x W, add a C dim
+    if d.ndim==3:
+        d = d.unsqueeze(dim=1)
+    # d should be B x C x H x W, where C=1
+    B, C, H, W = list(d.shape)
+    assert(C==1)
+    if norm:
+        d = utils.basic.normalize(d)
+    if do_colorize:
+        rgb = torch.zeros(B, 3, H, W)
+        for b in list(range(B)):
+            rgb[b] = colorize(d[b])
+    else:
+        rgb = d.repeat(1, 3, 1, 1)*255.0
+    # rgb = (255.0*rgb).type(torch.ByteTensor)
+    rgb = rgb.type(torch.ByteTensor)
+    # rgb = tf.cast(255.0*rgb, tf.uint8)
+    # rgb = tf.reshape(rgb, [-1, hyp.H, hyp.W, 3])
+    # rgb = tf.expand_dims(rgb, axis=0)
+    return rgb
+def oned2gray(d, norm=True):
+    # convert a 1chan input to a 3chan image output
+    # if it's just B x H x W, add a C dim
+    if d.ndim==3:
+        d = d.unsqueeze(dim=1)
+    # d should be B x C x H x W, where C=1
+    B, C, H, W = list(d.shape)
+    assert(C==1)
+    if norm:
+        d = utils.basic.normalize(d)
+    rgb = d.repeat(1,3,1,1)
+    rgb = (255.0*rgb).type(torch.ByteTensor)
+    return rgb
+def draw_frame_id_on_vis(vis, frame_id, scale=0.5, left=5, top=20):
+    rgb = vis.detach().cpu().numpy()[0]
+    rgb = np.transpose(rgb, [1, 2, 0]) # put channels last
+    rgb = cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR)
+    color = (255, 255, 255)
+    # print('putting frame id', frame_id)
+    frame_str = utils.basic.strnum(frame_id)
+    text_color_bg = (0,0,0)
+    font = cv2.FONT_HERSHEY_SIMPLEX
+    text_size, _ = cv2.getTextSize(frame_str, font, scale, 1)
+    text_w, text_h = text_size
+    cv2.rectangle(rgb, (left, top-text_h), (left + text_w, top+1), text_color_bg, -1)
+    cv2.putText(
+        rgb,
+        frame_str,
+        (left, top), # from left, from top
+        font,
+        scale, # font scale (float)
+        color,
+        1) # font thickness (int)
+    rgb = cv2.cvtColor(rgb.astype(np.uint8), cv2.COLOR_BGR2RGB)
+    vis = torch.from_numpy(rgb).permute(2, 0, 1).unsqueeze(0)
+    return vis
+COLORMAP_FILE = "./utils/bremm.png"
+class ColorMap2d:
+    def __init__(self, filename=None):
+        self._colormap_file = filename or COLORMAP_FILE
+        self._img = plt.imread(self._colormap_file)
+        self._height = self._img.shape[0]
+        self._width = self._img.shape[1]
+    def __call__(self, X):
+        assert len(X.shape) == 2
+        output = np.zeros((X.shape[0], 3))
+        for i in range(X.shape[0]):
+            x, y = X[i, :]
+            xp = int((self._width-1) * x)
+            yp = int((self._height-1) * y)
+            xp = np.clip(xp, 0, self._width-1)
+            yp = np.clip(yp, 0, self._height-1)
+            output[i, :] = self._img[yp, xp]
+        return output
+def get_n_colors(N, sequential=False):
+    label_colors = []
+    for ii in range(N):
+        if sequential:
+            rgb = cm.winter(ii/(N-1))
+            rgb = (np.array(rgb) * 255).astype(np.uint8)[:3]
+        else:
+            rgb = np.zeros(3)
+            while np.sum(rgb) < 128: # ensure min brightness
+                rgb = np.random.randint(0,256,3)
+        label_colors.append(rgb)
+    return label_colors
+class Summ_writer(object):
+    def __init__(self, writer, global_step, log_freq=10, fps=8, scalar_freq=100, just_gif=False):
+        self.writer = writer
+        self.global_step = global_step
+        self.log_freq = log_freq
+        self.fps = fps
+        self.just_gif = just_gif
+        self.maxwidth = 10000
+        self.save_this = (self.global_step % self.log_freq == 0)
+        self.scalar_freq = max(scalar_freq,1)
+    def summ_gif(self, name, tensor, blacken_zeros=False):
+        # tensor should be in B x S x C x H x W
+        assert tensor.dtype in {torch.uint8,torch.float32}
+        shape = list(tensor.shape)
+        if tensor.dtype == torch.float32:
+            tensor = back2color(tensor, blacken_zeros=blacken_zeros)
+        video_to_write = tensor[0:1]
+        S = video_to_write.shape[1]
+        if S==1:
+            # video_to_write is 1 x 1 x C x H x W
+            self.writer.add_image(name, video_to_write[0,0], global_step=self.global_step)
+        else:
+            self.writer.add_video(name, video_to_write, fps=self.fps, global_step=self.global_step)
+        return video_to_write
+    def draw_boxlist2d_on_image(self, rgb, boxlist, scores=None, tids=None, linewidth=1):
+        B, C, H, W = list(rgb.shape)
+        assert(C==3)
+        B2, N, D = list(boxlist.shape)
+        assert(B2==B)
+        assert(D==4) # ymin, xmin, ymax, xmax
+        rgb = back2color(rgb)
+        if scores is None:
+            scores = torch.ones(B2, N).float()
+        if tids is None:
+            tids = torch.arange(N).reshape(1,N).repeat(B2,N).long()
+            # tids = torch.zeros(B2, N).long()
+        out = self.draw_boxlist2d_on_image_py(
+            rgb[0].cpu().detach().numpy(),
+            boxlist[0].cpu().detach().numpy(),
+            scores[0].cpu().detach().numpy(),
+            tids[0].cpu().detach().numpy(),
+            linewidth=linewidth)
+        out = torch.from_numpy(out).type(torch.ByteTensor).permute(2, 0, 1)
+        out = torch.unsqueeze(out, dim=0)
+        out = preprocess_color(out)
+        out = torch.reshape(out, [1, C, H, W])
+        return out
+    def draw_boxlist2d_on_image_py(self, rgb, boxlist, scores, tids, linewidth=1):
+        # all inputs are numpy tensors
+        # rgb is H x W x 3
+        # boxlist is N x 4
+        # scores is N
+        # tids is N
+        rgb = np.transpose(rgb, [1, 2, 0]) # put channels last
+        # rgb = cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR)
+        rgb = rgb.astype(np.uint8).copy()
+        H, W, C = rgb.shape
+        assert(C==3)
+        N, D = boxlist.shape
+        assert(D==4)
+        # color_map = cm.get_cmap('tab20')
+        # color_map = cm.get_cmap('set1')
+        color_map = cm.get_cmap('Accent')
+        color_map = color_map.colors
+        # print('color_map', color_map)
+        # draw
+        for ind, box in enumerate(boxlist):
+            # box is 4
+            if not np.isclose(scores[ind], 0.0):
+                # box = utils.geom.scale_box2d(box, H, W)
+                ymin, xmin, ymax, xmax = box
+                # ymin, ymax = ymin*H, ymax*H
+                # xmin, xmax = xmin*W, xmax*W
+                # print 'score = %.2f' % scores[ind]
+                # color_id = tids[ind] % 20
+                color_id = tids[ind]
+                color = color_map[color_id]
+                color = np.array(color)*255.0
+                color = color.round()
+                # color = color.astype(np.uint8)
+                # color = color[::-1]
+                # print('color', color)
+                # print 'tid = %d; score = %.3f' % (tids[ind], scores[ind])
+                # if False:
+                if scores[ind] < 1.0: # not gt
+                    cv2.putText(rgb,
+                                # '%d (%.2f)' % (tids[ind], scores[ind]),
+                                '%.2f' % (scores[ind]),
+                                (int(xmin), int(ymin)),
+                                cv2.FONT_HERSHEY_SIMPLEX,
+                                0.5, # font size
+                                color),
+                    #1) # font weight
+                xmin = np.clip(int(xmin), 0,  W-1)
+                xmax = np.clip(int(xmax), 0,  W-1)
+                ymin = np.clip(int(ymin), 0,  H-1)
+                ymax = np.clip(int(ymax), 0,  H-1)
+                cv2.line(rgb, (xmin, ymin), (xmin, ymax), color, linewidth, cv2.LINE_AA)
+                cv2.line(rgb, (xmin, ymin), (xmax, ymin), color, linewidth, cv2.LINE_AA)
+                cv2.line(rgb, (xmax, ymin), (xmax, ymax), color, linewidth, cv2.LINE_AA)
+                cv2.line(rgb, (xmax, ymax), (xmin, ymax), color, linewidth, cv2.LINE_AA)
+        # rgb = cv2.cvtColor(rgb.astype(np.uint8), cv2.COLOR_BGR2RGB)
+        return rgb
+    def summ_boxlist2d(self, name, rgb, boxlist, scores=None, tids=None, frame_id=None, only_return=False, linewidth=2):
+        B, C, H, W = list(rgb.shape)
+        boxlist_vis = self.draw_boxlist2d_on_image(rgb, boxlist, scores=scores, tids=tids, linewidth=linewidth)
+        return self.summ_rgb(name, boxlist_vis, frame_id=frame_id, only_return=only_return)
+    def summ_rgbs(self, name, ims, frame_ids=None, blacken_zeros=False, only_return=False):
+        if self.save_this:
+            ims = gif_and_tile(ims, just_gif=self.just_gif)
+            vis = ims
+            assert vis.dtype in {torch.uint8,torch.float32}
+            if vis.dtype == torch.float32:
+                vis = back2color(vis, blacken_zeros)
+            B, S, C, H, W = list(vis.shape)
+            if frame_ids is not None:
+                assert(len(frame_ids)==S)
+                for s in range(S):
+                    vis[:,s] = draw_frame_id_on_vis(vis[:,s], frame_ids[s])
+            if int(W) > self.maxwidth:
+                vis = vis[:,:,:,:self.maxwidth]
+            if only_return:
+                return vis
+            else:
+                return self.summ_gif(name, vis, blacken_zeros)
+    def summ_rgb(self, name, ims, blacken_zeros=False, frame_id=None, only_return=False, halfres=False):
+        if self.save_this:
+            assert ims.dtype in {torch.uint8,torch.float32}
+            if ims.dtype == torch.float32:
+                ims = back2color(ims, blacken_zeros)
+            #ims is B x C x H x W
+            vis = ims[0:1] # just the first one
+            B, C, H, W = list(vis.shape)
+            if halfres:
+                vis = F.interpolate(vis, scale_factor=0.5)
+            if frame_id is not None:
+                vis = draw_frame_id_on_vis(vis, frame_id)
+            if int(W) > self.maxwidth:
+                vis = vis[:,:,:,:self.maxwidth]
+            if only_return:
+                return vis
+            else:
+                return self.summ_gif(name, vis.unsqueeze(1), blacken_zeros)
+    def flow2color(self, flow, clip=50.0):
+        """
+        :param flow: Optical flow tensor.
+        :return: RGB image normalized between 0 and 1.
+        """
+        # flow is B x C x H x W
+        B, C, H, W = list(flow.size())
+        flow = flow.clone().detach()
+        abs_image = torch.abs(flow)
+        flow_mean = abs_image.mean(dim=[1,2,3])
+        flow_std = abs_image.std(dim=[1,2,3])
+        if clip:
+            flow = torch.clamp(flow, -clip, clip)/clip
+        else:
+            # Apply some kind of normalization. Divide by the perceived maximum (mean + std*2)
+            flow_max = flow_mean + flow_std*2 + 1e-10
+            for b in range(B):
+                flow[b] = flow[b].clamp(-flow_max[b].item(), flow_max[b].item()) / flow_max[b].clamp(min=1)
+        radius = torch.sqrt(torch.sum(flow**2, dim=1, keepdim=True)) #B x 1 x H x W
+        radius_clipped = torch.clamp(radius, 0.0, 1.0)
+        angle = torch.atan2(flow[:, 1:], flow[:, 0:1]) / np.pi #B x 1 x H x W
+        hue = torch.clamp((angle + 1.0) / 2.0, 0.0, 1.0)
+        saturation = torch.ones_like(hue) * 0.75
+        value = radius_clipped
+        hsv = torch.cat([hue, saturation, value], dim=1) #B x 3 x H x W
+        #flow = tf.image.hsv_to_rgb(hsv)
+        flow = hsv_to_rgb(hsv)
+        flow = (flow*255.0).type(torch.ByteTensor)
+        return flow
+    def summ_flow(self, name, im, clip=0.0, only_return=False, frame_id=None):
+        # flow is B x C x D x W
+        if self.save_this:
+            return self.summ_rgb(name, self.flow2color(im, clip=clip), only_return=only_return, frame_id=frame_id)
+        else:
+            return None
+    def summ_oneds(self, name, ims, frame_ids=None, bev=False, fro=False, logvis=False, reduce_max=False, max_val=0.0, norm=True, only_return=False, do_colorize=False):
+        if self.save_this:
+            if bev:
+                B, C, H, _, W = list(ims[0].shape)
+                if reduce_max:
+                    ims = [torch.max(im, dim=3)[0] for im in ims]
+                else:
+                    ims = [torch.mean(im, dim=3) for im in ims]
+            elif fro:
+                B, C, _, H, W = list(ims[0].shape)
+                if reduce_max:
+                    ims = [torch.max(im, dim=2)[0] for im in ims]
+                else:
+                    ims = [torch.mean(im, dim=2) for im in ims]
+            if len(ims) != 1: # sequence
+                im = gif_and_tile(ims, just_gif=self.just_gif)
+            else:
+                im = torch.stack(ims, dim=1) # single frame
+            B, S, C, H, W = list(im.shape)
+            if logvis and max_val:
+                max_val = np.log(max_val)
+                im = torch.log(torch.clamp(im, 0)+1.0)
+                im = torch.clamp(im, 0, max_val)
+                im = im/max_val
+                norm = False
+            elif max_val:
+                im = torch.clamp(im, 0, max_val)
+                im = im/max_val
+                norm = False
+            if norm:
+                # normalize before oned2inferno,
+                # so that the ranges are similar within B across S
+                im = utils.basic.normalize(im)
+            im = im.view(B*S, C, H, W)
+            vis = oned2inferno(im, norm=norm, do_colorize=do_colorize)
+            vis = vis.view(B, S, 3, H, W)
+            if frame_ids is not None:
+                assert(len(frame_ids)==S)
+                for s in range(S):
+                    vis[:,s] = draw_frame_id_on_vis(vis[:,s], frame_ids[s])
+            if W > self.maxwidth:
+                vis = vis[...,:self.maxwidth]
+            if only_return:
+                return vis
+            else:
+                self.summ_gif(name, vis)
+    def summ_oned(self, name, im, bev=False, fro=False, logvis=False, max_val=0, max_along_y=False, norm=True, frame_id=None, only_return=False):
+        if self.save_this:
+            if bev:
+                B, C, H, _, W = list(im.shape)
+                if max_along_y:
+                    im = torch.max(im, dim=3)[0]
+                else:
+                    im = torch.mean(im, dim=3)
+            elif fro:
+                B, C, _, H, W = list(im.shape)
+                if max_along_y:
+                    im = torch.max(im, dim=2)[0]
+                else:
+                    im = torch.mean(im, dim=2)
+            else:
+                B, C, H, W = list(im.shape)
+            im = im[0:1] # just the first one
+            assert(C==1)
+            if logvis and max_val:
+                max_val = np.log(max_val)
+                im = torch.log(im)
+                im = torch.clamp(im, 0, max_val)
+                im = im/max_val
+                norm = False
+            elif max_val:
+                im = torch.clamp(im, 0, max_val)/max_val
+                norm = False
+            vis = oned2inferno(im, norm=norm)
+            if W > self.maxwidth:
+                vis = vis[...,:self.maxwidth]
+            return self.summ_rgb(name, vis, blacken_zeros=False, frame_id=frame_id, only_return=only_return)
+    def summ_feats(self, name, feats, valids=None, pca=True, fro=False, only_return=False, frame_ids=None):
+        if self.save_this:
+            if valids is not None:
+                valids = torch.stack(valids, dim=1)
+            feats  = torch.stack(feats, dim=1)
+            # feats leads with B x S x C
+            if feats.ndim==6:
+                # feats is B x S x C x D x H x W
+                if fro:
+                    reduce_dim = 3
+                else:
+                    reduce_dim = 4
+                if valids is None:
+                    feats = torch.mean(feats, dim=reduce_dim)
+                else:
+                    valids = valids.repeat(1, 1, feats.size()[2], 1, 1, 1)
+                    feats = utils.basic.reduce_masked_mean(feats, valids, dim=reduce_dim)
+            B, S, C, D, W = list(feats.size())
+            if not pca:
+                # feats leads with B x S x C
+                feats = torch.mean(torch.abs(feats), dim=2, keepdims=True)
+                # feats leads with B x S x 1
+                feats = torch.unbind(feats, dim=1)
+                return self.summ_oneds(name=name, ims=feats, norm=True, only_return=only_return, frame_ids=frame_ids)
+            else:
+                __p = lambda x: utils.basic.pack_seqdim(x, B)
+                __u = lambda x: utils.basic.unpack_seqdim(x, B)
+                feats_  = __p(feats)
+                if valids is None:
+                    feats_pca_ = get_feat_pca(feats_)
+                else:
+                    valids_ = __p(valids)
+                    feats_pca_ = get_feat_pca(feats_, valids)
+                feats_pca = __u(feats_pca_)
+                return self.summ_rgbs(name=name, ims=torch.unbind(feats_pca, dim=1), only_return=only_return, frame_ids=frame_ids)
+    def summ_feat(self, name, feat, valid=None, pca=True, only_return=False, bev=False, fro=False, frame_id=None):
+        if self.save_this:
+            if feat.ndim==5: # B x C x D x H x W
+                if bev:
+                    reduce_axis = 3
+                elif fro:
+                    reduce_axis = 2
+                else:
+                    # default to bev
+                    reduce_axis = 3
+                if valid is None:
+                    feat = torch.mean(feat, dim=reduce_axis)
+                else:
+                    valid = valid.repeat(1, feat.size()[1], 1, 1, 1)
+                    feat = utils.basic.reduce_masked_mean(feat, valid, dim=reduce_axis)
+            B, C, D, W = list(feat.shape)
+            if not pca:
+                feat = torch.mean(torch.abs(feat), dim=1, keepdims=True)
+                # feat is B x 1 x D x W
+                return self.summ_oned(name=name, im=feat, norm=True, only_return=only_return, frame_id=frame_id)
+            else:
+                feat_pca = get_feat_pca(feat, valid)
+                return self.summ_rgb(name, feat_pca, only_return=only_return, frame_id=frame_id)
+    def summ_scalar(self, name, value):
+        if (not (isinstance(value, int) or isinstance(value, float) or isinstance(value, np.float32))) and ('torch' in value.type()):
+            value = value.detach().cpu().numpy()
+        if not np.isnan(value):
+            if (self.log_freq == 1):
+                self.writer.add_scalar(name, value, global_step=self.global_step)
+            elif self.save_this or np.mod(self.global_step, self.scalar_freq)==0:
+                self.writer.add_scalar(name, value, global_step=self.global_step)
+    def summ_seg(self, name, seg, only_return=False, frame_id=None, colormap='tab20', label_colors=None):
+        if not self.save_this:
+            return
+        B,H,W = seg.shape
+        if label_colors is None:
+            custom_label_colors = False
+            # label_colors = get_n_colors(int(torch.max(seg).item()), sequential=True)
+            label_colors = cm.get_cmap(colormap).colors
+            label_colors = [[int(i*255) for i in l] for l in label_colors]
+        else:
+            custom_label_colors = True
+        # label_colors = matplotlib.cm.get_cmap(colormap).colors
+        # label_colors = [[int(i*255) for i in l] for l in label_colors]
+        # print('label_colors', label_colors)
+        # label_colors = [
+        #     (0, 0, 0),         # None
+        #     (70, 70, 70),      # Buildings
+        #     (190, 153, 153),   # Fences
+        #     (72, 0, 90),       # Other
+        #     (220, 20, 60),     # Pedestrians
+        #     (153, 153, 153),   # Poles
+        #     (157, 234, 50),    # RoadLines
+        #     (128, 64, 128),    # Roads
+        #     (244, 35, 232),    # Sidewalks
+        #     (107, 142, 35),    # Vegetation
+        #     (0, 0, 255),      # Vehicles
+        #     (102, 102, 156),  # Walls
+        #     (220, 220, 0)     # TrafficSigns
+        # ]
+        r = torch.zeros_like(seg,dtype=torch.uint8)
+        g = torch.zeros_like(seg,dtype=torch.uint8)
+        b = torch.zeros_like(seg,dtype=torch.uint8)
+        for label in range(0,len(label_colors)):
+            if (not custom_label_colors):# and (N > 20):
+                label_ = label % 20
+            else:
+                label_ = label
+            idx = (seg == label+1)
+            r[idx] = label_colors[label_][0]
+            g[idx] = label_colors[label_][1]
+            b[idx] = label_colors[label_][2]
+        rgb = torch.stack([r,g,b],axis=1)
+        return self.summ_rgb(name,rgb,only_return=only_return, frame_id=frame_id)
+    def summ_pts_on_rgb(self, name, trajs, rgb, valids=None, frame_id=None, only_return=False, show_dots=True, cmap='coolwarm', linewidth=1):
+        # trajs is B, S, N, 2
+        # rgbs is B, S, C, H, W
+        B, C, H, W = rgb.shape
+        B, S, N, D = trajs.shape
+        rgb = rgb[0] # C, H, W
+        trajs = trajs[0] # S, N, 2
+        if valids is None:
+            valids = torch.ones_like(trajs[:,:,0]) # S, N
+        else:
+            valids = valids[0]
+        # print('trajs', trajs.shape)
+        # print('valids', valids.shape)
+        rgb = back2color(rgb).detach().cpu().numpy()
+        rgb = np.transpose(rgb, [1, 2, 0]) # put channels last
+        trajs = trajs.long().detach().cpu().numpy() # S, N, 2
+        valids = valids.long().detach().cpu().numpy() # S, N
+        rgb = rgb.astype(np.uint8).copy()
+        for i in range(N):
+            if cmap=='onediff' and i==0:
+                cmap_ = 'spring'
+            elif cmap=='onediff':
+                cmap_ = 'winter'
+            else:
+                cmap_ = cmap
+            traj = trajs[:,i] # S,2
+            valid = valids[:,i] # S
+            color_map = cm.get_cmap(cmap)
+            color = np.array(color_map(i)[:3]) * 255 # rgb
+            for s in range(S):
+                if valid[s]:
+                    cv2.circle(rgb, (int(traj[s,0]), int(traj[s,1])), linewidth, color, -1)
+        rgb = torch.from_numpy(rgb).permute(2,0,1).unsqueeze(0)
+        rgb = preprocess_color(rgb)
+        return self.summ_rgb(name, rgb, only_return=only_return, frame_id=frame_id)
+    def summ_pts_on_rgbs(self, name, trajs, rgbs, valids=None, frame_ids=None, only_return=False, show_dots=True, cmap='coolwarm', linewidth=1):
+        # trajs is B, S, N, 2
+        # rgbs is B, S, C, H, W
+        B, S, C, H, W = rgbs.shape
+        B, S2, N, D = trajs.shape
+        assert(S==S2)
+        rgbs = rgbs[0] # S, C, H, W
+        trajs = trajs[0] # S, N, 2
+        if valids is None:
+            valids = torch.ones_like(trajs[:,:,0]) # S, N
+        else:
+            valids = valids[0]
+        # print('trajs', trajs.shape)
+        # print('valids', valids.shape)
+        rgbs_color = []
+        for rgb in rgbs:
+            rgb = back2color(rgb).detach().cpu().numpy()
+            rgb = np.transpose(rgb, [1, 2, 0]) # put channels last
+            rgbs_color.append(rgb) # each element 3 x H x W
+        trajs = trajs.long().detach().cpu().numpy() # S, N, 2
+        valids = valids.long().detach().cpu().numpy() # S, N
+        rgbs_color = [rgb.astype(np.uint8).copy() for rgb in rgbs_color]
+        for i in range(N):
+            traj = trajs[:,i] # S,2
+            valid = valids[:,i] # S
+            color_map = cm.get_cmap(cmap)
+            color = np.array(color_map(0)[:3]) * 255 # rgb
+            for s in range(S):
+                if valid[s]:
+                    cv2.circle(rgbs_color[s], (traj[s,0], traj[s,1]), linewidth, color, -1)
+        rgbs = []
+        for rgb in rgbs_color:
+            rgb = torch.from_numpy(rgb).permute(2, 0, 1).unsqueeze(0)
+            rgbs.append(preprocess_color(rgb))
+        return self.summ_rgbs(name, rgbs, only_return=only_return, frame_ids=frame_ids)
+    def summ_traj2ds_on_rgbs(self, name, trajs, rgbs, valids=None, frame_ids=None, only_return=False, show_dots=False, cmap='coolwarm', vals=None, linewidth=1):
+        # trajs is B, S, N, 2
+        # rgbs is B, S, C, H, W
+        B, S, C, H, W = rgbs.shape
+        B, S2, N, D = trajs.shape
+        assert(S==S2)
+        rgbs = rgbs[0] # S, C, H, W
+        trajs = trajs[0] # S, N, 2
+        if valids is None:
+            valids = torch.ones_like(trajs[:,:,0]) # S, N
+        else:
+            valids = valids[0]
+        # print('trajs', trajs.shape)
+        # print('valids', valids.shape)
+        if vals is not None:
+            vals = vals[0] # N
+            # print('vals', vals.shape)
+        rgbs_color = []
+        for rgb in rgbs:
+            rgb = back2color(rgb).detach().cpu().numpy()
+            rgb = np.transpose(rgb, [1, 2, 0]) # put channels last
+            rgbs_color.append(rgb) # each element 3 x H x W
+        for i in range(N):
+            if cmap=='onediff' and i==0:
+                cmap_ = 'spring'
+            elif cmap=='onediff':
+                cmap_ = 'winter'
+            else:
+                cmap_ = cmap
+            traj = trajs[:,i].long().detach().cpu().numpy() # S, 2
+            valid = valids[:,i].long().detach().cpu().numpy() # S
+            # print('traj', traj.shape)
+            # print('valid', valid.shape)
+            if vals is not None:
+                # val = vals[:,i].float().detach().cpu().numpy() # []
+                val = vals[i].float().detach().cpu().numpy() # []
+                # print('val', val.shape)
+            else:
+                val = None
+            for t in range(S):
+                # if valid[t]:
+                # traj_seq = traj[max(t-16,0):t+1]
+                traj_seq = traj[max(t-8,0):t+1]
+                val_seq = np.linspace(0,1,len(traj_seq))
+                # if t<2:
+                #     val_seq = np.zeros_like(val_seq)
+                # print('val_seq', val_seq)
+                # val_seq = 1.0
+                # val_seq = np.arange(8)/8.0
+                # val_seq = val_seq[-len(traj_seq):]
+                # rgbs_color[t] = self.draw_traj_on_image_py(rgbs_color[t], traj_seq, S=S, show_dots=show_dots, cmap=cmap_, val=val_seq, linewidth=linewidth)
+                rgbs_color[t] = self.draw_traj_on_image_py(rgbs_color[t], traj_seq, S=S, show_dots=show_dots, cmap=cmap_, val=val_seq, linewidth=linewidth)
+            # input()
+        for i in range(N):
+            if cmap=='onediff' and i==0:
+                cmap_ = 'spring'
+            elif cmap=='onediff':
+                cmap_ = 'winter'
+            else:
+                cmap_ = cmap
+            traj = trajs[:,i] # S,2
+            # vis = visibles[:,i] # S
+            vis = torch.ones_like(traj[:,0]) # S
+            valid = valids[:,i] # S
+            rgbs_color = self.draw_circ_on_images_py(rgbs_color, traj, vis, S=0, show_dots=show_dots, cmap=cmap_, linewidth=linewidth)
+        rgbs = []
+        for rgb in rgbs_color:
+            rgb = torch.from_numpy(rgb).permute(2, 0, 1).unsqueeze(0)
+            rgbs.append(preprocess_color(rgb))
+        return self.summ_rgbs(name, rgbs, only_return=only_return, frame_ids=frame_ids)
+    def summ_traj2ds_on_rgbs2(self, name, trajs, visibles, rgbs, valids=None, frame_ids=None, only_return=False, show_dots=True, cmap=None, linewidth=1):
+        # trajs is B, S, N, 2
+        # rgbs is B, S, C, H, W
+        B, S, C, H, W = rgbs.shape
+        B, S2, N, D = trajs.shape
+        assert(S==S2)
+        rgbs = rgbs[0] # S, C, H, W
+        trajs = trajs[0] # S, N, 2
+        visibles = visibles[0] # S, N
+        if valids is None:
+            valids = torch.ones_like(trajs[:,:,0]) # S, N
+        else:
+            valids = valids[0]
+        # print('trajs', trajs.shape)
+        # print('valids', valids.shape)
+        rgbs_color = []
+        for rgb in rgbs:
+            rgb = back2color(rgb).detach().cpu().numpy()
+            rgb = np.transpose(rgb, [1, 2, 0]) # put channels last
+            rgbs_color.append(rgb) # each element 3 x H x W
+        trajs = trajs.long().detach().cpu().numpy() # S, N, 2
+        visibles = visibles.float().detach().cpu().numpy() # S, N
+        valids = valids.long().detach().cpu().numpy() # S, N
+        for i in range(N):
+            if cmap=='onediff' and i==0:
+                cmap_ = 'spring'
+            elif cmap=='onediff':
+                cmap_ = 'winter'
+            else:
+                cmap_ = cmap
+            traj = trajs[:,i] # S,2
+            vis = visibles[:,i] # S
+            valid = valids[:,i] # S
+            rgbs_color = self.draw_traj_on_images_py(rgbs_color, traj, S=S, show_dots=show_dots, cmap=cmap_, linewidth=linewidth)
+        for i in range(N):
+            if cmap=='onediff' and i==0:
+                cmap_ = 'spring'
+            elif cmap=='onediff':
+                cmap_ = 'winter'
+            else:
+                cmap_ = cmap
+            traj = trajs[:,i] # S,2
+            vis = visibles[:,i] # S
+            valid = valids[:,i] # S
+            if valid[0]:
+                rgbs_color = self.draw_circ_on_images_py(rgbs_color, traj, vis, S=S, show_dots=show_dots, cmap=None, linewidth=linewidth)
+        rgbs = []
+        for rgb in rgbs_color:
+            rgb = torch.from_numpy(rgb).permute(2, 0, 1).unsqueeze(0)
+            rgbs.append(preprocess_color(rgb))
+        return self.summ_rgbs(name, rgbs, only_return=only_return, frame_ids=frame_ids)
+    def summ_traj2ds_on_rgb(self, name, trajs, rgb, valids=None, show_dots=False, show_lines=True, frame_id=None, only_return=False, cmap='coolwarm', linewidth=1):
+        # trajs is B, S, N, 2
+        # rgb is B, C, H, W
+        B, C, H, W = rgb.shape
+        B, S, N, D = trajs.shape
+        rgb = rgb[0] # S, C, H, W
+        trajs = trajs[0] # S, N, 2
+        if valids is None:
+            valids = torch.ones_like(trajs[:,:,0])
+        else:
+            valids = valids[0]
+        rgb_color = back2color(rgb).detach().cpu().numpy()
+        rgb_color = np.transpose(rgb_color, [1, 2, 0]) # put channels last
+        # using maxdist will dampen the colors for short motions
+        norms = torch.sqrt(1e-4 + torch.sum((trajs[-1] - trajs[0])**2, dim=1)) # N
+        maxdist = torch.quantile(norms, 0.95).detach().cpu().numpy()
+        maxdist = None
+        trajs = trajs.long().detach().cpu().numpy() # S, N, 2
+        valids = valids.long().detach().cpu().numpy() # S, N
+        for i in range(N):
+            if cmap=='onediff' and i==0:
+                cmap_ = 'spring'
+            elif cmap=='onediff':
+                cmap_ = 'winter'
+            else:
+                cmap_ = cmap
+            traj = trajs[:,i] # S, 2
+            valid = valids[:,i] # S
+            if valid[0]==1:
+                traj = traj[valid>0]
+                rgb_color = self.draw_traj_on_image_py(
+                    rgb_color, traj, S=S, show_dots=show_dots, show_lines=show_lines, cmap=cmap_, maxdist=maxdist, linewidth=linewidth)
+        rgb_color = torch.from_numpy(rgb_color).permute(2, 0, 1).unsqueeze(0)
+        rgb = preprocess_color(rgb_color)
+        return self.summ_rgb(name, rgb, only_return=only_return, frame_id=frame_id)
+    def draw_traj_on_image_py(self, rgb, traj, S=50, linewidth=1, show_dots=False, show_lines=True, cmap='coolwarm', val=None, maxdist=None):
+        # all inputs are numpy tensors
+        # rgb is 3 x H x W
+        # traj is S x 2
+        H, W, C = rgb.shape
+        assert(C==3)
+        rgb = rgb.astype(np.uint8).copy()
+        S1, D = traj.shape
+        assert(D==2)
+        color_map = cm.get_cmap(cmap)
+        S1, D = traj.shape
+        for s in range(S1):
+            if val is not None:
+                # if len(val) == S1:
+                color = np.array(color_map(val[s])[:3]) * 255 # rgb
+                # else:
+                #     color = np.array(color_map(val)[:3]) * 255 # rgb
+            else:
+                if maxdist is not None:
+                    val = (np.sqrt(np.sum((traj[s]-traj[0])**2))/maxdist).clip(0,1)
+                    color = np.array(color_map(val)[:3]) * 255 # rgb
+                else:
+                    color = np.array(color_map((s)/max(1,float(S-2)))[:3]) * 255 # rgb
+            if show_lines and s<(S1-1):
+                cv2.line(rgb,
+                         (int(traj[s,0]), int(traj[s,1])),
+                         (int(traj[s+1,0]), int(traj[s+1,1])),
+                         color,
+                         linewidth,
+                         cv2.LINE_AA)
+            if show_dots:
+                cv2.circle(rgb, (int(traj[s,0]), int(traj[s,1])), linewidth, np.array(color_map(1)[:3])*255, -1)
+        # if maxdist is not None:
+        #     val = (np.sqrt(np.sum((traj[-1]-traj[0])**2))/maxdist).clip(0,1)
+        #     color = np.array(color_map(val)[:3]) * 255 # rgb
+        # else:
+        #     # draw the endpoint of traj, using the next color (which may be the last color)
+        #     color = np.array(color_map((S1-1)/max(1,float(S-2)))[:3]) * 255 # rgb
+        # # emphasize endpoint
+        # cv2.circle(rgb, (traj[-1,0], traj[-1,1]), linewidth*2, color, -1)
+        return rgb
+    def draw_traj_on_images_py(self, rgbs, traj, S=50, linewidth=1, show_dots=False, cmap='coolwarm', maxdist=None):
+        # all inputs are numpy tensors
+        # rgbs is a list of H,W,3
+        # traj is S,2
+        H, W, C = rgbs[0].shape
+        assert(C==3)
+        rgbs = [rgb.astype(np.uint8).copy() for rgb in rgbs]
+        S1, D = traj.shape
+        assert(D==2)
+        x = int(np.clip(traj[0,0], 0, W-1))
+        y = int(np.clip(traj[0,1], 0, H-1))
+        color = rgbs[0][y,x]
+        color = (int(color[0]),int(color[1]),int(color[2]))
+        for s in range(S):
+            # bak_color = np.array(color_map(1.0)[:3]) * 255 # rgb
+            # cv2.circle(rgbs[s], (traj[s,0], traj[s,1]), linewidth*4, bak_color, -1)
+            cv2.polylines(rgbs[s],
+                          [traj[:s+1]],
+                          False,
+                          color,
+                          linewidth,
+                          cv2.LINE_AA)
+        return rgbs
+    def draw_circs_on_image_py(self, rgb, xy, colors=None, linewidth=10, radius=3, show_dots=False, maxdist=None):
+        # all inputs are numpy tensors
+        # rgbs is a list of 3,H,W
+        # xy is N,2
+        H, W, C = rgb.shape
+        assert(C==3)
+        rgb = rgb.astype(np.uint8).copy()
+        N, D = xy.shape
+        assert(D==2)
+        xy = xy.astype(np.float32)
+        xy[:,0] = np.clip(xy[:,0], 0, W-1)
+        xy[:,1] = np.clip(xy[:,1], 0, H-1)
+        xy = xy.astype(np.int32)
+        if colors is None:
+            colors = get_n_colors(N)
+        for n in range(N):
+            color = colors[n]
+            # print('color', color)
+            # color = (color[0]*255).astype(np.uint8)
+            color = (int(color[0]),int(color[1]),int(color[2]))
+            # x = int(np.clip(xy[0,0], 0, W-1))
+            # y = int(np.clip(xy[0,1], 0, H-1))
+            # color_ = rgbs[0][y,x]
+            # color_ = (int(color_[0]),int(color_[1]),int(color_[2]))
+            # color_ = (int(color_[0]),int(color_[1]),int(color_[2]))
+            cv2.circle(rgb, (xy[n,0], xy[n,1]), linewidth, color, 3)
+            # vis_color = int(np.squeeze(vis[s])*255)
+            # vis_color = (vis_color,vis_color,vis_color)
+            # cv2.circle(rgbs[s], (traj[s,0], traj[s,1]), linewidth+1, vis_color, -1)
+        return rgb
+    def draw_circ_on_images_py(self, rgbs, traj, vis, S=50, linewidth=1, show_dots=False, cmap=None, maxdist=None):
+        # all inputs are numpy tensors
+        # rgbs is a list of 3,H,W
+        # traj is S,2
+        H, W, C = rgbs[0].shape
+        assert(C==3)
+        rgbs = [rgb.astype(np.uint8).copy() for rgb in rgbs]
+        S1, D = traj.shape
+        assert(D==2)
+        if cmap is None:
+            bremm = ColorMap2d()
+            traj_ = traj[0:1].astype(np.float32)
+            traj_[:,0] /= float(W)
+            traj_[:,1] /= float(H)
+            color = bremm(traj_)
+            # print('color', color)
+            color = (color[0]*255).astype(np.uint8)
+            # color = (int(color[0]),int(color[1]),int(color[2]))
+            color = (int(color[2]),int(color[1]),int(color[0]))
+        for s in range(S1):
+            if cmap is not None:
+                color_map = cm.get_cmap(cmap)
+                # color = np.array(color_map(s/(S-1))[:3]) * 255 # rgb
+                color = np.array(color_map((s+1)/max(1,float(S-1)))[:3]) * 255 # rgb
+                # color = color.astype(np.uint8)
+                # color = (color[0], color[1], color[2])
+                # print('color', color)
+            # import ipdb; ipdb.set_trace()
+            cv2.circle(rgbs[s], (int(traj[s,0]), int(traj[s,1])), linewidth+1, color, -1)
+            # vis_color = int(np.squeeze(vis[s])*255)
+            # vis_color = (vis_color,vis_color,vis_color)
+            # cv2.circle(rgbs[s], (int(traj[s,0]), int(traj[s,1])), linewidth+1, vis_color, -1)
+        return rgbs
+    def summ_traj_as_crops(self, name, trajs_e, rgbs, frame_id=None, only_return=False, show_circ=False, trajs_g=None, is_g=False):
+        B, S, N, D = trajs_e.shape
+        assert(N==1)
+        assert(D==2)
+        rgbs_vis = []
+        n = 0
+        pad_amount = 100
+        trajs_e_py = trajs_e[0].detach().cpu().numpy()
+        # trajs_e_py = np.clip(trajs_e_py, min=pad_amount/2, max=pad_amoun
+        trajs_e_py = trajs_e_py + pad_amount
+        if trajs_g is not None:
+            trajs_g_py = trajs_g[0].detach().cpu().numpy()
+            trajs_g_py = trajs_g_py + pad_amount
+        for s in range(S):
+            rgb = rgbs[0,s].detach().cpu().numpy()
+            # print('orig rgb', rgb.shape)
+            rgb = np.transpose(rgb,(1,2,0)) # H, W, 3
+            rgb = np.pad(rgb, ((pad_amount,pad_amount),(pad_amount,pad_amount),(0,0)))
+            # print('pad rgb', rgb.shape)
+            H, W, C = rgb.shape
+            if trajs_g is not None:
+                xy_g = trajs_g_py[s,n]
+                xy_g[0] = np.clip(xy_g[0], pad_amount, W-pad_amount)
+                xy_g[1] = np.clip(xy_g[1], pad_amount, H-pad_amount)
+                rgb = self.draw_circs_on_image_py(rgb, xy_g.reshape(1,2), colors=[(0,255,0)], linewidth=2, radius=3)
+            xy_e = trajs_e_py[s,n]
+            xy_e[0] = np.clip(xy_e[0], pad_amount, W-pad_amount)
+            xy_e[1] = np.clip(xy_e[1], pad_amount, H-pad_amount)
+            if show_circ:
+                if is_g:
+                    rgb = self.draw_circs_on_image_py(rgb, xy_e.reshape(1,2), colors=[(0,255,0)], linewidth=2, radius=3)
+                else:
+                    rgb = self.draw_circs_on_image_py(rgb, xy_e.reshape(1,2), colors=[(255,0,255)], linewidth=2, radius=3)
+            xmin = int(xy_e[0])-pad_amount//2
+            xmax = int(xy_e[0])+pad_amount//2
+            ymin = int(xy_e[1])-pad_amount//2
+            ymax = int(xy_e[1])+pad_amount//2
+            rgb_ = rgb[ymin:ymax, xmin:xmax]
+            H_, W_ = rgb_.shape[:2]
+            # if np.any(rgb_.shape==0):
+            #     input()
+            if H_==0 or W_==0:
+                import ipdb; ipdb.set_trace()
+            rgb_ = rgb_.transpose(2,0,1)
+            rgb_ = torch.from_numpy(rgb_)
+            rgbs_vis.append(rgb_)
+        # nrow = int(np.sqrt(S)*(16.0/9)/2.0)
+        nrow = int(np.sqrt(S)*1.5)
+        grid_img = torchvision.utils.make_grid(torch.stack(rgbs_vis, dim=0), nrow=nrow).unsqueeze(0)
+        # print('grid_img', grid_img.shape)
+        return self.summ_rgb(name, grid_img.byte(), frame_id=frame_id, only_return=only_return)
+    def summ_occ(self, name, occ, reduce_axes=[3], bev=False, fro=False, pro=False, frame_id=None, only_return=False):
+        if self.save_this:
+            B, C, D, H, W = list(occ.shape)
+            if bev:
+                reduce_axes = [3]
+            elif fro:
+                reduce_axes = [2]
+            elif pro:
+                reduce_axes = [4]
+            for reduce_axis in reduce_axes:
+                height = convert_occ_to_height(occ, reduce_axis=reduce_axis)
+                if reduce_axis == reduce_axes[-1]:
+                    return self.summ_oned(name=('%s_ax%d' % (name, reduce_axis)), im=height, norm=False, frame_id=frame_id, only_return=only_return)
+                else:
+                    self.summ_oned(name=('%s_ax%d' % (name, reduce_axis)), im=height, norm=False, frame_id=frame_id, only_return=only_return)
+def erode2d(im, times=1, device='cuda'):
+    weights2d = torch.ones(1, 1, 3, 3, device=device)
+    for time in range(times):
+        im = 1.0 - F.conv2d(1.0 - im, weights2d, padding=1).clamp(0, 1)
+    return im
+def dilate2d(im, times=1, device='cuda', mode='square'):
+    weights2d = torch.ones(1, 1, 3, 3, device=device)
+    if mode=='cross':
+        weights2d[:,:,0,0] = 0.0
+        weights2d[:,:,0,2] = 0.0
+        weights2d[:,:,2,0] = 0.0
+        weights2d[:,:,2,2] = 0.0
+    for time in range(times):
+        im = F.conv2d(im, weights2d, padding=1).clamp(0, 1)
+    return im

models/spatracker/utils/misc.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import torch
+import numpy as np
+import math
+from prettytable import PrettyTable
+def count_parameters(model):
+    table = PrettyTable(["Modules", "Parameters"])
+    total_params = 0
+    for name, parameter in model.named_parameters():
+        if not parameter.requires_grad:
+            continue
+        param = parameter.numel()
+        if param > 100000:
+            table.add_row([name, param])
+        total_params+=param
+    print(table)
+    print('total params: %.2f M' % (total_params/1000000.0))
+    return total_params
+def posemb_sincos_2d_xy(xy, C, temperature=10000, dtype=torch.float32, cat_coords=False):
+    device = xy.device
+    dtype = xy.dtype
+    B, S, D = xy.shape
+    assert(D==2)
+    x = xy[:,:,0]
+    y = xy[:,:,1]
+    assert (C % 4) == 0, 'feature dimension must be multiple of 4 for sincos emb'
+    omega = torch.arange(C // 4, device=device) / (C // 4 - 1)
+    omega = 1. / (temperature ** omega)
+    y = y.flatten()[:, None] * omega[None, :]
+    x = x.flatten()[:, None] * omega[None, :]
+    pe = torch.cat((x.sin(), x.cos(), y.sin(), y.cos()), dim=1)
+    pe = pe.reshape(B,S,C).type(dtype)
+    if cat_coords:
+        pe = torch.cat([pe, xy], dim=2) # B,N,C+2
+    return pe
+class SimplePool():
+    def __init__(self, pool_size, version='pt'):
+        self.pool_size = pool_size
+        self.version = version
+        self.items = []
+        if not (version=='pt' or version=='np'):
+            print('version = %s; please choose pt or np')
+            assert(False) # please choose pt or np
+    def __len__(self):
+        return len(self.items)
+    def mean(self, min_size=1):
+        if min_size=='half':
+            pool_size_thresh = self.pool_size/2
+        else:
+            pool_size_thresh = min_size
+        if self.version=='np':
+            if len(self.items) >= pool_size_thresh:
+                return np.sum(self.items)/float(len(self.items))
+            else:
+                return np.nan
+        if self.version=='pt':
+            if len(self.items) >= pool_size_thresh:
+                return torch.sum(self.items)/float(len(self.items))
+            else:
+                return torch.from_numpy(np.nan)
+    def sample(self, with_replacement=True):
+        idx = np.random.randint(len(self.items))
+        if with_replacement:
+            return self.items[idx]
+        else:
+            return self.items.pop(idx)
+    def fetch(self, num=None):
+        if self.version=='pt':
+            item_array = torch.stack(self.items)
+        elif self.version=='np':
+            item_array = np.stack(self.items)
+        if num is not None:
+            # there better be some items
+            assert(len(self.items) >= num)
+            # if there are not that many elements just return however many there are
+            if len(self.items) < num:
+                return item_array
+            else:
+                idxs = np.random.randint(len(self.items), size=num)
+                return item_array[idxs]
+        else:
+            return item_array
+    def is_full(self):
+        full = len(self.items)==self.pool_size
+        return full
+    def empty(self):
+        self.items = []
+    def update(self, items):
+        for item in items:
+            if len(self.items) < self.pool_size:
+                # the pool is not full, so let's add this in
+                self.items.append(item)
+            else:
+                # the pool is full
+                # pop from the front
+                self.items.pop(0)
+                # add to the back
+                self.items.append(item)
+        return self.items
+def farthest_point_sample(xyz, npoint, include_ends=False, deterministic=False):
+    """
+    Input:
+        xyz: pointcloud data, [B, N, C], where C is probably 3
+        npoint: number of samples
+    Return:
+        inds: sampled pointcloud index, [B, npoint]
+    """
+    device = xyz.device
+    B, N, C = xyz.shape
+    xyz = xyz.float()
+    inds = torch.zeros(B, npoint, dtype=torch.long).to(device)
+    distance = torch.ones(B, N).to(device) * 1e10
+    if deterministic:
+        farthest = torch.randint(0, 1, (B,), dtype=torch.long).to(device)
+    else:
+        farthest = torch.randint(0, N, (B,), dtype=torch.long).to(device)
+    batch_indices = torch.arange(B, dtype=torch.long).to(device)
+    for i in range(npoint):
+        if include_ends:
+            if i==0:
+                farthest = 0
+            elif i==1:
+                farthest = N-1
+        inds[:, i] = farthest
+        centroid = xyz[batch_indices, farthest, :].view(B, 1, C)
+        dist = torch.sum((xyz - centroid) ** 2, -1)
+        mask = dist < distance
+        distance[mask] = dist[mask]
+        farthest = torch.max(distance, -1)[1]
+        if npoint > N:
+            # if we need more samples, make them random
+            distance += torch.randn_like(distance)
+    return inds
+def farthest_point_sample_py(xyz, npoint):
+    N,C = xyz.shape
+    inds = np.zeros(npoint, dtype=np.int32)
+    distance = np.ones(N) * 1e10
+    farthest = np.random.randint(0, N, dtype=np.int32)
+    for i in range(npoint):
+        inds[i] = farthest
+        centroid = xyz[farthest, :].reshape(1,C)
+        dist = np.sum((xyz - centroid) ** 2, -1)
+        mask = dist < distance
+        distance[mask] = dist[mask]
+        farthest = np.argmax(distance, -1)
+        if npoint > N:
+            # if we need more samples, make them random
+            distance += np.random.randn(*distance.shape)
+    return inds

models/spatracker/utils/samp.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import torch
+import utils.basic
+import torch.nn.functional as F
+def bilinear_sample2d(im, x, y, return_inbounds=False):
+    # x and y are each B, N
+    # output is B, C, N
+    B, C, H, W = list(im.shape)
+    N = list(x.shape)[1]
+    x = x.float()
+    y = y.float()
+    H_f = torch.tensor(H, dtype=torch.float32)
+    W_f = torch.tensor(W, dtype=torch.float32)
+    # inbound_mask = (x>-0.5).float()*(y>-0.5).float()*(x<W_f+0.5).float()*(y<H_f+0.5).float()
+    max_y = (H_f - 1).int()
+    max_x = (W_f - 1).int()
+    x0 = torch.floor(x).int()
+    x1 = x0 + 1
+    y0 = torch.floor(y).int()
+    y1 = y0 + 1
+    x0_clip = torch.clamp(x0, 0, max_x)
+    x1_clip = torch.clamp(x1, 0, max_x)
+    y0_clip = torch.clamp(y0, 0, max_y)
+    y1_clip = torch.clamp(y1, 0, max_y)
+    dim2 = W
+    dim1 = W * H
+    base = torch.arange(0, B, dtype=torch.int64, device=x.device)*dim1
+    base = torch.reshape(base, [B, 1]).repeat([1, N])
+    base_y0 = base + y0_clip * dim2
+    base_y1 = base + y1_clip * dim2
+    idx_y0_x0 = base_y0 + x0_clip
+    idx_y0_x1 = base_y0 + x1_clip
+    idx_y1_x0 = base_y1 + x0_clip
+    idx_y1_x1 = base_y1 + x1_clip
+    # use the indices to lookup pixels in the flat image
+    # im is B x C x H x W
+    # move C out to last dim
+    im_flat = (im.permute(0, 2, 3, 1)).reshape(B*H*W, C)
+    i_y0_x0 = im_flat[idx_y0_x0.long()]
+    i_y0_x1 = im_flat[idx_y0_x1.long()]
+    i_y1_x0 = im_flat[idx_y1_x0.long()]
+    i_y1_x1 = im_flat[idx_y1_x1.long()]
+    # Finally calculate interpolated values.
+    x0_f = x0.float()
+    x1_f = x1.float()
+    y0_f = y0.float()
+    y1_f = y1.float()
+    w_y0_x0 = ((x1_f - x) * (y1_f - y)).unsqueeze(2)
+    w_y0_x1 = ((x - x0_f) * (y1_f - y)).unsqueeze(2)
+    w_y1_x0 = ((x1_f - x) * (y - y0_f)).unsqueeze(2)
+    w_y1_x1 = ((x - x0_f) * (y - y0_f)).unsqueeze(2)
+    output = w_y0_x0 * i_y0_x0 + w_y0_x1 * i_y0_x1 + \
+             w_y1_x0 * i_y1_x0 + w_y1_x1 * i_y1_x1
+    # output is B*N x C
+    output = output.view(B, -1, C)
+    output = output.permute(0, 2, 1)
+    # output is B x C x N
+    if return_inbounds:
+        x_valid = (x > -0.5).byte() & (x < float(W_f - 0.5)).byte()
+        y_valid = (y > -0.5).byte() & (y < float(H_f - 0.5)).byte()
+        inbounds = (x_valid & y_valid).float()
+        inbounds = inbounds.reshape(B, N) # something seems wrong here for B>1; i'm getting an error here (or downstream if i put -1)
+        return output, inbounds
+    return output # B, C, N
+def paste_crop_on_canvas(crop, box2d_unnorm, H, W, fast=True, mask=None, canvas=None):
+    # this is the inverse of crop_and_resize_box2d
+    B, C, Y, X = list(crop.shape)
+    B2, D = list(box2d_unnorm.shape)
+    assert(B == B2)
+    assert(D == 4)
+    # here, we want to place the crop into a bigger image,
+    # at the location specified by the box2d.
+    if canvas is None:
+        canvas = torch.zeros((B, C, H, W), device=crop.device)
+    else:
+        B2, C2, H2, W2 = canvas.shape
+        assert(B==B2)
+        assert(C==C2)
+        assert(H==H2)
+        assert(W==W2)
+    # box2d_unnorm = utils.geom.unnormalize_box2d(box2d, H, W)
+    if fast:
+        ymin = box2d_unnorm[:, 0].long()
+        xmin = box2d_unnorm[:, 1].long()
+        ymax = box2d_unnorm[:, 2].long()
+        xmax = box2d_unnorm[:, 3].long()
+        w = (xmax - xmin).float()
+        h = (ymax - ymin).float()
+        grids = utils.basic.gridcloud2d(B, H, W)
+        grids_flat = grids.reshape(B, -1, 2)
+        # grids_flat[:, :, 0] = (grids_flat[:, :, 0] - xmin.float().unsqueeze(1)) / w.unsqueeze(1) * X
+        # grids_flat[:, :, 1] = (grids_flat[:, :, 1] - ymin.float().unsqueeze(1)) / h.unsqueeze(1) * Y
+        # for each pixel in the main image,
+        # grids_flat tells us where to sample in the crop image
+        # print('grids_flat', grids_flat.shape)
+        # print('crop', crop.shape)
+        grids_flat[:, :, 0] = (grids_flat[:, :, 0] - xmin.float().unsqueeze(1)) / w.unsqueeze(1) * 2.0 - 1.0
+        grids_flat[:, :, 1] = (grids_flat[:, :, 1] - ymin.float().unsqueeze(1)) / h.unsqueeze(1) * 2.0 - 1.0
+        grid = grids_flat.reshape(B,H,W,2)
+        canvas = F.grid_sample(crop, grid, align_corners=False)
+        # print('canvas', canvas.shape)
+        # if mask is None:
+        #     crop_resamp, inb = bilinear_sample2d(crop, grids_flat[:, :, 0], grids_flat[:, :, 1], return_inbounds=True)
+        #     crop_resamp = crop_resamp.reshape(B, C, H, W)
+        #     inb = inb.reshape(B, 1, H, W)
+        #     canvas = canvas * (1 - inb) + crop_resamp * inb
+        # else:
+        #     full_resamp = bilinear_sample2d(torch.cat([crop, mask], dim=1), grids_flat[:, :, 0], grids_flat[:, :, 1])
+        #     full_resamp = full_resamp.reshape(B, C+1, H, W)
+        #     crop_resamp = full_resamp[:,:3]
+        #     mask_resamp = full_resamp[:,3:4]
+        #     canvas = canvas * (1 - mask_resamp) + crop_resamp * mask_resamp
+    else:
+        for b in range(B):
+            ymin = box2d_unnorm[b, 0].long()
+            xmin = box2d_unnorm[b, 1].long()
+            ymax = box2d_unnorm[b, 2].long()
+            xmax = box2d_unnorm[b, 3].long()
+            crop_b = F.interpolate(crop[b:b + 1], (ymax - ymin, xmax - xmin)).squeeze(0)
+            # print('canvas[b,:,...', canvas[b,:,ymin:ymax,xmin:xmax].shape)
+            # print('crop_b', crop_b.shape)
+            canvas[b, :, ymin:ymax, xmin:xmax] = crop_b
+    return canvas

models/spatracker/utils/visualizer.py ADDED Viewed

	@@ -0,0 +1,409 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import numpy as np
+import cv2
+import torch
+import flow_vis
+from matplotlib import cm
+import torch.nn.functional as F
+import torchvision.transforms as transforms
+from moviepy.editor import ImageSequenceClip
+import matplotlib.pyplot as plt
+from tqdm import tqdm
+def read_video_from_path(path):
+    cap = cv2.VideoCapture(path)
+    if not cap.isOpened():
+        print("Error opening video file")
+    else:
+        frames = []
+        while cap.isOpened():
+            ret, frame = cap.read()
+            if ret == True:
+                frames.append(np.array(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)))
+            else:
+                break
+        cap.release()
+    return np.stack(frames)
+class Visualizer:
+    def __init__(
+        self,
+        save_dir: str = "./results",
+        grayscale: bool = False,
+        pad_value: int = 0,
+        fps: int = 10,
+        mode: str = "rainbow",  # 'cool', 'optical_flow'
+        linewidth: int = 1,
+        show_first_frame: int = 10,
+        tracks_leave_trace: int = 0,  # -1 for infinite
+    ):
+        self.mode = mode
+        self.save_dir = save_dir
+        self.vtxt_path = os.path.join(save_dir, "videos.txt")
+        self.ttxt_path = os.path.join(save_dir, "trackings.txt")
+        if mode == "rainbow":
+            self.color_map = cm.get_cmap("gist_rainbow")
+        elif mode == "cool":
+            self.color_map = cm.get_cmap(mode)
+        self.show_first_frame = show_first_frame
+        self.grayscale = grayscale
+        self.tracks_leave_trace = tracks_leave_trace
+        self.pad_value = pad_value
+        self.linewidth = linewidth
+        self.fps = fps
+    def visualize(
+        self,
+        video: torch.Tensor,  # (B,T,C,H,W)
+        tracks: torch.Tensor,  # (B,T,N,2)
+        visibility: torch.Tensor = None,  # (B, T, N, 1) bool
+        gt_tracks: torch.Tensor = None,  # (B,T,N,2)
+        segm_mask: torch.Tensor = None,  # (B,1,H,W)
+        filename: str = "video",
+        writer=None,  # tensorboard Summary Writer, used for visualization during training
+        step: int = 0,
+        query_frame: int = 0,
+        save_video: bool = True,
+        compensate_for_camera_motion: bool = False,
+        rigid_part = None,
+        video_depth = None # (B,T,C,H,W)
+    ):
+        if compensate_for_camera_motion:
+            assert segm_mask is not None
+        if segm_mask is not None:
+            coords = tracks[0, query_frame].round().long()
+            segm_mask = segm_mask[0, query_frame][coords[:, 1], coords[:, 0]].long()
+        video = F.pad(
+            video,
+            (self.pad_value, self.pad_value, self.pad_value, self.pad_value),
+            "constant",
+            255,
+        )
+        if video_depth is not None:
+            video_depth = (video_depth*255).cpu().numpy().astype(np.uint8)
+            video_depth = ([cv2.applyColorMap(video_depth[0,i,0], cv2.COLORMAP_INFERNO)
+                            for i in range(video_depth.shape[1])])
+            video_depth = np.stack(video_depth, axis=0)
+            video_depth = torch.from_numpy(video_depth).permute(0, 3, 1, 2)[None]
+        tracks = tracks + self.pad_value
+        if self.grayscale:
+            transform = transforms.Grayscale()
+            video = transform(video)
+            video = video.repeat(1, 1, 3, 1, 1)
+        tracking_video = self.draw_tracks_on_video(
+            video=video,
+            tracks=tracks,
+            visibility=visibility,
+            segm_mask=segm_mask,
+            gt_tracks=gt_tracks,
+            query_frame=query_frame,
+            compensate_for_camera_motion=compensate_for_camera_motion,
+            rigid_part=rigid_part
+        )
+        if save_video:
+            # import ipdb; ipdb.set_trace()
+            tracking_dir = os.path.join(self.save_dir, "tracking")
+            if not os.path.exists(tracking_dir):
+                os.makedirs(tracking_dir)
+            self.save_video(tracking_video, filename=filename+"_tracking",
+                            savedir=tracking_dir, writer=writer, step=step)
+            # with open(self.ttxt_path, 'a') as file:
+            #     file.write(f"tracking/{filename}_tracking.mp4\n")
+            videos_dir = os.path.join(self.save_dir, "videos")
+            if not os.path.exists(videos_dir):
+                os.makedirs(videos_dir)
+            self.save_video(video, filename=filename,
+                            savedir=videos_dir, writer=writer, step=step)
+            # with open(self.vtxt_path, 'a') as file:
+            #     file.write(f"videos/{filename}.mp4\n")
+            if video_depth is not None:
+                self.save_video(video_depth, filename=filename+"_depth",
+                                savedir=os.path.join(self.save_dir, "depth"), writer=writer, step=step)
+        return tracking_video
+    def save_video(self, video, filename, savedir=None, writer=None, step=0):
+        if writer is not None:
+            writer.add_video(
+                f"{filename}",
+                video.to(torch.uint8),
+                global_step=step,
+                fps=self.fps,
+            )
+        else:
+            os.makedirs(self.save_dir, exist_ok=True)
+            wide_list = list(video.unbind(1))
+            wide_list = [wide[0].permute(1, 2, 0).cpu().numpy() for wide in wide_list]
+            # clip = ImageSequenceClip(wide_list[2:-1], fps=self.fps)
+            clip = ImageSequenceClip(wide_list, fps=self.fps)
+            # Write the video file
+            if savedir is None:
+                save_path = os.path.join(self.save_dir, f"{filename}.mp4")
+            else:
+                save_path = os.path.join(savedir, f"{filename}.mp4")
+            clip.write_videofile(save_path, codec="libx264", fps=self.fps, logger=None)
+            print(f"Video saved to {save_path}")
+    def draw_tracks_on_video(
+        self,
+        video: torch.Tensor,
+        tracks: torch.Tensor,
+        visibility: torch.Tensor = None,
+        segm_mask: torch.Tensor = None,
+        gt_tracks=None,
+        query_frame: int = 0,
+        compensate_for_camera_motion=False,
+        rigid_part=None,
+    ):
+        B, T, C, H, W = video.shape
+        _, _, N, D = tracks.shape
+        assert D == 3
+        assert C == 3
+        video = video[0].permute(0, 2, 3, 1).byte().detach().cpu().numpy()  # S, H, W, C
+        tracks = tracks[0].detach().cpu().numpy()  # S, N, 2
+        if gt_tracks is not None:
+            gt_tracks = gt_tracks[0].detach().cpu().numpy()
+        res_video = []
+        # process input video
+        # for rgb in video:
+        #     res_video.append(rgb.copy())
+        # create a blank tensor with the same shape as the video
+        for rgb in video:
+            black_frame = np.zeros_like(rgb.copy(), dtype=rgb.dtype)
+            res_video.append(black_frame)
+        vector_colors = np.zeros((T, N, 3))
+        if self.mode == "optical_flow":
+            vector_colors = flow_vis.flow_to_color(tracks - tracks[query_frame][None])
+        elif segm_mask is None:
+            if self.mode == "rainbow":
+                x_min, x_max = tracks[0, :, 0].min(), tracks[0, :, 0].max()
+                y_min, y_max = tracks[0, :, 1].min(), tracks[0, :, 1].max()
+                z_inv = 1/tracks[0, :, 2]
+                z_min, z_max = np.percentile(z_inv, [2, 98])
+                norm_x = plt.Normalize(x_min, x_max)
+                norm_y = plt.Normalize(y_min, y_max)
+                norm_z = plt.Normalize(z_min, z_max)
+                for n in range(N):
+                    r = norm_x(tracks[0, n, 0])
+                    g = norm_y(tracks[0, n, 1])
+                    # r = 0
+                    # g = 0
+                    b = norm_z(1/tracks[0, n, 2])
+                    color = np.array([r, g, b])[None] * 255
+                    vector_colors[:, n] = np.repeat(color, T, axis=0)
+            else:
+                # color changes with time
+                for t in range(T):
+                    color = np.array(self.color_map(t / T)[:3])[None] * 255
+                    vector_colors[t] = np.repeat(color, N, axis=0)
+        else:
+            if self.mode == "rainbow":
+                vector_colors[:, segm_mask <= 0, :] = 255
+                x_min, x_max = tracks[0, :, 0].min(), tracks[0, :, 0].max()
+                y_min, y_max = tracks[0, :, 1].min(), tracks[0, :, 1].max()
+                z_min, z_max = tracks[0, :, 2].min(), tracks[0, :, 2].max()
+                norm_x = plt.Normalize(x_min, x_max)
+                norm_y = plt.Normalize(y_min, y_max)
+                norm_z = plt.Normalize(z_min, z_max)
+                for n in range(N):
+                    r = norm_x(tracks[0, n, 0])
+                    g = norm_y(tracks[0, n, 1])
+                    b = norm_z(tracks[0, n, 2])
+                    color = np.array([r, g, b])[None] * 255
+                    vector_colors[:, n] = np.repeat(color, T, axis=0)
+            else:
+                # color changes with segm class
+                segm_mask = segm_mask.cpu()
+                color = np.zeros((segm_mask.shape[0], 3), dtype=np.float32)
+                color[segm_mask > 0] = np.array(self.color_map(1.0)[:3]) * 255.0
+                color[segm_mask <= 0] = np.array(self.color_map(0.0)[:3]) * 255.0
+                vector_colors = np.repeat(color[None], T, axis=0)
+        # Draw tracks
+        if self.tracks_leave_trace != 0:
+            for t in range(1, T):
+                first_ind = (
+                    max(0, t - self.tracks_leave_trace)
+                    if self.tracks_leave_trace >= 0
+                    else 0
+                )
+                curr_tracks = tracks[first_ind : t + 1]
+                curr_colors = vector_colors[first_ind : t + 1]
+                if compensate_for_camera_motion:
+                    diff = (
+                        tracks[first_ind : t + 1, segm_mask <= 0]
+                        - tracks[t : t + 1, segm_mask <= 0]
+                    ).mean(1)[:, None]
+                    curr_tracks = curr_tracks - diff
+                    curr_tracks = curr_tracks[:, segm_mask > 0]
+                    curr_colors = curr_colors[:, segm_mask > 0]
+                res_video[t] = self._draw_pred_tracks(
+                    res_video[t],
+                    curr_tracks,
+                    curr_colors,
+                )
+                if gt_tracks is not None:
+                    res_video[t] = self._draw_gt_tracks(
+                        res_video[t], gt_tracks[first_ind : t + 1]
+                    )
+        if rigid_part is not None:
+            cls_label = torch.unique(rigid_part)
+            cls_num = len(torch.unique(rigid_part))
+            # visualize the clustering results
+            cmap = plt.get_cmap('jet')  # get the color mapping
+            colors = cmap(np.linspace(0, 1, cls_num))
+            colors = (colors[:, :3] * 255)
+            color_map = {lable.item(): color for lable, color in zip(cls_label, colors)}
+        # Draw points
+        for t in tqdm(range(T)):
+            # Create a list to store information for each point
+            points_info = []
+            for i in range(N):
+                coord = (tracks[t, i, 0], tracks[t, i, 1])
+                depth = tracks[t, i, 2]  # assume the third dimension is depth
+                visibile = True
+                if visibility is not None:
+                    visibile = visibility[0, t, i]
+                if coord[0] != 0 and coord[1] != 0:
+                    if not compensate_for_camera_motion or (
+                        compensate_for_camera_motion and segm_mask[i] > 0
+                    ):
+                        points_info.append((i, coord, depth, visibile))
+            # Sort points by depth, points with smaller depth (closer) will be drawn later
+            points_info.sort(key=lambda x: x[2], reverse=True)
+            for i, coord, _, visibile in points_info:
+                if rigid_part is not None:
+                    color = color_map[rigid_part.squeeze()[i].item()]
+                    cv2.circle(
+                        res_video[t],
+                        coord,
+                        int(self.linewidth * 2),
+                        color.tolist(),
+                        thickness=-1 if visibile else 2
+                        -1,
+                    )
+                else:
+                    # Determine rectangle width based on the distance between adjacent tracks in the first frame
+                    if t == 0:
+                        distances = np.linalg.norm(tracks[0] - tracks[0, i], axis=1)
+                        distances = distances[distances > 0]
+                        rect_size = int(np.min(distances))/2
+                    # Define coordinates for top-left and bottom-right corners of the rectangle
+                    top_left = (int(coord[0] - rect_size), int(coord[1] - rect_size/1.5)) # Rectangle width is 1.5x (video aspect ratio is 1.5:1)
+                    bottom_right = (int(coord[0] + rect_size), int(coord[1] + rect_size/1.5))
+                    # Draw rectangle
+                    cv2.rectangle(
+                        res_video[t],
+                        top_left,
+                        bottom_right,
+                        vector_colors[t, i].tolist(),
+                        thickness=-1 if visibile else 0
+                        -1,
+                    )
+        # Construct the final rgb sequence
+        return torch.from_numpy(np.stack(res_video)).permute(0, 3, 1, 2)[None].byte()
+    def _draw_pred_tracks(
+        self,
+        rgb: np.ndarray,  # H x W x 3
+        tracks: np.ndarray,  # T x 2
+        vector_colors: np.ndarray,
+        alpha: float = 0.5,
+    ):
+        T, N, _ = tracks.shape
+        for s in range(T - 1):
+            vector_color = vector_colors[s]
+            original = rgb.copy()
+            alpha = (s / T) ** 2
+            for i in range(N):
+                coord_y = (int(tracks[s, i, 0]), int(tracks[s, i, 1]))
+                coord_x = (int(tracks[s + 1, i, 0]), int(tracks[s + 1, i, 1]))
+                if coord_y[0] != 0 and coord_y[1] != 0:
+                    cv2.line(
+                        rgb,
+                        coord_y,
+                        coord_x,
+                        vector_color[i].tolist(),
+                        self.linewidth,
+                        cv2.LINE_AA,
+                    )
+            if self.tracks_leave_trace > 0:
+                rgb = cv2.addWeighted(rgb, alpha, original, 1 - alpha, 0)
+        return rgb
+    def _draw_gt_tracks(
+        self,
+        rgb: np.ndarray,  # H x W x 3,
+        gt_tracks: np.ndarray,  # T x 2
+    ):
+        T, N, _ = gt_tracks.shape
+        color = np.array((211.0, 0.0, 0.0))
+        for t in range(T):
+            for i in range(N):
+                gt_tracks = gt_tracks[t][i]
+                #  draw a red cross
+                if gt_tracks[0] > 0 and gt_tracks[1] > 0:
+                    length = self.linewidth * 3
+                    coord_y = (int(gt_tracks[0]) + length, int(gt_tracks[1]) + length)
+                    coord_x = (int(gt_tracks[0]) - length, int(gt_tracks[1]) - length)
+                    cv2.line(
+                        rgb,
+                        coord_y,
+                        coord_x,
+                        color,
+                        self.linewidth,
+                        cv2.LINE_AA,
+                    )
+                    coord_y = (int(gt_tracks[0]) - length, int(gt_tracks[1]) + length)
+                    coord_x = (int(gt_tracks[0]) + length, int(gt_tracks[1]) - length)
+                    cv2.line(
+                        rgb,
+                        coord_y,
+                        coord_x,
+                        color,
+                        self.linewidth,
+                        cv2.LINE_AA,
+                    )
+        return rgb

models/spatracker/utils/vox.py ADDED Viewed

	@@ -0,0 +1,500 @@

+import numpy as np
+import torch
+import torch.nn.functional as F
+import utils.geom
+class Vox_util(object):
+    def __init__(self, Z, Y, X, scene_centroid, bounds, pad=None, assert_cube=False):
+        self.XMIN, self.XMAX, self.YMIN, self.YMAX, self.ZMIN, self.ZMAX = bounds
+        B, D = list(scene_centroid.shape)
+        self.Z, self.Y, self.X = Z, Y, X
+        scene_centroid = scene_centroid.detach().cpu().numpy()
+        x_centroid, y_centroid, z_centroid = scene_centroid[0]
+        self.XMIN += x_centroid
+        self.XMAX += x_centroid
+        self.YMIN += y_centroid
+        self.YMAX += y_centroid
+        self.ZMIN += z_centroid
+        self.ZMAX += z_centroid
+        self.default_vox_size_X = (self.XMAX-self.XMIN)/float(X)
+        self.default_vox_size_Y = (self.YMAX-self.YMIN)/float(Y)
+        self.default_vox_size_Z = (self.ZMAX-self.ZMIN)/float(Z)
+        if pad:
+            Z_pad, Y_pad, X_pad = pad
+            self.ZMIN -= self.default_vox_size_Z * Z_pad
+            self.ZMAX += self.default_vox_size_Z * Z_pad
+            self.YMIN -= self.default_vox_size_Y * Y_pad
+            self.YMAX += self.default_vox_size_Y * Y_pad
+            self.XMIN -= self.default_vox_size_X * X_pad
+            self.XMAX += self.default_vox_size_X * X_pad
+        if assert_cube:
+            # we assume cube voxels
+            if (not np.isclose(self.default_vox_size_X, self.default_vox_size_Y)) or (not np.isclose(self.default_vox_size_X, self.default_vox_size_Z)):
+                print('Z, Y, X', Z, Y, X)
+                print('bounds for this iter:',
+                      'X = %.2f to %.2f' % (self.XMIN, self.XMAX),
+                      'Y = %.2f to %.2f' % (self.YMIN, self.YMAX),
+                      'Z = %.2f to %.2f' % (self.ZMIN, self.ZMAX),
+                )
+                print('self.default_vox_size_X', self.default_vox_size_X)
+                print('self.default_vox_size_Y', self.default_vox_size_Y)
+                print('self.default_vox_size_Z', self.default_vox_size_Z)
+            assert(np.isclose(self.default_vox_size_X, self.default_vox_size_Y))
+            assert(np.isclose(self.default_vox_size_X, self.default_vox_size_Z))
+    def Ref2Mem(self, xyz, Z, Y, X, assert_cube=False):
+        # xyz is B x N x 3, in ref coordinates
+        # transforms ref coordinates into mem coordinates
+        B, N, C = list(xyz.shape)
+        device = xyz.device
+        assert(C==3)
+        mem_T_ref = self.get_mem_T_ref(B, Z, Y, X, assert_cube=assert_cube, device=device)
+        xyz = utils.geom.apply_4x4(mem_T_ref, xyz)
+        return xyz
+    def Mem2Ref(self, xyz_mem, Z, Y, X, assert_cube=False):
+        # xyz is B x N x 3, in mem coordinates
+        # transforms mem coordinates into ref coordinates
+        B, N, C = list(xyz_mem.shape)
+        ref_T_mem = self.get_ref_T_mem(B, Z, Y, X, assert_cube=assert_cube, device=xyz_mem.device)
+        xyz_ref = utils.geom.apply_4x4(ref_T_mem, xyz_mem)
+        return xyz_ref
+    def get_mem_T_ref(self, B, Z, Y, X, assert_cube=False, device='cuda'):
+        vox_size_X = (self.XMAX-self.XMIN)/float(X)
+        vox_size_Y = (self.YMAX-self.YMIN)/float(Y)
+        vox_size_Z = (self.ZMAX-self.ZMIN)/float(Z)
+        if assert_cube:
+            if (not np.isclose(vox_size_X, vox_size_Y)) or (not np.isclose(vox_size_X, vox_size_Z)):
+                print('Z, Y, X', Z, Y, X)
+                print('bounds for this iter:',
+                      'X = %.2f to %.2f' % (self.XMIN, self.XMAX),
+                      'Y = %.2f to %.2f' % (self.YMIN, self.YMAX),
+                      'Z = %.2f to %.2f' % (self.ZMIN, self.ZMAX),
+                )
+                print('vox_size_X', vox_size_X)
+                print('vox_size_Y', vox_size_Y)
+                print('vox_size_Z', vox_size_Z)
+            assert(np.isclose(vox_size_X, vox_size_Y))
+            assert(np.isclose(vox_size_X, vox_size_Z))
+        # translation
+        # (this makes the left edge of the leftmost voxel correspond to XMIN)
+        center_T_ref = utils.geom.eye_4x4(B, device=device)
+        center_T_ref[:,0,3] = -self.XMIN-vox_size_X/2.0
+        center_T_ref[:,1,3] = -self.YMIN-vox_size_Y/2.0
+        center_T_ref[:,2,3] = -self.ZMIN-vox_size_Z/2.0
+        # scaling
+        # (this makes the right edge of the rightmost voxel correspond to XMAX)
+        mem_T_center = utils.geom.eye_4x4(B, device=device)
+        mem_T_center[:,0,0] = 1./vox_size_X
+        mem_T_center[:,1,1] = 1./vox_size_Y
+        mem_T_center[:,2,2] = 1./vox_size_Z
+        mem_T_ref = utils.geom.matmul2(mem_T_center, center_T_ref)
+        return mem_T_ref
+    def get_ref_T_mem(self, B, Z, Y, X, assert_cube=False, device='cuda'):
+        mem_T_ref = self.get_mem_T_ref(B, Z, Y, X, assert_cube=assert_cube, device=device)
+        # note safe_inverse is inapplicable here,
+        # since the transform is nonrigid
+        ref_T_mem = mem_T_ref.inverse()
+        return ref_T_mem
+    def get_inbounds(self, xyz, Z, Y, X, already_mem=False, padding=0.0, assert_cube=False):
+        # xyz is B x N x 3
+        # padding should be 0 unless you are trying to account for some later cropping
+        if not already_mem:
+            xyz = self.Ref2Mem(xyz, Z, Y, X, assert_cube=assert_cube)
+        x = xyz[:,:,0]
+        y = xyz[:,:,1]
+        z = xyz[:,:,2]
+        x_valid = ((x-padding)>-0.5).byte() & ((x+padding)<float(X-0.5)).byte()
+        y_valid = ((y-padding)>-0.5).byte() & ((y+padding)<float(Y-0.5)).byte()
+        z_valid = ((z-padding)>-0.5).byte() & ((z+padding)<float(Z-0.5)).byte()
+        nonzero = (~(z==0.0)).byte()
+        inbounds = x_valid & y_valid & z_valid & nonzero
+        return inbounds.bool()
+    def voxelize_xyz(self, xyz_ref, Z, Y, X, already_mem=False, assert_cube=False, clean_eps=0):
+        B, N, D = list(xyz_ref.shape)
+        assert(D==3)
+        if already_mem:
+            xyz_mem = xyz_ref
+        else:
+            xyz_mem = self.Ref2Mem(xyz_ref, Z, Y, X, assert_cube=assert_cube)
+            xyz_zero = self.Ref2Mem(xyz_ref[:,0:1]*0, Z, Y, X, assert_cube=assert_cube)
+        vox = self.get_occupancy(xyz_mem, Z, Y, X, clean_eps=clean_eps, xyz_zero=xyz_zero)
+        return vox
+    def voxelize_xyz_and_feats(self, xyz_ref, feats, Z, Y, X, already_mem=False, assert_cube=False, clean_eps=0):
+        B, N, D = list(xyz_ref.shape)
+        B2, N2, D2 = list(feats.shape)
+        assert(D==3)
+        assert(B==B2)
+        assert(N==N2)
+        if already_mem:
+            xyz_mem = xyz_ref
+        else:
+            xyz_mem = self.Ref2Mem(xyz_ref, Z, Y, X, assert_cube=assert_cube)
+            xyz_zero = self.Ref2Mem(xyz_ref[:,0:1]*0, Z, Y, X, assert_cube=assert_cube)
+        feats = self.get_feat_occupancy(xyz_mem, feats, Z, Y, X, clean_eps=clean_eps, xyz_zero=xyz_zero)
+        return feats
+    def get_occupancy(self, xyz, Z, Y, X, clean_eps=0, xyz_zero=None):
+        # xyz is B x N x 3 and in mem coords
+        # we want to fill a voxel tensor with 1's at these inds
+        B, N, C = list(xyz.shape)
+        assert(C==3)
+        # these papers say simple 1/0 occupancy is ok:
+        #  http://openaccess.thecvf.com/content_cvpr_2018/papers/Yang_PIXOR_Real-Time_3d_CVPR_2018_paper.pdf
+        #  http://openaccess.thecvf.com/content_cvpr_2018/papers/Luo_Fast_and_Furious_CVPR_2018_paper.pdf
+        # cont fusion says they do 8-neighbor interp
+        # voxelnet does occupancy but with a bit of randomness in terms of the reflectance value i think
+        inbounds = self.get_inbounds(xyz, Z, Y, X, already_mem=True)
+        x, y, z = xyz[:,:,0], xyz[:,:,1], xyz[:,:,2]
+        mask = torch.zeros_like(x)
+        mask[inbounds] = 1.0
+        if xyz_zero is not None:
+            # only take points that are beyond a thresh of zero
+            dist = torch.norm(xyz_zero-xyz, dim=2)
+            mask[dist < 0.1] = 0
+        if clean_eps > 0:
+            # only take points that are already near centers
+            xyz_round = torch.round(xyz) # B, N, 3
+            dist = torch.norm(xyz_round - xyz, dim=2)
+            mask[dist > clean_eps] = 0
+        # set the invalid guys to zero
+        # we then need to zero out 0,0,0
+        # (this method seems a bit clumsy)
+        x = x*mask
+        y = y*mask
+        z = z*mask
+        x = torch.round(x)
+        y = torch.round(y)
+        z = torch.round(z)
+        x = torch.clamp(x, 0, X-1).int()
+        y = torch.clamp(y, 0, Y-1).int()
+        z = torch.clamp(z, 0, Z-1).int()
+        x = x.view(B*N)
+        y = y.view(B*N)
+        z = z.view(B*N)
+        dim3 = X
+        dim2 = X * Y
+        dim1 = X * Y * Z
+        base = torch.arange(0, B, dtype=torch.int32, device=xyz.device)*dim1
+        base = torch.reshape(base, [B, 1]).repeat([1, N]).view(B*N)
+        vox_inds = base + z * dim2 + y * dim3 + x
+        voxels = torch.zeros(B*Z*Y*X, device=xyz.device).float()
+        voxels[vox_inds.long()] = 1.0
+        # zero out the singularity
+        voxels[base.long()] = 0.0
+        voxels = voxels.reshape(B, 1, Z, Y, X)
+        # B x 1 x Z x Y x X
+        return voxels
+    def get_feat_occupancy(self, xyz, feat, Z, Y, X, clean_eps=0, xyz_zero=None):
+        # xyz is B x N x 3 and in mem coords
+        # feat is B x N x D
+        # we want to fill a voxel tensor with 1's at these inds
+        B, N, C = list(xyz.shape)
+        B2, N2, D2 = list(feat.shape)
+        assert(C==3)
+        assert(B==B2)
+        assert(N==N2)
+        # these papers say simple 1/0 occupancy is ok:
+        #  http://openaccess.thecvf.com/content_cvpr_2018/papers/Yang_PIXOR_Real-Time_3d_CVPR_2018_paper.pdf
+        #  http://openaccess.thecvf.com/content_cvpr_2018/papers/Luo_Fast_and_Furious_CVPR_2018_paper.pdf
+        # cont fusion says they do 8-neighbor interp
+        # voxelnet does occupancy but with a bit of randomness in terms of the reflectance value i think
+        inbounds = self.get_inbounds(xyz, Z, Y, X, already_mem=True)
+        x, y, z = xyz[:,:,0], xyz[:,:,1], xyz[:,:,2]
+        mask = torch.zeros_like(x)
+        mask[inbounds] = 1.0
+        if xyz_zero is not None:
+            # only take points that are beyond a thresh of zero
+            dist = torch.norm(xyz_zero-xyz, dim=2)
+            mask[dist < 0.1] = 0
+        if clean_eps > 0:
+            # only take points that are already near centers
+            xyz_round = torch.round(xyz) # B, N, 3
+            dist = torch.norm(xyz_round - xyz, dim=2)
+            mask[dist > clean_eps] = 0
+        # set the invalid guys to zero
+        # we then need to zero out 0,0,0
+        # (this method seems a bit clumsy)
+        x = x*mask # B, N
+        y = y*mask
+        z = z*mask
+        feat = feat*mask.unsqueeze(-1) # B, N, D
+        x = torch.round(x)
+        y = torch.round(y)
+        z = torch.round(z)
+        x = torch.clamp(x, 0, X-1).int()
+        y = torch.clamp(y, 0, Y-1).int()
+        z = torch.clamp(z, 0, Z-1).int()
+        # permute point orders
+        perm = torch.randperm(N)
+        x = x[:, perm]
+        y = y[:, perm]
+        z = z[:, perm]
+        feat = feat[:, perm]
+        x = x.view(B*N)
+        y = y.view(B*N)
+        z = z.view(B*N)
+        feat = feat.view(B*N, -1)
+        dim3 = X
+        dim2 = X * Y
+        dim1 = X * Y * Z
+        base = torch.arange(0, B, dtype=torch.int32, device=xyz.device)*dim1
+        base = torch.reshape(base, [B, 1]).repeat([1, N]).view(B*N)
+        vox_inds = base + z * dim2 + y * dim3 + x
+        feat_voxels = torch.zeros((B*Z*Y*X, D2), device=xyz.device).float()
+        feat_voxels[vox_inds.long()] = feat
+        # zero out the singularity
+        feat_voxels[base.long()] = 0.0
+        feat_voxels = feat_voxels.reshape(B, Z, Y, X, D2).permute(0, 4, 1, 2, 3)
+        # B x C x Z x Y x X
+        return feat_voxels
+    def unproject_image_to_mem(self, rgb_camB, pixB_T_camA, camB_T_camA, Z, Y, X, assert_cube=False, xyz_camA=None):
+        # rgb_camB is B x C x H x W
+        # pixB_T_camA is B x 4 x 4
+        # rgb lives in B pixel coords
+        # we want everything in A memory coords
+        # this puts each C-dim pixel in the rgb_camB
+        # along a ray in the voxelgrid
+        B, C, H, W = list(rgb_camB.shape)
+        if xyz_camA is None:
+            xyz_memA = utils.basic.gridcloud3d(B, Z, Y, X, norm=False, device=pixB_T_camA.device)
+            xyz_camA = self.Mem2Ref(xyz_memA, Z, Y, X, assert_cube=assert_cube)
+        xyz_camB = utils.geom.apply_4x4(camB_T_camA, xyz_camA)
+        z = xyz_camB[:,:,2]
+        xyz_pixB = utils.geom.apply_4x4(pixB_T_camA, xyz_camA)
+        normalizer = torch.unsqueeze(xyz_pixB[:,:,2], 2)
+        EPS=1e-6
+        # z = xyz_pixB[:,:,2]
+        xy_pixB = xyz_pixB[:,:,:2]/torch.clamp(normalizer, min=EPS)
+        # this is B x N x 2
+        # this is the (floating point) pixel coordinate of each voxel
+        x, y = xy_pixB[:,:,0], xy_pixB[:,:,1]
+        # these are B x N
+        x_valid = (x>-0.5).bool() & (x<float(W-0.5)).bool()
+        y_valid = (y>-0.5).bool() & (y<float(H-0.5)).bool()
+        z_valid = (z>0.0).bool()
+        valid_mem = (x_valid & y_valid & z_valid).reshape(B, 1, Z, Y, X).float()
+        if (0):
+            # handwritten version
+            values = torch.zeros([B, C, Z*Y*X], dtype=torch.float32)
+            for b in list(range(B)):
+                values[b] = utils.samp.bilinear_sample_single(rgb_camB[b], x_pixB[b], y_pixB[b])
+        else:
+            # native pytorch version
+            y_pixB, x_pixB = utils.basic.normalize_grid2d(y, x, H, W)
+            # since we want a 3d output, we need 5d tensors
+            z_pixB = torch.zeros_like(x)
+            xyz_pixB = torch.stack([x_pixB, y_pixB, z_pixB], axis=2)
+            rgb_camB = rgb_camB.unsqueeze(2)
+            xyz_pixB = torch.reshape(xyz_pixB, [B, Z, Y, X, 3])
+            values = F.grid_sample(rgb_camB, xyz_pixB, align_corners=False)
+        values = torch.reshape(values, (B, C, Z, Y, X))
+        values = values * valid_mem
+        return values
+    def warp_tiled_to_mem(self, rgb_tileB, pixB_T_camA, camB_T_camA, Z, Y, X, DMIN, DMAX, assert_cube=False):
+        # rgb_tileB is B,C,D,H,W
+        # pixB_T_camA is B,4,4
+        # camB_T_camA is B,4,4
+        # rgb_tileB lives in B pixel coords but it has been tiled across the Z dimension
+        # we want everything in A memory coords
+        # this resamples the so that each C-dim pixel in rgb_tilB
+        # is put into its correct place in the voxelgrid
+        # (using the pinhole camera model)
+        B, C, D, H, W = list(rgb_tileB.shape)
+        xyz_memA = utils.basic.gridcloud3d(B, Z, Y, X, norm=False, device=pixB_T_camA.device)
+        xyz_camA = self.Mem2Ref(xyz_memA, Z, Y, X, assert_cube=assert_cube)
+        xyz_camB = utils.geom.apply_4x4(camB_T_camA, xyz_camA)
+        z_camB = xyz_camB[:,:,2]
+        # rgb_tileB has depth=DMIN in tile 0, and depth=DMAX in tile D-1
+        z_tileB = (D-1.0) * (z_camB-float(DMIN)) / float(DMAX-DMIN)
+        xyz_pixB = utils.geom.apply_4x4(pixB_T_camA, xyz_camA)
+        normalizer = torch.unsqueeze(xyz_pixB[:,:,2], 2)
+        EPS=1e-6
+        # z = xyz_pixB[:,:,2]
+        xy_pixB = xyz_pixB[:,:,:2]/torch.clamp(normalizer, min=EPS)
+        # this is B x N x 2
+        # this is the (floating point) pixel coordinate of each voxel
+        x, y = xy_pixB[:,:,0], xy_pixB[:,:,1]
+        # these are B x N
+        x_valid = (x>-0.5).bool() & (x<float(W-0.5)).bool()
+        y_valid = (y>-0.5).bool() & (y<float(H-0.5)).bool()
+        z_valid = (z_camB>0.0).bool()
+        valid_mem = (x_valid & y_valid & z_valid).reshape(B, 1, Z, Y, X).float()
+        z_tileB, y_pixB, x_pixB = utils.basic.normalize_grid3d(z_tileB, y, x, D, H, W)
+        xyz_pixB = torch.stack([x_pixB, y_pixB, z_tileB], axis=2)
+        xyz_pixB = torch.reshape(xyz_pixB, [B, Z, Y, X, 3])
+        values = F.grid_sample(rgb_tileB, xyz_pixB, align_corners=False)
+        values = torch.reshape(values, (B, C, Z, Y, X))
+        values = values * valid_mem
+        return values
+    def apply_mem_T_ref_to_lrtlist(self, lrtlist_cam, Z, Y, X, assert_cube=False):
+        # lrtlist is B x N x 19, in cam coordinates
+        # transforms them into mem coordinates, including a scale change for the lengths
+        B, N, C = list(lrtlist_cam.shape)
+        assert(C==19)
+        mem_T_cam = self.get_mem_T_ref(B, Z, Y, X, assert_cube=assert_cube, device=lrtlist_cam.device)
+    def xyz2circles(self, xyz, radius, Z, Y, X, soft=True, already_mem=True, also_offset=False, grid=None):
+        # xyz is B x N x 3
+        # radius is B x N or broadcastably so
+        # output is B x N x Z x Y x X
+        B, N, D = list(xyz.shape)
+        assert(D==3)
+        if not already_mem:
+            xyz = self.Ref2Mem(xyz, Z, Y, X)
+        if grid is None:
+            grid_z, grid_y, grid_x = utils.basic.meshgrid3d(B, Z, Y, X, stack=False, norm=False, device=xyz.device)
+            # note the default stack is on -1
+            grid = torch.stack([grid_x, grid_y, grid_z], dim=1)
+            # this is B x 3 x Z x Y x X
+        xyz = xyz.reshape(B, N, 3, 1, 1, 1)
+        grid = grid.reshape(B, 1, 3, Z, Y, X)
+        # this is B x N x Z x Y x X
+        # round the xyzs, so that at least one value matches the grid perfectly,
+        # and we get a value of 1 there (since exp(0)==1)
+        xyz = xyz.round()
+        if torch.is_tensor(radius):
+            radius = radius.clamp(min=0.01)
+        if soft:
+            off = grid - xyz # B,N,3,Z,Y,X
+            # interpret radius as sigma
+            dist_grid = torch.sum(off**2, dim=2, keepdim=False)
+            # this is B x N x Z x Y x X
+            if torch.is_tensor(radius):
+                radius = radius.reshape(B, N, 1, 1, 1)
+            mask = torch.exp(-dist_grid/(2*radius*radius))
+            # zero out near zero
+            mask[mask < 0.001] = 0.0
+            # h = np.exp(-(x * x + y * y) / (2 * sigma * sigma))
+            # h[h < np.finfo(h.dtype).eps * h.max()] = 0
+            # return h
+            if also_offset:
+                return mask, off
+            else:
+                return mask
+        else:
+            assert(False) # something is wrong with this. come back later to debug
+            dist_grid = torch.norm(grid - xyz, dim=2, keepdim=False)
+            # this is 0 at/near the xyz, and increases by 1 for each voxel away
+            radius = radius.reshape(B, N, 1, 1, 1)
+            within_radius_mask = (dist_grid < radius).float()
+            within_radius_mask = torch.sum(within_radius_mask, dim=1, keepdim=True).clamp(0, 1)
+            return within_radius_mask
+    def xyz2circles_bev(self, xyz, radius, Z, Y, X, already_mem=True, also_offset=False):
+        # xyz is B x N x 3
+        # radius is B x N or broadcastably so
+        # output is B x N x Z x Y x X
+        B, N, D = list(xyz.shape)
+        assert(D==3)
+        if not already_mem:
+            xyz = self.Ref2Mem(xyz, Z, Y, X)
+        xz = torch.stack([xyz[:,:,0], xyz[:,:,2]], dim=2)
+        grid_z, grid_x = utils.basic.meshgrid2d(B, Z, X, stack=False, norm=False, device=xyz.device)
+        # note the default stack is on -1
+        grid = torch.stack([grid_x, grid_z], dim=1)
+        # this is B x 2 x Z x X
+        xz = xz.reshape(B, N, 2, 1, 1)
+        grid = grid.reshape(B, 1, 2, Z, X)
+        # these are ready to broadcast to B x N x Z x X
+        # round the points, so that at least one value matches the grid perfectly,
+        # and we get a value of 1 there (since exp(0)==1)
+        xz = xz.round()
+        if torch.is_tensor(radius):
+            radius = radius.clamp(min=0.01)
+        off = grid - xz # B,N,2,Z,X
+        # interpret radius as sigma
+        dist_grid = torch.sum(off**2, dim=2, keepdim=False)
+        # this is B x N x Z x X
+        if torch.is_tensor(radius):
+            radius = radius.reshape(B, N, 1, 1, 1)
+        mask = torch.exp(-dist_grid/(2*radius*radius))
+        # zero out near zero
+        mask[mask < 0.001] = 0.0
+        # add a Y dim
+        mask = mask.unsqueeze(-2)
+        off = off.unsqueeze(-2)
+        # # B,N,2,Z,1,X
+        if also_offset:
+            return mask, off
+        else:
+            return mask

requirements.txt ADDED Viewed

	@@ -0,0 +1,32 @@

+# spatrack
+easydict==1.13
+opencv-python==4.9.0.80
+moviepy==1.0.3
+flow-vis==0.1
+matplotlib==3.8.3
+einops==0.7.0
+timm==0.6.7
+scikit-image==0.22.0
+scikit-learn==1.4.1.post1
+cupy-cuda11x
+accelerate
+yt-dlp
+pandas
+# cogvideox
+bitsandbytes
+diffusers>=0.31.2
+transformers>=4.45.2
+hf_transfer>=0.1.8
+peft>=0.12.0
+decord>=0.6.0
+wandb
+torchao>=0.5.0
+sentencepiece>=0.2.0
+imageio-ffmpeg>=0.5.1
+numpy>=1.26.4
+git+https://github.com/asomoza/image_gen_aux.git
+deepspeed
+# submodules
+-r submodules/MoGe/requirements.txt

submodules/MoGe/.gitignore ADDED Viewed

	@@ -0,0 +1,425 @@

+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+##
+## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore
+# User-specific files
+*.rsuser
+*.suo
+*.user
+*.userosscache
+*.sln.docstates
+# User-specific files (MonoDevelop/Xamarin Studio)
+*.userprefs
+# Mono auto generated files
+mono_crash.*
+# Build results
+[Dd]ebug/
+[Dd]ebugPublic/
+[Rr]elease/
+[Rr]eleases/
+x64/
+x86/
+[Ww][Ii][Nn]32/
+[Aa][Rr][Mm]/
+[Aa][Rr][Mm]64/
+bld/
+[Bb]in/
+[Oo]bj/
+[Ll]og/
+[Ll]ogs/
+# Visual Studio 2015/2017 cache/options directory
+.vs/
+# Uncomment if you have tasks that create the project's static files in wwwroot
+#wwwroot/
+# Visual Studio 2017 auto generated files
+Generated\ Files/
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+# NUnit
+*.VisualState.xml
+TestResult.xml
+nunit-*.xml
+# Build Results of an ATL Project
+[Dd]ebugPS/
+[Rr]eleasePS/
+dlldata.c
+# Benchmark Results
+BenchmarkDotNet.Artifacts/
+# .NET Core
+project.lock.json
+project.fragment.lock.json
+artifacts/
+# ASP.NET Scaffolding
+ScaffoldingReadMe.txt
+# StyleCop
+StyleCopReport.xml
+# Files built by Visual Studio
+*_i.c
+*_p.c
+*_h.h
+*.ilk
+*.meta
+*.obj
+*.iobj
+*.pch
+*.pdb
+*.ipdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.tmp_proj
+*_wpftmp.csproj
+*.log
+*.tlog
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.svclog
+*.scc
+# Chutzpah Test files
+_Chutzpah*
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opendb
+*.opensdf
+*.sdf
+*.cachefile
+*.VC.db
+*.VC.VC.opendb
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+*.sap
+# Visual Studio Trace Files
+*.e2e
+# TFS 2012 Local Workspace
+$tf/
+# Guidance Automation Toolkit
+*.gpState
+# ReSharper is a .NET coding add-in
+_ReSharper*/
+*.[Rr]e[Ss]harper
+*.DotSettings.user
+# TeamCity is a build add-in
+_TeamCity*
+# DotCover is a Code Coverage Tool
+*.dotCover
+# AxoCover is a Code Coverage Tool
+.axoCover/*
+!.axoCover/settings.json
+# Coverlet is a free, cross platform Code Coverage Tool
+coverage*.json
+coverage*.xml
+coverage*.info
+# Visual Studio code coverage results
+*.coverage
+*.coveragexml
+# NCrunch
+_NCrunch_*
+.*crunch*.local.xml
+nCrunchTemp_*
+# MightyMoose
+*.mm.*
+AutoTest.Net/
+# Web workbench (sass)
+.sass-cache/
+# Installshield output folder
+[Ee]xpress/
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+# Click-Once directory
+publish/
+# Publish Web Output
+*.[Pp]ublish.xml
+*.azurePubxml
+# Note: Comment the next line if you want to checkin your web deploy settings,
+# but database connection strings (with potential passwords) will be unencrypted
+*.pubxml
+*.publishproj
+# Microsoft Azure Web App publish settings. Comment the next line if you want to
+# checkin your Azure Web App publish settings, but sensitive information contained
+# in these scripts will be unencrypted
+PublishScripts/
+# NuGet Packages
+*.nupkg
+# NuGet Symbol Packages
+*.snupkg
+# The packages folder can be ignored because of Package Restore
+**/[Pp]ackages/*
+# except build/, which is used as an MSBuild target.
+!**/[Pp]ackages/build/
+# Uncomment if necessary however generally it will be regenerated when needed
+#!**/[Pp]ackages/repositories.config
+# NuGet v3's project.json files produces more ignorable files
+*.nuget.props
+*.nuget.targets
+# Microsoft Azure Build Output
+csx/
+*.build.csdef
+# Microsoft Azure Emulator
+ecf/
+rcf/
+# Windows Store app package directories and files
+AppPackages/
+BundleArtifacts/
+Package.StoreAssociation.xml
+_pkginfo.txt
+*.appx
+*.appxbundle
+*.appxupload
+# Visual Studio cache files
+# files ending in .cache can be ignored
+*.[Cc]ache
+# but keep track of directories ending in .cache
+!?*.[Cc]ache/
+# Others
+ClientBin/
+~$*
+*~
+*.dbmdl
+*.dbproj.schemaview
+*.jfm
+*.pfx
+*.publishsettings
+orleans.codegen.cs
+# Including strong name files can present a security risk
+# (https://github.com/github/gitignore/pull/2483#issue-259490424)
+#*.snk
+# Since there are multiple workflows, uncomment next line to ignore bower_components
+# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
+#bower_components/
+# RIA/Silverlight projects
+Generated_Code/
+# Backup & report files from converting an old project file
+# to a newer Visual Studio version. Backup files are not needed,
+# because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+UpgradeLog*.htm
+ServiceFabricBackup/
+*.rptproj.bak
+# SQL Server files
+*.mdf
+*.ldf
+*.ndf
+# Business Intelligence projects
+*.rdl.data
+*.bim.layout
+*.bim_*.settings
+*.rptproj.rsuser
+*- [Bb]ackup.rdl
+*- [Bb]ackup ([0-9]).rdl
+*- [Bb]ackup ([0-9][0-9]).rdl
+# Microsoft Fakes
+FakesAssemblies/
+# GhostDoc plugin setting file
+*.GhostDoc.xml
+# Node.js Tools for Visual Studio
+.ntvs_analysis.dat
+node_modules/
+# Visual Studio 6 build log
+*.plg
+# Visual Studio 6 workspace options file
+*.opt
+# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
+*.vbw
+# Visual Studio 6 auto-generated project file (contains which files were open etc.)
+*.vbp
+# Visual Studio 6 workspace and project file (working project files containing files to include in project)
+*.dsw
+*.dsp
+# Visual Studio 6 technical files
+*.ncb
+*.aps
+# Visual Studio LightSwitch build output
+**/*.HTMLClient/GeneratedArtifacts
+**/*.DesktopClient/GeneratedArtifacts
+**/*.DesktopClient/ModelManifest.xml
+**/*.Server/GeneratedArtifacts
+**/*.Server/ModelManifest.xml
+_Pvt_Extensions
+# Paket dependency manager
+.paket/paket.exe
+paket-files/
+# FAKE - F# Make
+.fake/
+# CodeRush personal settings
+.cr/personal
+# Python Tools for Visual Studio (PTVS)
+__pycache__/
+*.pyc
+# Cake - Uncomment if you are using it
+# tools/**
+# !tools/packages.config
+# Tabs Studio
+*.tss
+# Telerik's JustMock configuration file
+*.jmconfig
+# BizTalk build output
+*.btp.cs
+*.btm.cs
+*.odx.cs
+*.xsd.cs
+# OpenCover UI analysis results
+OpenCover/
+# Azure Stream Analytics local run output
+ASALocalRun/
+# MSBuild Binary and Structured Log
+*.binlog
+# NVidia Nsight GPU debugger configuration file
+*.nvuser
+# MFractors (Xamarin productivity tool) working folder
+.mfractor/
+# Local History for Visual Studio
+.localhistory/
+# Visual Studio History (VSHistory) files
+.vshistory/
+# BeatPulse healthcheck temp database
+healthchecksdb
+# Backup folder for Package Reference Convert tool in Visual Studio 2017
+MigrationBackup/
+# Ionide (cross platform F# VS Code tools) working folder
+.ionide/
+# Fody - auto-generated XML schema
+FodyWeavers.xsd
+# VS Code files for those working on multiple tools
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+*.code-workspace
+# Local History for Visual Studio Code
+.history/
+# Windows Installer files from build outputs
+*.cab
+*.msi
+*.msix
+*.msm
+*.msp
+# JetBrains Rider
+*.sln.iml
+# MoGe
+/data
+/download
+/extract
+/view_point_cloud
+/view_depth_map
+/blobcache
+/snapshot
+/reference_embeddings
+/.msra_intern_s_toolkit
+/debug
+/workspace
+/mlruns
+/infer_output
+/video_output
+/eval_output
+/.blobcache
+/test_images
+/test_videos
+/vis
+/videos
+/raid
+/blobmnt
+/eval_dump
+/pretrained
+/.gradio

submodules/MoGe/CHANGELOG.md ADDED Viewed

	@@ -0,0 +1,15 @@

+## 2024-11-28
+### Added
+- Supported user-provided camera FOV. See [scripts/infer.py](scripts/infer.py) --fov_x.
+  - Related issues: [#25](https://github.com/microsoft/MoGe/issues/25) and [#24](https://github.com/microsoft/MoGe/issues/24).
+- Added inference scripts for panorama images. See [scripts/infer_panorama.py](scripts/infer_panorama.py).
+  - Related issue: [#19](https://github.com/microsoft/MoGe/issues/19).
+### Fixed
+- Suppressed unnecessary numpy runtime warnings.
+- Specified recommended versions of requirements.
+  - Related issue: [#21](https://github.com/microsoft/MoGe/issues/21).
+### Changed
+- Moved `app.py` and `infer.py` to [scripts/](scripts/)
+- Improved edge removal.

submodules/MoGe/CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,9 @@

+# Microsoft Open Source Code of Conduct
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
+Resources:
+- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
+- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
+- Contact [[email protected]](mailto:[email protected]) with questions or concerns

submodules/MoGe/LICENSE ADDED Viewed

	@@ -0,0 +1,224 @@

+    MIT License
+    Copyright (c) Microsoft Corporation.
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.