kairunwen commited on
Commit
57746f1
·
1 Parent(s): 3c9ccf0

Update Code

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -0
  2. .gitignore +157 -0
  3. app.py +149 -9
  4. assets/examples/bicycle/_DSC8679.JPG +0 -0
  5. assets/examples/bicycle/_DSC8689.JPG +0 -0
  6. assets/examples/bonsai/DSCF5565.JPG +0 -0
  7. assets/examples/bonsai/DSCF5575.JPG +0 -0
  8. assets/examples/garden/DSC07956.JPG +0 -0
  9. assets/examples/garden/DSC07960.JPG +0 -0
  10. assets/examples/kitchen/0.jpg +0 -0
  11. assets/examples/kitchen/64.jpg +0 -0
  12. assets/examples/sofa/000000.jpg +0 -0
  13. assets/examples/sofa/000008.jpg +0 -0
  14. configs/model_config.yaml +20 -0
  15. requirements.txt +41 -0
  16. scannetv2-labels.combined.tsv +608 -0
  17. src/datasets/megadepth.py +125 -0
  18. src/datasets/scannet.py +109 -0
  19. src/datasets/scannetpp.py +107 -0
  20. src/datasets_preprocess/scannet_preprocess.py +209 -0
  21. src/datasets_preprocess/scannetpp_preprocess.py +227 -0
  22. src/gaussian_head.py +142 -0
  23. src/infer.py +23 -0
  24. src/losses.py +193 -0
  25. src/lseg.py +171 -0
  26. src/model.py +176 -0
  27. src/ptv3.py +13 -0
  28. src/train.py +73 -0
  29. src/utils/camera_utils.py +60 -0
  30. src/utils/cuda_splatting.py +216 -0
  31. src/utils/gaussian_model.py +160 -0
  32. src/utils/graphics_utils.py +77 -0
  33. src/utils/points_process.py +37 -0
  34. src/utils/sh_utils.py +117 -0
  35. src/utils/visualization_utils.py +355 -0
  36. submodules/PointTransformerV3/.gitmodules +3 -0
  37. submodules/PointTransformerV3/LICENSE +21 -0
  38. submodules/PointTransformerV3/Pointcept/.github/workflows/formatter.yml +20 -0
  39. submodules/PointTransformerV3/Pointcept/.gitignore +16 -0
  40. submodules/PointTransformerV3/Pointcept/LICENSE +21 -0
  41. submodules/PointTransformerV3/Pointcept/README.md +896 -0
  42. submodules/PointTransformerV3/Pointcept/configs/_base_/dataset/scannetpp.py +104 -0
  43. submodules/PointTransformerV3/Pointcept/configs/_base_/default_runtime.py +39 -0
  44. submodules/PointTransformerV3/Pointcept/configs/matterport3d/semseg-pt-v3m1-0-base.py +313 -0
  45. submodules/PointTransformerV3/Pointcept/configs/matterport3d/semseg-spunet-v1m1-0-base.py +282 -0
  46. submodules/PointTransformerV3/Pointcept/configs/modelnet40/cls-ptv3-v1m1-0-base.py +232 -0
  47. submodules/PointTransformerV3/Pointcept/configs/modelnet40/cls-spunet-v1m1-0-base.py +176 -0
  48. submodules/PointTransformerV3/Pointcept/configs/nuscenes/semseg-ppt-v1m1-0-nu-sk-wa-spunet.py +342 -0
  49. submodules/PointTransformerV3/Pointcept/configs/nuscenes/semseg-ppt-v1m2-0-nu-sk-wa-spunet.py +316 -0
  50. submodules/PointTransformerV3/Pointcept/configs/nuscenes/semseg-ppt-v1m2-1-nu-sk-wa-spunet-submit.py +292 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ wheel/*.whl filter=lfs diff=lfs merge=lfs -text
37
+ *.whl filter=lfs diff=lfs merge=lfs -text
38
+ checkpoints/* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # *.pth
2
+ *.pt
3
+ submodules/diff-gaussian-rasterization
4
+ submodules/simple-knn
5
+ # checkpoints/
6
+ output*
7
+ .gradio/
8
+
9
+ core.*
10
+ logs/*
11
+ /data/
12
+ # checkpoints/
13
+ video*
14
+ train_images*
15
+ test_images_save*
16
+ /pl_main
17
+ /to_be_test
18
+ /test_lsm
19
+ /test_img
20
+ /figure3
21
+
22
+ # Byte-compiled / optimized / DLL files
23
+ __pycache__/
24
+ *.py[cod]
25
+ *$py.class
26
+
27
+ # C extensions
28
+ *.so
29
+
30
+ # Distribution / packaging
31
+ .Python
32
+ build/
33
+ develop-eggs/
34
+ dist/
35
+ downloads/
36
+ eggs/
37
+ .eggs/
38
+ lib/
39
+ lib64/
40
+ parts/
41
+ sdist/
42
+ var/
43
+ wheels/
44
+ pip-wheel-metadata/
45
+ share/python-wheels/
46
+ *.egg-info/
47
+ .installed.cfg
48
+ *.egg
49
+ MANIFEST
50
+
51
+ # PyInstaller
52
+ # Usually these files are written by a python script from a template
53
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
54
+ *.manifest
55
+ *.spec
56
+
57
+ # Installer logs
58
+ pip-log.txt
59
+ pip-delete-this-directory.txt
60
+
61
+ # Unit test / coverage reports
62
+ htmlcov/
63
+ .tox/
64
+ .nox/
65
+ .coverage
66
+ .coverage.*
67
+ .cache
68
+ nosetests.xml
69
+ coverage.xml
70
+ *.cover
71
+ *.py,cover
72
+ .hypothesis/
73
+ .pytest_cache/
74
+
75
+ # Translations
76
+ *.mo
77
+ *.pot
78
+
79
+ # Django stuff:
80
+ *.log
81
+ local_settings.py
82
+ db.sqlite3
83
+ db.sqlite3-journal
84
+
85
+ # Flask stuff:
86
+ instance/
87
+ .webassets-cache
88
+
89
+ # Scrapy stuff:
90
+ .scrapy
91
+
92
+ # Sphinx documentation
93
+ docs/_build/
94
+
95
+ # PyBuilder
96
+ target/
97
+
98
+ # Jupyter Notebook
99
+ .ipynb_checkpoints
100
+
101
+ # IPython
102
+ profile_default/
103
+ ipython_config.py
104
+
105
+ # pyenv
106
+ .python-version
107
+
108
+ # pipenv
109
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
110
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
111
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
112
+ # install all needed dependencies.
113
+ #Pipfile.lock
114
+
115
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
116
+ __pypackages__/
117
+
118
+ # Celery stuff
119
+ celerybeat-schedule
120
+ celerybeat.pid
121
+
122
+ # SageMath parsed files
123
+ *.sage.py
124
+
125
+ # Environments
126
+ .env
127
+ .venv
128
+ env/
129
+ venv/
130
+ ENV/
131
+ env.bak/
132
+ venv.bak/
133
+
134
+ # Spyder project settings
135
+ .spyderproject
136
+ .spyproject
137
+
138
+ # Rope project settings
139
+ .ropeproject
140
+
141
+ # mkdocs documentation
142
+ /site
143
+
144
+ # mypy
145
+ .mypy_cache/
146
+ .dmypy.json
147
+ dmypy.json
148
+
149
+ # Pyre type checker
150
+ .pyre/
151
+ video/
152
+ scannet_processed_scenes_1.tar.gz
153
+ test_results/*
154
+ output/*
155
+ test_images
156
+ colmap_scannet
157
+ /test_lseg
app.py CHANGED
@@ -1,12 +1,152 @@
1
- import os
2
- import shlex
 
 
 
 
3
  import gradio as gr
4
- import subprocess
5
- from huggingface_hub import HfApi
 
6
 
7
- hf_token = os.getenv("LSM_token")
 
 
 
 
 
8
 
9
- api = HfApi()
10
- api.snapshot_download(repo_id="kairunwen/LSM_private_mast3r", repo_type="space", local_dir=".", token=hf_token)
11
- subprocess.run(shlex.split("pip install -r requirements.txt"))
12
- subprocess.run(shlex.split("python app.py"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, subprocess, shlex, sys, gc
2
+ import time
3
+ import torch
4
+ import numpy as np
5
+ import shutil
6
+ import argparse
7
  import gradio as gr
8
+ import uuid
9
+ import spaces
10
+ #
11
 
12
+ subprocess.run(shlex.split("pip install wheel/torch_scatter-2.1.2+pt21cu121-cp310-cp310-linux_x86_64.whl"))
13
+ subprocess.run(shlex.split("pip install wheel/flash_attn-2.6.3+cu123torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"))
14
+ subprocess.run(shlex.split("pip install wheel/diff_gaussian_rasterization-0.0.0-cp310-cp310-linux_x86_64.whl"))
15
+ subprocess.run(shlex.split("pip install wheel/simple_knn-0.0.0-cp310-cp310-linux_x86_64.whl"))
16
+ subprocess.run(shlex.split("pip install wheel/curope-0.0.0-cp310-cp310-linux_x86_64.whl"))
17
+ subprocess.run(shlex.split("pip install wheel/pointops-1.0-cp310-cp310-linux_x86_64.whl"))
18
 
19
+ from src.utils.visualization_utils import render_video_from_file
20
+ from src.model import LSM_MASt3R
21
+
22
+ model = LSM_MASt3R.from_pretrained("checkpoints/pretrained_model/checkpoint-40.pth")
23
+ model = model.eval()
24
+
25
+
26
+ @spaces.GPU(duration=80)
27
+ def process(inputfiles, input_path=None):
28
+ # 创建唯一的缓存目录
29
+ cache_dir = os.path.join('outputs', str(uuid.uuid4()))
30
+ os.makedirs(cache_dir, exist_ok=True)
31
+
32
+ if input_path is not None:
33
+ imgs_path = './assets/examples/' + input_path
34
+ imgs_names = sorted(os.listdir(imgs_path))
35
+
36
+ inputfiles = []
37
+ for imgs_name in imgs_names:
38
+ file_path = os.path.join(imgs_path, imgs_name)
39
+ print(file_path)
40
+ inputfiles.append(file_path)
41
+ print(inputfiles)
42
+
43
+ filelist = inputfiles
44
+ if len(filelist) != 2:
45
+ gr.Warning("Please select 2 images")
46
+ shutil.rmtree(cache_dir) # 清理缓存目录
47
+ return None, None, None, None, None, None
48
+
49
+ ply_path = os.path.join(cache_dir, 'gaussians.ply')
50
+ # render_video_from_file(filelist, model, output_path=cache_dir, resolution=224)
51
+ render_video_from_file(filelist, model, output_path=cache_dir, resolution=512)
52
+
53
+ rgb_video_path = os.path.join(cache_dir, 'moved', 'output_images_video.mp4')
54
+ depth_video_path = os.path.join(cache_dir, 'moved', 'output_depth_video.mp4')
55
+ feature_video_path = os.path.join(cache_dir, 'moved', 'output_fmap_video.mp4')
56
+
57
+ return filelist, rgb_video_path, depth_video_path, feature_video_path, ply_path, ply_path
58
+
59
+
60
+ _TITLE = 'LargeSpatialModel'
61
+ _DESCRIPTION = '''
62
+ <div style="display: flex; justify-content: center; align-items: center;">
63
+ <div style="width: 100%; text-align: center; font-size: 30px;">
64
+ <strong>Large Spatial Model: End-to-end Unposed Images to Semantic 3D</strong>
65
+ </div>
66
+ </div>
67
+ <p></p>
68
+
69
+ <div align="center">
70
+ <a style="display:inline-block" href="https://arxiv.org/abs/2410.18956"><img src="https://img.shields.io/badge/ArXiv-2410.18956-b31b1b?logo=arxiv" alt='arxiv'></a>&nbsp;
71
+ <a style="display:inline-block" href="https://largespatialmodel.github.io/"><img src='https://img.shields.io/badge/Project_Page-ff7512?logo=lightning'></a>&nbsp;
72
+ <a title="Social" href="https://x.com/WayneINR" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
73
+ <img src="https://www.obukhov.ai/img/badges/badge-social.svg" alt="social">
74
+ </a>
75
+
76
+ </div>
77
+ <p></p>
78
+
79
+ * Official demo of: [LargeSpatialModel: End-to-end Unposed Images to Semantic 3D](https://largespatialmodel.github.io/).
80
+ * Examples for direct viewing: you can simply click the examples (in the bottom of the page), to quickly view the results on representative data.
81
+ '''
82
+
83
+ block = gr.Blocks().queue()
84
+ with block:
85
+ gr.Markdown(_DESCRIPTION)
86
+
87
+ with gr.Column(variant="panel"):
88
+ with gr.Tab("Input"):
89
+ with gr.Row():
90
+ with gr.Column(scale=1):
91
+ inputfiles = gr.File(file_count="multiple", label="Load Images")
92
+ input_path = gr.Textbox(visible=False, label="example_path")
93
+ with gr.Column(scale=1):
94
+ image_gallery = gr.Gallery(
95
+ label="Gallery",
96
+ show_label=False,
97
+ elem_id="gallery",
98
+ columns=[2],
99
+ height=300, # 固定高度
100
+ object_fit="cover" # 确保图片填满空间
101
+ )
102
+
103
+ button_gen = gr.Button("Start Reconstruction", elem_id="button_gen")
104
+ processing_msg = gr.Markdown("Processing...", visible=False, elem_id="processing_msg")
105
+
106
+
107
+ with gr.Column(variant="panel"):
108
+ with gr.Tab("Output"):
109
+ with gr.Row():
110
+ with gr.Column(scale=1):
111
+ rgb_video = gr.Video(label="RGB Video", autoplay=True)
112
+ with gr.Column(scale=1):
113
+ feature_video = gr.Video(label="Feature Video", autoplay=True)
114
+ with gr.Column(scale=1):
115
+ depth_video = gr.Video(label="Depth Video", autoplay=True)
116
+ with gr.Row():
117
+ with gr.Group():
118
+ output_model = gr.Model3D(
119
+ label="3D Dense Model under Gaussian Splats Formats, need more time to visualize",
120
+ interactive=False,
121
+ camera_position=[0.5, 0.5, 1], # 稍微偏移一点,以便更好地查看模型
122
+ height=600,
123
+ )
124
+ gr.Markdown(
125
+ """
126
+ <div class="model-description">
127
+ &nbsp;&nbsp;Use the left mouse button to rotate, the scroll wheel to zoom, and the right mouse button to move.
128
+ </div>
129
+ """
130
+ )
131
+ with gr.Row():
132
+ output_file = gr.File(label="PLY File")
133
+
134
+ examples = gr.Examples(
135
+ examples=[
136
+ "sofa",
137
+ ],
138
+ inputs=[input_path],
139
+ outputs=[image_gallery, rgb_video, depth_video, feature_video, output_model, output_file],
140
+ fn=lambda x: process(inputfiles=None, input_path=x),
141
+ cache_examples=True,
142
+ label="Examples"
143
+ )
144
+
145
+
146
+ button_gen.click(
147
+ process,
148
+ inputs=[inputfiles],
149
+ outputs=[image_gallery, rgb_video, depth_video, feature_video, output_model, output_file],
150
+ )
151
+
152
+ block.launch(server_name="0.0.0.0", share=False)
assets/examples/bicycle/_DSC8679.JPG ADDED
assets/examples/bicycle/_DSC8689.JPG ADDED
assets/examples/bonsai/DSCF5565.JPG ADDED
assets/examples/bonsai/DSCF5575.JPG ADDED
assets/examples/garden/DSC07956.JPG ADDED
assets/examples/garden/DSC07960.JPG ADDED
assets/examples/kitchen/0.jpg ADDED
assets/examples/kitchen/64.jpg ADDED
assets/examples/sofa/000000.jpg ADDED
assets/examples/sofa/000008.jpg ADDED
configs/model_config.yaml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ mast3r_config:
2
+ pretrained_model_name_or_path: "checkpoints/pretrained_model/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth"
3
+
4
+ point_transformer_config:
5
+ enc_depths: [1, 1, 1, 3, 1]
6
+ enc_channels: [32, 64, 128, 256, 512]
7
+ enc_num_head: [2, 4, 8, 16, 32]
8
+ enc_patch_size: [1024, 1024, 1024, 1024, 1024]
9
+ dec_depths: [1, 1, 1, 1]
10
+ dec_channels: [64, 64, 128, 256]
11
+ dec_num_head: [4, 4, 8, 16]
12
+ dec_patch_size: [1024, 1024, 1024, 1024]
13
+
14
+ gaussian_head_config:
15
+ rgb_residual: true
16
+ d_gs_feats: 32
17
+
18
+ lseg_config:
19
+ pretrained_model_name_or_path: "checkpoints/pretrained_model/lang_seg.ckpt"
20
+ half_res: true
requirements.txt ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch==2.1.2
2
+ torchvision==0.16.2
3
+ pytorch-lightning==2.1.2
4
+ open3d
5
+ roma
6
+ gradio
7
+ matplotlib
8
+ tqdm
9
+ opencv-python
10
+ scipy
11
+ einops
12
+ trimesh
13
+ tensorboard
14
+ pyglet<2
15
+ numpy<2.0
16
+ huggingface-hub[torch]>=0.22
17
+ ninja
18
+ scikit-learn
19
+
20
+
21
+ arrow
22
+ pandas
23
+ torch-tb-profiler
24
+ jaxtyping
25
+ ninja
26
+ h5py
27
+ pyyaml
28
+ moviepy==1.0.3
29
+ jupyter
30
+ lpips
31
+ torch-geometric
32
+ spconv-cu120
33
+ git+https://github.com/openai/CLIP.git
34
+ sharedarray
35
+ tensorboardx
36
+ yapf
37
+ addict
38
+ plyfile
39
+ termcolor
40
+ timm
41
+
scannetv2-labels.combined.tsv ADDED
@@ -0,0 +1,608 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id raw_category category count nyu40id eigen13id nyuClass nyu40class eigen13class ModelNet40 ModelNet10 ShapeNetCore55 synsetoffset wnsynsetid wnsynsetkey mpcat40 mpcat40index
2
+ 1 wall wall 8277 1 12 wall wall Wall n04546855 wall.n.01 wall 1
3
+ 2 chair chair 4646 5 4 chair chair Chair chair chair chair 3001627 n03001627 chair.n.01 chair 3
4
+ 22 books book 1678 23 2 book books Books n02870526 book.n.11 objects 39
5
+ 3 floor floor 1553 2 5 floor floor Floor n03365592 floor.n.01 floor 2
6
+ 5 door door 1483 8 12 door door Wall door n03221720 door.n.01 door 4
7
+ 1163 object object 1313 40 7 otherprop Objects objects 39
8
+ 16 window window 1209 9 13 window window Window n04587648 window.n.01 window 9
9
+ 4 table table 1170 7 10 table table Table table table table 4379243 n04379243 table.n.02 table 5
10
+ 56 trash can trash can 1090 39 6 garbage bin otherfurniture Furniture trash_bin 2747177 n02747177 ashcan.n.01 objects 39
11
+ 13 pillow pillow 937 18 7 pillow pillow Objects pillow 3938244 n03938244 pillow.n.01 cushion 8
12
+ 15 picture picture 862 11 8 picture picture Picture n03931044 picture.n.01 picture 6
13
+ 41 ceiling ceiling 806 22 3 ceiling ceiling Ceiling n02990373 ceiling.n.01 ceiling 17
14
+ 26 box box 775 29 7 box box Objects n02883344 box.n.01 objects 39
15
+ 161 doorframe doorframe 768 8 12 door door Wall door doorframe.n.01 door 4
16
+ 19 monitor monitor 765 40 7 monitor otherprop Objects monitor monitor tv or monitor 3211117 n03782190 monitor.n.04 objects 39
17
+ 7 cabinet cabinet 731 3 6 cabinet cabinet Furniture cabinet 2933112 n02933112 cabinet.n.01 cabinet 7
18
+ 9 desk desk 680 14 10 desk desk Table desk desk table 4379243 n03179701 desk.n.01 table 5
19
+ 8 shelf shelf 641 15 6 shelves shelves Furniture bookshelf bookshelf 2871439 n02871439 bookshelf.n.01 shelving 31
20
+ 10 office chair office chair 595 5 4 chair chair Chair chair chair chair 3001627 n04373704 swivel_chair.n.01 chair 3
21
+ 31 towel towel 570 27 7 towel towel Objects n04459362 towel.n.01 towel 20
22
+ 6 couch couch 502 6 9 sofa sofa Sofa sofa sofa sofa 4256520 n04256520 sofa.n.01 sofa 10
23
+ 14 sink sink 488 34 7 sink sink Objects sink n04223580 sink.n.01 sink 15
24
+ 48 backpack backpack 479 40 7 backpack otherprop Objects n02769748 backpack.n.01 objects 39
25
+ 28 lamp lamp 419 35 7 lamp lamp Objects lamp lamp 3636649 n03636649 lamp.n.02 lighting 28
26
+ 11 bed bed 370 4 1 bed bed Bed bed bed bed 2818832 n02818832 bed.n.01 bed 11
27
+ 18 bookshelf bookshelf 360 10 6 bookshelf bookshelf Furniture bookshelf bookshelf 2871439 n02871439 bookshelf.n.01 shelving 31
28
+ 71 mirror mirror 349 19 7 mirror mirror Objects n03773035 mirror.n.01 mirror 21
29
+ 21 curtain curtain 347 16 13 curtain curtain Window curtain n03151077 curtain.n.01 curtain 12
30
+ 40 plant plant 331 40 7 plant otherprop Objects plant n00017222 plant.n.02 plant 14
31
+ 52 whiteboard whiteboard 327 30 7 whiteboard whiteboard Objects n03211616 display_panel.n.01 board_panel 35
32
+ 96 radiator radiator 322 39 6 radiator otherfurniture Furniture n04041069 radiator.n.02 misc 40
33
+ 22 book book 318 23 2 book books Books n02870526 book.n.11 objects 39
34
+ 29 kitchen cabinet kitchen cabinet 310 3 6 cabinet cabinet Furniture n02933112 cabinet.n.01 cabinet 7
35
+ 49 toilet paper toilet paper 291 40 7 toilet paper otherprop Objects n15075141 toilet_tissue.n.01 objects 39
36
+ 29 kitchen cabinets kitchen cabinet 289 3 6 cabinet cabinet Furniture cabinet 2933112 n02933112 cabinet.n.01 cabinet 7
37
+ 23 armchair armchair 281 5 4 chair chair Chair chair chair chair 3001627 n02738535 armchair.n.01 chair 3
38
+ 63 shoes shoe 272 40 7 shoe otherprop Objects n04199027 shoe.n.01 clothes 38
39
+ 24 coffee table coffee table 258 7 10 coffee table table Table table table table 4379243 n03063968 coffee_table.n.01 table 5
40
+ 17 toilet toilet 256 33 7 toilet toilet Objects toilet toilet n04446276 toilet.n.01 toilet 18
41
+ 47 bag bag 252 37 7 bag bag Objects suitcase 2773838 n02773838 bag.n.06 objects 39
42
+ 32 clothes clothes 248 21 7 clothes clothes Objects n02728440 apparel.n.01 clothes 38
43
+ 46 keyboard keyboard 246 40 7 keyboard otherprop Objects keyboard computer keyboard 3085013 n03085013 computer_keyboard.n.01 objects 39
44
+ 65 bottle bottle 226 40 7 bottle otherprop Objects bottle bottle 2876657 n02876657 bottle.n.01 objects 39
45
+ 97 recycling bin recycling bin 225 39 6 garbage bin otherfurniture Furniture trash_bin 2747177 n02747177 ashcan.n.01 objects 39
46
+ 34 nightstand nightstand 224 32 6 night stand night stand Furniture night_stand night_stand n03015254 chest_of_drawers.n.01 chest_of_drawers 13
47
+ 38 stool stool 221 40 7 stool otherprop Objects stool n04326896 stool.n.01 stool 19
48
+ 33 tv tv 219 25 11 television television TV tv or monitor 3211117 n03211117 display.n.06 tv_monitor 22
49
+ 75 file cabinet file cabinet 217 3 6 cabinet cabinet Furniture cabinet 2933112 n02933112 cabinet.n.01 cabinet 7
50
+ 36 dresser dresser 213 17 6 dresser dresser Furniture dresser dresser n03015254 chest_of_drawers.n.01 chest_of_drawers 13
51
+ 64 computer tower computer tower 203 40 7 computer otherprop Objects n03082979 computer.n.01 objects 39
52
+ 32 clothing clothes 165 21 7 clothes clothes Objects n02728440 apparel.n.01 clothes 38
53
+ 101 telephone telephone 164 40 7 telephone otherprop Objects telephone 4401088 n04401088 telephone.n.01 objects 39
54
+ 130 cup cup 157 40 7 cup otherprop Objects cup cup or mug 3797390 n03797390 mug.n.04 objects 39
55
+ 27 refrigerator refrigerator 154 24 6 refridgerator refridgerator Furniture n04070727 refrigerator.n.01 appliances 37
56
+ 44 end table end table 147 7 10 table table Table table table table 4379243 n04379243 table.n.02 table 5
57
+ 131 jacket jacket 146 40 7 jacket otherprop Objects n03589791 jacket.n.01 clothes 38
58
+ 55 shower curtain shower curtain 144 28 7 shower curtain shower curtain Objects curtain n04209239 shower_curtain.n.01 curtain 12
59
+ 42 bathtub bathtub 144 36 7 bathtub bathtub Objects bathtub bathtub tub 2808440 n02808440 bathtub.n.01 bathtub 25
60
+ 59 microwave microwave 141 40 7 microwave otherprop Objects microwave 3761084 n03761084 microwave.n.02 appliances 37
61
+ 159 kitchen counter kitchen counter 140 12 6 counter counter Furniture table table table 4379243 n03116530 counter.n.01 counter 26
62
+ 74 sofa chair sofa chair 129 5 4 chair chair Chair chair chair chair 3001627 n03001627 chair.n.01 chair 3
63
+ 82 paper towel dispenser paper towel dispenser 129 40 7 paper towel dispenser otherprop Objects objects 39
64
+ 1164 bathroom vanity bathroom vanity 126 3 6 cabinet cabinet Furniture cabinet 2933112 n02933112 cabinet.n.01 table 5
65
+ 93 suitcase suitcase 118 40 7 luggage otherprop Objects n02773838 bag.n.06 objects 39
66
+ 77 laptop laptop 111 40 7 laptop otherprop Objects laptop laptop 3642806 n03642806 laptop.n.01 objects 39
67
+ 67 ottoman ottoman 111 39 6 ottoman otherfurniture Furniture stool n03380724 footstool.n.01 stool 19
68
+ 128 shower walls shower wall 109 1 12 wall wall Wall n04546855 wall.n.01 wall 1
69
+ 50 printer printer 106 40 7 printer otherprop Objects printer 4004475 n04004475 printer.n.03 appliances 37
70
+ 35 counter counter 104 12 6 counter counter Furniture table table table 4379243 n03116530 counter.n.01 counter 26
71
+ 69 board board 100 38 7 board otherstructure Objects board_panel 35
72
+ 100 soap dispenser soap dispenser 99 40 7 otherprop Objects n04254120 soap_dispenser.n.01 objects 39
73
+ 62 stove stove 95 38 7 stove otherstructure Objects stove 4330267 n04330267 stove.n.02 appliances 37
74
+ 105 light light 93 38 7 light otherstructure Objects n03665366 light.n.02 lighting 28
75
+ 1165 closet wall closet wall 90 1 12 wall wall Wall n04546855 wall.n.01 wall 1
76
+ 165 mini fridge mini fridge 87 24 6 refridgerator refridgerator Furniture n03273913 electric_refrigerator.n.01 appliances 37
77
+ 7 cabinets cabinet 79 3 6 cabinet cabinet Furniture cabinet 2933112 n02933112 cabinet.n.01 cabinet 7
78
+ 5 doors door 76 8 12 door door Wall door n03221720 door.n.01 door 4
79
+ 76 fan fan 75 40 7 fan otherprop Objects n03320046 fan.n.01 misc 40
80
+ 230 tissue box tissue box 73 40 7 tissue box otherprop Objects n02883344 box.n.01 objects 39
81
+ 54 blanket blanket 72 40 7 blanket otherprop Objects n02849154 blanket.n.01 objects 39
82
+ 125 bathroom stall bathroom stall 71 38 7 otherstructure Objects n02873839 booth.n.02 misc 40
83
+ 72 copier copier 70 40 7 otherprop Objects n03257586 duplicator.n.01 appliances 37
84
+ 68 bench bench 66 39 6 bench otherfurniture Furniture bench bench 2828884 n02828884 bench.n.01 seating 34
85
+ 145 bar bar 66 38 7 bar otherstructure Objects n02788689 bar.n.03 misc 40
86
+ 157 soap dish soap dish 65 40 7 soap dish otherprop Objects n04254009 soap_dish.n.01 objects 39
87
+ 1166 laundry hamper laundry hamper 65 40 7 laundry basket otherprop Objects objects 39
88
+ 132 storage bin storage bin 63 40 7 storage bin otherprop Objects objects 39
89
+ 1167 bathroom stall door bathroom stall door 62 8 12 door door Wall door n03221720 door.n.01 door 4
90
+ 232 light switch light switch 61 38 7 light switch otherstructure Objects n04372370 switch.n.01 misc 40
91
+ 134 coffee maker coffee maker 61 40 7 otherprop Objects n03063338 coffee_maker.n.01 appliances 37
92
+ 51 tv stand tv stand 61 39 6 tv stand otherfurniture Furniture tv_stand n03290653 entertainment_center.n.01 furniture 36
93
+ 250 decoration decoration 60 40 7 otherprop Objects n03169390 decoration.n.01 misc 40
94
+ 1168 ceiling light ceiling light 59 38 7 light otherstructure Objects n03665366 light.n.02 lighting 28
95
+ 342 range hood range hood 59 38 7 range hood otherstructure Objects range_hood n04053677 range_hood.n.01 misc 40
96
+ 89 blackboard blackboard 58 38 7 blackboard otherstructure Objects n02846511 blackboard.n.01 board_panel 35
97
+ 103 clock clock 58 40 7 clock otherprop Objects clock 3046257 n03046257 clock.n.01 objects 39
98
+ 99 wardrobe closet wardrobe 54 39 6 wardrobe otherfurniture Furniture wardrobe n04550184 wardrobe.n.01 furniture 36
99
+ 95 rail rail 53 38 7 railing otherstructure Objects n04047401 railing.n.01 railing 30
100
+ 154 bulletin board bulletin board 53 38 7 board otherstructure Objects n03211616 display_panel.n.01 board_panel 35
101
+ 140 mat mat 52 20 5 floor mat floor mat Floor n03727837 mat.n.01 floor 2
102
+ 1169 trash bin trash bin 52 39 6 garbage bin otherfurniture Furniture trash_bin 2747177 n02747177 ashcan.n.01 objects 39
103
+ 193 ledge ledge 51 38 7 otherstructure Objects n09337253 ledge.n.01 misc 40
104
+ 116 seat seat 49 39 6 furniture otherfurniture Furniture n04161981 seat.n.03 furniture 36
105
+ 202 mouse mouse 49 40 7 mouse otherprop Objects n03793489 mouse.n.04 objects 39
106
+ 73 basket basket 48 40 7 basket otherprop Objects basket 2801938 n02801938 basket.n.01 objects 39
107
+ 78 shower shower 48 38 7 otherstructure Objects n04208936 shower.n.01 shower 23
108
+ 1170 dumbbell dumbbell 48 40 7 otherprop Objects n03255030 dumbbell.n.01 objects 39
109
+ 79 paper paper 46 26 7 paper paper Objects n14974264 paper.n.01 objects 39
110
+ 80 person person 46 31 7 person person Objects person n05217688 person.n.02 misc 40
111
+ 141 windowsill windowsill 45 38 7 otherstructure Objects n04590263 windowsill.n.01 window 9
112
+ 57 closet closet 45 39 6 wardrobe otherfurniture Furniture wardrobe misc 40
113
+ 102 bucket bucket 45 40 7 bucket otherprop Objects n02909870 bucket.n.01 misc 40
114
+ 261 sign sign 44 40 7 sign otherprop Objects n04217882 signboard.n.01 objects 39
115
+ 118 speaker speaker 43 40 7 speaker otherprop Objects speaker 3691459 n03691459 loudspeaker.n.01 objects 39
116
+ 136 dishwasher dishwasher 43 38 7 dishwasher otherstructure Objects dishwasher 3207941 n03207941 dishwasher.n.01 appliances 37
117
+ 98 container container 43 40 7 container otherprop Objects n03094503 container.n.01 objects 39
118
+ 1171 stair rail stair rail 42 38 7 banister otherstructure Objects n02788148 bannister.n.02 railing 30
119
+ 170 shower curtain rod shower curtain rod 42 40 7 otherprop Objects curtain 12
120
+ 1172 tube tube 41 40 7 otherprop Objects misc 40
121
+ 1173 bathroom cabinet bathroom cabinet 39 3 6 cabinet cabinet Furniture cabinet 2933112 n02933112 cabinet.n.01 cabinet 7
122
+ 79 papers paper 39 26 7 paper paper Objects n14974264 paper.n.01 objects 39
123
+ 221 storage container storage container 39 40 7 container otherprop Objects objects 39
124
+ 570 paper bag paper bag 39 37 7 bag bag Objects n04122825 sack.n.01 objects 39
125
+ 138 paper towel roll paper towel roll 39 40 7 paper towel otherprop Objects n03887697 paper_towel.n.01 towel 20
126
+ 168 ball ball 39 40 7 ball otherprop Objects objects 39
127
+ 276 closet doors closet door 38 8 12 door door Wall door n03221720 door.n.01 door 4
128
+ 106 laundry basket laundry basket 37 40 7 laundry basket otherprop Objects basket 2801938 n03050864 clothes_hamper.n.01 objects 39
129
+ 214 cart cart 37 40 7 cart otherprop Objects n03484083 handcart.n.01 shelving 31
130
+ 276 closet door closet door 35 8 12 door door Wall door n03221720 door.n.01 door 4
131
+ 323 dish rack dish rack 35 40 7 dish rack otherprop Objects n03207630 dish_rack.n.01 objects 39
132
+ 58 stairs stairs 35 38 7 stairs otherstructure Objects n04298308 stairway.n.01 stairs 16
133
+ 86 blinds blinds 35 13 13 blinds blinds Window n02851099 blind.n.03 blinds 32
134
+ 2 stack of chairs chair 35 5 4 chair chair Chair chair chair chair 3001627 n03001627 chair.n.01 chair 3
135
+ 399 purse purse 34 40 7 purse otherprop Objects n02774152 bag.n.04 objects 39
136
+ 121 bicycle bicycle 33 40 7 bicycle otherprop Objects bicycle 2834778 n02834778 bicycle.n.01 objects 39
137
+ 185 tray tray 32 40 7 tray otherprop Objects n04476259 tray.n.01 objects 39
138
+ 300 plunger plunger 30 40 7 otherprop Objects n03970156 plunger.n.03 objects 39
139
+ 180 paper cutter paper cutter 30 40 7 paper cutter otherprop Objects n03886940 paper_cutter.n.01 objects 39
140
+ 163 toilet paper dispenser toilet paper dispenser 29 40 7 otherprop Objects objects 39
141
+ 26 boxes box 29 29 7 box box Objects n02883344 box.n.01 objects 39
142
+ 66 bin bin 28 40 7 bin otherprop Objects n02839910 bin.n.01 objects 39
143
+ 208 toilet seat cover dispenser toilet seat cover dispenser 28 40 7 otherprop Objects objects 39
144
+ 112 guitar guitar 28 40 7 guitar otherprop Objects guitar guitar 3467517 n03467517 guitar.n.01 objects 39
145
+ 540 mailboxes mailbox 28 29 7 box box Objects mailbox 3710193 n03710193 mailbox.n.01 misc 40
146
+ 395 handicap bar handicap bar 27 38 7 bar otherstructure Objects misc 40
147
+ 166 fire extinguisher fire extinguisher 27 40 7 fire extinguisher otherprop Objects n03345837 fire_extinguisher.n.01 misc 40
148
+ 122 ladder ladder 27 39 6 ladder otherfurniture Furniture stairs n03632277 ladder.n.01 stairs 16
149
+ 120 column column 26 38 7 column otherstructure Objects n03074380 column.n.06 column 24
150
+ 107 pipe pipe 25 40 7 pipe otherprop Objects n03944672 pipe.n.02 misc 40
151
+ 283 vacuum cleaner vacuum cleaner 25 40 7 otherprop Objects n04517823 vacuum.n.04 objects 39
152
+ 88 plate plate 24 40 7 plate otherprop Objects n03959485 plate.n.04 objects 39
153
+ 90 piano piano 24 39 6 piano otherfurniture Furniture piano piano 3928116 n03928116 piano.n.01 furniture 36
154
+ 177 water cooler water cooler 24 39 6 water cooler otherfurniture Furniture n04559166 water_cooler.n.01 misc 40
155
+ 1174 cd case cd case 24 40 7 otherprop Objects objects 39
156
+ 562 bowl bowl 24 40 7 bowl otherprop Objects bowl bowl 2880940 n02880940 bowl.n.03 objects 39
157
+ 1175 closet rod closet rod 24 40 7 otherprop Objects n04100174 rod.n.01 misc 40
158
+ 1156 bathroom counter bathroom counter 24 12 6 counter counter Furniture table table table 4379243 n03116530 counter.n.01 counter 26
159
+ 84 oven oven 23 38 7 oven otherstructure Objects n03862676 oven.n.01 appliances 37
160
+ 104 stand stand 23 39 6 stand otherfurniture Furniture table table table 4379243 n04301000 stand.n.04 table 5
161
+ 229 scale scale 23 40 7 scale otherprop Objects n04141975 scale.n.07 objects 39
162
+ 70 washing machine washing machine 23 39 6 washing machine otherfurniture Furniture washing_machine 4554684 n04554684 washer.n.03 appliances 37
163
+ 325 broom broom 22 40 7 broom otherprop Objects n02906734 broom.n.01 objects 39
164
+ 169 hat hat 22 40 7 hat otherprop Objects n03497657 hat.n.01 clothes 38
165
+ 128 shower wall shower wall 22 1 12 wall wall Wall n04208936 shower.n.01 wall 1
166
+ 331 guitar case guitar case 21 40 7 guitar case otherprop Objects objects 39
167
+ 87 rack rack 21 39 6 stand otherfurniture Furniture n04038440 rack.n.05 shelving 31
168
+ 488 water pitcher water pitcher 21 40 7 pitcher otherprop Objects n03950228 pitcher.n.02 objects 39
169
+ 776 laundry detergent laundry detergent 21 40 7 otherprop Objects objects 39
170
+ 370 hair dryer hair dryer 21 40 7 hair dryer otherprop Objects n03483316 hand_blower.n.01 objects 39
171
+ 191 pillar pillar 21 38 7 column otherstructure Objects n03073977 column.n.07 column 24
172
+ 748 divider divider 20 40 7 otherprop Objects wall 1
173
+ 242 power outlet power outlet 19 40 7 otherprop Objects misc 40
174
+ 45 dining table dining table 19 7 10 table table Table table table table 4379243 n04379243 table.n.02 table 5
175
+ 417 shower floor shower floor 19 2 5 floor floor Floor n04208936 shower.n.01 floor 2
176
+ 70 washing machines washing machine 19 39 6 washing machine otherfurniture Furniture washing_machine 4554684 n04554684 washer.n.03 appliances 37
177
+ 188 shower door shower door 19 8 12 door door Wall door n04208936 shower.n.01 door 4
178
+ 1176 coffee kettle coffee kettle 18 40 7 pot otherprop Objects n03612814 kettle.n.01 objects 39
179
+ 1177 wardrobe cabinet wardrobe 18 39 6 wardrobe otherfurniture Furniture wardrobe n04550184 wardrobe.n.01 furniture 36
180
+ 1178 structure structure 18 38 7 otherstructure Objects misc 40
181
+ 18 bookshelves bookshelf 17 10 6 bookshelf bookshelf Furniture bookshelf bookshelf 2871439 n02871439 bookshelf.n.01 shelving 31
182
+ 110 clothes dryer clothes dryer 17 39 6 otherfurniture Furniture n03251766 dryer.n.01 appliances 37
183
+ 148 toaster toaster 17 40 7 toaster otherprop Objects n04442312 toaster.n.02 appliances 37
184
+ 63 shoe shoe 17 40 7 shoe otherprop Objects n04199027 shoe.n.01 clothes 38
185
+ 155 ironing board ironing board 16 39 6 ironing board otherfurniture Furniture n03586090 ironing_board.n.01 objects 39
186
+ 572 alarm clock alarm clock 16 40 7 alarm clock otherprop Objects clock 3046257 n02694662 alarm_clock.n.01 objects 39
187
+ 1179 shower head shower head 15 38 7 otherstructure Objects shower 23
188
+ 28 lamp base lamp 15 35 7 lamp lamp Objects lamp lamp 3636649 n03636649 lamp.n.02 lighting 28
189
+ 392 water bottle water bottle 15 40 7 bottle otherprop Objects bottle bottle 2876657 n04557648 water_bottle.n.01 objects 39
190
+ 1180 keyboard piano keyboard piano 15 39 6 piano otherfurniture Furniture piano piano 3928116 n03928116 piano.n.01 furniture 36
191
+ 609 projector screen projector screen 15 38 7 projector screen otherstructure Objects misc 40
192
+ 1181 case of water bottles case of water bottles 15 40 7 otherprop Objects objects 39
193
+ 195 toaster oven toaster oven 14 40 7 toaster oven otherprop Objects n04442441 toaster_oven.n.01 appliances 37
194
+ 581 music stand music stand 14 39 6 music stand otherfurniture Furniture n03801760 music_stand.n.01 furniture 36
195
+ 58 staircase stairs 14 38 7 stairs otherstructure Objects n04298308 stairway.n.01 stairs 16
196
+ 1182 coat rack coat rack 14 40 7 otherprop Objects n03059103 coatrack.n.01 shelving 3
197
+ 1183 storage organizer storage organizer 14 40 7 otherprop Objects shelving 3
198
+ 139 machine machine 14 40 7 machine otherprop Objects n03699975 machine.n.01 appliances 37
199
+ 1184 folded chair folded chair 14 5 4 chair chair Chair chair chair chair 3001627 n03001627 chair.n.01 chair 3
200
+ 1185 fire alarm fire alarm 14 40 7 otherprop Objects n03343737 fire_alarm.n.02 misc 40
201
+ 156 fireplace fireplace 13 38 7 fireplace otherstructure Objects n03346455 fireplace.n.01 fireplace 27
202
+ 408 vent vent 13 40 7 otherprop Objects n04526241 vent.n.01 misc 40
203
+ 213 furniture furniture 13 39 6 furniture otherfurniture Furniture n03405725 furniture.n.01 furniture 36
204
+ 1186 power strip power strip 13 40 7 otherprop Objects objects 39
205
+ 1187 calendar calendar 13 40 7 otherprop Objects objects 39
206
+ 1188 poster poster 13 11 8 picture picture Picture n03931044 picture.n.01 picture 6
207
+ 115 toilet paper holder toilet paper holder 13 40 7 toilet paper holder otherprop Objects objects 39
208
+ 1189 potted plant potted plant 12 40 7 plant otherprop Objects plant n00017222 plant.n.02 plant 14
209
+ 304 stuffed animal stuffed animal 12 40 7 stuffed animal otherprop Objects n04399382 teddy.n.01 objects 39
210
+ 1190 luggage luggage 12 40 7 luggage otherprop Objects n02774630 baggage.n.01 objects 39
211
+ 21 curtains curtain 12 16 13 curtain curtain Window curtain n03151077 curtain.n.01 curtain 12
212
+ 312 headphones headphones 12 40 7 otherprop Objects n03261776 earphone.n.01 objects 39
213
+ 233 crate crate 12 39 6 crate otherfurniture Furniture n03127925 crate.n.01 objects 39
214
+ 286 candle candle 12 40 7 candle otherprop Objects lamp n02948072 candle.n.01 objects 39
215
+ 264 projector projector 12 40 7 projector otherprop Objects n04009552 projector.n.02 objects 39
216
+ 110 clothes dryers clothes dryer 12 39 6 otherfurniture Furniture n03251766 dryer.n.01 appliances 37
217
+ 1191 mattress mattress 12 4 1 bed bed Bed bed bed bed 2818832 n02818832 bed.n.01 bed 11
218
+ 356 dustpan dustpan 12 40 7 otherprop Objects n03259009 dustpan.n.02 objects 39
219
+ 25 drawer drawer 11 39 6 drawer otherfurniture Furniture n03233905 drawer.n.01 furniture 36
220
+ 750 rod rod 11 40 7 otherprop Objects pistol 3948459 n03427202 gat.n.01 misc 40
221
+ 269 globe globe 11 40 7 globe otherprop Objects objects 39
222
+ 307 footrest footrest 11 39 6 foot rest otherfurniture Furniture stool n03380724 footstool.n.01 stool 19
223
+ 410 piano bench piano bench 11 39 6 piano bench otherfurniture Furniture bench bench 2828884 n02828884 bench.n.01 seating 34
224
+ 730 breakfast bar breakfast bar 11 38 7 bar otherstructure Objects counter 26
225
+ 216 step stool step stool 11 40 7 step stool otherprop Objects stool n04315713 step_stool.n.01 stool 19
226
+ 1192 hand rail hand rail 11 38 7 railing otherstructure Objects railing 30
227
+ 119 vending machine vending machine 11 40 7 machine otherprop Objects n04525305 vending_machine.n.01 appliances 37
228
+ 682 ceiling fan ceiling fan 11 40 7 fan otherprop Objects n03320046 fan.n.01 misc 40
229
+ 434 swiffer swiffer 11 40 7 otherprop Objects objects 39
230
+ 126 foosball table foosball table 11 39 6 foosball table otherfurniture Furniture table table table 4379243 n04379243 table.n.02 table 5
231
+ 919 jar jar 11 40 7 jar otherprop Objects jar 3593526 n03593526 jar.n.01 objects 39
232
+ 85 footstool footstool 11 39 6 ottoman otherfurniture Furniture stool n03380724 footstool.n.01 stool 19
233
+ 1193 folded table folded table 10 7 10 table table Table table table table 4379243 n04379243 table.n.02 table 5
234
+ 108 round table round table 10 7 10 table table Table table table table 4379243 n04114554 round_table.n.02 table 5
235
+ 135 hamper hamper 10 40 7 basket otherprop Objects basket 2801938 n03482405 hamper.n.02 objects 39
236
+ 1194 poster tube poster tube 10 40 7 otherprop Objects objects 39
237
+ 432 case case 10 40 7 case otherprop Objects objects 39
238
+ 53 carpet carpet 10 40 7 rug otherprop Objects n04118021 rug.n.01 floor 2
239
+ 1195 thermostat thermostat 10 40 7 otherprop Objects n04422875 thermostat.n.01 misc 40
240
+ 111 coat coat 10 40 7 jacket otherprop Objects n03057021 coat.n.01 clothes 38
241
+ 305 water fountain water fountain 10 38 7 water fountain otherstructure Objects n03241335 drinking_fountain.n.01 misc 40
242
+ 1125 smoke detector smoke detector 10 40 7 otherprop Objects misc 40
243
+ 13 pillows pillow 9 18 7 pillow pillow Objects pillow 3938244 n03938244 pillow.n.01 cushion 8
244
+ 1196 flip flops flip flops 9 40 7 shoe otherprop Objects n04199027 shoe.n.01 clothes 38
245
+ 1197 cloth cloth 9 21 7 clothes clothes Objects n02728440 apparel.n.01 clothes 38
246
+ 1198 banner banner 9 40 7 otherprop Objects n02788021 banner.n.01 misc 40
247
+ 1199 clothes hanger clothes hanger 9 40 7 otherprop Objects n03057920 coat_hanger.n.01 objects 39
248
+ 1200 whiteboard eraser whiteboard eraser 9 40 7 otherprop Objects objects 39
249
+ 378 iron iron 9 40 7 otherprop Objects n03584829 iron.n.04 objects 39
250
+ 591 instrument case instrument case 9 40 7 case otherprop Objects objects 39
251
+ 49 toilet paper rolls toilet paper 9 40 7 toilet paper otherprop Objects n15075141 toilet_tissue.n.01 objects 39
252
+ 92 soap soap 9 40 7 soap otherprop Objects n04253437 soap.n.01 objects 39
253
+ 1098 block block 9 40 7 otherprop Objects misc 40
254
+ 291 wall hanging wall hanging 8 40 7 otherprop Objects n03491178 hanging.n.01 picture 6
255
+ 1063 kitchen island kitchen island 8 38 7 kitchen island otherstructure Objects n03620600 kitchen_island.n.01 counter 26
256
+ 107 pipes pipe 8 38 7 otherstructure Objects misc 40
257
+ 1135 toothbrush toothbrush 8 40 7 toothbrush otherprop Objects n04453156 toothbrush.n.01 objects 39
258
+ 189 shirt shirt 8 40 7 otherprop Objects n04197391 shirt.n.01 clothes 38
259
+ 245 cutting board cutting board 8 40 7 cutting board otherprop Objects n03025513 chopping_board.n.01 objects 39
260
+ 194 vase vase 8 40 7 vase otherprop Objects vase jar 3593526 n04522168 vase.n.01 objects 39
261
+ 1201 shower control valve shower control valve 8 38 7 otherstructure Objects n04208936 shower.n.01 shower 23
262
+ 386 exercise machine exercise machine 8 40 7 machine otherprop Objects gym_equipment 33
263
+ 1202 compost bin compost bin 8 39 6 garbage bin otherfurniture Furniture trash_bin 2747177 n02747177 ashcan.n.01 objects 39
264
+ 857 shorts shorts 8 40 7 shorts otherprop Objects clothes 38
265
+ 452 tire tire 8 40 7 otherprop Objects n04440749 tire.n.01 objects 39
266
+ 1203 teddy bear teddy bear 7 40 7 stuffed animal otherprop Objects n04399382 teddy.n.01 objects 39
267
+ 346 bathrobe bathrobe 7 40 7 otherprop Objects n02807616 bathrobe.n.01 clothes 38
268
+ 152 handrail handrail 7 38 7 railing otherstructure Objects n02788148 bannister.n.02 railing 30
269
+ 83 faucet faucet 7 40 7 faucet otherprop Objects faucet 3325088 n03325088 faucet.n.01 misc 40
270
+ 1204 pantry wall pantry wall 7 1 12 wall wall Wall n04546855 wall.n.01 wall 1
271
+ 726 thermos thermos 7 40 7 flask otherprop Objects bottle bottle 2876657 n04422727 thermos.n.01 objects 39
272
+ 61 rug rug 7 40 7 rug otherprop Objects n04118021 rug.n.01 floor 2
273
+ 39 couch cushions cushion 7 18 7 pillow pillow Objects n03151500 cushion.n.03 cushion 8
274
+ 1117 tripod tripod 7 39 6 stand otherfurniture Furniture n04485082 tripod.n.01 objects 39
275
+ 540 mailbox mailbox 7 29 7 box box Objects mailbox 3710193 n03710193 mailbox.n.01 misc 40
276
+ 1205 tupperware tupperware 7 40 7 otherprop Objects objects 39
277
+ 415 shoe rack shoe rack 7 40 7 shoe rack otherprop Objects shelving 31
278
+ 31 towels towel 6 27 7 towel towel Objects n04459362 towel.n.01 towel 20
279
+ 1206 beer bottles beer bottle 6 40 7 bottle otherprop Objects bottle bottle 2876657 n02876657 bottle.n.01 objects 39
280
+ 153 treadmill treadmill 6 39 6 treadmill otherfurniture Furniture n04477387 treadmill.n.01 gym_equipment 33
281
+ 1207 salt salt 6 40 7 otherprop Objects objects 39
282
+ 129 chest chest 6 39 6 chest otherfurniture Furniture dresser dresser chest_of_drawers 13
283
+ 220 dispenser dispenser 6 40 7 otherprop Objects n03210683 dispenser.n.01 objects 39
284
+ 1208 mirror doors mirror door 6 8 12 door door Wall door n03221720 door.n.01 door 4
285
+ 231 remote remote 6 40 7 otherprop Objects remote_control 4074963 n04074963 remote_control.n.01 objects 39
286
+ 1209 folded ladder folded ladder 6 39 6 ladder otherfurniture Furniture stairs n03632277 ladder.n.01 misc 40
287
+ 39 cushion cushion 6 18 7 pillow pillow Objects n03151500 cushion.n.03 cushion 8
288
+ 1210 carton carton 6 40 7 otherprop Objects objects 39
289
+ 117 step step 6 38 7 otherstructure Objects n04314914 step.n.04 misc 40
290
+ 822 drying rack drying rack 6 39 6 drying rack otherfurniture Furniture shelving 31
291
+ 238 slippers slipper 6 40 7 shoe otherprop Objects n04241394 slipper.n.01 clothes 38
292
+ 143 pool table pool table 6 39 6 pool table otherfurniture Furniture table table table 4379243 n03982430 pool_table.n.01 table 5
293
+ 1211 soda stream soda stream 6 40 7 otherprop Objects objects 39
294
+ 228 toilet brush toilet brush 6 40 7 toilet brush otherprop Objects objects 39
295
+ 494 loft bed loft bed 6 4 1 bed bed Bed bed bed bed 2818832 n02818832 bed.n.01 bed 11
296
+ 226 cooking pot cooking pot 6 40 7 pot otherprop Objects objects 39
297
+ 91 heater heater 6 39 6 heater otherfurniture Furniture n03508101 heater.n.01 misc 40
298
+ 1072 messenger bag messenger bag 6 37 7 bag bag Objects objects 39
299
+ 435 stapler stapler 6 40 7 stapler otherprop Objects n04303497 stapler.n.01 objects 39
300
+ 1165 closet walls closet wall 5 1 12 wall wall Wall n04546855 wall.n.01 wall 1
301
+ 345 scanner scanner 5 40 7 otherprop Objects appliances 37
302
+ 893 elliptical machine elliptical machine 5 40 7 machine otherprop Objects gym_equipment 33
303
+ 621 kettle kettle 5 40 7 pot otherprop Objects n03612814 kettle.n.01 objects 39
304
+ 1212 metronome metronome 5 40 7 otherprop Objects n03757604 metronome.n.01 objects 39
305
+ 297 dumbell dumbell 5 40 7 otherprop Objects objects 39
306
+ 1213 music book music book 5 23 2 book books Books n02870526 book.n.11 objects 39
307
+ 1214 rice cooker rice cooker 5 40 7 otherprop Objects objects 39
308
+ 1215 dart board dart board 5 38 7 board otherstructure Objects n03162940 dartboard.n.01 objects 39
309
+ 529 sewing machine sewing machine 5 40 7 sewing machine otherprop Objects n04179913 sewing_machine.n.01 objects 39
310
+ 1216 grab bar grab bar 5 38 7 railing otherstructure Objects railing 30
311
+ 1217 flowerpot flowerpot 5 40 7 vase otherprop Objects vase jar 3593526 n04522168 vase.n.01 objects 39
312
+ 1218 painting painting 5 11 8 picture picture Picture n03931044 picture.n.01 picture 6
313
+ 1219 railing railing 5 38 7 railing otherstructure Objects n04047401 railing.n.01 railing 30
314
+ 1220 stair stair 5 38 7 stairs otherstructure Objects stairs n04314914 step.n.04 stairs 16
315
+ 525 toolbox toolbox 5 39 6 chest otherfurniture Furniture n04452615 toolbox.n.01 objects 39
316
+ 204 nerf gun nerf gun 5 40 7 otherprop Objects objects 39
317
+ 693 binders binder 5 40 7 binder otherprop Objects objects 39
318
+ 179 desk lamp desk lamp 5 35 7 lamp lamp Objects lamp lamp 3636649 n03636649 lamp.n.02 lighting 28
319
+ 1221 quadcopter quadcopter 5 40 7 otherprop Objects objects 39
320
+ 1222 pitcher pitcher 5 40 7 pitcher otherprop Objects n03950228 pitcher.n.02 objects 39
321
+ 1223 hanging hanging 5 40 7 otherprop Objects misc 40
322
+ 1224 mail mail 5 40 7 otherprop Objects misc 40
323
+ 1225 closet ceiling closet ceiling 5 22 3 ceiling ceiling Ceiling n02990373 ceiling.n.01 ceiling 17
324
+ 1226 hoverboard hoverboard 5 40 7 otherprop Objects objects 39
325
+ 1227 beanbag chair beanbag chair 5 39 6 bean bag otherfurniture Furniture n02816656 beanbag.n.01 chair 3
326
+ 571 water heater water heater 5 40 7 water heater otherprop Objects n04560113 water_heater.n.01 misc 40
327
+ 1228 spray bottle spray bottle 5 40 7 bottle otherprop Objects bottle bottle 2876657 n02876657 bottle.n.01 objects 39
328
+ 556 rope rope 5 40 7 rope otherprop Objects n04108268 rope.n.01 objects 39
329
+ 280 plastic container plastic container 5 40 7 container otherprop Objects objects 39
330
+ 1229 soap bottle soap bottle 5 40 7 soap otherprop Objects objects 39
331
+ 1230 ikea bag ikea bag 4 37 7 bag bag Objects 2773838 n02773838 bag.n.06 objects 39
332
+ 1231 sleeping bag sleeping bag 4 40 7 otherprop Objects n04235860 sleeping_bag.n.01 objects 39
333
+ 1232 duffel bag duffel bag 4 37 7 bag bag Objects suitcase 2773838 n02773838 bag.n.06 objects 39
334
+ 746 frying pan frying pan 4 40 7 frying pan otherprop Objects n03400231 frying_pan.n.01 objects 39
335
+ 1233 oven mitt oven mitt 4 40 7 otherprop Objects objects 39
336
+ 1234 pot pot 4 40 7 pot otherprop Objects n04235860 sleeping_bag.n.01 objects 39
337
+ 144 hand dryer hand dryer 4 40 7 otherprop Objects objects 39
338
+ 282 dollhouse dollhouse 4 39 6 doll house otherfurniture Furniture n03219483 dollhouse.n.01 objects 39
339
+ 167 shampoo bottle shampoo bottle 4 40 7 bottle otherprop Objects bottle bottle 2876657 n02876657 bottle.n.01 objects 39
340
+ 1235 hair brush hair brush 4 40 7 otherprop Objects n02908217 brush.n.02 objects 39
341
+ 1236 tennis racket tennis racket 4 40 7 otherprop Objects n04409806 tennis_racket.n.01 objects 39
342
+ 1237 display case display case 4 40 7 case otherprop Objects objects 39
343
+ 234 ping pong table ping pong table 4 39 6 ping pong table otherfurniture Furniture table table table 4379243 n04379243 table.n.02 table 5
344
+ 563 boiler boiler 4 40 7 otherprop Objects misc 40
345
+ 1238 bag of coffee beans bag of coffee beans 4 37 7 bag bag Objects suitcase 2773838 n02773838 bag.n.06 objects 39
346
+ 1239 bananas banana 4 40 7 otherprop Objects n00021265 food.n.01 objects 39
347
+ 1240 carseat carseat 4 40 7 otherprop Objects misc 40
348
+ 366 helmet helmet 4 40 7 otherprop Objects helmet 3513137 n03513137 helmet.n.02 clothes 38
349
+ 816 umbrella umbrella 4 40 7 umbrella otherprop Objects n04507155 umbrella.n.01 objects 39
350
+ 1241 coffee box coffee box 4 40 7 otherprop Objects objects 39
351
+ 719 envelope envelope 4 40 7 envelope otherprop Objects n03291819 envelope.n.01 objects 39
352
+ 284 wet floor sign wet floor sign 4 40 7 sign otherprop Objects misc 40
353
+ 1242 clothing rack clothing rack 4 39 6 stand otherfurniture Furniture n04038440 rack.n.05 shelving 31
354
+ 247 controller controller 4 40 7 otherprop Objects n03096960 control.n.09 objects 39
355
+ 1243 bath walls bathroom wall 4 1 12 wall wall Wall n04546855 wall.n.01 wall 1
356
+ 1244 podium podium 4 39 6 otherfurniture Furniture n03159640 dais.n.01 furniture 36
357
+ 1245 storage box storage box 4 29 7 box box Objects n02883344 box.n.01 objects 39
358
+ 1246 dolly dolly 4 40 7 otherprop Objects misc 40
359
+ 1247 shampoo shampoo 3 40 7 otherprop Objects n04183516 shampoo.n.01 objects 39
360
+ 592 paper tray paper tray 3 40 7 paper tray otherprop Objects objects 39
361
+ 385 cabinet door cabinet door 3 8 12 door door Wall door door 4
362
+ 1248 changing station changing station 3 40 7 otherprop Objects misc 40
363
+ 1249 poster printer poster printer 3 40 7 printer otherprop Objects printer 4004475 n04004475 printer.n.03 appliances 37
364
+ 133 screen screen 3 40 7 otherprop Objects n03151077 curtain.n.01 curtain 12
365
+ 301 soap bar soap bar 3 38 7 bar otherstructure Objects objects 39
366
+ 1250 crutches crutches 3 40 7 otherprop Objects n03141823 crutch.n.01 objects 39
367
+ 379 studio light studio light 3 38 7 light otherstructure Objects lighting 28
368
+ 130 stack of cups cup 3 40 7 cup otherprop Objects cup cup or mug 3797390 n03797390 mug.n.04 objects 39
369
+ 1251 toilet flush button toilet flush button 3 40 7 otherprop Objects objects 39
370
+ 450 trunk trunk 3 40 7 otherprop Objects misc 40
371
+ 1252 grocery bag grocery bag 3 37 7 bag bag Objects suitcase 2773838 n03461288 grocery_bag.n.01 objects 39
372
+ 316 plastic bin plastic bin 3 40 7 bin otherprop Objects objects 39
373
+ 1253 pizza box pizza box 3 29 7 box box Objects objects 39
374
+ 385 cabinet doors cabinet door 3 3 6 cabinet cabinet Furniture cabinet 2933112 n02933112 cabinet.n.01 door 4
375
+ 1254 legs legs 3 31 7 person person Objects person n05217688 person.n.02 misc 40
376
+ 461 car car 3 40 7 car otherprop Objects car car 2958343 n02958343 car.n.01 misc 40
377
+ 1255 shaving cream shaving cream 3 40 7 otherprop Objects n04186051 shaving_cream.n.01 objects 39
378
+ 1256 luggage stand luggage stand 3 39 6 stand otherfurniture Furniture n04038440 rack.n.05 shelving 31
379
+ 599 shredder shredder 3 40 7 otherprop Objects n04210120 shredder.n.01 objects 39
380
+ 281 statue statue 3 40 7 sculpture otherprop Objects n04306847 statue.n.01 misc 40
381
+ 1257 urinal urinal 3 33 7 toilet toilet Objects toilet toilet n04515991 urinal.n.01 toilet 18
382
+ 1258 hose hose 3 40 7 otherprop Objects n03539875 hose.n.03 misc 40
383
+ 1259 bike pump bike pump 3 40 7 otherprop Objects objects 39
384
+ 319 coatrack coatrack 3 40 7 otherprop Objects n03059103 coatrack.n.01 shelving 31
385
+ 1260 bear bear 3 40 7 otherprop Objects objects 39
386
+ 28 wall lamp lamp 3 35 7 lamp lamp Objects lamp lamp 3636649 n03636649 lamp.n.02 lighting 28
387
+ 1261 humidifier humidifier 3 40 7 otherprop Objects objects 39
388
+ 546 toothpaste toothpaste 3 40 7 toothpaste otherprop Objects objects 39
389
+ 1262 mouthwash bottle mouthwash bottle 3 40 7 bottle otherprop Objects bottle bottle 2876657 n02876657 bottle.n.01 objects 39
390
+ 1263 poster cutter poster cutter 3 40 7 otherprop Objects objects 39
391
+ 1264 golf bag golf bag 3 37 7 bag bag Objects suitcase 2773838 n03445617 golf_bag.n.01 objects 39
392
+ 1265 food container food container 3 40 7 container otherprop Objects n03094503 container.n.01 objects 39
393
+ 1266 camera camera 3 40 7 otherprop Objects objects 39
394
+ 28 table lamp lamp 3 35 7 lamp lamp Objects lamp lamp 3636649 n04380533 table_lamp.n.01 lighting 28
395
+ 1267 yoga mat yoga mat 3 20 5 floor mat floor mat Floor n03727837 mat.n.01 floor 2
396
+ 1268 card card 3 40 7 otherprop Objects objects 39
397
+ 1269 mug mug 3 40 7 cup otherprop Objects cup cup or mug 3797390 n03797390 mug.n.04 objects 39
398
+ 188 shower doors shower door 3 38 7 otherstructure Objects n04208936 shower.n.01 door 4
399
+ 689 cardboard cardboard 3 40 7 otherprop Objects objects 39
400
+ 1270 rack stand rack stand 3 39 6 stand otherfurniture Furniture n04038440 rack.n.05 shelving 31
401
+ 1271 boxes of paper boxes of paper 3 29 7 box box Objects n02883344 box.n.01 objects 39
402
+ 1272 flag flag 3 40 7 otherprop Objects misc 40
403
+ 354 futon futon 3 39 6 mattress otherfurniture Furniture n03408444 futon.n.01 sofa 10
404
+ 339 magazine magazine 3 40 7 magazine otherprop Objects n06595351 magazine.n.01 objects 39
405
+ 1009 exit sign exit sign 3 40 7 exit sign otherprop Objects misc 40
406
+ 1273 rolled poster rolled poster 3 40 7 otherprop Objects objects 39
407
+ 1274 wheel wheel 3 40 7 otherprop Objects objects 39
408
+ 15 pictures picture 3 11 8 picture picture Picture n03931044 picture.n.01 picture 6
409
+ 1275 blackboard eraser blackboard eraser 3 40 7 eraser otherprop Objects n03294833 eraser.n.01 objects 39
410
+ 361 organizer organizer 3 40 7 otherprop Objects n03918737 personal_digital_assistant.n.01 objects 39
411
+ 1276 doll doll 3 40 7 toy otherprop Objects n03219135 doll.n.01 objects 39
412
+ 326 book rack book rack 3 39 6 bookrack otherfurniture Furniture objects 39
413
+ 1277 laundry bag laundry bag 3 40 7 laundry basket otherprop Objects basket 2801938 n03050864 clothes_hamper.n.01 objects 39
414
+ 1278 sponge sponge 3 40 7 otherprop Objects n01906749 sponge.n.04 objects 39
415
+ 116 seating seat 3 39 6 furniture otherfurniture Furniture n04161981 seat.n.03 furniture 36
416
+ 1184 folded chairs folded chair 2 5 4 chair chair Chair chair chair chair 3001627 n03001627 chair.n.01 chair 3
417
+ 1279 lotion bottle lotion bottle 2 40 7 bottle otherprop Objects bottle bottle 2876657 n02876657 bottle.n.01 objects 39
418
+ 212 can can 2 40 7 can otherprop Objects can 2946921 n02946921 can.n.01 objects 39
419
+ 1280 lunch box lunch box 2 40 7 otherprop Objects objects 39
420
+ 1281 food display food display 2 40 7 otherprop Objects misc 40
421
+ 794 storage shelf storage shelf 2 40 7 otherprop Objects shelving 31
422
+ 1282 sliding wood door sliding wood door 2 40 7 otherprop Objects door 4
423
+ 955 pants pants 2 40 7 otherprop Objects n04489008 trouser.n.01 clothes 38
424
+ 387 wood wood 2 40 7 otherprop Objects misc 40
425
+ 69 boards board 2 38 7 board otherstructure Objects board_panel 35
426
+ 65 bottles bottle 2 40 7 bottle otherprop Objects bottle bottle 2876657 n02876657 bottle.n.01 objects 39
427
+ 523 washcloth washcloth 2 40 7 otherprop Objects n04554523 washcloth.n.01 towel 20
428
+ 389 workbench workbench 2 39 6 bench otherfurniture Furniture bench table 4379243 n04600486 workbench.n.01 table 5
429
+ 29 open kitchen cabinet kitchen cabinet 2 3 6 cabinet cabinet Furniture n02933112 cabinet.n.01 cabinet 7
430
+ 1283 organizer shelf organizer shelf 2 15 6 shelves shelves Furniture bookshelf bookshelf 2871439 n02871439 bookshelf.n.01 shelving 31
431
+ 146 frame frame 2 38 7 otherstructure Objects misc 40
432
+ 130 cups cup 2 40 7 cup otherprop Objects cup cup or mug 3797390 n03797390 mug.n.04 objects 39
433
+ 372 exercise ball exercise ball 2 40 7 ball otherprop Objects n04285146 sports_equipment.n.01 gym_equipment 33
434
+ 289 easel easel 2 39 6 stand otherfurniture Furniture n03262809 easel.n.01 furniture 36
435
+ 440 garbage bag garbage bag 2 37 7 bag bag Objects suitcase 2773838 n02773838 bag.n.06 objects 39
436
+ 321 roomba roomba 2 40 7 otherprop Objects objects 39
437
+ 976 garage door garage door 2 38 7 garage door otherstructure Objects door door 4
438
+ 1256 luggage rack luggage stand 2 39 6 stand otherfurniture Furniture n04038440 shelving 31
439
+ 1284 bike lock bike lock 2 40 7 otherprop Objects objects 39
440
+ 1285 briefcase briefcase 2 40 7 otherprop Objects n02900705 briefcase.n.01 objects 39
441
+ 357 hand towel hand towel 2 27 7 towel towel Objects n03490006 hand_towel.n.01 towel 20
442
+ 1286 bath products bath product 2 40 7 otherprop Objects objects 39
443
+ 1287 star star 2 40 7 otherprop Objects n09444783 star.n.03 misc 40
444
+ 365 map map 2 40 7 map otherprop Objects n03720163 map.n.01 misc 40
445
+ 1288 coffee bean bag coffee bean bag 2 37 7 bag bag Objects suitcase 2773838 n02773838 bag.n.06 objects 39
446
+ 81 headboard headboard 2 39 6 headboard otherfurniture Furniture n03502200 headboard.n.01 bed 11
447
+ 1289 ipad ipad 2 40 7 otherprop Objects objects 39
448
+ 1290 display rack display rack 2 39 6 stand otherfurniture Furniture n04038440 rack.n.05 shelving 31
449
+ 948 traffic cone traffic cone 2 40 7 cone otherprop Objects cone objects 39
450
+ 174 toiletry toiletry 2 40 7 otherprop Objects n04447443 toiletry.n.01 objects 39
451
+ 1028 canopy canopy 2 40 7 otherprop Objects misc 40
452
+ 1291 massage chair massage chair 2 5 4 chair chair Chair chair chair chair 3001627 n03001627 chair.n.01 chair 3
453
+ 1292 paper organizer paper organizer 2 40 7 otherprop Objects objects 39
454
+ 1005 barricade barricade 2 40 7 otherprop Objects misc 40
455
+ 235 platform platform 2 38 7 otherstructure Objects misc 40
456
+ 1293 cap cap 2 40 7 hat otherprop Objects n03497657 hat.n.01 clothes 38
457
+ 1294 dumbbell plates dumbbell plates 2 40 7 otherprop Objects objects 39
458
+ 1295 elevator elevator 2 38 7 otherstructure Objects misc 40
459
+ 1296 cooking pan cooking pan 2 40 7 pan otherprop Objects n03880531 pan.n.01 objects 39
460
+ 1297 trash bag trash bag 2 37 7 bag bag Objects objects 39
461
+ 1298 santa santa 2 40 7 otherprop Objects misc 40
462
+ 1299 jewelry box jewelry box 2 29 7 box box Objects n02883344 box.n.01 objects 39
463
+ 1300 boat boat 2 40 7 otherprop Objects misc 40
464
+ 1301 sock sock 2 21 7 clothes clothes Objects n04254777 sock.n.01 clothes 38
465
+ 1051 kinect kinect 2 40 7 kinect otherprop Objects objects 39
466
+ 566 crib crib 2 39 6 crib otherfurniture Furniture furniture 36
467
+ 1302 plastic storage bin plastic storage bin 2 40 7 container otherprop Objects n03094503 container.n.01 objects 39
468
+ 1062 cooler cooler 2 24 6 refridgerator refridgerator Furniture n03102654 cooler.n.01 appliances 37
469
+ 1303 kitchen apron kitchen apron 2 21 7 clothes clothes Objects n02728440 apparel.n.01 clothes 38
470
+ 1304 dishwashing soap bottle dishwashing soap bottle 2 40 7 bottle otherprop Objects bottle bottle 2876657 n02876657 bottle.n.01 objects 39
471
+ 1305 xbox controller xbox controller 2 40 7 otherprop Objects objects 39
472
+ 1306 banana holder banana holder 2 40 7 otherprop Objects objects 39
473
+ 298 ping pong paddle ping pong paddle 2 40 7 otherprop Objects table 5
474
+ 1307 airplane airplane 2 40 7 otherprop Objects misc 40
475
+ 1308 conditioner bottle conditioner bottle 2 40 7 bottle otherprop Objects bottle bottle 2876657 n02876657 bottle.n.01 objects 39
476
+ 1309 tea kettle tea kettle 2 40 7 tea kettle otherprop Objects n04397768 teakettle.n.01 objects 39
477
+ 43 bedframe bedframe 2 39 6 otherfurniture Furniture n02822579 bedstead.n.01 bed 11
478
+ 1310 wood beam wood beam 2 38 7 otherstructure Objects beam 29
479
+ 593 toilet paper package toilet paper package 2 40 7 otherprop Objects objects 39
480
+ 1311 wall mounted coat rack wall mounted coat rack 2 40 7 otherprop Objects n03059103 coatrack.n.01 shelving 31
481
+ 1312 film light film light 2 40 7 otherprop Objects lighting 28
482
+ 749 ceiling lamp ceiling lamp 1 35 7 lamp lamp Objects lamp lamp 3636649 n03636649 lamp.n.02 lighting 28
483
+ 623 chain chain 1 40 7 otherprop Objects chair 3
484
+ 1313 sofa sofa 1 6 9 sofa sofa Sofa sofa sofa sofa 4256520 n04256520 sofa.n.01 sofa 10
485
+ 99 closet wardrobe wardrobe 1 39 6 wardrobe otherfurniture Furniture wardrobe n04550184 wardrobe.n.01 furniture 36
486
+ 265 sweater sweater 1 40 7 otherprop Objects n04370048 sweater.n.01 clothes 38
487
+ 1314 kitchen mixer kitchen mixer 1 40 7 otherprop Objects appliances 37
488
+ 99 wardrobe wardrobe 1 39 6 wardrobe otherfurniture Furniture wardrobe n04550184 wardrobe.n.01 furniture 36
489
+ 1315 water softener water softener 1 40 7 otherprop Objects misc 40
490
+ 448 banister banister 1 38 7 banister otherstructure Objects n02788148 bannister.n.02 railing 30
491
+ 257 trolley trolley 1 40 7 trolley otherprop Objects n04335435 streetcar.n.01 misc 40
492
+ 1316 pantry shelf pantry shelf 1 15 6 shelves shelves Furniture bookshelf bookshelf 2871439 n02871439 bookshelf.n.01 shelving 31
493
+ 786 sofa bed sofa bed 1 4 1 bed bed Bed bed bed bed 2818832 n02818832 bed.n.01 bed 11
494
+ 801 loofa loofa 1 40 7 otherprop Objects objects 39
495
+ 972 shower faucet handle shower faucet handle 1 40 7 handle otherprop Objects shower 23
496
+ 1317 toy piano toy piano 1 40 7 toy otherprop Objects n03964744 plaything.n.01 objects 39
497
+ 1318 fish fish 1 40 7 otherprop Objects n02512053 fish.n.01 objects 39
498
+ 75 file cabinets file cabinet 1 3 6 cabinet cabinet Furniture cabinet 2933112 n03337140 file.n.03 cabinet 7
499
+ 657 cat litter box cat litter box 1 29 7 box box Objects objects 39
500
+ 561 electric panel electric panel 1 40 7 otherprop Objects misc 40
501
+ 93 suitcases suitcase 1 40 7 luggage otherprop Objects n02774630 baggage.n.01 objects 39
502
+ 513 curtain rod curtain rod 1 38 7 curtain rod otherstructure Objects curtain 12
503
+ 411 bunk bed bunk bed 1 39 6 bunk bed otherfurniture Furniture bed bed bed 2818832 n02920259 bunk_bed.n.01 bed 11
504
+ 1122 chandelier chandelier 1 38 7 chandelier otherstructure Objects n03005285 chandelier.n.01 lighting 28
505
+ 922 tape tape 1 40 7 tape otherprop Objects objects 39
506
+ 88 plates plate 1 40 7 otherprop Objects n03959485 plate.n.04 objects 39
507
+ 518 alarm alarm 1 40 7 alarm otherprop Objects clock 3046257 n02694662 alarm_clock.n.01 objects 39
508
+ 814 fire hose fire hose 1 40 7 otherprop Objects n03346004 fire_hose.n.01 misc 40
509
+ 1319 toy dinosaur toy dinosaur 1 40 7 toy otherprop Objects n03964744 plaything.n.01 objects 39
510
+ 1320 cone cone 1 40 7 otherprop Objects objects 39
511
+ 649 glass doors glass door 1 8 12 door door Wall door n03221720 door.n.01 door 4
512
+ 607 hatrack hatrack 1 40 7 otherprop Objects n03059103 coatrack.n.01 shelving 31
513
+ 819 subwoofer subwoofer 1 40 7 speaker otherprop Objects speaker 3691459 n04349401 subwoofer.n.01 objects 39
514
+ 1321 fire sprinkler fire sprinkler 1 40 7 otherprop Objects misc 40
515
+ 1322 trash cabinet trash cabinet 1 3 6 cabinet cabinet Furniture cabinet 2933112 n02933112 cabinet.n.01 cabinet 7
516
+ 1204 pantry walls pantry wall 1 1 12 wall wall Wall n04546855 wall.n.01 wall 1
517
+ 227 photo photo 1 40 7 photo otherprop Objects n03925226 photograph.n.01 picture 6
518
+ 817 barrier barrier 1 40 7 otherprop Objects n02796623 barrier.n.01 misc 40
519
+ 130 stacks of cups cup 1 40 7 otherprop Objects n03147509 cup.n.01 objects 39
520
+ 712 beachball beachball 1 40 7 ball otherprop Objects n02814224 beach_ball.n.01 objects 39
521
+ 1323 folded boxes folded boxes 1 40 7 otherprop Objects objects 39
522
+ 1324 contact lens solution bottle contact lens solution bottle 1 40 7 bottle otherprop Objects bottle bottle 2876657 n02876657 bottle.n.01 objects 39
523
+ 673 covered box covered box 1 29 7 box box Objects objects 39
524
+ 459 folder folder 1 40 7 folder otherprop Objects n03376279 folder.n.02 objects 39
525
+ 643 mail trays mail tray 1 40 7 mail tray otherprop Objects objects 39
526
+ 238 slipper slipper 1 40 7 otherprop Objects n04241394 slipper.n.01 clothes 38
527
+ 765 magazine rack magazine rack 1 39 6 stand otherfurniture Furniture n03704549 magazine_rack.n.01 shelving 31
528
+ 1008 sticker sticker 1 40 7 sticker otherprop Objects n07272545 gummed_label.n.01 objects 39
529
+ 225 lotion lotion 1 40 7 otherprop Objects n03690938 lotion.n.01 objects 39
530
+ 1083 buddha buddha 1 40 7 otherprop Objects objects 39
531
+ 813 file organizer file organizer 1 40 7 otherprop Objects objects 39
532
+ 138 paper towel rolls paper towel roll 1 40 7 paper towel otherprop Objects n03887697 paper_towel.n.01 towel 20
533
+ 1145 night lamp night lamp 1 35 7 lamp lamp Objects lamp lamp 3636649 n03636649 lamp.n.02 lighting 28
534
+ 796 fuse box fuse box 1 40 7 otherprop Objects misc 40
535
+ 1325 knife block knife block 1 40 7 otherprop Objects objects 39
536
+ 363 furnace furnace 1 39 6 furnace otherfurniture Furniture n03404449 furnace.n.01
537
+ 1174 cd cases cd case 1 40 7 otherprop Objects objects 39
538
+ 38 stools stool 1 40 7 stool otherprop Objects stool n04326896 stool.n.01 stool 19
539
+ 1326 hand sanitzer dispenser hand sanitzer dispenser 1 40 7 otherprop Objects n04254120 soap_dispenser.n.01 objects 39
540
+ 997 teapot teapot 1 40 7 tea pot otherprop Objects n04398044 teapot.n.01 objects 39
541
+ 1327 pen holder pen holder 1 40 7 otherprop Objects objects 39
542
+ 1328 tray rack tray rack 1 40 7 otherprop Objects objects 39
543
+ 1329 wig wig 1 40 7 otherprop Objects n04584207 wig.n.01 objects 39
544
+ 182 switch switch 1 40 7 otherprop Objects n04372370 switch.n.01 misc 40
545
+ 280 plastic containers plastic container 1 40 7 container otherprop Objects n03094503 container.n.01 objects 39
546
+ 1330 night light night light 1 40 7 otherprop Objects lighting 28
547
+ 1331 notepad notepad 1 40 7 otherprop Objects objects 39
548
+ 1332 mail bin mail bin 1 40 7 otherprop Objects misc 40
549
+ 1333 elevator button elevator button 1 40 7 otherprop Objects misc 40
550
+ 939 gaming wheel gaming wheel 1 40 7 otherprop Objects objects 39
551
+ 1334 drum set drum set 1 40 7 otherprop Objects objects 39
552
+ 480 cosmetic bag cosmetic bag 1 37 7 bag bag Objects objects 39
553
+ 907 coffee mug coffee mug 1 40 7 vessel otherprop Objects cup or mug 3797390 n03063599 coffee_mug.n.01 objects 39
554
+ 1335 closet shelf closet shelf 1 15 6 shelves shelves Furniture bookshelf bookshelf 2871439 n02871439 bookshelf.n.01 shelving 31
555
+ 1336 baby mobile baby mobile 1 40 7 otherprop Objects objects 39
556
+ 829 diaper bin diaper bin 1 40 7 bin otherprop Objects objects 39
557
+ 947 door wall door wall 1 1 12 wall wall Wall wall 1
558
+ 1116 stepstool stepstool 1 40 7 step stool otherprop Objects objects 39
559
+ 599 paper shredder shredder 1 40 7 otherprop Objects n04210120 shredder.n.01 objects 39
560
+ 733 dress rack dress rack 1 40 7 otherprop Objects n03238762 dress_rack.n.01 misc 40
561
+ 123 cover cover 1 40 7 blanket otherprop Objects objects 39
562
+ 506 shopping bag shopping bag 1 37 7 bag bag Objects n04204081 shopping_bag.n.01 objects 39
563
+ 569 sliding door sliding door 1 8 12 door door Wall door n04239074 sliding_door.n.01 door 4
564
+ 1337 exercise bike exercise bike 1 40 7 machine otherprop Objects n04210120 shredder.n.01 gym_equipment 33
565
+ 1338 recliner chair recliner chair 1 5 4 chair chair Chair chair chair chair 3001627 n03238762 dress_rack.n.01 chair 3
566
+ 1314 kitchenaid mixer kitchen mixer 1 40 7 otherprop Objects appliances 37
567
+ 1339 soda can soda can 1 40 7 can otherprop Objects can 2946921 n02946921 can.n.01 objects 39
568
+ 1340 stovetop stovetop 1 38 7 stove otherstructure Objects stove 4330267 n04330267 stove.n.02 appliances 37
569
+ 851 stepladder stepladder 1 39 6 ladder otherfurniture Furniture stairs n04315599 step_ladder.n.01 stairs 16
570
+ 142 tap tap 1 40 7 faucet otherprop Objects faucet 3325088 n04559451 water_faucet.n.01 objects 39
571
+ 436 cable cable 1 40 7 cables otherprop Objects objects 39
572
+ 1341 baby changing station baby changing station 1 39 6 otherfurniture Furniture furniture 36
573
+ 1342 costume costume 1 21 7 clothes clothes Objects n02728440 apparel.n.01 clothes 38
574
+ 885 rocking chair rocking chair 1 5 4 chair chair Chair chair chair chair 3001627 n04099969 rocking_chair.n.01 chair 3
575
+ 693 binder binder 1 40 7 binder otherprop Objects objects 39
576
+ 815 media center media center 1 3 6 cabinet cabinet Furniture cabinet 2933112 n02933112 cabinet.n.01 cabinet 7
577
+ 401 towel rack towel rack 1 40 7 otherprop Objects n04459773 towel_rack.n.01 misc 40
578
+ 1343 medal medal 1 40 7 otherprop Objects objects 39
579
+ 1184 stack of folded chairs folded chair 1 5 4 chair chair Chair chair chair chair 3001627 n03001627 chair.n.01 chair 3
580
+ 1344 telescope telescope 1 40 7 otherprop Objects n04403638 telescope.n.01 objects 39
581
+ 1345 closet doorframe closet doorframe 1 8 12 door door Wall door door 4
582
+ 160 glass glass 1 38 7 glass otherstructure Objects n03438257 glass.n.02 misc 40
583
+ 1126 baseball cap baseball cap 1 40 7 otherprop Objects cap 2954340 n02799323 baseball_cap.n.01 clothes 38
584
+ 1346 battery disposal jar battery disposal jar 1 40 7 jar otherprop Objects jar 3593526 n03593526 jar.n.01 objects 39
585
+ 332 mop mop 1 40 7 otherprop Objects n04367480 swab.n.02 objects 39
586
+ 397 tank tank 1 40 7 otherprop Objects objects 39
587
+ 643 mail tray mail tray 1 40 7 mail tray otherprop Objects objects 39
588
+ 551 centerpiece centerpiece 1 40 7 centerpiece otherprop Objects n02994419 centerpiece.n.02 objects 39
589
+ 1163 stick object 1 40 7 stick otherprop Objects objects 39
590
+ 1347 closet floor closet floor 1 2 5 floor floor Floor n03365592 floor.n.01 floor 2
591
+ 1348 dryer sheets dryer sheets 1 40 7 otherprop Objects objects 39
592
+ 803 bycicle bycicle 1 40 7 otherprop Objects misc 40
593
+ 484 flower stand flower stand 1 39 6 stand otherfurniture Furniture furniture 36
594
+ 1349 air mattress air mattress 1 4 1 bed bed Bed bed bed bed 2818832 n02690809 air_mattress.n.01 bed 11
595
+ 1350 clip clip 1 40 7 otherprop Objects objects 39
596
+ 222 side table side table 1 7 10 table table Table table table table 4379243 n04379243 table.n.02 table 5
597
+ 1253 pizza boxes pizza box 1 29 7 box box Objects n02883344 box.n.01 objects 39
598
+ 1351 display display 1 39 7 otherfurniture Furniture n03211117 display.n.06 misc 40
599
+ 1352 postcard postcard 1 40 7 otherprop Objects objects 39
600
+ 828 display sign display sign 1 40 7 sign otherprop Objects misc 40
601
+ 1353 paper towel paper towel 1 40 7 paper towel otherprop Objects n03887697 paper_towel.n.01 towel 20
602
+ 612 boots boot 1 40 7 shoe otherprop Objects n04199027 shoe.n.01 clothes 38
603
+ 1354 tennis racket bag tennis racket bag 1 40 7 otherprop Objects objects 39
604
+ 1355 air hockey table air hockey table 1 7 10 table table Table table table table 4379243 n04379243 table.n.02 table 5
605
+ 1301 socks sock 1 21 7 clothes clothes Objects n04254777 sock.n.01 clothes 38
606
+ 1356 food bag food bag 1 37 7 bag bag Objects objects 39
607
+ 1199 clothes hangers clothes hanger 1 40 7 otherprop Objects n03057920 coat_hanger.n.01 misc 40
608
+ 1357 starbucks cup starbucks cup 1 40 7 cup otherprop Objects cup cup or mug 3797390 n03797390 mug.n.04 objects 39
src/datasets/megadepth.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2024-present Naver Corporation. All rights reserved.
2
+ # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
3
+ #
4
+ # --------------------------------------------------------
5
+ # Dataloader for preprocessed MegaDepth
6
+ # dataset at https://www.cs.cornell.edu/projects/megadepth/
7
+ # See datasets_preprocess/preprocess_megadepth.py
8
+ # --------------------------------------------------------
9
+ import os.path as osp
10
+ import numpy as np
11
+ import sys
12
+ sys.path.append("submodules/mast3r/dust3r")
13
+ from dust3r.datasets.base.base_stereo_view_dataset import BaseStereoViewDataset
14
+ from dust3r.utils.image import imread_cv2
15
+
16
+
17
+ class MegaDepth(BaseStereoViewDataset):
18
+ def __init__(self, *args, split, ROOT, **kwargs):
19
+ self.ROOT = ROOT
20
+ super().__init__(*args, **kwargs)
21
+ self.num_views = 3 # render third view
22
+ self.loaded_data = self._load_data(self.split)
23
+
24
+ if self.split is None:
25
+ pass
26
+ elif self.split == 'train':
27
+ self.select_scene(('0015', '0022'), opposite=True)
28
+ elif self.split == 'val':
29
+ self.select_scene(('0015', '0022'))
30
+ else:
31
+ raise ValueError(f'bad {self.split=}')
32
+
33
+ def _load_data(self, split):
34
+ with np.load(osp.join(self.ROOT, 'all_metadata.npz')) as data:
35
+ self.all_scenes = data['scenes']
36
+ self.all_images = data['images']
37
+ self.pairs = data['pairs']
38
+
39
+ def __len__(self):
40
+ return len(self.pairs)
41
+
42
+ def get_stats(self):
43
+ return f'{len(self)} pairs from {len(self.all_scenes)} scenes'
44
+
45
+ def select_scene(self, scene, *instances, opposite=False):
46
+ scenes = (scene,) if isinstance(scene, str) else tuple(scene)
47
+ scene_id = [s.startswith(scenes) for s in self.all_scenes]
48
+ assert any(scene_id), 'no scene found'
49
+
50
+ valid = np.in1d(self.pairs['scene_id'], np.nonzero(scene_id)[0])
51
+ if instances:
52
+ image_id = [i.startswith(instances) for i in self.all_images]
53
+ image_id = np.nonzero(image_id)[0]
54
+ assert len(image_id), 'no instance found'
55
+ # both together?
56
+ if len(instances) == 2:
57
+ valid &= np.in1d(self.pairs['im1_id'], image_id) & np.in1d(self.pairs['im2_id'], image_id)
58
+ else:
59
+ valid &= np.in1d(self.pairs['im1_id'], image_id) | np.in1d(self.pairs['im2_id'], image_id)
60
+
61
+ if opposite:
62
+ valid = ~valid
63
+ assert valid.any()
64
+ self.pairs = self.pairs[valid]
65
+
66
+ def _get_views(self, pair_idx, resolution, rng):
67
+ scene_id, im1_id, im2_id, score = self.pairs[pair_idx]
68
+ im3_id = int((im1_id + im2_id) / 2)
69
+ scene, subscene = self.all_scenes[scene_id].split()
70
+ seq_path = osp.join(self.ROOT, scene, subscene)
71
+
72
+ views = []
73
+
74
+ for im_id in [im1_id, im2_id, im2_id]:
75
+ img = self.all_images[im_id]
76
+ try:
77
+ image = imread_cv2(osp.join(seq_path, img + '.jpg'))
78
+ depthmap = imread_cv2(osp.join(seq_path, img + ".exr"))
79
+ camera_params = np.load(osp.join(seq_path, img + ".npz"))
80
+ except Exception as e:
81
+ raise OSError(f'cannot load {img}, got exception {e}')
82
+
83
+ intrinsics = np.float32(camera_params['intrinsics'])
84
+ camera_pose = np.float32(camera_params['cam2world'])
85
+
86
+ image, depthmap, intrinsics = self._crop_resize_if_necessary(
87
+ image, depthmap, intrinsics, resolution, rng, info=(seq_path, img))
88
+
89
+ views.append(dict(
90
+ img=image,
91
+ depthmap=depthmap,
92
+ camera_pose=camera_pose, # cam2world
93
+ camera_intrinsics=intrinsics,
94
+ dataset='MegaDepth',
95
+ label=osp.relpath(seq_path, self.ROOT),
96
+ instance=img))
97
+
98
+ return views
99
+
100
+
101
+ if __name__ == "__main__":
102
+ from dust3r.datasets.base.base_stereo_view_dataset import view_name
103
+ from dust3r.viz import SceneViz, auto_cam_size
104
+ from dust3r.utils.image import rgb
105
+
106
+ dataset = MegaDepth(split='train', ROOT="data/megadepth_processed", resolution=224, aug_crop=16)
107
+
108
+ for idx in np.random.permutation(len(dataset)):
109
+ views = dataset[idx]
110
+ assert len(views) == 3
111
+ print(idx, view_name(views[0]), view_name(views[1]), view_name(views[2]))
112
+ viz = SceneViz()
113
+ poses = [views[view_idx]['camera_pose'] for view_idx in [0, 1, 2]]
114
+ cam_size = max(auto_cam_size(poses), 0.001)
115
+ for view_idx in [0, 1, 2]:
116
+ pts3d = views[view_idx]['pts3d']
117
+ valid_mask = views[view_idx]['valid_mask']
118
+ colors = rgb(views[view_idx]['img'])
119
+ viz.add_pointcloud(pts3d, colors, valid_mask)
120
+ viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],
121
+ focal=views[view_idx]['camera_intrinsics'][0, 0],
122
+ color=(idx * 255, (1 - idx) * 255, 0),
123
+ image=colors,
124
+ cam_size=cam_size)
125
+ viz.show()
src/datasets/scannet.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import os.path as osp
3
+ import sys
4
+ sys.path.append("submodules/mast3r/dust3r")
5
+ from dust3r.datasets.base.base_stereo_view_dataset import BaseStereoViewDataset
6
+ import numpy as np
7
+ import cv2
8
+ from dust3r.utils.image import imread_cv2
9
+
10
+ class Scannet(BaseStereoViewDataset):
11
+ def __init__(self, *args, ROOT, **kwargs):
12
+ self.ROOT = ROOT
13
+ super().__init__(*args, **kwargs)
14
+ self.num_views = 3 # render third view
15
+ self._load_data()
16
+
17
+ def _load_data(self):
18
+ # Traverse all the folders in the data_root
19
+ scene_names = [folder for folder in os.listdir(self.ROOT) if os.path.isdir(os.path.join(self.ROOT, folder))]
20
+ # Filter out scenes without scene_data.npz
21
+ valid_scenes = []
22
+ for scene_name in scene_names:
23
+ scene_data_path = osp.join(self.ROOT, scene_name, "scene_data.npz")
24
+ if osp.exists(scene_data_path):
25
+ valid_scenes.append(scene_name)
26
+ else:
27
+ print(f"Skipping {scene_name}: scene_data.npz not found")
28
+ scene_names = valid_scenes
29
+ scene_names.sort()
30
+ if self.split == 'train':
31
+ scene_names = scene_names[:-150]
32
+ else:
33
+ scene_names = scene_names[-150:]
34
+ # merge all pairs and images
35
+ pairs = [] # (scene_name, image_idx1, image_idx2)
36
+ images = {} # (scene_name, image_idx) -> image_path
37
+ for scene_name in scene_names:
38
+ scene_path = osp.join(self.ROOT, scene_name, "scene_data.npz")
39
+ scene_data = np.load(scene_path)
40
+ pairs.extend([(scene_name, *pair) for pair in scene_data['pairs']])
41
+ images.update({(scene_name, idx): path for idx, path in enumerate(scene_data['images'])})
42
+ self.pairs = pairs
43
+ self.images = images
44
+
45
+ def __len__(self):
46
+ return len(self.pairs)
47
+
48
+ def _get_views(self, idx, resolution, rng):
49
+ scene_name, image_idx1, image_idx2, _ = self.pairs[idx]
50
+ image_idx1 = int(image_idx1)
51
+ image_idx2 = int(image_idx2)
52
+ image_idx3 = int((image_idx1 + image_idx2) / 2)
53
+ views = []
54
+ for view_idx in [image_idx1, image_idx2, image_idx3]:
55
+ basename = self.images[(scene_name, view_idx)]
56
+ # Load RGB image
57
+ rgb_path = osp.join(self.ROOT, scene_name, 'images', f'{basename}.jpg')
58
+ rgb_image = imread_cv2(rgb_path)
59
+ # Load depthmap
60
+ depthmap_path = osp.join(self.ROOT, scene_name, 'depths', f'{basename}.png')
61
+ depthmap = imread_cv2(depthmap_path, cv2.IMREAD_UNCHANGED)
62
+ depthmap = depthmap.astype(np.float32) / 1000
63
+ depthmap[~np.isfinite(depthmap)] = 0 # invalid
64
+ # Load camera parameters
65
+ meta_path = osp.join(self.ROOT, scene_name, 'images', f'{basename}.npz')
66
+ meta = np.load(meta_path)
67
+ intrinsics = meta['camera_intrinsics']
68
+ camera_pose = meta['camera_pose']
69
+ # crop if necessary
70
+ rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
71
+ rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx)
72
+ views.append(dict(
73
+ img=rgb_image,
74
+ depthmap=depthmap.astype(np.float32),
75
+ camera_pose=camera_pose.astype(np.float32),
76
+ camera_intrinsics=intrinsics.astype(np.float32),
77
+ dataset='ScanNet',
78
+ label=scene_name + '_' + basename,
79
+ instance=f'{str(idx)}_{str(view_idx)}',
80
+ ))
81
+ return views
82
+
83
+ if __name__ == "__main__":
84
+ from dust3r.datasets.base.base_stereo_view_dataset import view_name
85
+ from dust3r.viz import SceneViz, auto_cam_size
86
+ from dust3r.utils.image import rgb
87
+
88
+ dataset = Scannet(split='train', ROOT="data/scannet_processed", resolution=224, aug_crop=16)
89
+
90
+ print(len(dataset))
91
+
92
+ for idx in np.random.permutation(len(dataset)):
93
+ views = dataset[idx]
94
+ assert len(views) == 3
95
+ print(view_name(views[0]), view_name(views[1]), view_name(views[2]))
96
+ viz = SceneViz()
97
+ poses = [views[view_idx]['camera_pose'] for view_idx in [0, 1, 2]]
98
+ cam_size = max(auto_cam_size(poses), 0.001)
99
+ for view_idx in [0, 1, 2]:
100
+ pts3d = views[view_idx]['pts3d']
101
+ valid_mask = views[view_idx]['valid_mask']
102
+ colors = rgb(views[view_idx]['img'])
103
+ viz.add_pointcloud(pts3d, colors, valid_mask)
104
+ viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],
105
+ focal=views[view_idx]['camera_intrinsics'][0, 0],
106
+ color=(idx*255, (1 - idx)*255, 0),
107
+ image=colors,
108
+ cam_size=cam_size)
109
+ viz.show()
src/datasets/scannetpp.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import os.path as osp
3
+ import sys
4
+ sys.path.append("submodules/mast3r/dust3r")
5
+ from dust3r.datasets.base.base_stereo_view_dataset import BaseStereoViewDataset
6
+ import numpy as np
7
+ import cv2
8
+ from dust3r.utils.image import imread_cv2
9
+
10
+ class Scannetpp(BaseStereoViewDataset):
11
+ def __init__(self, *args, ROOT, **kwargs):
12
+ self.ROOT = ROOT
13
+ super().__init__(*args, **kwargs)
14
+ assert self.split == 'train' # just for training
15
+ self.num_views = 3 # render third view
16
+ self._load_data()
17
+
18
+ def _load_data(self):
19
+ # Traverse all the folders in the data_root
20
+ scene_names = [folder for folder in os.listdir(self.ROOT) if os.path.isdir(os.path.join(self.ROOT, folder))]
21
+ # Filter out scenes without scene_data.npz
22
+ valid_scenes = []
23
+ for scene_name in scene_names:
24
+ scene_data_path = osp.join(self.ROOT, scene_name, "scene_data.npz")
25
+ if osp.exists(scene_data_path):
26
+ valid_scenes.append(scene_name)
27
+ else:
28
+ print(f"Skipping {scene_name}: scene_data.npz not found")
29
+ scene_names = valid_scenes
30
+ scene_names.sort()
31
+
32
+ # merge all pairs and images
33
+ pairs = [] # (scene_name, image_idx1, image_idx2)
34
+ images = {} # (scene_name, image_idx) -> image_path
35
+ for scene_name in scene_names:
36
+ scene_path = osp.join(self.ROOT, scene_name, "scene_data.npz")
37
+ scene_data = np.load(scene_path)
38
+ pairs.extend([(scene_name, *pair) for pair in scene_data['pairs']])
39
+ images.update({(scene_name, idx): path for idx, path in enumerate(scene_data['images'])})
40
+ self.pairs = pairs
41
+ self.images = images
42
+
43
+ def __len__(self):
44
+ return len(self.pairs)
45
+
46
+ def _get_views(self, idx, resolution, rng):
47
+ scene_name, image_idx1, image_idx2, _ = self.pairs[idx]
48
+ image_idx1 = int(image_idx1)
49
+ image_idx2 = int(image_idx2)
50
+ image_idx3 = int((image_idx1 + image_idx2) / 2)
51
+ views = []
52
+ for view_idx in [image_idx1, image_idx2, image_idx3]:
53
+ basename = self.images[(scene_name, view_idx)]
54
+ # Load RGB image
55
+ rgb_path = osp.join(self.ROOT, scene_name, 'images', f'{basename}.JPG')
56
+ rgb_image = imread_cv2(rgb_path)
57
+ # Load depthmap
58
+ depthmap_path = osp.join(self.ROOT, scene_name, 'depths', f'{basename}.png')
59
+ depthmap = imread_cv2(depthmap_path, cv2.IMREAD_UNCHANGED)
60
+ depthmap = depthmap.astype(np.float32) / 1000
61
+ depthmap[~np.isfinite(depthmap)] = 0 # invalid
62
+ # Load camera parameters
63
+ meta_path = osp.join(self.ROOT, scene_name, 'images', f'{basename}.npz')
64
+ meta = np.load(meta_path)
65
+ intrinsics = meta['camera_intrinsics']
66
+ camera_pose = meta['camera_pose']
67
+ # crop if necessary
68
+ rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
69
+ rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx)
70
+ views.append(dict(
71
+ img=rgb_image,
72
+ depthmap=depthmap.astype(np.float32),
73
+ camera_pose=camera_pose.astype(np.float32),
74
+ camera_intrinsics=intrinsics.astype(np.float32),
75
+ dataset='ScanNet++',
76
+ label=scene_name + '_' + basename,
77
+ instance=f'{str(idx)}_{str(view_idx)}',
78
+ ))
79
+ return views
80
+
81
+ if __name__ == "__main__":
82
+ from dust3r.datasets.base.base_stereo_view_dataset import view_name
83
+ from dust3r.viz import SceneViz, auto_cam_size
84
+ from dust3r.utils.image import rgb
85
+
86
+ dataset = Scannetpp(split='train', ROOT="data/scannetpp_processed", resolution=224, aug_crop=16)
87
+
88
+ print(len(dataset))
89
+
90
+ for idx in np.random.permutation(len(dataset)):
91
+ views = dataset[idx]
92
+ assert len(views) == 3
93
+ print(view_name(views[0]), view_name(views[1]), view_name(views[2]))
94
+ viz = SceneViz()
95
+ poses = [views[view_idx]['camera_pose'] for view_idx in [0, 1, 2]]
96
+ cam_size = max(auto_cam_size(poses), 0.001)
97
+ for view_idx in [0, 1, 2]:
98
+ pts3d = views[view_idx]['pts3d']
99
+ valid_mask = views[view_idx]['valid_mask']
100
+ colors = rgb(views[view_idx]['img'])
101
+ viz.add_pointcloud(pts3d, colors, valid_mask)
102
+ viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],
103
+ focal=views[view_idx]['camera_intrinsics'][0, 0],
104
+ color=(idx*255, (1 - idx)*255, 0),
105
+ image=colors,
106
+ cam_size=cam_size)
107
+ viz.show()
src/datasets_preprocess/scannet_preprocess.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import cv2
4
+ import torch
5
+ import torch.multiprocessing as mp
6
+
7
+ def process_scene_on_gpu(gpu_id, scene_names, data_root, output_queue):
8
+ torch.cuda.set_device(gpu_id)
9
+ local_pairs = {}
10
+ local_images = {}
11
+
12
+ for scene_name in scene_names:
13
+ save_path = os.path.join(data_root, scene_name, "scene_data.npz")
14
+ if os.path.exists(save_path):
15
+ print(f"Scene {scene_name} already processed, skipping")
16
+ continue
17
+ pairs, images = process_scene(data_root, scene_name)
18
+ np.savez_compressed(save_path, pairs=pairs, images=images)
19
+
20
+ output_queue.put((local_pairs, local_images))
21
+
22
+ def preprocess_scannet(data_root, threads_per_gpu=4):
23
+ scene_names = [folder for folder in os.listdir(data_root) if os.path.isdir(os.path.join(data_root, folder))]
24
+ num_gpus = torch.cuda.device_count()
25
+ total_threads = num_gpus * threads_per_gpu
26
+
27
+ # 将场景平均分配给所有线程
28
+ scenes_per_thread = [scene_names[i::total_threads] for i in range(total_threads)]
29
+
30
+ output_queue = mp.Queue()
31
+ processes = []
32
+
33
+ # 为每个GPU创建多个进程
34
+ for gpu_id in range(num_gpus):
35
+ for thread_id in range(threads_per_gpu):
36
+ process_id = gpu_id * threads_per_gpu + thread_id
37
+ p = mp.Process(
38
+ target=process_scene_on_gpu,
39
+ args=(gpu_id, scenes_per_thread[process_id], data_root, output_queue)
40
+ )
41
+ p.start()
42
+ processes.append(p)
43
+
44
+ # 收集所有进程的结果
45
+ all_pairs = {}
46
+ all_images = {}
47
+ for _ in range(total_threads):
48
+ local_pairs, local_images = output_queue.get()
49
+ all_pairs.update(local_pairs)
50
+ all_images.update(local_images)
51
+
52
+ # Wait for all processes to complete
53
+ for p in processes:
54
+ p.join()
55
+
56
+ # Save to npz file
57
+ np.savez_compressed(os.path.join(data_root, "scannet_image_pairs.npz"), **all_pairs)
58
+ np.savez_compressed(os.path.join(data_root, "scannet_images.npz"), **all_images)
59
+
60
+ # print the number of image pairs
61
+ # sum up the number of image pairs for all scenes
62
+ total_pairs = sum(len(pairs) for pairs in all_pairs.values())
63
+ print(f"Total number of image pairs: {total_pairs}")
64
+ return all_pairs, all_images
65
+
66
+ def process_scene(data_root, scene_name):
67
+ pairs = []
68
+ images_dir = os.path.join(data_root, scene_name, "images")
69
+ images = [os.path.splitext(file)[0] for file in os.listdir(images_dir) if file.endswith(".jpg")]
70
+ images.sort()
71
+
72
+ # Check validity of c2w for each image
73
+ valid_images = []
74
+ for image in images:
75
+ _, c2w, _ = load_image(data_root, scene_name, image)
76
+ if is_valid_c2w(c2w):
77
+ valid_images.append(image)
78
+ else:
79
+ print(f"Invalid c2w for image {image} in scene {scene_name}")
80
+
81
+ # generate image pairs
82
+ slide_window = 50
83
+ num_sub_intervals = 5
84
+
85
+ pairs = generate_image_pairs(data_root, scene_name, valid_images, slide_window, num_sub_intervals)
86
+ print(f"Scene {scene_name} has {len(pairs)} image pairs and {len(valid_images)} valid images out of {len(images)} total images")
87
+ return pairs, valid_images
88
+
89
+ def is_valid_c2w(c2w):
90
+ return not np.any(np.isinf(c2w)) and not np.any(np.isnan(c2w))
91
+
92
+ def generate_image_pairs(data_root, scene_name, images, slide_window, num_sub_intervals=3):
93
+ pairs = []
94
+ n = len(images)
95
+
96
+ # Define IOU sub-intervals
97
+ iou_range = (0.3, 0.8)
98
+ sub_interval_size = (iou_range[1] - iou_range[0]) / num_sub_intervals
99
+ sub_intervals = [(iou_range[0] + i * sub_interval_size, iou_range[0] + (i + 1) * sub_interval_size)
100
+ for i in range(num_sub_intervals)]
101
+
102
+ for i in range(n):
103
+ # Keep track of whether a pair has been added for each sub-interval
104
+ interval_selected = [False] * num_sub_intervals
105
+
106
+ for j in range(i+1, min(i + slide_window, n)):
107
+ # Break early if all sub-intervals have been selected
108
+ if all(interval_selected):
109
+ break
110
+
111
+ # Load image pair
112
+ depth1, c2w1, K1 = load_image(data_root, scene_name, images[i])
113
+ depth2, c2w2, K2 = load_image(data_root, scene_name, images[j])
114
+
115
+ # Calculate mean IoU
116
+ try:
117
+ iou_1 = calculate_iou(depth1, c2w1, K1, depth2, c2w2, K2)
118
+ iou_2 = calculate_iou(depth2, c2w2, K2, depth1, c2w1, K1)
119
+ except Exception as e:
120
+ print(f"Error calculating IoU for images {images[i]} and {images[j]} in scene {scene_name}: {str(e)}")
121
+ continue
122
+
123
+ mean_iou = (iou_1 + iou_2) / 2
124
+
125
+ # Check which sub-interval the mean IoU falls into
126
+ for idx, (lower, upper) in enumerate(sub_intervals):
127
+ if lower <= mean_iou <= upper and not interval_selected[idx]:
128
+ pairs.append((i, j, mean_iou))
129
+ interval_selected[idx] = True # Mark this interval as selected
130
+ break # Move to the next pair after adding one in the current sub-interval
131
+
132
+ return pairs
133
+
134
+
135
+ def load_image(data_root, scene_name, image_id):
136
+ # load depthmap
137
+ depth_path = f"{data_root}/{scene_name}/depths/{image_id}.png"
138
+ depth = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED).astype(np.float32) / 1000.0
139
+ # load camera parameters
140
+ meta_path = f"{data_root}/{scene_name}/images/{image_id}.npz"
141
+ meta = np.load(meta_path)
142
+ c2w = meta['camera_pose']
143
+ K = meta['camera_intrinsics']
144
+ return depth, c2w, K
145
+
146
+ # Unproject depthmap to point cloud and project to another camera
147
+ def calculate_iou(depth1, c2w1, K1, depth2, c2w2, K2):
148
+ # Move data to GPU and ensure float32 dtype
149
+ depth1 = torch.from_numpy(depth1).cuda().float()
150
+ depth2 = torch.from_numpy(depth2).cuda().float()
151
+ c2w1 = torch.from_numpy(c2w1).cuda().float()
152
+ c2w2 = torch.from_numpy(c2w2).cuda().float()
153
+ K1 = torch.from_numpy(K1).cuda().float()
154
+ K2 = torch.from_numpy(K2).cuda().float()
155
+
156
+ # Get image dimensions
157
+ h, w = depth1.shape
158
+
159
+ # Create pixel coordinates
160
+ y, x = torch.meshgrid(torch.arange(h, device='cuda', dtype=torch.float32),
161
+ torch.arange(w, device='cuda', dtype=torch.float32))
162
+ pixels = torch.stack((x.flatten(), y.flatten(), torch.ones_like(x.flatten())), dim=-1).T
163
+
164
+ # Unproject pixels to 3D points
165
+ pixels_3d = torch.linalg.inv(K1) @ pixels
166
+ pixels_3d *= depth1.flatten().unsqueeze(0)
167
+
168
+ # Transform 3D points to world coordinates
169
+ pixels_world = c2w1[:3, :3] @ pixels_3d + c2w1[:3, 3:4]
170
+
171
+ # Check if c2w2[:3, :3] is invertible
172
+ if torch.det(c2w2[:3, :3]) == 0:
173
+ return 0, False # Calculation failed
174
+
175
+ # Project world points to second camera
176
+ pixels_cam2 = torch.linalg.inv(c2w2[:3, :3]) @ (pixels_world - c2w2[:3, 3:4])
177
+ pixels_img2 = K2 @ pixels_cam2
178
+
179
+ # Normalize homogeneous coordinates
180
+ pixels_img2 = pixels_img2[:2] / pixels_img2[2]
181
+ pixels_img2 = pixels_img2.T
182
+
183
+ # Filter valid pixels
184
+ valid_mask = (pixels_img2[:, 0] >= 0) & (pixels_img2[:, 0] < w) & \
185
+ (pixels_img2[:, 1] >= 0) & (pixels_img2[:, 1] < h)
186
+
187
+ pixels_img2 = pixels_img2[valid_mask].long()
188
+
189
+ # Compare depths
190
+ projected_depth = pixels_cam2[2, valid_mask]
191
+ actual_depth = depth2[pixels_img2[:, 1], pixels_img2[:, 0]]
192
+
193
+ depth_diff = torch.abs(projected_depth - actual_depth)
194
+ depth_threshold = 0.1 # 10cm threshold
195
+
196
+ overlap_mask = depth_diff < depth_threshold
197
+
198
+ # Calculate IoU
199
+ intersection = torch.sum(overlap_mask)
200
+ union = torch.sum(valid_mask) + torch.sum(depth2 > 0) - intersection
201
+
202
+ iou = intersection.float() / union.float() if union > 0 else torch.tensor(0.0, device='cuda')
203
+
204
+ return iou.item()
205
+
206
+ if __name__ == "__main__":
207
+ data_root = "data/scannet_processed"
208
+ # 可以通过参数指定每个GPU的线程数
209
+ preprocess_scannet(data_root, threads_per_gpu=12)
src/datasets_preprocess/scannetpp_preprocess.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import cv2
4
+ import torch
5
+ import torch.multiprocessing as mp
6
+ import shutil
7
+
8
+ def process_scene_on_gpu(gpu_id, scene_names, data_root, target_root, output_queue):
9
+ torch.cuda.set_device(gpu_id)
10
+ local_pairs = {}
11
+ local_images = {}
12
+
13
+ for scene_name in scene_names:
14
+ save_path = os.path.join(target_root, scene_name, "scene_data.npz")
15
+ if os.path.exists(save_path):
16
+ print(f"Scene {scene_name} already processed, skipping")
17
+ continue
18
+ pairs, images = process_scene(data_root, target_root, scene_name)
19
+ np.savez_compressed(save_path, pairs=pairs, images=images)
20
+
21
+ output_queue.put((local_pairs, local_images))
22
+
23
+ def preprocess_scannetpp(data_root, target_root):
24
+ # Traverse all the folders in the data_root
25
+ scene_names = [folder for folder in os.listdir(data_root) if os.path.isdir(os.path.join(data_root, folder))]
26
+
27
+ # Get the number of available GPUs
28
+ num_gpus = torch.cuda.device_count()
29
+
30
+ # Distribute scenes across GPUs
31
+ scenes_per_gpu = [scene_names[i::num_gpus] for i in range(num_gpus)]
32
+
33
+ # Create a multiprocessing queue to collect results
34
+ output_queue = mp.Queue()
35
+
36
+ # Launch parallel processes
37
+ processes = []
38
+ for gpu_id in range(num_gpus):
39
+ p = mp.Process(target=process_scene_on_gpu, args=(gpu_id, scenes_per_gpu[gpu_id], data_root, target_root, output_queue))
40
+ p.start()
41
+ processes.append(p)
42
+
43
+ # Collect results from all processes
44
+ all_pairs = {}
45
+ all_images = {}
46
+ for _ in range(num_gpus):
47
+ local_pairs, local_images = output_queue.get()
48
+ all_pairs.update(local_pairs)
49
+ all_images.update(local_images)
50
+
51
+ # Wait for all processes to complete
52
+ for p in processes:
53
+ p.join()
54
+
55
+ # Save to npz file
56
+ np.savez_compressed(os.path.join(data_root, "scannet_image_pairs.npz"), **all_pairs)
57
+ np.savez_compressed(os.path.join(data_root, "scannet_images.npz"), **all_images)
58
+
59
+ # print the number of image pairs
60
+ # sum up the number of image pairs for all scenes
61
+ total_pairs = sum(len(pairs) for pairs in all_pairs.values())
62
+ print(f"Total number of image pairs: {total_pairs}")
63
+ return all_pairs, all_images
64
+
65
+ # def preprocess_scannetpp(data_root, target_root):
66
+ # # Traverse all the folders in the data_root
67
+ # scene_names = [folder for folder in os.listdir(data_root) if os.path.isdir(os.path.join(data_root, folder))]
68
+
69
+ # for scene_name in scene_names:
70
+ # save_path = os.path.join(target_root, scene_name, "scene_data.npz")
71
+ # if os.path.exists(save_path):
72
+ # print(f"Scene {scene_name} already processed, skipping")
73
+ # continue
74
+ # pairs, images = process_scene(data_root, target_root, scene_name)
75
+ # np.savez_compressed(save_path, pairs=pairs, images=images)
76
+
77
+ def process_scene(data_root, target_root, scene_name):
78
+ pairs = []
79
+ images_dir = os.path.join(data_root, scene_name, "images")
80
+ images = [os.path.splitext(file)[0] for file in os.listdir(images_dir) if file.endswith(".JPG")]
81
+ images.sort()
82
+ # copy images, depths, and camera parameters to target_root
83
+ os.makedirs(os.path.join(target_root, scene_name, "images"), exist_ok=True)
84
+ os.makedirs(os.path.join(target_root, scene_name, "depths"), exist_ok=True)
85
+ for image in images:
86
+ shutil.copy(os.path.join(data_root, scene_name, "images", f"{image}.JPG"), os.path.join(target_root, scene_name, "images", f"{image}.JPG"))
87
+ shutil.copy(os.path.join(data_root, scene_name, "depths", f"{image}.png"), os.path.join(target_root, scene_name, "depths", f"{image}.png"))
88
+ shutil.copy(os.path.join(data_root, scene_name, "images", f"{image}.npz"), os.path.join(target_root, scene_name, "images", f"{image}.npz"))
89
+
90
+ # Check validity of c2w for each image
91
+ valid_images = []
92
+ for image in images:
93
+ _, c2w, _ = load_image(data_root, scene_name, image)
94
+ if is_valid_c2w(c2w):
95
+ valid_images.append(image)
96
+ else:
97
+ print(f"Invalid c2w for image {image} in scene {scene_name}")
98
+
99
+ # generate image pairs
100
+ slide_window = 100
101
+ num_sub_intervals = 5
102
+
103
+ pairs = generate_image_pairs(data_root, scene_name, valid_images, slide_window, num_sub_intervals)
104
+ print(f"Scene {scene_name} has {len(pairs)} image pairs and {len(valid_images)} valid images out of {len(images)} total images")
105
+ return pairs, valid_images
106
+
107
+ def is_valid_c2w(c2w):
108
+ return not np.any(np.isinf(c2w)) and not np.any(np.isnan(c2w))
109
+
110
+ def generate_image_pairs(data_root, scene_name, images, slide_window, num_sub_intervals=3):
111
+ pairs = []
112
+ n = len(images)
113
+
114
+ # Define IOU sub-intervals
115
+ iou_range = (0.3, 0.8)
116
+ sub_interval_size = (iou_range[1] - iou_range[0]) / num_sub_intervals
117
+ sub_intervals = [(iou_range[0] + i * sub_interval_size, iou_range[0] + (i + 1) * sub_interval_size)
118
+ for i in range(num_sub_intervals)]
119
+
120
+ for i in range(n):
121
+ # Keep track of whether a pair has been added for each sub-interval
122
+ interval_selected = [False] * num_sub_intervals
123
+
124
+ for j in range(i+1, min(i + slide_window, n)):
125
+ # Break early if all sub-intervals have been selected
126
+ if all(interval_selected):
127
+ break
128
+
129
+ # Load image pair
130
+ depth1, c2w1, K1 = load_image(data_root, scene_name, images[i])
131
+ depth2, c2w2, K2 = load_image(data_root, scene_name, images[j])
132
+
133
+ # Calculate mean IoU
134
+ try:
135
+ iou_1 = calculate_iou(depth1, c2w1, K1, depth2, c2w2, K2)
136
+ iou_2 = calculate_iou(depth2, c2w2, K2, depth1, c2w1, K1)
137
+ except Exception as e:
138
+ print(f"Error calculating IoU for images {images[i]} and {images[j]} in scene {scene_name}: {str(e)}")
139
+ continue
140
+
141
+ mean_iou = (iou_1 + iou_2) / 2
142
+
143
+ # Check which sub-interval the mean IoU falls into
144
+ for idx, (lower, upper) in enumerate(sub_intervals):
145
+ if lower <= mean_iou <= upper and not interval_selected[idx]:
146
+ pairs.append((i, j, mean_iou))
147
+ interval_selected[idx] = True # Mark this interval as selected
148
+ break # Move to the next pair after adding one in the current sub-interval
149
+
150
+ return pairs
151
+
152
+
153
+ def load_image(data_root, scene_name, image_id):
154
+ # load depthmap
155
+ depth_path = f"{data_root}/{scene_name}/depths/{image_id}.png"
156
+ depth = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED).astype(np.float32) / 1000.0
157
+ # load camera parameters
158
+ meta_path = f"{data_root}/{scene_name}/images/{image_id}.npz"
159
+ meta = np.load(meta_path)
160
+ c2w = meta['camera_pose']
161
+ K = meta['camera_intrinsics']
162
+ return depth, c2w, K
163
+
164
+ # Unproject depthmap to point cloud and project to another camera
165
+ def calculate_iou(depth1, c2w1, K1, depth2, c2w2, K2):
166
+ # Move data to GPU and ensure float32 dtype
167
+ depth1 = torch.from_numpy(depth1).cuda().float()
168
+ depth2 = torch.from_numpy(depth2).cuda().float()
169
+ c2w1 = torch.from_numpy(c2w1).cuda().float()
170
+ c2w2 = torch.from_numpy(c2w2).cuda().float()
171
+ K1 = torch.from_numpy(K1).cuda().float()
172
+ K2 = torch.from_numpy(K2).cuda().float()
173
+
174
+ # Get image dimensions
175
+ h, w = depth1.shape
176
+
177
+ # Create pixel coordinates
178
+ y, x = torch.meshgrid(torch.arange(h, device='cuda', dtype=torch.float32),
179
+ torch.arange(w, device='cuda', dtype=torch.float32))
180
+ pixels = torch.stack((x.flatten(), y.flatten(), torch.ones_like(x.flatten())), dim=-1).T
181
+
182
+ # Unproject pixels to 3D points
183
+ pixels_3d = torch.linalg.inv(K1) @ pixels
184
+ pixels_3d *= depth1.flatten().unsqueeze(0)
185
+
186
+ # Transform 3D points to world coordinates
187
+ pixels_world = c2w1[:3, :3] @ pixels_3d + c2w1[:3, 3:4]
188
+
189
+ # Check if c2w2[:3, :3] is invertible
190
+ if torch.det(c2w2[:3, :3]) == 0:
191
+ return 0, False # Calculation failed
192
+
193
+ # Project world points to second camera
194
+ pixels_cam2 = torch.linalg.inv(c2w2[:3, :3]) @ (pixels_world - c2w2[:3, 3:4])
195
+ pixels_img2 = K2 @ pixels_cam2
196
+
197
+ # Normalize homogeneous coordinates
198
+ pixels_img2 = pixels_img2[:2] / pixels_img2[2]
199
+ pixels_img2 = pixels_img2.T
200
+
201
+ # Filter valid pixels
202
+ valid_mask = (pixels_img2[:, 0] >= 0) & (pixels_img2[:, 0] < w) & \
203
+ (pixels_img2[:, 1] >= 0) & (pixels_img2[:, 1] < h)
204
+
205
+ pixels_img2 = pixels_img2[valid_mask].long()
206
+
207
+ # Compare depths
208
+ projected_depth = pixels_cam2[2, valid_mask]
209
+ actual_depth = depth2[pixels_img2[:, 1], pixels_img2[:, 0]]
210
+
211
+ depth_diff = torch.abs(projected_depth - actual_depth)
212
+ depth_threshold = 0.1 # 10cm threshold
213
+
214
+ overlap_mask = depth_diff < depth_threshold
215
+
216
+ # Calculate IoU
217
+ intersection = torch.sum(overlap_mask)
218
+ union = torch.sum(valid_mask) + torch.sum(depth2 > 0) - intersection
219
+
220
+ iou = intersection.float() / union.float() if union > 0 else torch.tensor(0.0, device='cuda')
221
+
222
+ return iou.item()
223
+
224
+ if __name__ == "__main__":
225
+ data_root = "data/scannetpp_processed"
226
+ target_root = "data/scannetpp_target"
227
+ preprocess_scannetpp(data_root, target_root)
src/gaussian_head.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from einops import rearrange
4
+ from src.utils.gaussian_model import build_covariance
5
+ from simple_knn._C import distCUDA2
6
+ from src.utils.sh_utils import RGB2SH
7
+
8
+ class GaussianHead(nn.Module):
9
+ def __init__(self, d_pt_feat=64, **kwargs):
10
+ super().__init__()
11
+ # args
12
+ self.args = kwargs
13
+ self.d_means = 3
14
+ self.d_scales = 3
15
+ self.d_rotations = 4
16
+ self.d_opacities = 1
17
+ self.sh_degree = 3
18
+ self.d_view_dep_features = 3 # RGB
19
+ self.d_sh = (self.sh_degree + 1) ** 2
20
+ self.d_attr = (self.d_scales + self.d_rotations + self.d_opacities + self.d_view_dep_features * self.d_sh)
21
+ if self.args.get('d_gs_feats'):
22
+ self.d_attr += self.args['d_gs_feats']
23
+
24
+ # Create a mask for the spherical harmonics coefficients.
25
+ # This ensures that at initialization, the coefficients are biased
26
+ # towards having a large DC component and small view-dependent components.
27
+ self.register_buffer(
28
+ "sh_mask",
29
+ torch.ones((self.d_sh,), dtype=torch.float32),
30
+ persistent=False,
31
+ )
32
+ for degree in range(1, self.sh_degree + 1):
33
+ self.sh_mask[degree**2 : (degree + 1) ** 2] = 0.5 * 0.25**degree
34
+
35
+ self.gaussian_proj = nn.Linear(d_pt_feat, self.d_attr)
36
+
37
+ # Activation functions
38
+ self.scale_activation = torch.exp
39
+ self.rotation_activation = torch.nn.functional.normalize
40
+ self.opacity_activation = torch.sigmoid
41
+
42
+ def forward(self, point_transformer_output, lseg_features=None):
43
+ pred1 = {}
44
+ pred2 = {}
45
+
46
+ scene_scale = point_transformer_output['scale'] # B, 1, 1
47
+ scene_center = point_transformer_output['center'] # B, 1, 3
48
+ B, H, W, _ = point_transformer_output['shape']
49
+ normalized_means = point_transformer_output['coord'] # B * V * H * W, 3
50
+ colors = point_transformer_output['color'] # B * V * H * W, 3
51
+
52
+ # split normalized_means to 2 views
53
+ normalized_means = rearrange(normalized_means, '(b v h w) c -> v b (h w) c', v=2, b=B, h=H, w=W)
54
+ means = normalized_means * scene_scale + scene_center # V, B, H * W, 3
55
+ means = rearrange(means, 'v b (h w) c -> b (v h w) c', b=B, v=2, h=H, w=W)
56
+
57
+ # get features
58
+ feat = point_transformer_output['feat']
59
+ gaussian_attr = self.gaussian_proj(feat)
60
+
61
+ # # split gaussian attributes
62
+ # scales, rotations, opacities, sh_coeffs = torch.split(gaussian_attr,
63
+ # [
64
+ # self.d_scales,
65
+ # self.d_rotations,
66
+ # self.d_opacities,
67
+ # self.d_view_dep_features * self.d_sh
68
+ # ],
69
+ # dim=-1)
70
+
71
+ scales, rotations, opacities, sh_coeffs, gs_feats = torch.split(gaussian_attr,
72
+ [
73
+ self.d_scales,
74
+ self.d_rotations,
75
+ self.d_opacities,
76
+ self.d_view_dep_features * self.d_sh,
77
+ self.args['d_gs_feats']
78
+ ],
79
+ dim=-1)
80
+
81
+ # scales
82
+ # calculate the distance between each point and its nearest neighbor
83
+ all_dist = torch.stack([torch.sqrt(torch.clamp_min(distCUDA2(pts3d), 0.0000001)) for pts3d in means]) # B, V * H * W
84
+ median_dist = all_dist.median(dim=-1)[0][:, None, None] # B, 1, 1
85
+ scales = self.scale_activation(scales)
86
+ scales = rearrange(scales, '(b v h w) c -> b (v h w) c', b=B, v=2, h=H, w=W)
87
+ scales = scales * all_dist[..., None]
88
+ # clip scales
89
+ scales = torch.clamp(scales, min=0.1 * median_dist, max=3.0 * median_dist)
90
+ scales = rearrange(scales, 'b (v h w) c -> (b v h w) c', b=B, v=2, h=H, w=W)
91
+
92
+ # activation
93
+ rotations = self.rotation_activation(rotations)
94
+ opacities = self.opacity_activation(opacities)
95
+
96
+ # build covariance matrix
97
+ covs = build_covariance(scales, rotations)
98
+
99
+ # sh_mask
100
+ sh_coeffs = rearrange(sh_coeffs, '(b v h w) (c d) -> (b v h w) c d', b=B, v=2, h=H, w=W, c=self.d_sh, d=self.d_view_dep_features)
101
+ sh_dc = sh_coeffs[..., 0, :]
102
+ sh_rest = sh_coeffs[..., 1:, :]
103
+ if self.args.get('rgb_residual'):
104
+ # denormalize colors
105
+ colors = colors * 0.5 + 0.5
106
+ sh_rgb = RGB2SH(colors) # (B * V * H * W, 3)
107
+ # add rgb residual to dc component
108
+ sh_dc = sh_dc + sh_rgb
109
+ # concatenate dc and rest
110
+ sh_coeffs = torch.cat([sh_dc[..., None, :], sh_rest], dim=-2)
111
+ sh_coeffs = sh_coeffs * self.sh_mask[None, :, None]
112
+
113
+ # lseg_features(learning residual)
114
+ lseg_features = rearrange(lseg_features, '(v b) c h w -> (b v h w) c', b=B, v=2, h=H, w=W)
115
+ gs_feats = gs_feats + lseg_features
116
+
117
+ # split to 2 views
118
+ scales = rearrange(scales, '(b v h w) ... -> v b h w ...', v=2, b=B, h=H, w=W)
119
+ rotations = rearrange(rotations, '(b v h w) ... -> v b h w ...', v=2, b=B, h=H, w=W)
120
+ opacities = rearrange(opacities, '(b v h w) ... -> v b h w ...', v=2, b=B, h=H, w=W)
121
+ sh_coeffs = rearrange(sh_coeffs, '(b v h w) ... -> v b h w ...', v=2, b=B, h=H, w=W)
122
+ covs = rearrange(covs, '(b v h w) ... -> v b h w ...', v=2, b=B, h=H, w=W)
123
+ means = rearrange(means, 'b (v h w) ... -> v b h w ...', v=2, b=B, h=H, w=W)
124
+ gs_feats = rearrange(gs_feats, '(b v h w) ... -> v b h w ...', v=2, b=B, h=H, w=W)
125
+
126
+ pred1['scales'] = scales[0]
127
+ pred1['rotations'] = rotations[0]
128
+ pred1['covs'] = covs[0]
129
+ pred1['opacities'] = opacities[0]
130
+ pred1['sh_coeffs'] = sh_coeffs[0]
131
+ pred1['means'] = means[0]
132
+ pred1['gs_feats'] = gs_feats[0]
133
+
134
+ pred2['scales'] = scales[1]
135
+ pred2['rotations'] = rotations[1]
136
+ pred2['covs'] = covs[1]
137
+ pred2['opacities'] = opacities[1]
138
+ pred2['sh_coeffs'] = sh_coeffs[1]
139
+ pred2['means'] = means[1]
140
+ pred2['gs_feats'] = gs_feats[1]
141
+
142
+ return pred1, pred2
src/infer.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import sys
3
+
4
+ sys.path.append('.')
5
+ from src.model import LSM_MASt3R
6
+ from src.utils.visualization_utils import render_video_from_file
7
+
8
+ if __name__ == '__main__':
9
+ parser = argparse.ArgumentParser()
10
+ parser.add_argument('--file_list', type=str, nargs='+', required=True,
11
+ help='List of input image files or directories')
12
+ parser.add_argument('--model_path', type=str, required=True)
13
+ parser.add_argument('--output_path', type=str, required=True)
14
+ parser.add_argument('--resolution', type=int, default=512)
15
+ parser.add_argument('--n_interp', type=int, default=90)
16
+ parser.add_argument('--fps', type=int, default=30)
17
+
18
+ args = parser.parse_args()
19
+
20
+ # 1. load model
21
+ model = LSM_MASt3R.from_pretrained(args.model_path)
22
+ # 2. render video
23
+ render_video_from_file(args.file_list, model, args.output_path, resolution=args.resolution, n_interp=args.n_interp, fps=args.fps)
src/losses.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from submodules.mast3r.dust3r.dust3r.losses import *
2
+ from torchmetrics import PeakSignalNoiseRatio, StructuralSimilarityIndexMeasure, JaccardIndex, Accuracy
3
+ import lpips
4
+ from src.utils.gaussian_model import GaussianModel
5
+ from src.utils.cuda_splatting import render, DummyPipeline
6
+ from einops import rearrange
7
+ from src.utils.camera_utils import get_scaled_camera
8
+ from torchvision.utils import save_image
9
+ from dust3r.inference import make_batch_symmetric
10
+
11
+ class L2Loss (LLoss):
12
+ """ Euclidean distance between 3d points """
13
+
14
+ def distance(self, a, b):
15
+ return torch.norm(a - b, dim=-1) # normalized L2 distance
16
+
17
+ class L1Loss (LLoss):
18
+ """ Manhattan distance between 3d points """
19
+
20
+ def distance(self, a, b):
21
+ return torch.abs(a - b).mean() # L1 distance
22
+
23
+ L2 = L2Loss()
24
+ L1 = L1Loss()
25
+
26
+ def merge_and_split_predictions(pred1, pred2):
27
+ merged = {}
28
+ for key in pred1.keys():
29
+ merged_pred = torch.stack([pred1[key], pred2[key]], dim=1)
30
+ merged_pred = rearrange(merged_pred, 'b v h w ... -> b (v h w) ...')
31
+ merged[key] = merged_pred
32
+
33
+ # Split along the batch dimension
34
+ batch_size = next(iter(merged.values())).shape[0]
35
+ split = [{key: value[i] for key, value in merged.items()} for i in range(batch_size)]
36
+
37
+ return split
38
+
39
+ class GaussianLoss(MultiLoss):
40
+ def __init__(self, ssim_weight=0.2):
41
+ super().__init__()
42
+ self.ssim_weight = ssim_weight
43
+ self.ssim = StructuralSimilarityIndexMeasure(data_range=1.0).cuda()
44
+ self.psnr = PeakSignalNoiseRatio(data_range=1.0).cuda()
45
+ self.lpips_vgg = lpips.LPIPS(net='vgg').cuda()
46
+ self.pipeline = DummyPipeline()
47
+ # bg_color
48
+ self.register_buffer('bg_color', torch.tensor([0.0, 0.0, 0.0]).cuda())
49
+
50
+ def get_name(self):
51
+ return f'GaussianLoss(ssim_weight={self.ssim_weight})'
52
+
53
+ # def compute_loss(self, gt1, gt2, target_view, pred1, pred2, model):
54
+ # # render images
55
+ # # 1. merge predictions
56
+ # pred = merge_and_split_predictions(pred1, pred2)
57
+
58
+ # # 2. calculate optimal scaling
59
+ # pred_pts1 = pred1['means']
60
+ # pred_pts2 = pred2['means']
61
+ # # convert to camera1 coordinates
62
+ # # everything is normalized w.r.t. camera of view1
63
+ # valid1 = gt1['valid_mask'].clone()
64
+ # valid2 = gt2['valid_mask'].clone()
65
+ # in_camera1 = inv(gt1['camera_pose'])
66
+ # gt_pts1 = geotrf(in_camera1, gt1['pts3d'].to(in_camera1.device)) # B,H,W,3
67
+ # gt_pts2 = geotrf(in_camera1, gt2['pts3d'].to(in_camera1.device)) # B,H,W,3
68
+ # scaling = find_opt_scaling(gt_pts1, gt_pts2, pred_pts1, pred_pts2, valid1=valid1, valid2=valid2)
69
+
70
+ # # 3. render images(need gaussian model, camera, pipeline)
71
+ # rendered_images = []
72
+ # rendered_feats = []
73
+ # for i in range(len(pred)):
74
+ # # get gaussian model
75
+ # gaussians = GaussianModel.from_predictions(pred[i], sh_degree=3)
76
+ # # get camera
77
+ # ref_camera_extrinsics = gt1['camera_pose'][i]
78
+ # target_extrinsics = target_view['camera_pose'][i]
79
+ # target_intrinsics = target_view['camera_intrinsics'][i]
80
+ # image_shape = target_view['true_shape'][i]
81
+ # scale = scaling[i]
82
+ # camera = get_scaled_camera(ref_camera_extrinsics, target_extrinsics, target_intrinsics, scale, image_shape)
83
+ # # render(image and features)
84
+ # rendered_output = render(camera, gaussians, self.pipeline, self.bg_color)
85
+ # rendered_images.append(rendered_output['render'])
86
+ # rendered_feats.append(rendered_output['feature_map'])
87
+
88
+ # rendered_images = torch.stack(rendered_images, dim=0) # B, 3, H, W
89
+ # rendered_feats = torch.stack(rendered_feats, dim=0) # B, d_feats, H, W
90
+ # rendered_feats = model.feature_expansion(rendered_feats) # B, 512, H//2, W//2
91
+
92
+ # gt_images = target_view['img'] * 0.5 + 0.5
93
+ # gt_feats = model.lseg_feature_extractor.extract_features(target_view['img']) # B, 512, H//2, W//2
94
+ # image_loss = torch.abs(rendered_images - gt_images).mean()
95
+ # feature_loss = torch.abs(rendered_feats - gt_feats).mean()
96
+ # loss = image_loss + 100 * feature_loss
97
+
98
+ # # # temp
99
+ # # gt_logits = model.lseg_feature_extractor.decode_feature(gt_feats, ['wall', 'floor', 'others'])
100
+ # # gt_labels = torch.argmax(gt_logits, dim=1, keepdim=True)
101
+ # # rendered_logits = model.lseg_feature_extractor.decode_feature(rendered_feats, ['wall', 'floor', 'others'])
102
+ # # rendered_labels = torch.argmax(rendered_logits, dim=1, keepdim=True)
103
+
104
+ # # calculate metric
105
+ # with torch.no_grad():
106
+ # ssim = self.ssim(rendered_images, gt_images)
107
+ # psnr = self.psnr(rendered_images, gt_images)
108
+ # lpips = self.lpips_vgg(rendered_images, gt_images).mean()
109
+
110
+ # return loss, {'ssim': ssim, 'psnr': psnr, 'lpips': lpips, 'image_loss': image_loss, 'feature_loss': feature_loss}
111
+
112
+ def compute_loss(self, gt1, gt2, target_view, pred1, pred2, model):
113
+ # render images
114
+ # 1. merge predictions
115
+ pred = merge_and_split_predictions(pred1, pred2)
116
+
117
+ # 2. calculate optimal scaling
118
+ pred_pts1 = pred1['means']
119
+ pred_pts2 = pred2['means']
120
+ # convert to camera1 coordinates
121
+ # everything is normalized w.r.t. camera of view1
122
+ valid1 = gt1['valid_mask'].clone()
123
+ valid2 = gt2['valid_mask'].clone()
124
+ in_camera1 = inv(gt1['camera_pose'])
125
+ gt_pts1 = geotrf(in_camera1, gt1['pts3d'].to(in_camera1.device)) # B,H,W,3
126
+ gt_pts2 = geotrf(in_camera1, gt2['pts3d'].to(in_camera1.device)) # B,H,W,3
127
+ scaling = find_opt_scaling(gt_pts1, gt_pts2, pred_pts1, pred_pts2, valid1=valid1, valid2=valid2)
128
+
129
+ # 3. render images(need gaussian model, camera, pipeline)
130
+ rendered_images = []
131
+ rendered_feats = []
132
+ gt_images = []
133
+
134
+ for i in range(len(pred)):
135
+ # get gaussian model
136
+ gaussians = GaussianModel.from_predictions(pred[i], sh_degree=3)
137
+ # get camera
138
+ ref_camera_extrinsics = gt1['camera_pose'][i]
139
+ target_view_list = [gt1, gt2, target_view] # use gt1, gt2, and target_view
140
+ for j in range(len(target_view_list)):
141
+ target_extrinsics = target_view_list[j]['camera_pose'][i]
142
+ target_intrinsics = target_view_list[j]['camera_intrinsics'][i]
143
+ image_shape = target_view_list[j]['true_shape'][i]
144
+ scale = scaling[i]
145
+ camera = get_scaled_camera(ref_camera_extrinsics, target_extrinsics, target_intrinsics, scale, image_shape)
146
+ # render(image and features)
147
+ rendered_output = render(camera, gaussians, self.pipeline, self.bg_color)
148
+ rendered_images.append(rendered_output['render'])
149
+ rendered_feats.append(rendered_output['feature_map'])
150
+ gt_images.append(target_view_list[j]['img'][i] * 0.5 + 0.5)
151
+
152
+ rendered_images = torch.stack(rendered_images, dim=0) # B, 3, H, W
153
+ gt_images = torch.stack(gt_images, dim=0)
154
+ rendered_feats = torch.stack(rendered_feats, dim=0) # B, d_feats, H, W
155
+ rendered_feats = model.feature_expansion(rendered_feats) # B, 512, H//2, W//2
156
+ gt_feats = model.lseg_feature_extractor.extract_features(gt_images) # B, 512, H//2, W//2
157
+ image_loss = torch.abs(rendered_images - gt_images).mean()
158
+ feature_loss = torch.abs(rendered_feats - gt_feats).mean()
159
+ loss = image_loss + feature_loss
160
+
161
+ # calculate metric
162
+ with torch.no_grad():
163
+ ssim = self.ssim(rendered_images, gt_images)
164
+ psnr = self.psnr(rendered_images, gt_images)
165
+ lpips = self.lpips_vgg(rendered_images, gt_images).mean()
166
+
167
+ return loss, {'ssim': ssim, 'psnr': psnr, 'lpips': lpips, 'image_loss': image_loss, 'feature_loss': feature_loss}
168
+
169
+ # loss for one batch
170
+ def loss_of_one_batch(batch, model, criterion, device, symmetrize_batch=False, use_amp=False, ret=None):
171
+ view1, view2, target_view = batch
172
+ ignore_keys = set(['depthmap', 'dataset', 'label', 'instance', 'idx', 'true_shape', 'rng', 'pts3d'])
173
+ for view in batch:
174
+ for name in view.keys(): # pseudo_focal
175
+ if name in ignore_keys:
176
+ continue
177
+ view[name] = view[name].to(device, non_blocking=True)
178
+
179
+ if symmetrize_batch:
180
+ view1, view2 = make_batch_symmetric(batch)
181
+
182
+ # Get the actual model if it's distributed
183
+ actual_model = model.module if hasattr(model, 'module') else model
184
+
185
+ with torch.cuda.amp.autocast(enabled=bool(use_amp)):
186
+ pred1, pred2 = actual_model(view1, view2)
187
+
188
+ # loss is supposed to be symmetric
189
+ with torch.cuda.amp.autocast(enabled=False):
190
+ loss = criterion(view1, view2, target_view, pred1, pred2, actual_model) if criterion is not None else None
191
+
192
+ result = dict(view1=view1, view2=view2, target_view=target_view, pred1=pred1, pred2=pred2, loss=loss)
193
+ return result[ret] if ret else result
src/lseg.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from submodules.lang_seg.modules.models.lseg_net import LSegNet, clip
4
+
5
+ class LSegFeatureExtractor(LSegNet):
6
+ def __init__(self, half_res=True):
7
+ super().__init__(
8
+ labels='',
9
+ backbone='clip_vitl16_384',
10
+ features=256,
11
+ crop_size=224,
12
+ arch_option=0,
13
+ block_depth=0,
14
+ activation='lrelu'
15
+ )
16
+
17
+ self.half_res = half_res
18
+
19
+ @torch.no_grad()
20
+ def extract_features(self, x):
21
+ layer_1, layer_2, layer_3, layer_4 = forward_layers(self.pretrained, x)
22
+ # layer:(b, 1024, h//16, w//16)
23
+ # image_features = torch.cat([layer_1, layer_2, layer_3, layer_4], dim=1)
24
+ # # image_features:(b, 4096, h//16, w//16)
25
+
26
+ # dense feature
27
+ # DPT head
28
+ pretrained = self.pretrained
29
+ layer_1 = pretrained.act_postprocess1[3 : len(pretrained.act_postprocess1)](layer_1)
30
+ layer_2 = pretrained.act_postprocess2[3 : len(pretrained.act_postprocess2)](layer_2)
31
+ layer_3 = pretrained.act_postprocess3[3 : len(pretrained.act_postprocess3)](layer_3)
32
+ layer_4 = pretrained.act_postprocess4[3 : len(pretrained.act_postprocess4)](layer_4)
33
+
34
+ # refinenet
35
+ layer_1_rn = self.scratch.layer1_rn(layer_1)
36
+ layer_2_rn = self.scratch.layer2_rn(layer_2)
37
+ layer_3_rn = self.scratch.layer3_rn(layer_3)
38
+ layer_4_rn = self.scratch.layer4_rn(layer_4)
39
+
40
+ path_4 = self.scratch.refinenet4(layer_4_rn)
41
+ path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
42
+ path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
43
+ path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
44
+
45
+ # (b, 512, h//2, w//2)
46
+ image_features = self.scratch.head1(path_1)
47
+ if self.half_res:
48
+ return image_features
49
+
50
+ # (b, 512, h, w)
51
+ image_features = self.scratch.output_conv(image_features)
52
+
53
+ return image_features
54
+
55
+ @torch.no_grad()
56
+ def decode_feature(self, image_features, labelset=''):
57
+ # # image_features:(b, 4096, h//16, w//16)
58
+ # # split image_features into 4 parts
59
+ # layer_1, layer_2, layer_3, layer_4 = torch.split(image_features, 1024, dim=1)
60
+
61
+ # # DPT head
62
+ # pretrained = self.pretrained
63
+ # layer_1 = pretrained.act_postprocess1[3 : len(pretrained.act_postprocess1)](layer_1)
64
+ # layer_2 = pretrained.act_postprocess2[3 : len(pretrained.act_postprocess2)](layer_2)
65
+ # layer_3 = pretrained.act_postprocess3[3 : len(pretrained.act_postprocess3)](layer_3)
66
+ # layer_4 = pretrained.act_postprocess4[3 : len(pretrained.act_postprocess4)](layer_4)
67
+
68
+ # # refinenet
69
+ # layer_1_rn = self.scratch.layer1_rn(layer_1)
70
+ # layer_2_rn = self.scratch.layer2_rn(layer_2)
71
+ # layer_3_rn = self.scratch.layer3_rn(layer_3)
72
+ # layer_4_rn = self.scratch.layer4_rn(layer_4)
73
+
74
+ # path_4 = self.scratch.refinenet4(layer_4_rn)
75
+ # path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
76
+ # path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
77
+ # path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
78
+
79
+ # image_features = self.scratch.head1(path_1)
80
+ imshape = image_features.shape
81
+
82
+ # encode text
83
+ if labelset == '':
84
+ text = self.text
85
+ else:
86
+ text = clip.tokenize(labelset)
87
+
88
+ self.logit_scale = self.logit_scale.to(image_features.device)
89
+ text = text.to(image_features.device)
90
+ text_features = self.clip_pretrained.encode_text(text)
91
+ image_features = image_features.permute(0,2,3,1).reshape(-1, self.out_c)
92
+
93
+ # normalized features
94
+ image_features = image_features / image_features.norm(dim=-1, keepdim=True)
95
+ text_features = text_features / text_features.norm(dim=-1, keepdim=True)
96
+
97
+ logits_per_image = self.logit_scale * image_features.half() @ text_features.t()
98
+ out = logits_per_image.float().view(imshape[0], imshape[2], imshape[3], -1).permute(0,3,1,2)
99
+
100
+ if self.arch_option in [1, 2]:
101
+ for _ in range(self.block_depth - 1):
102
+ out = self.scratch.head_block(out)
103
+ out = self.scratch.head_block(out, False)
104
+
105
+ if self.half_res:
106
+ out = self.scratch.output_conv(out)
107
+
108
+ return out
109
+
110
+ @classmethod
111
+ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
112
+ print(f"Loading checkpoint from: {pretrained_model_name_or_path}")
113
+ ckpt = torch.load(pretrained_model_name_or_path, map_location='cpu')
114
+ print(f"Checkpoint loaded. Keys in checkpoint: {ckpt.keys()}")
115
+
116
+ print("Processing state dict...")
117
+ new_state_dict = {k[len("net."):]: v for k, v in ckpt['state_dict'].items() if k.startswith("net.")}
118
+ print(f"Processed state dict. Number of keys: {len(new_state_dict)}")
119
+
120
+ print("Initializing model...")
121
+ model = cls(*args, **kwargs)
122
+
123
+ print("Loading state dict into model...")
124
+ model.load_state_dict(new_state_dict, strict=True)
125
+ print("State dict loaded successfully.")
126
+
127
+ print("Cleaning up...")
128
+ del ckpt
129
+ del new_state_dict
130
+
131
+ print("Model loading complete.")
132
+ return model
133
+
134
+ def forward_layers(pretrained, x):
135
+ b, c, h, w = x.shape
136
+
137
+ # encoder
138
+ glob = pretrained.model.forward_flex(x)
139
+
140
+ layer_1 = pretrained.activations["1"]
141
+ layer_2 = pretrained.activations["2"]
142
+ layer_3 = pretrained.activations["3"]
143
+ layer_4 = pretrained.activations["4"]
144
+
145
+ layer_1 = pretrained.act_postprocess1[0:2](layer_1)
146
+ layer_2 = pretrained.act_postprocess2[0:2](layer_2)
147
+ layer_3 = pretrained.act_postprocess3[0:2](layer_3)
148
+ layer_4 = pretrained.act_postprocess4[0:2](layer_4)
149
+
150
+ unflatten = nn.Sequential(
151
+ nn.Unflatten(
152
+ 2,
153
+ torch.Size(
154
+ [
155
+ h // pretrained.model.patch_size[1],
156
+ w // pretrained.model.patch_size[0],
157
+ ]
158
+ ),
159
+ )
160
+ )
161
+
162
+ if layer_1.ndim == 3:
163
+ layer_1 = unflatten(layer_1)
164
+ if layer_2.ndim == 3:
165
+ layer_2 = unflatten(layer_2)
166
+ if layer_3.ndim == 3:
167
+ layer_3 = unflatten(layer_3)
168
+ if layer_4.ndim == 3:
169
+ layer_4 = unflatten(layer_4)
170
+
171
+ return layer_1, layer_2, layer_3, layer_4
src/model.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import yaml
4
+ import sys
5
+ sys.path.append(".")
6
+ sys.path.append("submodules")
7
+ sys.path.append("submodules/mast3r")
8
+ from mast3r.model import AsymmetricMASt3R
9
+ from src.ptv3 import PTV3
10
+ from src.gaussian_head import GaussianHead
11
+ from src.utils.points_process import merge_points
12
+ from src.losses import GaussianLoss
13
+ from src.lseg import LSegFeatureExtractor
14
+ import argparse
15
+
16
+ class LSM_MASt3R(nn.Module):
17
+ def __init__(self,
18
+ mast3r_config,
19
+ point_transformer_config,
20
+ gaussian_head_config,
21
+ lseg_config,
22
+ ):
23
+
24
+ super().__init__()
25
+ # self.config
26
+ self.config = {
27
+ 'mast3r_config': mast3r_config,
28
+ 'point_transformer_config': point_transformer_config,
29
+ 'gaussian_head_config': gaussian_head_config,
30
+ 'lseg_config': lseg_config
31
+ }
32
+
33
+ # Initialize AsymmetricMASt3R
34
+ self.mast3r = AsymmetricMASt3R.from_pretrained(**mast3r_config)
35
+
36
+ # Freeze MASt3R parameters
37
+ for param in self.mast3r.parameters():
38
+ param.requires_grad = False
39
+ self.mast3r.eval()
40
+
41
+ # Initialize PointTransformerV3
42
+ self.point_transformer = PTV3(**point_transformer_config)
43
+
44
+ # Initialize the gaussian head
45
+ self.gaussian_head = GaussianHead(**gaussian_head_config)
46
+
47
+ # Initialize the lseg feature extractor
48
+ self.lseg_feature_extractor = LSegFeatureExtractor.from_pretrained(**lseg_config)
49
+ for param in self.lseg_feature_extractor.parameters():
50
+ param.requires_grad = False
51
+ self.lseg_feature_extractor.eval()
52
+
53
+ # Define two linear layers
54
+ d_gs_feats = gaussian_head_config.get('d_gs_feats', 32)
55
+ self.feature_reduction = nn.Sequential(
56
+ nn.Conv2d(512, d_gs_feats, kernel_size=1),
57
+ nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
58
+ ) # (b, 512, h//2, w//2) -> (b, d_features, h, w)
59
+
60
+ self.feature_expansion = nn.Sequential(
61
+ nn.Conv2d(d_gs_feats, 512, kernel_size=1),
62
+ nn.Upsample(scale_factor=0.5, mode='bilinear', align_corners=True)
63
+ ) # (b, d_features, h, w) -> (b, 512, h//2, w//2)
64
+
65
+ def forward(self, view1, view2):
66
+ # AsymmetricMASt3R forward pass
67
+ mast3r_output = self.mast3r(view1, view2)
68
+
69
+ # merge points from two views
70
+ data_dict = merge_points(mast3r_output, view1, view2)
71
+
72
+ # PointTransformerV3 forward pass
73
+ point_transformer_output = self.point_transformer(data_dict)
74
+
75
+ # extract lseg features
76
+ lseg_features = self.extract_lseg_features(view1, view2)
77
+
78
+ # Gaussian head forward pass
79
+ final_output = self.gaussian_head(point_transformer_output, lseg_features)
80
+
81
+ return final_output
82
+
83
+ def extract_lseg_features(self, view1, view2):
84
+ # concat view1 and view2
85
+ img = torch.cat([view1['img'], view2['img']], dim=0) # (v*b, 3, h, w)
86
+ # extract features
87
+ lseg_features = self.lseg_feature_extractor.extract_features(img) # (v*b, 512, h//2, w//2)
88
+ # reduce dimensions
89
+ lseg_features = self.feature_reduction(lseg_features) # (v*b, d_features, h, w)
90
+
91
+ return lseg_features
92
+
93
+ @staticmethod
94
+ def from_pretrained(checkpoint_path, device='cuda'):
95
+ # Load the checkpoint
96
+ ckpt = torch.load(checkpoint_path, map_location='cpu')
97
+
98
+ # Extract the configuration from the checkpoint
99
+ config = ckpt['args']
100
+
101
+ # Create a new instance of LSM_MASt3R
102
+ model = eval(config.model)
103
+
104
+ # Load the state dict
105
+ model.load_state_dict(ckpt['model'])
106
+
107
+ # Move the model to the specified device
108
+ model = model.to(device)
109
+
110
+ return model
111
+
112
+ def state_dict(self, destination=None, prefix='', keep_vars=False):
113
+ # 获取所有参数的state_dict
114
+ full_state_dict = super().state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)
115
+
116
+ # 只保留需要训练的参数
117
+ trainable_state_dict = {
118
+ k: v for k, v in full_state_dict.items()
119
+ if not (k.startswith('mast3r.') or k.startswith('lseg_feature_extractor.'))
120
+ }
121
+
122
+ return trainable_state_dict
123
+
124
+ def load_state_dict(self, state_dict, strict=True):
125
+ # 获取当前模型的完整state_dict
126
+ model_state = super().state_dict()
127
+
128
+ # 只更新需要训练的参数
129
+ for k in list(state_dict.keys()):
130
+ if k in model_state and not (k.startswith('mast3r.') or k.startswith('lseg_feature_extractor.')):
131
+ model_state[k] = state_dict[k]
132
+
133
+ # 使用更新后的state_dict加载模型
134
+ super().load_state_dict(model_state, strict=False)
135
+
136
+ if __name__ == "__main__":
137
+ from torch.utils.data import DataLoader
138
+ import argparse
139
+ parser = argparse.ArgumentParser()
140
+ parser.add_argument('--checkpoint', type=str)
141
+ args = parser.parse_args()
142
+
143
+ # Load config
144
+ with open("configs/model_config.yaml", "r") as f:
145
+ config = yaml.safe_load(f)
146
+ # Initialize model
147
+ if args.checkpoint is not None:
148
+ model = LSM_MASt3R.from_pretrained(args.checkpoint, device='cuda')
149
+ else:
150
+ model = LSM_MASt3R(**config).to('cuda')
151
+
152
+ model.eval()
153
+
154
+ # Print model
155
+ print(model)
156
+ # Load dataset
157
+ from src.datasets.scannet import Scannet
158
+ dataset = Scannet(split='train', ROOT="data/scannet_processed", resolution=[(512, 384)])
159
+ # Print dataset
160
+ print(dataset)
161
+ # Test model
162
+ data_loader = DataLoader(dataset, batch_size=3, shuffle=True)
163
+ data = next(iter(data_loader))
164
+ # move data to cuda
165
+ for view in data:
166
+ view['img'] = view['img'].to('cuda')
167
+ view['depthmap'] = view['depthmap'].to('cuda')
168
+ view['camera_pose'] = view['camera_pose'].to('cuda')
169
+ view['camera_intrinsics'] = view['camera_intrinsics'].to('cuda')
170
+ # Forward pass
171
+ output = model(*data[:2])
172
+
173
+ # Loss
174
+ loss = GaussianLoss()
175
+ loss_value = loss(*data, *output, model)
176
+ print(loss_value)
src/ptv3.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PointTransformerV3.model import *
2
+
3
+ class PTV3(PointTransformerV3):
4
+ def __init__(self, **kwargs):
5
+ super().__init__(**kwargs)
6
+
7
+ def encode(self, data_dict):
8
+ point = Point(data_dict)
9
+ point.serialization(order=self.order, shuffle_orders=self.shuffle_orders)
10
+ point.sparsify()
11
+ point = self.embedding(point)
12
+ point = self.enc(point)
13
+ return point.feats
src/train.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Copyright (C) 2024-present Naver Corporation. All rights reserved.
3
+ # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
4
+ #
5
+ # --------------------------------------------------------
6
+ # training executable for MASt3R
7
+ # --------------------------------------------------------
8
+ import sys
9
+ sys.path.append('.')
10
+ sys.path.append('submodules/mast3r')
11
+ from mast3r.model import AsymmetricMASt3R
12
+ from mast3r.losses import ConfMatchingLoss, MatchingLoss, APLoss, Regr3D, InfoNCE, Regr3D_ScaleShiftInv
13
+ from mast3r.datasets import ARKitScenes, BlendedMVS, Co3d, MegaDepth, ScanNetpp, StaticThings3D, Waymo, WildRGBD
14
+
15
+ import mast3r.utils.path_to_dust3r # noqa
16
+ # add mast3r classes to dust3r imports
17
+ import dust3r.training
18
+ dust3r.training.AsymmetricMASt3R = AsymmetricMASt3R
19
+ dust3r.training.Regr3D = Regr3D
20
+ dust3r.training.Regr3D_ScaleShiftInv = Regr3D_ScaleShiftInv
21
+ dust3r.training.MatchingLoss = MatchingLoss
22
+ dust3r.training.ConfMatchingLoss = ConfMatchingLoss
23
+ dust3r.training.InfoNCE = InfoNCE
24
+ dust3r.training.APLoss = APLoss
25
+
26
+ import dust3r.datasets
27
+ dust3r.datasets.ARKitScenes = ARKitScenes
28
+ dust3r.datasets.BlendedMVS = BlendedMVS
29
+ dust3r.datasets.Co3d = Co3d
30
+ dust3r.datasets.MegaDepth = MegaDepth
31
+ dust3r.datasets.ScanNetpp = ScanNetpp
32
+ dust3r.datasets.StaticThings3D = StaticThings3D
33
+ dust3r.datasets.Waymo = Waymo
34
+ dust3r.datasets.WildRGBD = WildRGBD
35
+ from src.datasets.scannet import Scannet
36
+ from src.datasets.scannetpp import Scannetpp
37
+ from src.datasets.megadepth import MegaDepth
38
+ dust3r.datasets.Scannet = Scannet
39
+ dust3r.datasets.Scannetpp = Scannetpp
40
+ dust3r.datasets.MegaDepth = MegaDepth
41
+
42
+ from src.model import LSM_MASt3R
43
+ dust3r.training.LSM_MASt3R = LSM_MASt3R
44
+ from src.losses import GaussianLoss
45
+ dust3r.training.GaussianLoss = GaussianLoss
46
+
47
+ from dust3r.training import get_args_parser as dust3r_get_args_parser # noqa
48
+ from dust3r.training import train # noqa
49
+
50
+ import yaml
51
+
52
+
53
+ def get_args_parser():
54
+ parser = dust3r_get_args_parser()
55
+ parser.prog = 'LSM_MASt3R training'
56
+
57
+ # Load the configuration
58
+ with open("configs/model_config.yaml", "r") as f:
59
+ config = yaml.safe_load(f)
60
+
61
+ # Convert the config dict to a string of keyword arguments
62
+ config_str = ", ".join(f"{k}={v}" for k, v in config.items())
63
+
64
+ # Set the default model string with parameters
65
+ parser.set_defaults(model=f"LSM_MASt3R({config_str})")
66
+
67
+ return parser
68
+
69
+
70
+ if __name__ == '__main__':
71
+ args = get_args_parser()
72
+ args = args.parse_args()
73
+ train(args)
src/utils/camera_utils.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from dust3r.utils.geometry import inv
4
+ from src.utils.cuda_splatting import DummyCamera
5
+
6
+ def get_scaled_camera(ref_camera_extrinsics, target_camera_extrinsics, target_camera_intrinsics, scale, image_shape):
7
+ """
8
+ get a scaled camera from a reference camera to a target camera
9
+
10
+ """
11
+
12
+ # get extrinsics(target_camera to ref_camera)
13
+ target_camera_extrinsics = inv(ref_camera_extrinsics) @ target_camera_extrinsics
14
+ # scale translation
15
+ target_camera_extrinsics[:3, 3] = target_camera_extrinsics[:3, 3] * scale
16
+ # invert extrinsics(ref_camera to target_camera)
17
+ target_camera_extrinsics_inv = inv(target_camera_extrinsics)
18
+ # calculate fov
19
+ fovx = 2 * math.atan(image_shape[1] / (2 * target_camera_intrinsics[0, 0]))
20
+ fovy = 2 * math.atan(image_shape[0] / (2 * target_camera_intrinsics[1, 1]))
21
+ # return camera(numpy)
22
+ R = target_camera_extrinsics_inv[:3, :3].cpu().numpy().transpose() # R.transpose() : ref_camera_2_target_camera
23
+ T = target_camera_extrinsics_inv[:3, 3].cpu().numpy() # T : ref_camera_2_target_camera
24
+ image_shape = image_shape.cpu().numpy()
25
+ return DummyCamera(R, T, fovx, fovy, image_shape[1], image_shape[0])
26
+
27
+ def move_c2w_along_z(extrinsics: torch.Tensor, distance: float) -> torch.Tensor:
28
+ """
29
+ 向后移动多个 Camera-to-World (C2W) 矩阵,使相机沿各自 Z 轴方向远离原点。
30
+
31
+ 参数:
32
+ extrinsics (torch.Tensor): 形状为 [N, 4, 4] 的张量,包含 N 个 C2W 矩阵。
33
+ distance (float): 向后移动的距离。
34
+
35
+ 返回:
36
+ torch.Tensor: 更新后的 C2W 矩阵,形状与输入相同。
37
+ """
38
+ # 确保输入是一个四维矩阵,且最后一维是 4x4
39
+ assert extrinsics.dim() == 3 and extrinsics.shape[1:] == (4, 4), \
40
+ "输入的 extrinsics 必须是形状为 [N, 4, 4] 的张量"
41
+
42
+ # 创建一个拷贝以免修改原矩阵
43
+ updated_extrinsics = extrinsics.clone()
44
+
45
+ # 遍历每个 C2W 矩阵
46
+ for i in range(updated_extrinsics.shape[0]):
47
+ # 提取旋转矩阵 R 和平移向量 t
48
+ R = updated_extrinsics[i, :3, :3] # 形状为 [3, 3]
49
+ t = updated_extrinsics[i, :3, 3] # 形状为 [3]
50
+
51
+ # 获取相机的 Z 轴方向(第三列)
52
+ z_axis = R[:, 2] # 形状为 [3]
53
+
54
+ # 计算新的平移向量,沿 Z 轴方向向后移动
55
+ t_new = t - distance * z_axis
56
+
57
+ # 更新 C2W 矩阵的平移部分
58
+ updated_extrinsics[i, :3, 3] = t_new
59
+
60
+ return updated_extrinsics
src/utils/cuda_splatting.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (C) 2023, Inria
3
+ # GRAPHDECO research group, https://team.inria.fr/graphdeco
4
+ # All rights reserved.
5
+ #
6
+ # This software is free for non-commercial, research and evaluation use
7
+ # under the terms of the LICENSE.md file.
8
+ #
9
+ # For inquiries contact [email protected]
10
+ #
11
+ import numpy as np
12
+ import torch
13
+ import math
14
+ from diff_gaussian_rasterization import GaussianRasterizationSettings, GaussianRasterizer
15
+ from .gaussian_model import GaussianModel
16
+ from .sh_utils import eval_sh
17
+ from .graphics_utils import getWorld2View2, getProjectionMatrix
18
+
19
+ class DummyCamera:
20
+ def __init__(self, R, T, FoVx, FoVy, W, H):
21
+ self.projection_matrix = getProjectionMatrix(znear=0.01, zfar=100.0, fovX=FoVx, fovY=FoVy).transpose(0,1).cuda()
22
+ self.R = R
23
+ self.T = T
24
+ self.world_view_transform = torch.tensor(getWorld2View2(R, T, np.array([0,0,0]), 1.0)).transpose(0, 1).cuda()
25
+ self.full_proj_transform = (self.world_view_transform.unsqueeze(0).bmm(self.projection_matrix.unsqueeze(0))).squeeze(0)
26
+ self.camera_center = self.world_view_transform.inverse()[3, :3]
27
+ self.image_width = W
28
+ self.image_height = H
29
+ self.FoVx = FoVx
30
+ self.FoVy = FoVy
31
+
32
+ class DummyPipeline:
33
+ convert_SHs_python = False
34
+ compute_cov3D_python = False
35
+ debug = False
36
+
37
+ def calculate_fov(output_width, output_height, focal_length, aspect_ratio=1.0, invert_y=False):
38
+ fovx = 2 * math.atan((output_width / (2 * focal_length)))
39
+ fovy = 2 * math.atan((output_height / aspect_ratio) / (2 * focal_length))
40
+
41
+ if invert_y:
42
+ fovy = -fovy
43
+
44
+ return fovx, fovy
45
+
46
+ # def render(viewpoint_camera, pc : GaussianModel, pipe, bg_color : torch.Tensor, scaling_modifier = 1.0, override_color = None):
47
+ # """
48
+ # Render the scene.
49
+
50
+ # Background tensor (bg_color) must be on GPU!
51
+ # """
52
+
53
+ # # Create zero tensor. We will use it to make pytorch return gradients of the 2D (screen-space) means
54
+ # screenspace_points = torch.zeros_like(pc.get_xyz, dtype=pc.get_xyz.dtype, requires_grad=True, device="cuda") + 0
55
+ # try:
56
+ # screenspace_points.retain_grad()
57
+ # except:
58
+ # pass
59
+
60
+ # # Set up rasterization configuration
61
+ # tanfovx = math.tan(viewpoint_camera.FoVx * 0.5)
62
+ # tanfovy = math.tan(viewpoint_camera.FoVy * 0.5)
63
+
64
+ # raster_settings = GaussianRasterizationSettings(
65
+ # image_height=int(viewpoint_camera.image_height),
66
+ # image_width=int(viewpoint_camera.image_width),
67
+ # tanfovx=tanfovx,
68
+ # tanfovy=tanfovy,
69
+ # bg=bg_color,
70
+ # scale_modifier=scaling_modifier,
71
+ # viewmatrix=viewpoint_camera.world_view_transform,
72
+ # projmatrix=viewpoint_camera.full_proj_transform,
73
+ # sh_degree=pc.active_sh_degree,
74
+ # campos=viewpoint_camera.camera_center,
75
+ # prefiltered=False,
76
+ # debug=pipe.debug
77
+ # )
78
+
79
+ # rasterizer = GaussianRasterizer(raster_settings=raster_settings)
80
+
81
+ # means3D = pc.get_xyz
82
+ # means2D = screenspace_points
83
+ # opacity = pc.get_opacity
84
+
85
+ # # If precomputed 3d covariance is provided, use it. If not, then it will be computed from
86
+ # # scaling / rotation by the rasterizer.
87
+ # scales = None
88
+ # rotations = None
89
+ # cov3D_precomp = None
90
+ # if pipe.compute_cov3D_python:
91
+ # cov3D_precomp = pc.get_covariance(scaling_modifier)
92
+ # else:
93
+ # scales = pc.get_scaling
94
+ # rotations = pc.get_rotation
95
+
96
+ # # If precomputed colors are provided, use them. Otherwise, if it is desired to precompute colors
97
+ # # from SHs in Python, do it. If not, then SH -> RGB conversion will be done by rasterizer.
98
+ # shs = None
99
+ # colors_precomp = None
100
+ # if override_color is None:
101
+ # if pipe.convert_SHs_python:
102
+ # shs_view = pc.get_features.transpose(1, 2).view(-1, 3, (pc.max_sh_degree+1)**2)
103
+ # dir_pp = (pc.get_xyz - viewpoint_camera.camera_center.repeat(pc.get_features.shape[0], 1))
104
+ # dir_pp_normalized = dir_pp/dir_pp.norm(dim=1, keepdim=True)
105
+ # sh2rgb = eval_sh(pc.active_sh_degree, shs_view, dir_pp_normalized)
106
+ # colors_precomp = torch.clamp_min(sh2rgb + 0.5, 0.0)
107
+ # else:
108
+ # shs = pc.get_features
109
+ # else:
110
+ # colors_precomp = override_color
111
+
112
+ # # Rasterize visible Gaussians to image, obtain their radii (on screen).
113
+ # rendered_image, radii = rasterizer(
114
+ # means3D = means3D,
115
+ # means2D = means2D,
116
+ # shs = shs,
117
+ # colors_precomp = colors_precomp,
118
+ # opacities = opacity,
119
+ # scales = scales,
120
+ # rotations = rotations,
121
+ # cov3D_precomp = cov3D_precomp)
122
+
123
+ # # Those Gaussians that were frustum culled or had a radius of 0 were not visible.
124
+ # # They will be excluded from value updates used in the splitting criteria.
125
+ # return {"render": rendered_image,
126
+ # "viewspace_points": screenspace_points,
127
+ # "visibility_filter" : radii > 0,
128
+ # "radii": radii}
129
+
130
+ def render(viewpoint_camera, pc : GaussianModel, pipe, bg_color : torch.Tensor, scaling_modifier = 1.0, override_color = None):
131
+ """
132
+ Render the scene.
133
+
134
+ Background tensor (bg_color) must be on GPU!
135
+ """
136
+
137
+ # Create zero tensor. We will use it to make pytorch return gradients of the 2D (screen-space) means
138
+ screenspace_points = torch.zeros_like(pc.get_xyz, dtype=pc.get_xyz.dtype, requires_grad=True, device="cuda") + 0
139
+ try:
140
+ screenspace_points.retain_grad()
141
+ except:
142
+ pass
143
+
144
+ # Set up rasterization configuration
145
+ tanfovx = math.tan(viewpoint_camera.FoVx * 0.5)
146
+ tanfovy = math.tan(viewpoint_camera.FoVy * 0.5)
147
+
148
+ raster_settings = GaussianRasterizationSettings(
149
+ image_height=int(viewpoint_camera.image_height),
150
+ image_width=int(viewpoint_camera.image_width),
151
+ tanfovx=tanfovx,
152
+ tanfovy=tanfovy,
153
+ bg=bg_color,
154
+ scale_modifier=scaling_modifier,
155
+ viewmatrix=viewpoint_camera.world_view_transform,
156
+ projmatrix=viewpoint_camera.full_proj_transform,
157
+ sh_degree=pc.active_sh_degree,
158
+ campos=viewpoint_camera.camera_center,
159
+ prefiltered=False,
160
+ debug=pipe.debug
161
+ )
162
+
163
+ rasterizer = GaussianRasterizer(raster_settings=raster_settings)
164
+
165
+ means3D = pc.get_xyz
166
+ means2D = screenspace_points
167
+ opacity = pc.get_opacity
168
+
169
+ # If precomputed 3d covariance is provided, use it. If not, then it will be computed from
170
+ # scaling / rotation by the rasterizer.
171
+ scales = None
172
+ rotations = None
173
+ cov3D_precomp = None
174
+ if pipe.compute_cov3D_python:
175
+ cov3D_precomp = pc.get_covariance(scaling_modifier)
176
+ else:
177
+ scales = pc.get_scaling
178
+ rotations = pc.get_rotation
179
+
180
+ # If precomputed colors are provided, use them. Otherwise, if it is desired to precompute colors
181
+ # from SHs in Python, do it. If not, then SH -> RGB conversion will be done by rasterizer.
182
+ shs = None
183
+ colors_precomp = None
184
+ if override_color is None:
185
+ if pipe.convert_SHs_python:
186
+ shs_view = pc.get_features.transpose(1, 2).view(-1, 3, (pc.max_sh_degree+1)**2)
187
+ dir_pp = (pc.get_xyz - viewpoint_camera.camera_center.repeat(pc.get_features.shape[0], 1))
188
+ dir_pp_normalized = dir_pp/dir_pp.norm(dim=1, keepdim=True)
189
+ sh2rgb = eval_sh(pc.active_sh_degree, shs_view, dir_pp_normalized)
190
+ colors_precomp = torch.clamp_min(sh2rgb + 0.5, 0.0)
191
+ else:
192
+ shs = pc.get_features
193
+ else:
194
+ colors_precomp = override_color
195
+ semantic_feature = pc.get_semantic_feature
196
+
197
+ # Rasterize visible Gaussians to image, obtain their radii (on screen).
198
+ rendered_image, feature_map, radii, depth = rasterizer(
199
+ means3D = means3D,
200
+ means2D = means2D,
201
+ shs = shs,
202
+ colors_precomp = colors_precomp,
203
+ semantic_feature = semantic_feature,
204
+ opacities = opacity,
205
+ scales = scales,
206
+ rotations = rotations,
207
+ cov3D_precomp = cov3D_precomp)
208
+
209
+ # Those Gaussians that were frustum culled or had a radius of 0 were not visible.
210
+ # They will be excluded from value updates used in the splitting criteria.
211
+ return {"render": rendered_image,
212
+ "viewspace_points": screenspace_points,
213
+ "visibility_filter" : radii > 0,
214
+ "radii": radii,
215
+ 'feature_map': feature_map,
216
+ "depth": depth} ###d
src/utils/gaussian_model.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from einops import rearrange
4
+ import numpy as np
5
+ from plyfile import PlyData, PlyElement
6
+ from os import makedirs, path
7
+ from errno import EEXIST
8
+
9
+ def mkdir_p(folder_path):
10
+ # Creates a directory. equivalent to using mkdir -p on the command line
11
+ try:
12
+ makedirs(folder_path)
13
+ except OSError as exc: # Python >2.5
14
+ if exc.errno == EEXIST and path.isdir(folder_path):
15
+ pass
16
+ else:
17
+ raise
18
+
19
+ def RGB2SH(rgb):
20
+ return (rgb - 0.5) / C0
21
+
22
+ C0 = 0.28209479177387814
23
+
24
+ # https://github.com/facebookresearch/pytorch3d/blob/main/pytorch3d/transforms/rotation_conversions.py
25
+ def quaternion_to_matrix(
26
+ quaternions,
27
+ eps=1e-8,
28
+ ) :
29
+ # Order changed to match scipy format!
30
+ i, j, k, r = torch.unbind(quaternions, dim=-1)
31
+ two_s = 2 / ((quaternions * quaternions).sum(dim=-1) + eps)
32
+
33
+ o = torch.stack(
34
+ (
35
+ 1 - two_s * (j * j + k * k),
36
+ two_s * (i * j - k * r),
37
+ two_s * (i * k + j * r),
38
+ two_s * (i * j + k * r),
39
+ 1 - two_s * (i * i + k * k),
40
+ two_s * (j * k - i * r),
41
+ two_s * (i * k - j * r),
42
+ two_s * (j * k + i * r),
43
+ 1 - two_s * (i * i + j * j),
44
+ ),
45
+ -1,
46
+ )
47
+ return rearrange(o, "... (i j) -> ... i j", i=3, j=3)
48
+
49
+
50
+ def build_covariance(
51
+ scale,
52
+ rotation_xyzw,
53
+ ):
54
+ scale = scale.diag_embed()
55
+ rotation = quaternion_to_matrix(rotation_xyzw)
56
+ return (
57
+ rotation
58
+ @ scale
59
+ @ rearrange(scale, "... i j -> ... j i")
60
+ @ rearrange(rotation, "... i j -> ... j i")
61
+ )
62
+
63
+ def inverse_sigmoid(x):
64
+ return torch.log(x/(1-x))
65
+
66
+ class GaussianModel:
67
+
68
+ def __init__(self, sh_degree : int):
69
+ self.active_sh_degree = 0
70
+ self.max_sh_degree = sh_degree
71
+ self._xyz = torch.empty(0)
72
+ self._features_dc = torch.empty(0)
73
+ self._features_rest = torch.empty(0)
74
+ self._scaling = torch.empty(0)
75
+ self._rotation = torch.empty(0)
76
+ self._opacity = torch.empty(0)
77
+ self.max_radii2D = torch.empty(0)
78
+ self.xyz_gradient_accum = torch.empty(0)
79
+ self.denom = torch.empty(0)
80
+ self.optimizer = None
81
+ self.percent_dense = 0
82
+ self.spatial_lr_scale = 0
83
+ self._semantic_feature = torch.empty(0)
84
+
85
+ @property
86
+ def get_scaling(self):
87
+ return self._scaling
88
+
89
+ @property
90
+ def get_rotation(self):
91
+ return self._rotation
92
+
93
+ @property
94
+ def get_xyz(self):
95
+ return self._xyz
96
+
97
+ @property
98
+ def get_features(self):
99
+ features_dc = self._features_dc
100
+ features_rest = self._features_rest
101
+ return torch.cat((features_dc, features_rest), dim=1)
102
+
103
+ @property
104
+ def get_opacity(self):
105
+ return self._opacity
106
+
107
+ @property
108
+ def get_semantic_feature(self):
109
+ return self._semantic_feature
110
+
111
+ def construct_list_of_attributes(self):
112
+ l = ['x', 'y', 'z', 'nx', 'ny', 'nz']
113
+ # All channels except the 3 DC
114
+ for i in range(self._features_dc.shape[1]*self._features_dc.shape[2]):
115
+ l.append('f_dc_{}'.format(i))
116
+ for i in range(self._features_rest.shape[1]*self._features_rest.shape[2]):
117
+ l.append('f_rest_{}'.format(i))
118
+
119
+ l.append('opacity')
120
+ for i in range(self._scaling.shape[1]):
121
+ l.append('scale_{}'.format(i))
122
+ for i in range(self._rotation.shape[1]):
123
+ l.append('rot_{}'.format(i))
124
+ # Add semantic features
125
+ for i in range(self._semantic_feature.shape[1]*self._semantic_feature.shape[2]):
126
+ l.append('semantic_{}'.format(i))
127
+ return l
128
+
129
+ @staticmethod
130
+ def from_predictions(pred, sh_degree):
131
+ gaussians = GaussianModel(sh_degree=sh_degree)
132
+ gaussians._xyz = pred['means']
133
+ gaussians._features_dc = pred['sh_coeffs'][:, :1] # N, 1, d_sh
134
+ gaussians._features_rest = pred['sh_coeffs'][:, 1:] # N, d_sh-1, d_sh
135
+ gaussians._opacity = pred['opacities'] # N, 1
136
+ gaussians._scaling = pred['scales'] # N, 3, 3
137
+ gaussians._rotation = pred['rotations'] # N, 4
138
+ gaussians._semantic_feature = pred['gs_feats'][:, None, :] # N, 1, d_feats
139
+ return gaussians
140
+
141
+ def save_ply(self, path):
142
+ mkdir_p(os.path.dirname(path))
143
+
144
+ xyz = self._xyz.detach().cpu().numpy()
145
+ normals = np.zeros_like(xyz)
146
+ f_dc = self._features_dc.detach().transpose(1, 2).flatten(start_dim=1).contiguous().cpu().numpy()
147
+ f_rest = self._features_rest.detach().transpose(1, 2).flatten(start_dim=1).contiguous().cpu().numpy()
148
+ opacities = inverse_sigmoid(self._opacity).detach().cpu().numpy()
149
+ scale = torch.log(self._scaling).detach().cpu().numpy()
150
+ rotation = self._rotation.detach().cpu().numpy()
151
+ semantic_feature = self._semantic_feature.detach().transpose(1, 2).flatten(start_dim=1).contiguous().cpu().numpy()
152
+
153
+ dtype_full = [(attribute, 'f4') for attribute in self.construct_list_of_attributes()]
154
+
155
+ elements = np.empty(xyz.shape[0], dtype=dtype_full)
156
+ attributes = np.concatenate((xyz, normals, f_dc, f_rest, opacities, scale, rotation, semantic_feature), axis=1)
157
+ # attributes = np.concatenate((xyz, normals, f_dc, f_rest, opacities, scale, rotation), axis=1)
158
+ elements[:] = list(map(tuple, attributes))
159
+ el = PlyElement.describe(elements, 'vertex')
160
+ PlyData([el]).write(path)
src/utils/graphics_utils.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (C) 2023, Inria
3
+ # GRAPHDECO research group, https://team.inria.fr/graphdeco
4
+ # All rights reserved.
5
+ #
6
+ # This software is free for non-commercial, research and evaluation use
7
+ # under the terms of the LICENSE.md file.
8
+ #
9
+ # For inquiries contact [email protected]
10
+ #
11
+
12
+ import torch
13
+ import math
14
+ import numpy as np
15
+ from typing import NamedTuple
16
+
17
+ class BasicPointCloud(NamedTuple):
18
+ points : np.array
19
+ colors : np.array
20
+ normals : np.array
21
+
22
+ def geom_transform_points(points, transf_matrix):
23
+ P, _ = points.shape
24
+ ones = torch.ones(P, 1, dtype=points.dtype, device=points.device)
25
+ points_hom = torch.cat([points, ones], dim=1)
26
+ points_out = torch.matmul(points_hom, transf_matrix.unsqueeze(0))
27
+
28
+ denom = points_out[..., 3:] + 0.0000001
29
+ return (points_out[..., :3] / denom).squeeze(dim=0)
30
+
31
+ def getWorld2View(R, t):
32
+ Rt = np.zeros((4, 4))
33
+ Rt[:3, :3] = R.transpose()
34
+ Rt[:3, 3] = t
35
+ Rt[3, 3] = 1.0
36
+ return np.float32(Rt)
37
+
38
+ def getWorld2View2(R, t, translate=np.array([.0, .0, .0]), scale=1.0):
39
+ Rt = np.zeros((4, 4))
40
+ Rt[:3, :3] = R.transpose()
41
+ Rt[:3, 3] = t
42
+ Rt[3, 3] = 1.0
43
+
44
+ C2W = np.linalg.inv(Rt)
45
+ cam_center = C2W[:3, 3]
46
+ cam_center = (cam_center + translate) * scale
47
+ C2W[:3, 3] = cam_center
48
+ Rt = np.linalg.inv(C2W)
49
+ return np.float32(Rt)
50
+
51
+ def getProjectionMatrix(znear, zfar, fovX, fovY):
52
+ tanHalfFovY = math.tan((fovY / 2))
53
+ tanHalfFovX = math.tan((fovX / 2))
54
+
55
+ top = tanHalfFovY * znear
56
+ bottom = -top
57
+ right = tanHalfFovX * znear
58
+ left = -right
59
+
60
+ P = torch.zeros(4, 4)
61
+
62
+ z_sign = 1.0
63
+
64
+ P[0, 0] = 2.0 * znear / (right - left)
65
+ P[1, 1] = 2.0 * znear / (top - bottom)
66
+ P[0, 2] = (right + left) / (right - left)
67
+ P[1, 2] = (top + bottom) / (top - bottom)
68
+ P[3, 2] = z_sign
69
+ P[2, 2] = z_sign * zfar / (zfar - znear)
70
+ P[2, 3] = -(zfar * znear) / (zfar - znear)
71
+ return P
72
+
73
+ def fov2focal(fov, pixels):
74
+ return pixels / (2 * math.tan(fov / 2))
75
+
76
+ def focal2fov(focal, pixels):
77
+ return 2*math.atan(pixels/(2*focal))
src/utils/points_process.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from einops import rearrange
3
+
4
+ # merge points from two views and add color information
5
+ def merge_points(mast3r_output, view1, view2, grid_size=0.01):
6
+ # get points from mast3r_output
7
+ points1 = mast3r_output[0]['pts3d'].detach() # B, H, W, 3
8
+ points2 = mast3r_output[1]['pts3d_in_other_view'].detach() # B, H, W, 3
9
+ shape = points1.shape
10
+ # add color information
11
+ colors = torch.stack([view1['img'], view2['img']], dim=1) # B, V, 3, H, W
12
+ colors = rearrange(colors, 'b v c h w -> b (v h w) c') # B, V * H * W, 3
13
+ # merge points
14
+ points = torch.stack([points1, points2], dim=1) # B, V, H, W, 3
15
+ points = rearrange(points, 'b v h w c -> b (v h w) c') # B, V * H * W, 3
16
+ B, N, _ = points.shape
17
+ offset = torch.arange(1, B + 1, device=points.device) * N
18
+ # Center and normalize points
19
+ center = torch.mean(points, dim=1, keepdim=True)
20
+ points = points - center
21
+ scale = torch.max(torch.norm(points, dim=2, keepdim=True), dim=1, keepdim=True)[0]
22
+ points = points / scale
23
+ # concat points and colors
24
+ feat = torch.cat([points, colors], dim=-1) # B, V * H * W, 6
25
+
26
+ data_dict = {
27
+ 'coord': rearrange(points, 'b n c -> (b n) c'),
28
+ 'color': rearrange(colors, 'b n c -> (b n) c'),
29
+ 'feat': rearrange(feat, 'b n c -> (b n) c'),
30
+ 'offset': offset,
31
+ 'grid_size': grid_size,
32
+ 'center': center,
33
+ 'scale': scale,
34
+ 'shape': shape,
35
+ }
36
+
37
+ return data_dict
src/utils/sh_utils.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 The PlenOctree Authors.
2
+ # Redistribution and use in source and binary forms, with or without
3
+ # modification, are permitted provided that the following conditions are met:
4
+ #
5
+ # 1. Redistributions of source code must retain the above copyright notice,
6
+ # this list of conditions and the following disclaimer.
7
+ #
8
+ # 2. Redistributions in binary form must reproduce the above copyright notice,
9
+ # this list of conditions and the following disclaimer in the documentation
10
+ # and/or other materials provided with the distribution.
11
+ #
12
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
13
+ # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
14
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
15
+ # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
16
+ # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
17
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
18
+ # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
19
+ # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
20
+ # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
21
+ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
22
+ # POSSIBILITY OF SUCH DAMAGE.
23
+
24
+
25
+ C0 = 0.28209479177387814
26
+ C1 = 0.4886025119029199
27
+ C2 = [
28
+ 1.0925484305920792,
29
+ -1.0925484305920792,
30
+ 0.31539156525252005,
31
+ -1.0925484305920792,
32
+ 0.5462742152960396
33
+ ]
34
+ C3 = [
35
+ -0.5900435899266435,
36
+ 2.890611442640554,
37
+ -0.4570457994644658,
38
+ 0.3731763325901154,
39
+ -0.4570457994644658,
40
+ 1.445305721320277,
41
+ -0.5900435899266435
42
+ ]
43
+ C4 = [
44
+ 2.5033429417967046,
45
+ -1.7701307697799304,
46
+ 0.9461746957575601,
47
+ -0.6690465435572892,
48
+ 0.10578554691520431,
49
+ -0.6690465435572892,
50
+ 0.47308734787878004,
51
+ -1.7701307697799304,
52
+ 0.6258357354491761,
53
+ ]
54
+
55
+
56
+ def eval_sh(deg, sh, dirs):
57
+ """
58
+ Evaluate spherical harmonics at unit directions
59
+ using hardcoded SH polynomials.
60
+ Works with torch/np/jnp.
61
+ ... Can be 0 or more batch dimensions.
62
+ Args:
63
+ deg: int SH deg. Currently, 0-3 supported
64
+ sh: jnp.ndarray SH coeffs [..., C, (deg + 1) ** 2]
65
+ dirs: jnp.ndarray unit directions [..., 3]
66
+ Returns:
67
+ [..., C]
68
+ """
69
+ assert deg <= 4 and deg >= 0
70
+ coeff = (deg + 1) ** 2
71
+ assert sh.shape[-1] >= coeff
72
+
73
+ result = C0 * sh[..., 0]
74
+ if deg > 0:
75
+ x, y, z = dirs[..., 0:1], dirs[..., 1:2], dirs[..., 2:3]
76
+ result = (result -
77
+ C1 * y * sh[..., 1] +
78
+ C1 * z * sh[..., 2] -
79
+ C1 * x * sh[..., 3])
80
+
81
+ if deg > 1:
82
+ xx, yy, zz = x * x, y * y, z * z
83
+ xy, yz, xz = x * y, y * z, x * z
84
+ result = (result +
85
+ C2[0] * xy * sh[..., 4] +
86
+ C2[1] * yz * sh[..., 5] +
87
+ C2[2] * (2.0 * zz - xx - yy) * sh[..., 6] +
88
+ C2[3] * xz * sh[..., 7] +
89
+ C2[4] * (xx - yy) * sh[..., 8])
90
+
91
+ if deg > 2:
92
+ result = (result +
93
+ C3[0] * y * (3 * xx - yy) * sh[..., 9] +
94
+ C3[1] * xy * z * sh[..., 10] +
95
+ C3[2] * y * (4 * zz - xx - yy)* sh[..., 11] +
96
+ C3[3] * z * (2 * zz - 3 * xx - 3 * yy) * sh[..., 12] +
97
+ C3[4] * x * (4 * zz - xx - yy) * sh[..., 13] +
98
+ C3[5] * z * (xx - yy) * sh[..., 14] +
99
+ C3[6] * x * (xx - 3 * yy) * sh[..., 15])
100
+
101
+ if deg > 3:
102
+ result = (result + C4[0] * xy * (xx - yy) * sh[..., 16] +
103
+ C4[1] * yz * (3 * xx - yy) * sh[..., 17] +
104
+ C4[2] * xy * (7 * zz - 1) * sh[..., 18] +
105
+ C4[3] * yz * (7 * zz - 3) * sh[..., 19] +
106
+ C4[4] * (zz * (35 * zz - 30) + 3) * sh[..., 20] +
107
+ C4[5] * xz * (7 * zz - 3) * sh[..., 21] +
108
+ C4[6] * (xx - yy) * (7 * zz - 1) * sh[..., 22] +
109
+ C4[7] * xz * (xx - 3 * yy) * sh[..., 23] +
110
+ C4[8] * (xx * (xx - 3 * yy) - yy * (3 * xx - yy)) * sh[..., 24])
111
+ return result
112
+
113
+ def RGB2SH(rgb):
114
+ return (rgb - 0.5) / C0
115
+
116
+ def SH2RGB(sh):
117
+ return sh * C0 + 0.5
src/utils/visualization_utils.py ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ import numpy as np
4
+ import scipy.interpolate
5
+ import PIL
6
+ import torch
7
+ import matplotlib.pyplot as plt
8
+ from sklearn.preprocessing import StandardScaler
9
+ from sklearn.decomposition import PCA
10
+ import moviepy.editor as mpy
11
+
12
+ sys.path.append('submodules/mast3r/dust3r')
13
+ from dust3r.utils.image import heif_support_enabled, exif_transpose, _resize_pil_image, ImgNorm
14
+ from dust3r.image_pairs import make_pairs
15
+ from dust3r.inference import inference
16
+ from dust3r.cloud_opt import global_aligner, GlobalAlignerMode
17
+
18
+ sys.path.append('.')
19
+ from src.utils.cuda_splatting import render, DummyPipeline
20
+ from src.utils.gaussian_model import GaussianModel
21
+ from src.utils.camera_utils import get_scaled_camera
22
+ from src.losses import merge_and_split_predictions
23
+ from src.utils.camera_utils import move_c2w_along_z
24
+
25
+ from einops import rearrange
26
+ LABELS = ['wall', 'floor', 'ceiling', 'chair', 'table', 'sofa', 'bed', 'other']
27
+ NUM_LABELS = len(LABELS) + 1
28
+ PALLETE = plt.cm.get_cmap('tab10', NUM_LABELS)
29
+ COLORS_LIST = [PALLETE(i)[:3] for i in range(NUM_LABELS)]
30
+ COLORS = torch.tensor(COLORS_LIST, dtype=torch.float32)
31
+
32
+ def load_images(folder_or_list, size, square_ok=False, verbose=True, save_dir=None):
33
+ """ open and convert all images in a list or folder to proper input format for DUSt3R
34
+ """
35
+ if isinstance(folder_or_list, str):
36
+ if verbose:
37
+ print(f'>> Loading images from {folder_or_list}')
38
+ root, folder_content = folder_or_list, sorted(os.listdir(folder_or_list))
39
+
40
+ elif isinstance(folder_or_list, list):
41
+ if verbose:
42
+ print(f'>> Loading a list of {len(folder_or_list)} images')
43
+ root, folder_content = '', folder_or_list
44
+
45
+ else:
46
+ raise ValueError(f'bad {folder_or_list=} ({type(folder_or_list)})')
47
+
48
+ supported_images_extensions = ['.jpg', '.jpeg', '.png']
49
+ if heif_support_enabled:
50
+ supported_images_extensions += ['.heic', '.heif']
51
+ supported_images_extensions = tuple(supported_images_extensions)
52
+
53
+ imgs = []
54
+ for path in folder_content:
55
+ if not path.lower().endswith(supported_images_extensions):
56
+ continue
57
+ img = exif_transpose(PIL.Image.open(os.path.join(root, path))).convert('RGB')
58
+ W1, H1 = img.size
59
+ if size == 224:
60
+ # resize short side to 224 (then crop)
61
+ img = _resize_pil_image(img, round(size * max(W1/H1, H1/W1)))
62
+ else:
63
+ # resize long side to 512
64
+ img = _resize_pil_image(img, size)
65
+ W, H = img.size
66
+ cx, cy = W//2, H//2
67
+ if size == 224:
68
+ half = min(cx, cy)
69
+ img = img.crop((cx-half, cy-half, cx+half, cy+half))
70
+ else:
71
+ halfw, halfh = ((2*cx)//32)*16, ((2*cy)//32)*16
72
+ if not (square_ok) and W == H:
73
+ halfh = 3*halfw/4
74
+ img = img.crop((cx-halfw, cy-halfh, cx+halfw, cy+halfh))
75
+
76
+ W2, H2 = img.size
77
+ if verbose:
78
+ print(f' - adding {path} with resolution {W1}x{H1} --> {W2}x{H2}')
79
+
80
+ # Save the processed image if save_dir is provided
81
+ if save_dir:
82
+ os.makedirs(save_dir, exist_ok=True)
83
+ save_path = os.path.join(save_dir, f"processed_{len(imgs):03d}.png")
84
+ img.save(save_path)
85
+ if verbose:
86
+ print(f' - saved processed image to {save_path}')
87
+
88
+ imgs.append(dict(img=ImgNorm(img)[None], true_shape=np.int32(
89
+ [img.size[::-1]]), idx=len(imgs), instance=str(len(imgs))))
90
+
91
+ assert imgs, 'no images foud at '+root
92
+ if verbose:
93
+ print(f' (Found {len(imgs)} images)')
94
+ return imgs
95
+
96
+ def normalize(x):
97
+ """Normalization helper function."""
98
+ return x / np.linalg.norm(x)
99
+
100
+ def viewmatrix(lookdir, up, position):
101
+ """Construct lookat view matrix."""
102
+ vec2 = normalize(lookdir)
103
+ vec0 = normalize(np.cross(up, vec2))
104
+ vec1 = normalize(np.cross(vec2, vec0))
105
+ m = np.stack([vec0, vec1, vec2, position], axis=1)
106
+ return m
107
+
108
+ def poses_to_points(poses, dist):
109
+ """Converts from pose matrices to (position, lookat, up) format."""
110
+ pos = poses[:, :3, -1]
111
+ lookat = poses[:, :3, -1] - dist * poses[:, :3, 2]
112
+ up = poses[:, :3, -1] + dist * poses[:, :3, 1]
113
+ return np.stack([pos, lookat, up], 1)
114
+
115
+ def points_to_poses(points):
116
+ """Converts from (position, lookat, up) format to pose matrices."""
117
+ return np.array([viewmatrix(p - l, u - p, p) for p, l, u in points])
118
+
119
+ def interp(points, n, k, s):
120
+ """Runs multidimensional B-spline interpolation on the input points."""
121
+ sh = points.shape
122
+ pts = np.reshape(points, (sh[0], -1))
123
+ k = min(k, sh[0] - 1)
124
+ tck, _ = scipy.interpolate.splprep(pts.T, k=k, s=s)
125
+ u = np.linspace(0, 1, n, endpoint=False)
126
+ new_points = np.array(scipy.interpolate.splev(u, tck))
127
+ new_points = np.reshape(new_points.T, (n, sh[1], sh[2]))
128
+ return new_points
129
+
130
+ def generate_interpolated_path(poses, n_interp, spline_degree=5,
131
+ smoothness=.03, rot_weight=.1):
132
+ """Creates a smooth spline path between input keyframe camera poses.
133
+
134
+ Spline is calculated with poses in format (position, lookat-point, up-point).
135
+
136
+ Args:
137
+ poses: (n, 3, 4) array of input pose keyframes.
138
+ n_interp: returned path will have n_interp * (n - 1) total poses.
139
+ spline_degree: polynomial degree of B-spline.
140
+ smoothness: parameter for spline smoothing, 0 forces exact interpolation.
141
+ rot_weight: relative weighting of rotation/translation in spline solve.
142
+
143
+ Returns:
144
+ Array of new camera poses with shape (n_interp * (n - 1), 3, 4).
145
+ """
146
+
147
+ points = poses_to_points(poses, dist=rot_weight)
148
+ new_points = interp(points,
149
+ n_interp * (points.shape[0] - 1),
150
+ k=spline_degree,
151
+ s=smoothness)
152
+ return points_to_poses(new_points)
153
+
154
+ def batch_visualize_tensor_global_pca(tensor_batch, num_components=3):
155
+ B, C, H, W = tensor_batch.shape
156
+
157
+ tensor_flat_all = tensor_batch.reshape(B, C, -1).permute(1, 0, 2).reshape(C, -1).T
158
+
159
+ tensor_flat_all_np = tensor_flat_all.cpu().numpy()
160
+
161
+ scaler = StandardScaler()
162
+ tensor_flat_all_np = scaler.fit_transform(tensor_flat_all_np)
163
+
164
+ pca = PCA(n_components=num_components)
165
+ tensor_reduced_all_np = pca.fit_transform(tensor_flat_all_np)
166
+
167
+ tensor_reduced_all = torch.tensor(tensor_reduced_all_np, dtype=tensor_batch.dtype).T.reshape(num_components, B, H * W).permute(1, 0, 2)
168
+
169
+ output_tensor = torch.zeros((B, 3, H, W))
170
+
171
+ for i in range(B):
172
+ tensor_reduced = tensor_reduced_all[i].reshape(num_components, H, W)
173
+ tensor_reduced -= tensor_reduced.min()
174
+ tensor_reduced /= tensor_reduced.max()
175
+ output_tensor[i] = tensor_reduced[:3]
176
+
177
+ return output_tensor
178
+
179
+ def depth_to_colormap(depth_tensor, colormap='jet'):
180
+ B, _, _, _ = depth_tensor.shape
181
+
182
+ depth_tensor = (depth_tensor - depth_tensor.min()) / (depth_tensor.max() - depth_tensor.min())
183
+
184
+ depth_np = depth_tensor.squeeze(1).cpu().numpy()
185
+
186
+ cmap = plt.get_cmap(colormap)
187
+ colored_images = []
188
+
189
+ for i in range(B):
190
+ colored_image = cmap(depth_np[i])
191
+ colored_images.append(colored_image[..., :3])
192
+
193
+ colored_tensor = torch.tensor(np.array(colored_images), dtype=torch.float32).permute(0, 3, 1, 2)
194
+
195
+ return colored_tensor
196
+
197
+ def save_video(frames, video_path, fps=24):
198
+ clips = [mpy.ImageClip(frame).set_duration(1/fps) for frame in frames]
199
+ video = mpy.concatenate_videoclips(clips, method="compose")
200
+ video.write_videofile(video_path, fps=fps)
201
+
202
+ def tensors_to_videos(all_images, all_depth_vis, all_fmap_vis, all_sems_vis, video_dir='videos', fps=24):
203
+ B, C, H, W = all_images.shape
204
+ assert all_depth_vis.shape == (B, C, H, W)
205
+ assert all_fmap_vis.shape == (B, C, H, W)
206
+ assert all_sems_vis.shape == (B, C, H, W)
207
+ os.makedirs(video_dir, exist_ok=True)
208
+
209
+ all_images = (all_images.permute(0, 2, 3, 1).cpu().numpy() * 255).astype(np.uint8)
210
+ all_depth_vis = (all_depth_vis.permute(0, 2, 3, 1).cpu().numpy() * 255).astype(np.uint8)
211
+ all_fmap_vis = (all_fmap_vis.permute(0, 2, 3, 1).cpu().numpy() * 255).astype(np.uint8)
212
+ all_sems_vis = (all_sems_vis.permute(0, 2, 3, 1).cpu().numpy() * 255).astype(np.uint8)
213
+
214
+ save_video(all_images, os.path.join(video_dir, 'output_images_video.mp4'), fps=fps)
215
+ save_video(all_depth_vis, os.path.join(video_dir, 'output_depth_video.mp4'), fps=fps)
216
+ save_video(all_fmap_vis, os.path.join(video_dir, 'output_fmap_video.mp4'), fps=fps)
217
+ # save_video(all_sems_vis, os.path.join(video_dir, 'output_sems_video.mp4'), fps=fps)
218
+
219
+ print(f'Videos saved to {video_dir}')
220
+
221
+ def transfer_images_to_device(images, device):
222
+ """
223
+ Transfer the loaded images to the specified device.
224
+
225
+ Args:
226
+ images (list): List of dictionaries containing image data.
227
+ device (str or torch.device): The device to transfer the data to.
228
+
229
+ Returns:
230
+ list: List of dictionaries with image data transferred to the specified device.
231
+ """
232
+ transferred_images = []
233
+ for img_dict in images:
234
+ transferred_dict = {
235
+ 'img': img_dict['img'].to(device),
236
+ 'true_shape': torch.tensor(img_dict['true_shape'], device=device),
237
+ 'idx': img_dict['idx'],
238
+ 'instance': img_dict['instance']
239
+ }
240
+ transferred_images.append(transferred_dict)
241
+ return transferred_images
242
+
243
+ def render_camera_path(video_poses, camera_params, gaussians, model, device, pipeline, bg_color, image_shape):
244
+ """渲染相机路径的帮助函数
245
+
246
+ Args:
247
+ video_poses: 相机位姿列表
248
+ camera_params: 包含extrinsics和intrinsics的相机参数
249
+ gaussians: 高斯模型
250
+ model: 特征提取模型
251
+ device: 计算设备
252
+ pipeline: 渲染管线
253
+ bg_color: 背景颜色
254
+ image_shape: 图像尺寸
255
+
256
+ Returns:
257
+ rendered_images: 渲染的图像
258
+ rendered_feats: 渲染的特征图
259
+ rendered_depths: 渲染的深度图
260
+ rendered_sems: 渲染的语义图
261
+ """
262
+ extrinsics, intrinsics = camera_params
263
+ rendered_images = []
264
+ rendered_feats = []
265
+ rendered_depths = []
266
+ rendered_sems = []
267
+
268
+ for i in range(len(video_poses)):
269
+ target_extrinsics = torch.zeros(4, 4).to(device)
270
+ target_extrinsics[3, 3] = 1.0
271
+ target_extrinsics[:3, :4] = torch.tensor(video_poses[i], device=device)
272
+ camera = get_scaled_camera(extrinsics[0], target_extrinsics, intrinsics[0], 1.0, image_shape)
273
+
274
+ rendered_output = render(camera, gaussians, pipeline, bg_color)
275
+ rendered_images.append(rendered_output['render'])
276
+
277
+ # 处理特征图
278
+ feature_map = rendered_output['feature_map']
279
+ feature_map = model.feature_expansion(feature_map[None, ...])
280
+
281
+ # 处理语义图
282
+ logits = model.lseg_feature_extractor.decode_feature(feature_map, labelset=LABELS)
283
+ semantic_map = torch.argmax(logits, dim=1) + 1
284
+ mask = COLORS[semantic_map.cpu()]
285
+ mask = rearrange(mask, 'b h w c -> b c h w')
286
+ rendered_sems.append(mask.squeeze(0))
287
+
288
+ # 降采样并上采样特征图
289
+ feature_map = feature_map[:, ::16, ...]
290
+ feature_map = torch.nn.functional.interpolate(feature_map, scale_factor=2, mode='bilinear', align_corners=True)
291
+ rendered_feats.append(feature_map[0])
292
+ del feature_map
293
+
294
+ rendered_depths.append(rendered_output['depth'])
295
+
296
+ # 堆叠并处理结果
297
+ rendered_images = torch.clamp(torch.stack(rendered_images, dim=0), 0, 1)
298
+ rendered_feats = torch.stack(rendered_feats, dim=0)
299
+ rendered_depths = torch.stack(rendered_depths, dim=0)
300
+ rendered_sems = torch.stack(rendered_sems, dim=0)
301
+
302
+ return rendered_images, rendered_feats, rendered_depths, rendered_sems
303
+
304
+ @torch.no_grad()
305
+ def render_video_from_file(file_list, model, output_path, device='cuda', resolution=224, n_interp=90, fps=30, path_type='default'):
306
+ # 1. load images
307
+ images = load_images(file_list, resolution, save_dir=os.path.join(output_path, 'processed_images'))
308
+ images = transfer_images_to_device(images, device) # Transfer images to the specified device
309
+ image_shape = images[0]['true_shape'][0]
310
+ # 2. get camera pose
311
+ pairs = make_pairs(images, prefilter=None, symmetrize=True)
312
+ output = inference(pairs, model.mast3r, device, batch_size=1)
313
+ mode = GlobalAlignerMode.PairViewer
314
+ scene = global_aligner(output, device=device, mode=mode)
315
+ extrinsics = scene.get_im_poses()
316
+ intrinsics = scene.get_intrinsics()
317
+ video_poses = generate_interpolated_path(extrinsics[:, :3, :].cpu().numpy(), n_interp=n_interp) # extrinsics: (b, 3, 4)
318
+ # 3. get gaussians
319
+ pred1, pred2 = model(*images)
320
+ pred = merge_and_split_predictions(pred1, pred2)
321
+ gaussians = GaussianModel.from_predictions(pred[0], sh_degree=3)
322
+ # 4. 渲染原始视角
323
+ pipeline = DummyPipeline()
324
+ bg_color = torch.tensor([0.0, 0.0, 0.0]).to(device)
325
+ camera_params = (extrinsics, intrinsics)
326
+
327
+ rendered_images, rendered_feats, rendered_depths, rendered_sems = render_camera_path(
328
+ video_poses, camera_params, gaussians, model, device, pipeline, bg_color, image_shape)
329
+
330
+ # 5. 可视化
331
+ all_fmap_vis = batch_visualize_tensor_global_pca(rendered_feats)
332
+ all_depth_vis = depth_to_colormap(rendered_depths)
333
+ all_sems_vis = rendered_sems
334
+
335
+ # 6. 保存视频和高斯点云
336
+ tensors_to_videos(rendered_images, all_depth_vis, all_fmap_vis, all_sems_vis, output_path, fps=fps)
337
+ gaussians.save_ply(os.path.join(output_path, 'gaussians.ply'))
338
+
339
+ # 7. 渲染移动后的视角
340
+ moved_extrinsics = move_c2w_along_z(extrinsics, 2.0)
341
+ moved_video_poses = generate_interpolated_path(moved_extrinsics[:, :3, :].cpu().numpy(), n_interp=n_interp)
342
+ camera_params = (extrinsics, intrinsics)
343
+
344
+ moved_rendered_images, moved_rendered_feats, moved_rendered_depths, moved_rendered_sems = render_camera_path(
345
+ moved_video_poses, camera_params, gaussians, model, device, pipeline, bg_color, image_shape)
346
+
347
+ # 8. 可视化和保存移动后的结果
348
+ moved_all_fmap_vis = batch_visualize_tensor_global_pca(moved_rendered_feats)
349
+ moved_all_depth_vis = depth_to_colormap(moved_rendered_depths)
350
+ moved_all_sems_vis = moved_rendered_sems
351
+
352
+ moved_output_path = os.path.join(output_path, 'moved')
353
+ os.makedirs(moved_output_path, exist_ok=True)
354
+ tensors_to_videos(moved_rendered_images, moved_all_depth_vis, moved_all_fmap_vis, moved_all_sems_vis,
355
+ moved_output_path, fps=fps)
submodules/PointTransformerV3/.gitmodules ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [submodule "Pointcept"]
2
+ path = Pointcept
3
+ url = https://github.com/Pointcept/Pointcept
submodules/PointTransformerV3/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Pointcept
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
submodules/PointTransformerV3/Pointcept/.github/workflows/formatter.yml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Formatter
2
+
3
+ on:
4
+ workflow_dispatch:
5
+ push:
6
+ branches:
7
+ - main
8
+ pull_request:
9
+ types: [opened, reopened, synchronize]
10
+
11
+ concurrency:
12
+ group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
13
+ cancel-in-progress: true
14
+
15
+ jobs:
16
+ formatter:
17
+ runs-on: ubuntu-latest
18
+ steps:
19
+ - uses: actions/checkout@v3
20
+ - uses: psf/black@stable
submodules/PointTransformerV3/Pointcept/.gitignore ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ image/
2
+ __pycache__
3
+ **/build/
4
+ **/*.egg-info/
5
+ **/dist/
6
+ *.so
7
+ exp
8
+ weights
9
+ data
10
+ log
11
+ outputs/
12
+ .vscode
13
+ .idea
14
+ */.DS_Store
15
+ **/*.out
16
+ Dockerfile
submodules/PointTransformerV3/Pointcept/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Pointcept
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
submodules/PointTransformerV3/Pointcept/README.md ADDED
@@ -0,0 +1,896 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <p align="center">
2
+ <!-- pypi-strip -->
3
+ <picture>
4
+ <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/Pointcept/Pointcept/main/docs/logo_dark.png">
5
+ <source media="(prefers-color-scheme: light)" srcset="https://raw.githubusercontent.com/Pointcept/Pointcept/main/docs/logo.png">
6
+ <!-- /pypi-strip -->
7
+ <img alt="pointcept" src="https://raw.githubusercontent.com/Pointcept/Pointcept/main/docs/logo.png" width="400">
8
+ <!-- pypi-strip -->
9
+ </picture><br>
10
+ <!-- /pypi-strip -->
11
+ </p>
12
+
13
+ [![Formatter](https://github.com/pointcept/pointcept/actions/workflows/formatter.yml/badge.svg)](https://github.com/pointcept/pointcept/actions/workflows/formatter.yml)
14
+
15
+ **Pointcept** is a powerful and flexible codebase for point cloud perception research. It is also an official implementation of the following paper:
16
+ - **Point Transformer V3: Simpler, Faster, Stronger**
17
+ *Xiaoyang Wu, Li Jiang, Peng-Shuai Wang, Zhijian Liu, Xihui Liu, Yu Qiao, Wanli Ouyang, Tong He, Hengshuang Zhao*
18
+ IEEE Conference on Computer Vision and Pattern Recognition (**CVPR**) 2024 - Oral
19
+ [ Backbone ] [PTv3] - [ [arXiv](https://arxiv.org/abs/2312.10035) ] [ [Bib](https://xywu.me/research/ptv3/bib.txt) ] [ [Project](https://github.com/Pointcept/PointTransformerV3) ] &rarr; [here](https://github.com/Pointcept/PointTransformerV3)
20
+
21
+ - **OA-CNNs: Omni-Adaptive Sparse CNNs for 3D Semantic Segmentation**
22
+ *Bohao Peng, Xiaoyang Wu, Li Jiang, Yukang Chen, Hengshuang Zhao, Zhuotao Tian, Jiaya Jia*
23
+ IEEE Conference on Computer Vision and Pattern Recognition (**CVPR**) 2024
24
+ [ Backbone ] [ OA-CNNs ] - [ [arXiv](https://arxiv.org/abs/2403.14418) ] [ [Bib](https://xywu.me/research/oacnns/bib.txt) ] &rarr; [here](#oa-cnns)
25
+
26
+ - **PonderV2: Pave the Way for 3D Foundation Model with A Universal Pre-training Paradigm**
27
+ *Haoyi Zhu\*, Honghui Yang\*, Xiaoyang Wu\*, Di Huang\*, Sha Zhang, Xianglong He, Tong He, Hengshuang Zhao, Chunhua Shen, Yu Qiao, Wanli Ouyang*
28
+ arXiv Preprint 2023
29
+ [ Pretrain ] [PonderV2] - [ [arXiv](https://arxiv.org/abs/2310.08586) ] [ [Bib](https://xywu.me/research/ponderv2/bib.txt) ] [ [Project](https://github.com/OpenGVLab/PonderV2) ] &rarr; [here](https://github.com/OpenGVLab/PonderV2)
30
+
31
+
32
+ - **Towards Large-scale 3D Representation Learning with Multi-dataset Point Prompt Training**
33
+ *Xiaoyang Wu, Zhuotao Tian, Xin Wen, Bohao Peng, Xihui Liu, Kaicheng Yu, Hengshuang Zhao*
34
+ IEEE Conference on Computer Vision and Pattern Recognition (**CVPR**) 2024
35
+ [ Pretrain ] [PPT] - [ [arXiv](https://arxiv.org/abs/2308.09718) ] [ [Bib](https://xywu.me/research/ppt/bib.txt) ] &rarr; [here](#point-prompt-training-ppt)
36
+
37
+ - **Masked Scene Contrast: A Scalable Framework for Unsupervised 3D Representation Learning**
38
+ *Xiaoyang Wu, Xin Wen, Xihui Liu, Hengshuang Zhao*
39
+ IEEE Conference on Computer Vision and Pattern Recognition (**CVPR**) 2023
40
+ [ Pretrain ] [ MSC ] - [ [arXiv](https://arxiv.org/abs/2303.14191) ] [ [Bib](https://xywu.me/research/msc/bib.txt) ] &rarr; [here](#masked-scene-contrast-msc)
41
+
42
+
43
+ - **Learning Context-aware Classifier for Semantic Segmentation** (3D Part)
44
+ *Zhuotao Tian, Jiequan Cui, Li Jiang, Xiaojuan Qi, Xin Lai, Yixin Chen, Shu Liu, Jiaya Jia*
45
+ AAAI Conference on Artificial Intelligence (**AAAI**) 2023 - Oral
46
+ [ SemSeg ] [ CAC ] - [ [arXiv](https://arxiv.org/abs/2303.11633) ] [ [Bib](https://xywu.me/research/cac/bib.txt) ] [ [2D Part](https://github.com/tianzhuotao/CAC) ] &rarr; [here](#context-aware-classifier)
47
+
48
+
49
+ - **Point Transformer V2: Grouped Vector Attention and Partition-based Pooling**
50
+ *Xiaoyang Wu, Yixing Lao, Li Jiang, Xihui Liu, Hengshuang Zhao*
51
+ Conference on Neural Information Processing Systems (**NeurIPS**) 2022
52
+ [ Backbone ] [ PTv2 ] - [ [arXiv](https://arxiv.org/abs/2210.05666) ] [ [Bib](https://xywu.me/research/ptv2/bib.txt) ] &rarr; [here](#point-transformers)
53
+
54
+
55
+ - **Point Transformer**
56
+ *Hengshuang Zhao, Li Jiang, Jiaya Jia, Philip Torr, Vladlen Koltun*
57
+ IEEE International Conference on Computer Vision (**ICCV**) 2021 - Oral
58
+ [ Backbone ] [ PTv1 ] - [ [arXiv](https://arxiv.org/abs/2012.09164) ] [ [Bib](https://hszhao.github.io/papers/iccv21_pointtransformer_bib.txt) ] &rarr; [here](#point-transformers)
59
+
60
+ Additionally, **Pointcept** integrates the following excellent work (contain above):
61
+ Backbone:
62
+ [MinkUNet](https://github.com/NVIDIA/MinkowskiEngine) ([here](#sparseunet)),
63
+ [SpUNet](https://github.com/traveller59/spconv) ([here](#sparseunet)),
64
+ [SPVCNN](https://github.com/mit-han-lab/spvnas) ([here](#spvcnn)),
65
+ [OACNNs](https://arxiv.org/abs/2403.14418) ([here](#oa-cnns)),
66
+ [PTv1](https://arxiv.org/abs/2012.09164) ([here](#point-transformers)),
67
+ [PTv2](https://arxiv.org/abs/2210.05666) ([here](#point-transformers)),
68
+ [PTv3](https://arxiv.org/abs/2312.10035) ([here](#point-transformers)),
69
+ [StratifiedFormer](https://github.com/dvlab-research/Stratified-Transformer) ([here](#stratified-transformer)),
70
+ [OctFormer](https://github.com/octree-nn/octformer) ([here](#octformer)),
71
+ [Swin3D](https://github.com/microsoft/Swin3D) ([here](#swin3d));
72
+ Semantic Segmentation:
73
+ [Mix3d](https://github.com/kumuji/mix3d) ([here](https://github.com/Pointcept/Pointcept/blob/main/configs/scannet/semseg-spunet-v1m1-0-base.py#L5)),
74
+ [CAC](https://arxiv.org/abs/2303.11633) ([here](#context-aware-classifier));
75
+ Instance Segmentation:
76
+ [PointGroup](https://github.com/dvlab-research/PointGroup) ([here](#pointgroup));
77
+ Pre-training:
78
+ [PointContrast](https://github.com/facebookresearch/PointContrast) ([here](#pointcontrast)),
79
+ [Contrastive Scene Contexts](https://github.com/facebookresearch/ContrastiveSceneContexts) ([here](#contrastive-scene-contexts)),
80
+ [Masked Scene Contrast](https://arxiv.org/abs/2303.14191) ([here](#masked-scene-contrast-msc)),
81
+ [Point Prompt Training](https://arxiv.org/abs/2308.09718) ([here](#point-prompt-training-ppt));
82
+ Datasets:
83
+ [ScanNet](http://www.scan-net.org/) ([here](#scannet-v2)),
84
+ [ScanNet200](http://www.scan-net.org/) ([here](#scannet-v2)),
85
+ [ScanNet++](https://kaldir.vc.in.tum.de/scannetpp/) ([here](#scannet)),
86
+ [S3DIS](https://docs.google.com/forms/d/e/1FAIpQLScDimvNMCGhy_rmBA2gHfDu3naktRm6A8BPwAWWDv-Uhm6Shw/viewform?c=0&w=1) ([here](#s3dis)),
87
+ [Matterport3D](https://niessner.github.io/Matterport/) ([here](#matterport3d)),
88
+ [ArkitScene](https://github.com/apple/ARKitScenes),
89
+ [Structured3D](https://structured3d-dataset.org/) ([here](#structured3d)),
90
+ [SemanticKITTI](http://www.semantic-kitti.org/) ([here](#semantickitti)),
91
+ [nuScenes](https://www.nuscenes.org/nuscenes) ([here](#nuscenes)),
92
+ [ModelNet40](https://modelnet.cs.princeton.edu/) ([here](#modelnet)),
93
+ [Waymo](https://waymo.com/open/) ([here](#waymo)).
94
+
95
+
96
+ ## Highlights
97
+ - *May, 2024*: In v1.5.2, we redesigned the default structure for each dataset for better performance. Please **re-preprocess** datasets or **download** our preprocessed datasets from **[here](https://huggingface.co/Pointcept)**.
98
+ - *Apr, 2024*: **PTv3** is selected as one of the 90 **Oral** papers (3.3% accepted papers, 0.78% submissions) by CVPR'24!
99
+ - *Mar, 2024*: We release code for **OA-CNNs**, accepted by CVPR'24. Issue related to **OA-CNNs** can @Pbihao.
100
+ - *Feb, 2024*: **PTv3** and **PPT** are accepted by CVPR'24, another **two** papers by our Pointcept team have also been accepted by CVPR'24 🎉🎉🎉. We will make them publicly available soon!
101
+ - *Dec, 2023*: **PTv3** is released on arXiv, and the code is available in Pointcept. PTv3 is an efficient backbone model that achieves SOTA performances across indoor and outdoor scenarios.
102
+ - *Aug, 2023*: **PPT** is released on arXiv. PPT presents a multi-dataset pre-training framework that achieves SOTA performance in both **indoor** and **outdoor** scenarios. It is compatible with various existing pre-training frameworks and backbones. A **pre-release** version of the code is accessible; for those interested, please feel free to contact me directly for access.
103
+ - *Mar, 2023*: We released our codebase, **Pointcept**, a highly potent tool for point cloud representation learning and perception. We welcome new work to join the _Pointcept_ family and highly recommend reading [Quick Start](#quick-start) before starting your trail.
104
+ - *Feb, 2023*: **MSC** and **CeCo** accepted by CVPR 2023. _MSC_ is a highly efficient and effective pretraining framework that facilitates cross-dataset large-scale pretraining, while _CeCo_ is a segmentation method specifically designed for long-tail datasets. Both approaches are compatible with all existing backbone models in our codebase, and we will soon make the code available for public use.
105
+ - *Jan, 2023*: **CAC**, oral work of AAAI 2023, has expanded its 3D result with the incorporation of Pointcept. This addition will allow CAC to serve as a pluggable segmentor within our codebase.
106
+ - *Sep, 2022*: **PTv2** accepted by NeurIPS 2022. It is a continuation of the Point Transformer. The proposed GVA theory can apply to most existing attention mechanisms, while Grid Pooling is also a practical addition to existing pooling methods.
107
+
108
+ ## Citation
109
+ If you find _Pointcept_ useful to your research, please cite our work as encouragement. (੭ˊ꒳​ˋ)੭✧
110
+ ```
111
+ @misc{pointcept2023,
112
+ title={Pointcept: A Codebase for Point Cloud Perception Research},
113
+ author={Pointcept Contributors},
114
+ howpublished = {\url{https://github.com/Pointcept/Pointcept}},
115
+ year={2023}
116
+ }
117
+ ```
118
+
119
+ ## Overview
120
+
121
+ - [Installation](#installation)
122
+ - [Data Preparation](#data-preparation)
123
+ - [Quick Start](#quick-start)
124
+ - [Model Zoo](#model-zoo)
125
+ - [Citation](#citation)
126
+ - [Acknowledgement](#acknowledgement)
127
+
128
+ ## Installation
129
+
130
+ ### Requirements
131
+ - Ubuntu: 18.04 and above.
132
+ - CUDA: 11.3 and above.
133
+ - PyTorch: 1.10.0 and above.
134
+
135
+ ### Conda Environment
136
+
137
+ ```bash
138
+ conda create -n pointcept python=3.8 -y
139
+ conda activate pointcept
140
+ conda install ninja -y
141
+ # Choose version you want here: https://pytorch.org/get-started/previous-versions/
142
+ conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.3 -c pytorch -y
143
+ conda install h5py pyyaml -c anaconda -y
144
+ conda install sharedarray tensorboard tensorboardx yapf addict einops scipy plyfile termcolor timm -c conda-forge -y
145
+ conda install pytorch-cluster pytorch-scatter pytorch-sparse -c pyg -y
146
+ pip install torch-geometric
147
+
148
+ # spconv (SparseUNet)
149
+ # refer https://github.com/traveller59/spconv
150
+ pip install spconv-cu113
151
+
152
+ # PPT (clip)
153
+ pip install ftfy regex tqdm
154
+ pip install git+https://github.com/openai/CLIP.git
155
+
156
+ # PTv1 & PTv2 or precise eval
157
+ cd libs/pointops
158
+ # usual
159
+ python setup.py install
160
+ # docker & multi GPU arch
161
+ TORCH_CUDA_ARCH_LIST="ARCH LIST" python setup.py install
162
+ # e.g. 7.5: RTX 3000; 8.0: a100 More available in: https://developer.nvidia.com/cuda-gpus
163
+ TORCH_CUDA_ARCH_LIST="7.5 8.0" python setup.py install
164
+ cd ../..
165
+
166
+ # Open3D (visualization, optional)
167
+ pip install open3d
168
+ ```
169
+
170
+ ## Data Preparation
171
+
172
+ ### ScanNet v2
173
+
174
+ The preprocessing supports semantic and instance segmentation for both `ScanNet20`, `ScanNet200`, and `ScanNet Data Efficient`.
175
+ - Download the [ScanNet](http://www.scan-net.org/) v2 dataset.
176
+ - Run preprocessing code for raw ScanNet as follows:
177
+
178
+ ```bash
179
+ # RAW_SCANNET_DIR: the directory of downloaded ScanNet v2 raw dataset.
180
+ # PROCESSED_SCANNET_DIR: the directory of the processed ScanNet dataset (output dir).
181
+ python pointcept/datasets/preprocessing/scannet/preprocess_scannet.py --dataset_root ${RAW_SCANNET_DIR} --output_root ${PROCESSED_SCANNET_DIR}
182
+ ```
183
+ - (Optional) Download ScanNet Data Efficient files:
184
+ ```bash
185
+ # download-scannet.py is the official download script
186
+ # or follow instructions here: https://kaldir.vc.in.tum.de/scannet_benchmark/data_efficient/documentation#download
187
+ python download-scannet.py --data_efficient -o ${RAW_SCANNET_DIR}
188
+ # unzip downloads
189
+ cd ${RAW_SCANNET_DIR}/tasks
190
+ unzip limited-annotation-points.zip
191
+ unzip limited-reconstruction-scenes.zip
192
+ # copy files to processed dataset folder
193
+ mkdir ${PROCESSED_SCANNET_DIR}/tasks
194
+ cp -r ${RAW_SCANNET_DIR}/tasks/points ${PROCESSED_SCANNET_DIR}/tasks
195
+ cp -r ${RAW_SCANNET_DIR}/tasks/scenes ${PROCESSED_SCANNET_DIR}/tasks
196
+ ```
197
+ - (Alternative) Our preprocess data can be directly downloaded [[here](https://huggingface.co/datasets/Pointcept/scannet-compressed)], please agree the official license before download it.
198
+
199
+ - Link processed dataset to codebase:
200
+ ```bash
201
+ # PROCESSED_SCANNET_DIR: the directory of the processed ScanNet dataset.
202
+ mkdir data
203
+ ln -s ${PROCESSED_SCANNET_DIR} ${CODEBASE_DIR}/data/scannet
204
+ ```
205
+
206
+ ### ScanNet++
207
+ - Download the [ScanNet++](https://kaldir.vc.in.tum.de/scannetpp/) dataset.
208
+ - Run preprocessing code for raw ScanNet++ as follows:
209
+ ```bash
210
+ # RAW_SCANNETPP_DIR: the directory of downloaded ScanNet++ raw dataset.
211
+ # PROCESSED_SCANNETPP_DIR: the directory of the processed ScanNet++ dataset (output dir).
212
+ # NUM_WORKERS: the number of workers for parallel preprocessing.
213
+ python pointcept/datasets/preprocessing/scannetpp/preprocess_scannetpp.py --dataset_root ${RAW_SCANNETPP_DIR} --output_root ${PROCESSED_SCANNETPP_DIR} --num_workers ${NUM_WORKERS}
214
+ ```
215
+ - Sampling and chunking large point cloud data in train/val split as follows (only used for training):
216
+ ```bash
217
+ # PROCESSED_SCANNETPP_DIR: the directory of the processed ScanNet++ dataset (output dir).
218
+ # NUM_WORKERS: the number of workers for parallel preprocessing.
219
+ python pointcept/datasets/preprocessing/sampling_chunking_data.py --dataset_root ${PROCESSED_SCANNETPP_DIR} --grid_size 0.01 --chunk_range 6 6 --chunk_stride 3 3 --split train --num_workers ${NUM_WORKERS}
220
+ python pointcept/datasets/preprocessing/sampling_chunking_data.py --dataset_root ${PROCESSED_SCANNETPP_DIR} --grid_size 0.01 --chunk_range 6 6 --chunk_stride 3 3 --split val --num_workers ${NUM_WORKERS}
221
+ ```
222
+ - (Alternative) Our preprocess data can be directly downloaded [[here](https://huggingface.co/datasets/Pointcept/scannetpp-compressed)], please agree the official license before download it.
223
+ - Link processed dataset to codebase:
224
+ ```bash
225
+ # PROCESSED_SCANNETPP_DIR: the directory of the processed ScanNet dataset.
226
+ mkdir data
227
+ ln -s ${PROCESSED_SCANNETPP_DIR} ${CODEBASE_DIR}/data/scannetpp
228
+ ```
229
+
230
+ ### S3DIS
231
+
232
+ - Download S3DIS data by filling this [Google form](https://docs.google.com/forms/d/e/1FAIpQLScDimvNMCGhy_rmBA2gHfDu3naktRm6A8BPwAWWDv-Uhm6Shw/viewform?c=0&w=1). Download the `Stanford3dDataset_v1.2.zip` file and unzip it.
233
+ - Fix error in `Area_5/office_19/Annotations/ceiling` Line 323474 (103.0�0000 => 103.000000).
234
+ - (Optional) Download Full 2D-3D S3DIS dataset (no XYZ) from [here](https://github.com/alexsax/2D-3D-Semantics) for parsing normal.
235
+ - Run preprocessing code for S3DIS as follows:
236
+
237
+ ```bash
238
+ # S3DIS_DIR: the directory of downloaded Stanford3dDataset_v1.2 dataset.
239
+ # RAW_S3DIS_DIR: the directory of Stanford2d3dDataset_noXYZ dataset. (optional, for parsing normal)
240
+ # PROCESSED_S3DIS_DIR: the directory of processed S3DIS dataset (output dir).
241
+
242
+ # S3DIS without aligned angle
243
+ python pointcept/datasets/preprocessing/s3dis/preprocess_s3dis.py --dataset_root ${S3DIS_DIR} --output_root ${PROCESSED_S3DIS_DIR}
244
+ # S3DIS with aligned angle
245
+ python pointcept/datasets/preprocessing/s3dis/preprocess_s3dis.py --dataset_root ${S3DIS_DIR} --output_root ${PROCESSED_S3DIS_DIR} --align_angle
246
+ # S3DIS with normal vector (recommended, normal is helpful)
247
+ python pointcept/datasets/preprocessing/s3dis/preprocess_s3dis.py --dataset_root ${S3DIS_DIR} --output_root ${PROCESSED_S3DIS_DIR} --raw_root ${RAW_S3DIS_DIR} --parse_normal
248
+ python pointcept/datasets/preprocessing/s3dis/preprocess_s3dis.py --dataset_root ${S3DIS_DIR} --output_root ${PROCESSED_S3DIS_DIR} --raw_root ${RAW_S3DIS_DIR} --align_angle --parse_normal
249
+ ```
250
+
251
+ - (Alternative) Our preprocess data can also be downloaded [[here](https://huggingface.co/datasets/Pointcept/s3dis-compressed
252
+ )] (with normal vector and aligned angle), please agree with the official license before downloading it.
253
+
254
+ - Link processed dataset to codebase.
255
+ ```bash
256
+ # PROCESSED_S3DIS_DIR: the directory of processed S3DIS dataset.
257
+ mkdir data
258
+ ln -s ${PROCESSED_S3DIS_DIR} ${CODEBASE_DIR}/data/s3dis
259
+ ```
260
+ ### Structured3D
261
+
262
+ - Download Structured3D panorama related and perspective (full) related zip files by filling this [Google form](https://docs.google.com/forms/d/e/1FAIpQLSc0qtvh4vHSoZaW6UvlXYy79MbcGdZfICjh4_t4bYofQIVIdw/viewform?pli=1) (no need to unzip them).
263
+ - Organize all downloaded zip file in one folder (`${STRUCT3D_DIR}`).
264
+ - Run preprocessing code for Structured3D as follows:
265
+ ```bash
266
+ # STRUCT3D_DIR: the directory of downloaded Structured3D dataset.
267
+ # PROCESSED_STRUCT3D_DIR: the directory of processed Structured3D dataset (output dir).
268
+ # NUM_WORKERS: Number for workers for preprocessing, default same as cpu count (might OOM).
269
+ export PYTHONPATH=./
270
+ python pointcept/datasets/preprocessing/structured3d/preprocess_structured3d.py --dataset_root ${STRUCT3D_DIR} --output_root ${PROCESSED_STRUCT3D_DIR} --num_workers ${NUM_WORKERS} --grid_size 0.01 --fuse_prsp --fuse_pano
271
+ ```
272
+ Following the instruction of [Swin3D](https://arxiv.org/abs/2304.06906), we keep 25 categories with frequencies of more than 0.001, out of the original 40 categories.
273
+
274
+ [//]: # (- &#40;Alternative&#41; Our preprocess data can also be downloaded [[here]&#40;&#41;], please agree the official license before download it.)
275
+
276
+ - (Alternative) Our preprocess data can also be downloaded [[here](https://huggingface.co/datasets/Pointcept/structured3d-compressed
277
+ )] (with perspective views and panorama view, 471.7G after unzipping), please agree the official license before download it.
278
+
279
+ - Link processed dataset to codebase.
280
+ ```bash
281
+ # PROCESSED_STRUCT3D_DIR: the directory of processed Structured3D dataset (output dir).
282
+ mkdir data
283
+ ln -s ${PROCESSED_STRUCT3D_DIR} ${CODEBASE_DIR}/data/structured3d
284
+ ```
285
+ ### Matterport3D
286
+ - Follow [this page](https://niessner.github.io/Matterport/#download) to request access to the dataset.
287
+ - Download the "region_segmentation" type, which represents the division of a scene into individual rooms.
288
+ ```bash
289
+ # download-mp.py is the official download script
290
+ # MATTERPORT3D_DIR: the directory of downloaded Matterport3D dataset.
291
+ python download-mp.py -o {MATTERPORT3D_DIR} --type region_segmentations
292
+ ```
293
+ - Unzip the region_segmentations data
294
+ ```bash
295
+ # MATTERPORT3D_DIR: the directory of downloaded Matterport3D dataset.
296
+ python pointcept/datasets/preprocessing/matterport3d/unzip_matterport3d_region_segmentation.py --dataset_root {MATTERPORT3D_DIR}
297
+ ```
298
+ - Run preprocessing code for Matterport3D as follows:
299
+ ```bash
300
+ # MATTERPORT3D_DIR: the directory of downloaded Matterport3D dataset.
301
+ # PROCESSED_MATTERPORT3D_DIR: the directory of processed Matterport3D dataset (output dir).
302
+ # NUM_WORKERS: the number of workers for this preprocessing.
303
+ python pointcept/datasets/preprocessing/matterport3d/preprocess_matterport3d_mesh.py --dataset_root ${MATTERPORT3D_DIR} --output_root ${PROCESSED_MATTERPORT3D_DIR} --num_workers ${NUM_WORKERS}
304
+ ```
305
+ - Link processed dataset to codebase.
306
+ ```bash
307
+ # PROCESSED_MATTERPORT3D_DIR: the directory of processed Matterport3D dataset (output dir).
308
+ mkdir data
309
+ ln -s ${PROCESSED_MATTERPORT3D_DIR} ${CODEBASE_DIR}/data/matterport3d
310
+ ```
311
+
312
+ Following the instruction of [OpenRooms](https://github.com/ViLab-UCSD/OpenRooms), we remapped Matterport3D's categories to ScanNet 20 semantic categories with the addition of a ceiling category.
313
+ * (Alternative) Our preprocess data can also be downloaded [here](https://huggingface.co/datasets/Pointcept/matterport3d-compressed), please agree the official license before download it.
314
+
315
+ ### SemanticKITTI
316
+ - Download [SemanticKITTI](http://www.semantic-kitti.org/dataset.html#download) dataset.
317
+ - Link dataset to codebase.
318
+ ```bash
319
+ # SEMANTIC_KITTI_DIR: the directory of SemanticKITTI dataset.
320
+ # |- SEMANTIC_KITTI_DIR
321
+ # |- dataset
322
+ # |- sequences
323
+ # |- 00
324
+ # |- 01
325
+ # |- ...
326
+
327
+ mkdir -p data
328
+ ln -s ${SEMANTIC_KITTI_DIR} ${CODEBASE_DIR}/data/semantic_kitti
329
+ ```
330
+
331
+ ### nuScenes
332
+ - Download the official [NuScene](https://www.nuscenes.org/nuscenes#download) dataset (with Lidar Segmentation) and organize the downloaded files as follows:
333
+ ```bash
334
+ NUSCENES_DIR
335
+ │── samples
336
+ │── sweeps
337
+ │── lidarseg
338
+ ...
339
+ │── v1.0-trainval
340
+ │── v1.0-test
341
+ ```
342
+ - Run information preprocessing code (modified from OpenPCDet) for nuScenes as follows:
343
+ ```bash
344
+ # NUSCENES_DIR: the directory of downloaded nuScenes dataset.
345
+ # PROCESSED_NUSCENES_DIR: the directory of processed nuScenes dataset (output dir).
346
+ # MAX_SWEEPS: Max number of sweeps. Default: 10.
347
+ pip install nuscenes-devkit pyquaternion
348
+ python pointcept/datasets/preprocessing/nuscenes/preprocess_nuscenes_info.py --dataset_root ${NUSCENES_DIR} --output_root ${PROCESSED_NUSCENES_DIR} --max_sweeps ${MAX_SWEEPS} --with_camera
349
+ ```
350
+ - (Alternative) Our preprocess nuScenes information data can also be downloaded [[here](
351
+ https://huggingface.co/datasets/Pointcept/nuscenes-compressed)] (only processed information, still need to download raw dataset and link to the folder), please agree the official license before download it.
352
+
353
+ - Link raw dataset to processed NuScene dataset folder:
354
+ ```bash
355
+ # NUSCENES_DIR: the directory of downloaded nuScenes dataset.
356
+ # PROCESSED_NUSCENES_DIR: the directory of processed nuScenes dataset (output dir).
357
+ ln -s ${NUSCENES_DIR} {PROCESSED_NUSCENES_DIR}/raw
358
+ ```
359
+ then the processed nuscenes folder is organized as follows:
360
+ ```bash
361
+ nuscene
362
+ |── raw
363
+ │── samples
364
+ │── sweeps
365
+ │── lidarseg
366
+ ...
367
+ │── v1.0-trainval
368
+ │── v1.0-test
369
+ |── info
370
+ ```
371
+
372
+ - Link processed dataset to codebase.
373
+ ```bash
374
+ # PROCESSED_NUSCENES_DIR: the directory of processed nuScenes dataset (output dir).
375
+ mkdir data
376
+ ln -s ${PROCESSED_NUSCENES_DIR} ${CODEBASE_DIR}/data/nuscenes
377
+ ```
378
+
379
+ ### Waymo
380
+ - Download the official [Waymo](https://waymo.com/open/download/) dataset (v1.4.3) and organize the downloaded files as follows:
381
+ ```bash
382
+ WAYMO_RAW_DIR
383
+ │── training
384
+ │── validation
385
+ │── testing
386
+ ```
387
+ - Install the following dependence:
388
+ ```bash
389
+ # If shows "No matching distribution found", download whl directly from Pypi and install the package.
390
+ conda create -n waymo python=3.10 -y
391
+ conda activate waymo
392
+ pip install waymo-open-dataset-tf-2-12-0
393
+ ```
394
+ - Run the preprocessing code as follows:
395
+ ```bash
396
+ # WAYMO_DIR: the directory of the downloaded Waymo dataset.
397
+ # PROCESSED_WAYMO_DIR: the directory of the processed Waymo dataset (output dir).
398
+ # NUM_WORKERS: num workers for preprocessing
399
+ python pointcept/datasets/preprocessing/waymo/preprocess_waymo.py --dataset_root ${WAYMO_DIR} --output_root ${PROCESSED_WAYMO_DIR} --splits training validation --num_workers ${NUM_WORKERS}
400
+ ```
401
+
402
+ - Link processed dataset to the codebase.
403
+ ```bash
404
+ # PROCESSED_WAYMO_DIR: the directory of the processed Waymo dataset (output dir).
405
+ mkdir data
406
+ ln -s ${PROCESSED_WAYMO_DIR} ${CODEBASE_DIR}/data/waymo
407
+ ```
408
+
409
+ ### ModelNet
410
+ - Download [modelnet40_normal_resampled.zip](https://shapenet.cs.stanford.edu/media/modelnet40_normal_resampled.zip) and unzip
411
+ - Link dataset to the codebase.
412
+ ```bash
413
+ mkdir -p data
414
+ ln -s ${MODELNET_DIR} ${CODEBASE_DIR}/data/modelnet40_normal_resampled
415
+ ```
416
+
417
+ ## Quick Start
418
+
419
+ ### Training
420
+ **Train from scratch.** The training processing is based on configs in `configs` folder.
421
+ The training script will generate an experiment folder in `exp` folder and backup essential code in the experiment folder.
422
+ Training config, log, tensorboard, and checkpoints will also be saved into the experiment folder during the training process.
423
+ ```bash
424
+ export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}
425
+ # Script (Recommended)
426
+ sh scripts/train.sh -p ${INTERPRETER_PATH} -g ${NUM_GPU} -d ${DATASET_NAME} -c ${CONFIG_NAME} -n ${EXP_NAME}
427
+ # Direct
428
+ export PYTHONPATH=./
429
+ python tools/train.py --config-file ${CONFIG_PATH} --num-gpus ${NUM_GPU} --options save_path=${SAVE_PATH}
430
+ ```
431
+
432
+ For example:
433
+ ```bash
434
+ # By script (Recommended)
435
+ # -p is default set as python and can be ignored
436
+ sh scripts/train.sh -p python -d scannet -c semseg-pt-v2m2-0-base -n semseg-pt-v2m2-0-base
437
+ # Direct
438
+ export PYTHONPATH=./
439
+ python tools/train.py --config-file configs/scannet/semseg-pt-v2m2-0-base.py --options save_path=exp/scannet/semseg-pt-v2m2-0-base
440
+ ```
441
+ **Resume training from checkpoint.** If the training process is interrupted by accident, the following script can resume training from a given checkpoint.
442
+ ```bash
443
+ export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}
444
+ # Script (Recommended)
445
+ # simply add "-r true"
446
+ sh scripts/train.sh -p ${INTERPRETER_PATH} -g ${NUM_GPU} -d ${DATASET_NAME} -c ${CONFIG_NAME} -n ${EXP_NAME} -r true
447
+ # Direct
448
+ export PYTHONPATH=./
449
+ python tools/train.py --config-file ${CONFIG_PATH} --num-gpus ${NUM_GPU} --options save_path=${SAVE_PATH} resume=True weight=${CHECKPOINT_PATH}
450
+ ```
451
+
452
+ ### Testing
453
+ During training, model evaluation is performed on point clouds after grid sampling (voxelization), providing an initial assessment of model performance. However, to obtain precise evaluation results, testing is **essential**. The testing process involves subsampling a dense point cloud into a sequence of voxelized point clouds, ensuring comprehensive coverage of all points. These sub-results are then predicted and collected to form a complete prediction of the entire point cloud. This approach yields higher evaluation results compared to simply mapping/interpolating the prediction. In addition, our testing code supports TTA (test time augmentation) testing, which further enhances the stability of evaluation performance.
454
+
455
+ ```bash
456
+ # By script (Based on experiment folder created by training script)
457
+ sh scripts/test.sh -p ${INTERPRETER_PATH} -g ${NUM_GPU} -d ${DATASET_NAME} -n ${EXP_NAME} -w ${CHECKPOINT_NAME}
458
+ # Direct
459
+ export PYTHONPATH=./
460
+ python tools/test.py --config-file ${CONFIG_PATH} --num-gpus ${NUM_GPU} --options save_path=${SAVE_PATH} weight=${CHECKPOINT_PATH}
461
+ ```
462
+ For example:
463
+ ```bash
464
+ # By script (Based on experiment folder created by training script)
465
+ # -p is default set as python and can be ignored
466
+ # -w is default set as model_best and can be ignored
467
+ sh scripts/test.sh -p python -d scannet -n semseg-pt-v2m2-0-base -w model_best
468
+ # Direct
469
+ export PYTHONPATH=./
470
+ python tools/test.py --config-file configs/scannet/semseg-pt-v2m2-0-base.py --options save_path=exp/scannet/semseg-pt-v2m2-0-base weight=exp/scannet/semseg-pt-v2m2-0-base/model/model_best.pth
471
+ ```
472
+
473
+ The TTA can be disabled by replace `data.test.test_cfg.aug_transform = [...]` with:
474
+
475
+ ```python
476
+ data = dict(
477
+ train = dict(...),
478
+ val = dict(...),
479
+ test = dict(
480
+ ...,
481
+ test_cfg = dict(
482
+ ...,
483
+ aug_transform = [
484
+ [dict(type="RandomRotateTargetAngle", angle=[0], axis="z", center=[0, 0, 0], p=1)]
485
+ ]
486
+ )
487
+ )
488
+ )
489
+ ```
490
+
491
+ ### Offset
492
+ `Offset` is the separator of point clouds in batch data, and it is similar to the concept of `Batch` in PyG.
493
+ A visual illustration of batch and offset is as follows:
494
+ <p align="center">
495
+ <!-- pypi-strip -->
496
+ <picture>
497
+ <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/Pointcept/Pointcept/main/docs/offset_dark.png">
498
+ <source media="(prefers-color-scheme: light)" srcset="https://raw.githubusercontent.com/Pointcept/Pointcept/main/docs/offset.png">
499
+ <!-- /pypi-strip -->
500
+ <img alt="pointcept" src="https://raw.githubusercontent.com/Pointcept/Pointcept/main/docs/offset.png" width="480">
501
+ <!-- pypi-strip -->
502
+ </picture><br>
503
+ <!-- /pypi-strip -->
504
+ </p>
505
+
506
+ ## Model Zoo
507
+ ### 1. Backbones and Semantic Segmentation
508
+ #### SparseUNet
509
+
510
+ _Pointcept_ provides `SparseUNet` implemented by `SpConv` and `MinkowskiEngine`. The SpConv version is recommended since SpConv is easy to install and faster than MinkowskiEngine. Meanwhile, SpConv is also widely applied in outdoor perception.
511
+
512
+ - **SpConv (recommend)**
513
+
514
+ The SpConv version `SparseUNet` in the codebase was fully rewrite from `MinkowskiEngine` version, example running script is as follows:
515
+
516
+ ```bash
517
+ # ScanNet val
518
+ sh scripts/train.sh -g 4 -d scannet -c semseg-spunet-v1m1-0-base -n semseg-spunet-v1m1-0-base
519
+ # ScanNet200
520
+ sh scripts/train.sh -g 4 -d scannet200 -c semseg-spunet-v1m1-0-base -n semseg-spunet-v1m1-0-base
521
+ # S3DIS
522
+ sh scripts/train.sh -g 4 -d s3dis -c semseg-spunet-v1m1-0-base -n semseg-spunet-v1m1-0-base
523
+ # S3DIS (with normal)
524
+ sh scripts/train.sh -g 4 -d s3dis -c semseg-spunet-v1m1-0-cn-base -n semseg-spunet-v1m1-0-cn-base
525
+ # SemanticKITTI
526
+ sh scripts/train.sh -g 4 -d semantic_kitti -c semseg-spunet-v1m1-0-base -n semseg-spunet-v1m1-0-base
527
+ # nuScenes
528
+ sh scripts/train.sh -g 4 -d nuscenes -c semseg-spunet-v1m1-0-base -n semseg-spunet-v1m1-0-base
529
+ # ModelNet40
530
+ sh scripts/train.sh -g 2 -d modelnet40 -c cls-spunet-v1m1-0-base -n cls-spunet-v1m1-0-base
531
+
532
+ # ScanNet Data Efficient
533
+ sh scripts/train.sh -g 4 -d scannet -c semseg-spunet-v1m1-2-efficient-la20 -n semseg-spunet-v1m1-2-efficient-la20
534
+ sh scripts/train.sh -g 4 -d scannet -c semseg-spunet-v1m1-2-efficient-la50 -n semseg-spunet-v1m1-2-efficient-la50
535
+ sh scripts/train.sh -g 4 -d scannet -c semseg-spunet-v1m1-2-efficient-la100 -n semseg-spunet-v1m1-2-efficient-la100
536
+ sh scripts/train.sh -g 4 -d scannet -c semseg-spunet-v1m1-2-efficient-la200 -n semseg-spunet-v1m1-2-efficient-la200
537
+ sh scripts/train.sh -g 4 -d scannet -c semseg-spunet-v1m1-2-efficient-lr1 -n semseg-spunet-v1m1-2-efficient-lr1
538
+ sh scripts/train.sh -g 4 -d scannet -c semseg-spunet-v1m1-2-efficient-lr5 -n semseg-spunet-v1m1-2-efficient-lr5
539
+ sh scripts/train.sh -g 4 -d scannet -c semseg-spunet-v1m1-2-efficient-lr10 -n semseg-spunet-v1m1-2-efficient-lr10
540
+ sh scripts/train.sh -g 4 -d scannet -c semseg-spunet-v1m1-2-efficient-lr20 -n semseg-spunet-v1m1-2-efficient-lr20
541
+
542
+ # Profile model run time
543
+ sh scripts/train.sh -g 4 -d scannet -c semseg-spunet-v1m1-0-enable-profiler -n semseg-spunet-v1m1-0-enable-profiler
544
+ ```
545
+
546
+ - **MinkowskiEngine**
547
+
548
+ The MinkowskiEngine version `SparseUNet` in the codebase was modified from the original MinkowskiEngine repo, and example running scripts are as follows:
549
+ 1. Install MinkowskiEngine, refer https://github.com/NVIDIA/MinkowskiEngine
550
+ 2. Training with the following example scripts:
551
+ ```bash
552
+ # Uncomment "# from .sparse_unet import *" in "pointcept/models/__init__.py"
553
+ # Uncomment "# from .mink_unet import *" in "pointcept/models/sparse_unet/__init__.py"
554
+ # ScanNet
555
+ sh scripts/train.sh -g 4 -d scannet -c semseg-minkunet34c-0-base -n semseg-minkunet34c-0-base
556
+ # ScanNet200
557
+ sh scripts/train.sh -g 4 -d scannet200 -c semseg-minkunet34c-0-base -n semseg-minkunet34c-0-base
558
+ # S3DIS
559
+ sh scripts/train.sh -g 4 -d s3dis -c semseg-minkunet34c-0-base -n semseg-minkunet34c-0-base
560
+ # SemanticKITTI
561
+ sh scripts/train.sh -g 2 -d semantic_kitti -c semseg-minkunet34c-0-base -n semseg-minkunet34c-0-base
562
+ ```
563
+
564
+ #### OA-CNNs
565
+ Introducing Omni-Adaptive 3D CNNs (**OA-CNNs**), a family of networks that integrates a lightweight module to greatly enhance the adaptivity of sparse CNNs at minimal computational cost. Without any self-attention modules, **OA-CNNs** favorably surpass point transformers in terms of accuracy in both indoor and outdoor scenes, with much less latency and memory cost. Issue related to **OA-CNNs** can @Pbihao.
566
+ ```bash
567
+ # ScanNet
568
+ sh scripts/train.sh -g 4 -d scannet -c semseg-oacnns-v1m1-0-base -n semseg-oacnns-v1m1-0-base
569
+ ```
570
+
571
+ #### Point Transformers
572
+ - **PTv3**
573
+
574
+ [PTv3](https://arxiv.org/abs/2312.10035) is an efficient backbone model that achieves SOTA performances across indoor and outdoor scenarios. The full PTv3 relies on FlashAttention, while FlashAttention relies on CUDA 11.6 and above, make sure your local Pointcept environment satisfies the requirements.
575
+
576
+ If you can not upgrade your local environment to satisfy the requirements (CUDA >= 11.6), then you can disable FlashAttention by setting the model parameter `enable_flash` to `false` and reducing the `enc_patch_size` and `dec_patch_size` to a level (e.g. 128).
577
+
578
+ FlashAttention force disables RPE and forces the accuracy reduced to fp16. If you require these features, please disable `enable_flash` and adjust `enable_rpe`, `upcast_attention` and`upcast_softmax`.
579
+
580
+ Detailed instructions and experiment records (containing weights) are available on the [project repository](https://github.com/Pointcept/PointTransformerV3). Example running scripts are as follows:
581
+ ```bash
582
+ # Scratched ScanNet
583
+ sh scripts/train.sh -g 4 -d scannet -c semseg-pt-v3m1-0-base -n semseg-pt-v3m1-0-base
584
+ # PPT joint training (ScanNet + Structured3D) and evaluate in ScanNet
585
+ sh scripts/train.sh -g 8 -d scannet -c semseg-pt-v3m1-1-ppt-extreme -n semseg-pt-v3m1-1-ppt-extreme
586
+
587
+ # Scratched ScanNet200
588
+ sh scripts/train.sh -g 4 -d scannet200 -c semseg-pt-v3m1-0-base -n semseg-pt-v3m1-0-base
589
+ # Fine-tuning from PPT joint training (ScanNet + Structured3D) with ScanNet200
590
+ # PTV3_PPT_WEIGHT_PATH: Path to model weight trained by PPT multi-dataset joint training
591
+ # e.g. exp/scannet/semseg-pt-v3m1-1-ppt-extreme/model/model_best.pth
592
+ sh scripts/train.sh -g 4 -d scannet200 -c semseg-pt-v3m1-1-ppt-ft -n semseg-pt-v3m1-1-ppt-ft -w ${PTV3_PPT_WEIGHT_PATH}
593
+
594
+ # Scratched ScanNet++
595
+ sh scripts/train.sh -g 4 -d scannetpp -c semseg-pt-v3m1-0-base -n semseg-pt-v3m1-0-base
596
+ # Scratched ScanNet++ test
597
+ sh scripts/train.sh -g 4 -d scannetpp -c semseg-pt-v3m1-1-submit -n semseg-pt-v3m1-1-submit
598
+
599
+
600
+ # Scratched S3DIS
601
+ sh scripts/train.sh -g 4 -d s3dis -c semseg-pt-v3m1-0-base -n semseg-pt-v3m1-0-base
602
+ # an example for disbale flash_attention and enable rpe.
603
+ sh scripts/train.sh -g 4 -d s3dis -c semseg-pt-v3m1-1-rpe -n semseg-pt-v3m1-0-rpe
604
+ # PPT joint training (ScanNet + S3DIS + Structured3D) and evaluate in ScanNet
605
+ sh scripts/train.sh -g 8 -d s3dis -c semseg-pt-v3m1-1-ppt-extreme -n semseg-pt-v3m1-1-ppt-extreme
606
+ # S3DIS 6-fold cross validation
607
+ # 1. The default configs are evaluated on Area_5, modify the "data.train.split", "data.val.split", and "data.test.split" to make the config evaluated on Area_1 ~ Area_6 respectively.
608
+ # 2. Train and evaluate the model on each split of areas and gather result files located in "exp/s3dis/EXP_NAME/result/Area_x.pth" in one single folder, noted as RECORD_FOLDER.
609
+ # 3. Run the following script to get S3DIS 6-fold cross validation performance:
610
+ export PYTHONPATH=./
611
+ python tools/test_s3dis_6fold.py --record_root ${RECORD_FOLDER}
612
+
613
+ # Scratched nuScenes
614
+ sh scripts/train.sh -g 4 -d nuscenes -c semseg-pt-v3m1-0-base -n semseg-pt-v3m1-0-base
615
+ # Scratched Waymo
616
+ sh scripts/train.sh -g 4 -d waymo -c semseg-pt-v3m1-0-base -n semseg-pt-v3m1-0-base
617
+
618
+ # More configs and exp records for PTv3 will be available soon.
619
+ ```
620
+
621
+ Indoor semantic segmentation
622
+ | Model | Benchmark | Additional Data | Num GPUs | Val mIoU | Config | Tensorboard | Exp Record |
623
+ | :---: | :---: |:---------------:| :---: | :---: | :---: | :---: | :---: |
624
+ | PTv3 | ScanNet | &cross; | 4 | 77.6% | [link](https://github.com/Pointcept/Pointcept/blob/main/configs/scannet/semseg-pt-v3m1-0-base.py) | [link](https://huggingface.co/Pointcept/PointTransformerV3/tensorboard) | [link](https://huggingface.co/Pointcept/PointTransformerV3/tree/main/scannet-semseg-pt-v3m1-0-base) |
625
+ | PTv3 + PPT | ScanNet | &check; | 8 | 78.5% | [link](https://github.com/Pointcept/Pointcept/blob/main/configs/scannet/semseg-pt-v3m1-1-ppt-extreme.py) | [link](https://huggingface.co/Pointcept/PointTransformerV3/tensorboard) | [link](https://huggingface.co/Pointcept/PointTransformerV3/tree/main/scannet-semseg-pt-v3m1-1-ppt-extreme) |
626
+ | PTv3 | ScanNet200 | &cross; | 4 | 35.3% | [link](https://github.com/Pointcept/Pointcept/blob/main/configs/scannet200/semseg-pt-v3m1-0-base.py) | [link](https://huggingface.co/Pointcept/PointTransformerV3/tensorboard) |[link](https://huggingface.co/Pointcept/PointTransformerV3/tree/main/scannet200-semseg-pt-v3m1-0-base)|
627
+ | PTv3 + PPT | ScanNet200 | &check; (f.t.) | 4 | | | | |
628
+ | PTv3 | S3DIS (Area5) | &cross; | 4 | 73.6% | [link](https://github.com/Pointcept/Pointcept/blob/main/configs/s3dis/semseg-pt-v3m1-0-rpe.py) | [link](https://huggingface.co/Pointcept/PointTransformerV3/tensorboard) | [link](https://huggingface.co/Pointcept/PointTransformerV3/tree/main/s3dis-semseg-pt-v3m1-0-rpe) |
629
+ | PTv3 + PPT | S3DIS (Area5) | &check; | 8 | 75.4% | [link](https://github.com/Pointcept/Pointcept/blob/main/configs/s3dis/semseg-pt-v3m1-1-ppt-extreme.py) | [link](https://huggingface.co/Pointcept/PointTransformerV3/tensorboard) | [link](https://huggingface.co/Pointcept/PointTransformerV3/tree/main/s3dis-semseg-pt-v3m1-1-ppt-extreme) |
630
+
631
+ Outdoor semantic segmentation
632
+ | Model | Benchmark | Additional Data | Num GPUs | Val mIoU | Config | Tensorboard | Exp Record |
633
+ | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
634
+ | PTv3 | nuScenes | &cross; | 4 | 80.3 | [link](https://github.com/Pointcept/Pointcept/blob/main/configs/nuscenes/semseg-pt-v3m1-0-base.py) | [link](https://huggingface.co/Pointcept/PointTransformerV3/tensorboard)|[link](https://huggingface.co/Pointcept/PointTransformerV3/tree/main/nuscenes-semseg-pt-v3m1-0-base) |
635
+ | PTv3 + PPT | nuScenes | &check; | 8 | | | | |
636
+ | PTv3 | SemanticKITTI | &cross; | 4 | | | | |
637
+ | PTv3 + PPT | SemanticKITTI | &check; | 8 | | | | |
638
+ | PTv3 | Waymo | &cross; | 4 | 71.2 | [link](https://github.com/Pointcept/Pointcept/blob/main/configs/waymo/semseg-pt-v3m1-0-base.py) | [link](https://huggingface.co/Pointcept/PointTransformerV3/tensorboard) | [link](https://huggingface.co/Pointcept/PointTransformerV3/tree/main/waymo-semseg-pt-v3m1-0-base) (log only) |
639
+ | PTv3 + PPT | Waymo | &check; | 8 | | | | |
640
+
641
+ _**\*Released model weights are trained for v1.5.1, weights for v1.5.2 and later is still ongoing.**_
642
+
643
+ - **PTv2 mode2**
644
+
645
+ The original PTv2 was trained on 4 * RTX a6000 (48G memory). Even enabling AMP, the memory cost of the original PTv2 is slightly larger than 24G. Considering GPUs with 24G memory are much more accessible, I tuned the PTv2 on the latest Pointcept and made it runnable on 4 * RTX 3090 machines.
646
+
647
+ `PTv2 Mode2` enables AMP and disables _Position Encoding Multiplier_ & _Grouped Linear_. During our further research, we found that precise coordinates are not necessary for point cloud understanding (Replacing precise coordinates with grid coordinates doesn't influence the performance. Also, SparseUNet is an example). As for Grouped Linear, my implementation of Grouped Linear seems to cost more memory than the Linear layer provided by PyTorch. Benefiting from the codebase and better parameter tuning, we also relieve the overfitting problem. The reproducing performance is even better than the results reported in our paper.
648
+
649
+ Example running scripts are as follows:
650
+
651
+ ```bash
652
+ # ptv2m2: PTv2 mode2, disable PEM & Grouped Linear, GPU memory cost < 24G (recommend)
653
+ # ScanNet
654
+ sh scripts/train.sh -g 4 -d scannet -c semseg-pt-v2m2-0-base -n semseg-pt-v2m2-0-base
655
+ sh scripts/train.sh -g 4 -d scannet -c semseg-pt-v2m2-3-lovasz -n semseg-pt-v2m2-3-lovasz
656
+
657
+ # ScanNet test
658
+ sh scripts/train.sh -g 4 -d scannet -c semseg-pt-v2m2-1-submit -n semseg-pt-v2m2-1-submit
659
+ # ScanNet200
660
+ sh scripts/train.sh -g 4 -d scannet200 -c semseg-pt-v2m2-0-base -n semseg-pt-v2m2-0-base
661
+ # ScanNet++
662
+ sh scripts/train.sh -g 4 -d scannetpp -c semseg-pt-v2m2-0-base -n semseg-pt-v2m2-0-base
663
+ # ScanNet++ test
664
+ sh scripts/train.sh -g 4 -d scannetpp -c semseg-pt-v2m2-1-submit -n semseg-pt-v2m2-1-submit
665
+ # S3DIS
666
+ sh scripts/train.sh -g 4 -d s3dis -c semseg-pt-v2m2-0-base -n semseg-pt-v2m2-0-base
667
+ # SemanticKITTI
668
+ sh scripts/train.sh -g 4 -d semantic_kitti -c semseg-pt-v2m2-0-base -n semseg-pt-v2m2-0-base
669
+ # nuScenes
670
+ sh scripts/train.sh -g 4 -d nuscenes -c semseg-pt-v2m2-0-base -n semseg-pt-v2m2-0-base
671
+ ```
672
+
673
+ - **PTv2 mode1**
674
+
675
+ `PTv2 mode1` is the original PTv2 we reported in our paper, example running scripts are as follows:
676
+
677
+ ```bash
678
+ # ptv2m1: PTv2 mode1, Original PTv2, GPU memory cost > 24G
679
+ # ScanNet
680
+ sh scripts/train.sh -g 4 -d scannet -c semseg-pt-v2m1-0-base -n semseg-pt-v2m1-0-base
681
+ # ScanNet200
682
+ sh scripts/train.sh -g 4 -d scannet200 -c semseg-pt-v2m1-0-base -n semseg-pt-v2m1-0-base
683
+ # S3DIS
684
+ sh scripts/train.sh -g 4 -d s3dis -c semseg-pt-v2m1-0-base -n semseg-pt-v2m1-0-base
685
+ ```
686
+
687
+ - **PTv1**
688
+
689
+ The original PTv1 is also available in our Pointcept codebase. I haven't run PTv1 for a long time, but I have ensured that the example running script works well.
690
+
691
+ ```bash
692
+ # ScanNet
693
+ sh scripts/train.sh -g 4 -d scannet -c semseg-pt-v1-0-base -n semseg-pt-v1-0-base
694
+ # ScanNet200
695
+ sh scripts/train.sh -g 4 -d scannet200 -c semseg-pt-v1-0-base -n semseg-pt-v1-0-base
696
+ # S3DIS
697
+ sh scripts/train.sh -g 4 -d s3dis -c semseg-pt-v1-0-base -n semseg-pt-v1-0-base
698
+ ```
699
+
700
+
701
+ #### Stratified Transformer
702
+ 1. Additional requirements:
703
+ ```bash
704
+ pip install torch-points3d
705
+ # Fix dependence, caused by installing torch-points3d
706
+ pip uninstall SharedArray
707
+ pip install SharedArray==3.2.1
708
+
709
+ cd libs/pointops2
710
+ python setup.py install
711
+ cd ../..
712
+ ```
713
+ 2. Uncomment `# from .stratified_transformer import *` in `pointcept/models/__init__.py`.
714
+ 3. Refer [Optional Installation](installation) to install dependence.
715
+ 4. Training with the following example scripts:
716
+ ```bash
717
+ # stv1m1: Stratified Transformer mode1, Modified from the original Stratified Transformer code.
718
+ # PTv2m2: Stratified Transformer mode2, My rewrite version (recommend).
719
+
720
+ # ScanNet
721
+ sh scripts/train.sh -g 4 -d scannet -c semseg-st-v1m2-0-refined -n semseg-st-v1m2-0-refined
722
+ sh scripts/train.sh -g 4 -d scannet -c semseg-st-v1m1-0-origin -n semseg-st-v1m1-0-origin
723
+ # ScanNet200
724
+ sh scripts/train.sh -g 4 -d scannet200 -c semseg-st-v1m2-0-refined -n semseg-st-v1m2-0-refined
725
+ # S3DIS
726
+ sh scripts/train.sh -g 4 -d s3dis -c semseg-st-v1m2-0-refined -n semseg-st-v1m2-0-refined
727
+ ```
728
+
729
+ #### SPVCNN
730
+ `SPVCNN` is a baseline model of [SPVNAS](https://github.com/mit-han-lab/spvnas), it is also a practical baseline for outdoor datasets.
731
+ 1. Install torchsparse:
732
+ ```bash
733
+ # refer https://github.com/mit-han-lab/torchsparse
734
+ # install method without sudo apt install
735
+ conda install google-sparsehash -c bioconda
736
+ export C_INCLUDE_PATH=${CONDA_PREFIX}/include:$C_INCLUDE_PATH
737
+ export CPLUS_INCLUDE_PATH=${CONDA_PREFIX}/include:CPLUS_INCLUDE_PATH
738
+ pip install --upgrade git+https://github.com/mit-han-lab/torchsparse.git
739
+ ```
740
+ 2. Training with the following example scripts:
741
+ ```bash
742
+ # SemanticKITTI
743
+ sh scripts/train.sh -g 2 -d semantic_kitti -c semseg-spvcnn-v1m1-0-base -n semseg-spvcnn-v1m1-0-base
744
+ ```
745
+
746
+ #### OctFormer
747
+ OctFormer from _OctFormer: Octree-based Transformers for 3D Point Clouds_.
748
+ 1. Additional requirements:
749
+ ```bash
750
+ cd libs
751
+ git clone https://github.com/octree-nn/dwconv.git
752
+ pip install ./dwconv
753
+ pip install ocnn
754
+ ```
755
+ 2. Uncomment `# from .octformer import *` in `pointcept/models/__init__.py`.
756
+ 2. Training with the following example scripts:
757
+ ```bash
758
+ # ScanNet
759
+ sh scripts/train.sh -g 4 -d scannet -c semseg-octformer-v1m1-0-base -n semseg-octformer-v1m1-0-base
760
+ ```
761
+
762
+ #### Swin3D
763
+ Swin3D from _Swin3D: A Pretrained Transformer Backbone for 3D Indoor Scene Understanding_.
764
+ 1. Additional requirements:
765
+ ```bash
766
+ # 1. Install MinkEngine v0.5.4, follow readme in https://github.com/NVIDIA/MinkowskiEngine;
767
+ # 2. Install Swin3D, mainly for cuda operation:
768
+ cd libs
769
+ git clone https://github.com/microsoft/Swin3D.git
770
+ cd Swin3D
771
+ pip install ./
772
+ ```
773
+ 2. Uncomment `# from .swin3d import *` in `pointcept/models/__init__.py`.
774
+ 3. Pre-Training with the following example scripts (Structured3D preprocessing refer [here](#structured3d)):
775
+ ```bash
776
+ # Structured3D + Swin-S
777
+ sh scripts/train.sh -g 4 -d structured3d -c semseg-swin3d-v1m1-0-small -n semseg-swin3d-v1m1-0-small
778
+ # Structured3D + Swin-L
779
+ sh scripts/train.sh -g 4 -d structured3d -c semseg-swin3d-v1m1-1-large -n semseg-swin3d-v1m1-1-large
780
+
781
+ # Addition
782
+ # Structured3D + SpUNet
783
+ sh scripts/train.sh -g 4 -d structured3d -c semseg-spunet-v1m1-0-base -n semseg-spunet-v1m1-0-base
784
+ # Structured3D + PTv2
785
+ sh scripts/train.sh -g 4 -d structured3d -c semseg-pt-v2m2-0-base -n semseg-pt-v2m2-0-base
786
+ ```
787
+ 4. Fine-tuning with the following example scripts:
788
+ ```bash
789
+ # ScanNet + Swin-S
790
+ sh scripts/train.sh -g 4 -d scannet -w exp/structured3d/semseg-swin3d-v1m1-1-large/model/model_last.pth -c semseg-swin3d-v1m1-0-small -n semseg-swin3d-v1m1-0-small
791
+ # ScanNet + Swin-L
792
+ sh scripts/train.sh -g 4 -d scannet -w exp/structured3d/semseg-swin3d-v1m1-1-large/model/model_last.pth -c semseg-swin3d-v1m1-1-large -n semseg-swin3d-v1m1-1-large
793
+
794
+ # S3DIS + Swin-S (here we provide config support S3DIS normal vector)
795
+ sh scripts/train.sh -g 4 -d s3dis -w exp/structured3d/semseg-swin3d-v1m1-1-large/model/model_last.pth -c semseg-swin3d-v1m1-0-small -n semseg-swin3d-v1m1-0-small
796
+ # S3DIS + Swin-L (here we provide config support S3DIS normal vector)
797
+ sh scripts/train.sh -g 4 -d s3dis -w exp/structured3d/semseg-swin3d-v1m1-1-large/model/model_last.pth -c semseg-swin3d-v1m1-1-large -n semseg-swin3d-v1m1-1-large
798
+ ```
799
+
800
+ #### Context-Aware Classifier
801
+ `Context-Aware Classifier` is a segmentor that can further boost the performance of each backbone, as a replacement for `Default Segmentor`. Training with the following example scripts:
802
+ ```bash
803
+ # ScanNet
804
+ sh scripts/train.sh -g 4 -d scannet -c semseg-cac-v1m1-0-spunet-base -n semseg-cac-v1m1-0-spunet-base
805
+ sh scripts/train.sh -g 4 -d scannet -c semseg-cac-v1m1-1-spunet-lovasz -n semseg-cac-v1m1-1-spunet-lovasz
806
+ sh scripts/train.sh -g 4 -d scannet -c semseg-cac-v1m1-2-ptv2-lovasz -n semseg-cac-v1m1-2-ptv2-lovasz
807
+
808
+ # ScanNet200
809
+ sh scripts/train.sh -g 4 -d scannet200 -c semseg-cac-v1m1-0-spunet-base -n semseg-cac-v1m1-0-spunet-base
810
+ sh scripts/train.sh -g 4 -d scannet200 -c semseg-cac-v1m1-1-spunet-lovasz -n semseg-cac-v1m1-1-spunet-lovasz
811
+ sh scripts/train.sh -g 4 -d scannet200 -c semseg-cac-v1m1-2-ptv2-lovasz -n semseg-cac-v1m1-2-ptv2-lovasz
812
+ ```
813
+
814
+
815
+ ### 2. Instance Segmentation
816
+ #### PointGroup
817
+ [PointGroup](https://github.com/dvlab-research/PointGroup) is a baseline framework for point cloud instance segmentation.
818
+ 1. Additional requirements:
819
+ ```bash
820
+ conda install -c bioconda google-sparsehash
821
+ cd libs/pointgroup_ops
822
+ python setup.py install --include_dirs=${CONDA_PREFIX}/include
823
+ cd ../..
824
+ ```
825
+ 2. Uncomment `# from .point_group import *` in `pointcept/models/__init__.py`.
826
+ 3. Training with the following example scripts:
827
+ ```bash
828
+ # ScanNet
829
+ sh scripts/train.sh -g 4 -d scannet -c insseg-pointgroup-v1m1-0-spunet-base -n insseg-pointgroup-v1m1-0-spunet-base
830
+ # S3DIS
831
+ sh scripts/train.sh -g 4 -d scannet -c insseg-pointgroup-v1m1-0-spunet-base -n insseg-pointgroup-v1m1-0-spunet-base
832
+ ```
833
+
834
+ ### 3. Pre-training
835
+ #### Masked Scene Contrast (MSC)
836
+ 1. Pre-training with the following example scripts:
837
+ ```bash
838
+ # ScanNet
839
+ sh scripts/train.sh -g 8 -d scannet -c pretrain-msc-v1m1-0-spunet-base -n pretrain-msc-v1m1-0-spunet-base
840
+ ```
841
+
842
+ 2. Fine-tuning with the following example scripts:
843
+ enable PointGroup ([here](#pointgroup)) before fine-tuning on instance segmentation task.
844
+ ```bash
845
+ # ScanNet20 Semantic Segmentation
846
+ sh scripts/train.sh -g 8 -d scannet -w exp/scannet/pretrain-msc-v1m1-0-spunet-base/model/model_last.pth -c semseg-spunet-v1m1-4-ft -n semseg-msc-v1m1-0f-spunet-base
847
+ # ScanNet20 Instance Segmentation (enable PointGroup before running the script)
848
+ sh scripts/train.sh -g 4 -d scannet -w exp/scannet/pretrain-msc-v1m1-0-spunet-base/model/model_last.pth -c insseg-pointgroup-v1m1-0-spunet-base -n insseg-msc-v1m1-0f-pointgroup-spunet-base
849
+ ```
850
+ 3. Example log and weight: [[Pretrain](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wuxy_connect_hku_hk/EYvNV4XUJ_5Mlk-g15RelN4BW_P8lVBfC_zhjC_BlBDARg?e=UoGFWH)] [[Semseg](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wuxy_connect_hku_hk/EQkDiv5xkOFKgCpGiGtAlLwBon7i8W6my3TIbGVxuiTttQ?e=tQFnbr)]
851
+
852
+ #### Point Prompt Training (PPT)
853
+ PPT presents a multi-dataset pre-training framework, and it is compatible with various existing pre-training frameworks and backbones.
854
+ 1. PPT supervised joint training with the following example scripts:
855
+ ```bash
856
+ # ScanNet + Structured3d, validate on ScanNet (S3DIS might cause long data time, w/o S3DIS for a quick validation) >= 3090 * 8
857
+ sh scripts/train.sh -g 8 -d scannet -c semseg-ppt-v1m1-0-sc-st-spunet -n semseg-ppt-v1m1-0-sc-st-spunet
858
+ sh scripts/train.sh -g 8 -d scannet -c semseg-ppt-v1m1-1-sc-st-spunet-submit -n semseg-ppt-v1m1-1-sc-st-spunet-submit
859
+ # ScanNet + S3DIS + Structured3d, validate on S3DIS (>= a100 * 8)
860
+ sh scripts/train.sh -g 8 -d s3dis -c semseg-ppt-v1m1-0-s3-sc-st-spunet -n semseg-ppt-v1m1-0-s3-sc-st-spunet
861
+ # SemanticKITTI + nuScenes + Waymo, validate on SemanticKITTI (bs12 >= 3090 * 4 >= 3090 * 8, v1m1-0 is still on tuning)
862
+ sh scripts/train.sh -g 4 -d semantic_kitti -c semseg-ppt-v1m1-0-nu-sk-wa-spunet -n semseg-ppt-v1m1-0-nu-sk-wa-spunet
863
+ sh scripts/train.sh -g 4 -d semantic_kitti -c semseg-ppt-v1m2-0-sk-nu-wa-spunet -n semseg-ppt-v1m2-0-sk-nu-wa-spunet
864
+ sh scripts/train.sh -g 4 -d semantic_kitti -c semseg-ppt-v1m2-1-sk-nu-wa-spunet-submit -n semseg-ppt-v1m2-1-sk-nu-wa-spunet-submit
865
+ # SemanticKITTI + nuScenes + Waymo, validate on nuScenes (bs12 >= 3090 * 4; bs24 >= 3090 * 8, v1m1-0 is still on tuning))
866
+ sh scripts/train.sh -g 4 -d nuscenes -c semseg-ppt-v1m1-0-nu-sk-wa-spunet -n semseg-ppt-v1m1-0-nu-sk-wa-spunet
867
+ sh scripts/train.sh -g 4 -d nuscenes -c semseg-ppt-v1m2-0-nu-sk-wa-spunet -n semseg-ppt-v1m2-0-nu-sk-wa-spunet
868
+ sh scripts/train.sh -g 4 -d nuscenes -c semseg-ppt-v1m2-1-nu-sk-wa-spunet-submit -n semseg-ppt-v1m2-1-nu-sk-wa-spunet-submit
869
+ ```
870
+
871
+ #### PointContrast
872
+ 1. Preprocess and link ScanNet-Pair dataset (pair-wise matching with ScanNet raw RGB-D frame, ~1.5T):
873
+ ```bash
874
+ # RAW_SCANNET_DIR: the directory of downloaded ScanNet v2 raw dataset.
875
+ # PROCESSED_SCANNET_PAIR_DIR: the directory of processed ScanNet pair dataset (output dir).
876
+ python pointcept/datasets/preprocessing/scannet/scannet_pair/preprocess.py --dataset_root ${RAW_SCANNET_DIR} --output_root ${PROCESSED_SCANNET_PAIR_DIR}
877
+ ln -s ${PROCESSED_SCANNET_PAIR_DIR} ${CODEBASE_DIR}/data/scannet
878
+ ```
879
+ 2. Pre-training with the following example scripts:
880
+ ```bash
881
+ # ScanNet
882
+ sh scripts/train.sh -g 8 -d scannet -c pretrain-msc-v1m1-1-spunet-pointcontrast -n pretrain-msc-v1m1-1-spunet-pointcontrast
883
+ ```
884
+ 3. Fine-tuning refer [MSC](#masked-scene-contrast-msc).
885
+
886
+ #### Contrastive Scene Contexts
887
+ 1. Preprocess and link ScanNet-Pair dataset (refer [PointContrast](#pointcontrast)):
888
+ 2. Pre-training with the following example scripts:
889
+ ```bash
890
+ # ScanNet
891
+ sh scripts/train.sh -g 8 -d scannet -c pretrain-msc-v1m2-0-spunet-csc -n pretrain-msc-v1m2-0-spunet-csc
892
+ ```
893
+ 3. Fine-tuning refer [MSC](#masked-scene-contrast-msc).
894
+
895
+ ## Acknowledgement
896
+ _Pointcept_ is designed by [Xiaoyang](https://xywu.me/), named by [Yixing](https://github.com/yxlao) and the logo is created by [Yuechen](https://julianjuaner.github.io/). It is derived from [Hengshuang](https://hszhao.github.io/)'s [Semseg](https://github.com/hszhao/semseg) and inspirited by several repos, e.g., [MinkowskiEngine](https://github.com/NVIDIA/MinkowskiEngine), [pointnet2](https://github.com/charlesq34/pointnet2), [mmcv](https://github.com/open-mmlab/mmcv/tree/master/mmcv), and [Detectron2](https://github.com/facebookresearch/detectron2).
submodules/PointTransformerV3/Pointcept/configs/_base_/dataset/scannetpp.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data = dict(
2
+ names=[
3
+ "wall",
4
+ "ceiling",
5
+ "floor",
6
+ "table",
7
+ "door",
8
+ "ceiling lamp",
9
+ "cabinet",
10
+ "blinds",
11
+ "curtain",
12
+ "chair",
13
+ "storage cabinet",
14
+ "office chair",
15
+ "bookshelf",
16
+ "whiteboard",
17
+ "window",
18
+ "box",
19
+ "window frame",
20
+ "monitor",
21
+ "shelf",
22
+ "doorframe",
23
+ "pipe",
24
+ "heater",
25
+ "kitchen cabinet",
26
+ "sofa",
27
+ "windowsill",
28
+ "bed",
29
+ "shower wall",
30
+ "trash can",
31
+ "book",
32
+ "plant",
33
+ "blanket",
34
+ "tv",
35
+ "computer tower",
36
+ "kitchen counter",
37
+ "refrigerator",
38
+ "jacket",
39
+ "electrical duct",
40
+ "sink",
41
+ "bag",
42
+ "picture",
43
+ "pillow",
44
+ "towel",
45
+ "suitcase",
46
+ "backpack",
47
+ "crate",
48
+ "keyboard",
49
+ "rack",
50
+ "toilet",
51
+ "paper",
52
+ "printer",
53
+ "poster",
54
+ "painting",
55
+ "microwave",
56
+ "board",
57
+ "shoes",
58
+ "socket",
59
+ "bottle",
60
+ "bucket",
61
+ "cushion",
62
+ "basket",
63
+ "shoe rack",
64
+ "telephone",
65
+ "file folder",
66
+ "cloth",
67
+ "blind rail",
68
+ "laptop",
69
+ "plant pot",
70
+ "exhaust fan",
71
+ "cup",
72
+ "coat hanger",
73
+ "light switch",
74
+ "speaker",
75
+ "table lamp",
76
+ "air vent",
77
+ "clothes hanger",
78
+ "kettle",
79
+ "smoke detector",
80
+ "container",
81
+ "power strip",
82
+ "slippers",
83
+ "paper bag",
84
+ "mouse",
85
+ "cutting board",
86
+ "toilet paper",
87
+ "paper towel",
88
+ "pot",
89
+ "clock",
90
+ "pan",
91
+ "tap",
92
+ "jar",
93
+ "soap dispenser",
94
+ "binder",
95
+ "bowl",
96
+ "tissue box",
97
+ "whiteboard eraser",
98
+ "toilet brush",
99
+ "spray bottle",
100
+ "headphones",
101
+ "stapler",
102
+ "marker",
103
+ ]
104
+ )
submodules/PointTransformerV3/Pointcept/configs/_base_/default_runtime.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ weight = None # path to model weight
2
+ resume = False # whether to resume training process
3
+ evaluate = True # evaluate after each epoch training process
4
+ test_only = False # test process
5
+
6
+ seed = None # train process will init a random seed and record
7
+ save_path = "exp/default"
8
+ num_worker = 16 # total worker in all gpu
9
+ batch_size = 16 # total batch size in all gpu
10
+ batch_size_val = None # auto adapt to bs 1 for each gpu
11
+ batch_size_test = None # auto adapt to bs 1 for each gpu
12
+ epoch = 100 # total epoch, data loop = epoch // eval_epoch
13
+ eval_epoch = 100 # sche total eval & checkpoint epoch
14
+ clip_grad = None # disable with None, enable with a float
15
+
16
+ sync_bn = False
17
+ enable_amp = False
18
+ empty_cache = False
19
+ empty_cache_per_epoch = False
20
+ find_unused_parameters = False
21
+
22
+ mix_prob = 0
23
+ param_dicts = None # example: param_dicts = [dict(keyword="block", lr_scale=0.1)]
24
+
25
+ # hook
26
+ hooks = [
27
+ dict(type="CheckpointLoader"),
28
+ dict(type="IterationTimer", warmup_iter=2),
29
+ dict(type="InformationWriter"),
30
+ dict(type="SemSegEvaluator"),
31
+ dict(type="CheckpointSaver", save_freq=None),
32
+ dict(type="PreciseEvaluator", test_last=False),
33
+ ]
34
+
35
+ # Trainer
36
+ train = dict(type="DefaultTrainer")
37
+
38
+ # Tester
39
+ test = dict(type="SemSegTester", verbose=True)
submodules/PointTransformerV3/Pointcept/configs/matterport3d/semseg-pt-v3m1-0-base.py ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = ["../_base_/default_runtime.py"]
2
+
3
+ # misc custom setting
4
+ batch_size = 12 # bs: total bs in all gpus
5
+ num_worker = 24
6
+ mix_prob = 0.8
7
+ empty_cache = False
8
+ enable_amp = True
9
+
10
+ # model settings
11
+ model = dict(
12
+ type="DefaultSegmentorV2",
13
+ num_classes=21,
14
+ backbone_out_channels=64,
15
+ backbone=dict(
16
+ type="PT-v3m1",
17
+ in_channels=6,
18
+ order=("z", "z-trans", "hilbert", "hilbert-trans"),
19
+ stride=(2, 2, 2, 2),
20
+ enc_depths=(2, 2, 2, 6, 2),
21
+ enc_channels=(32, 64, 128, 256, 512),
22
+ enc_num_head=(2, 4, 8, 16, 32),
23
+ enc_patch_size=(1024, 1024, 1024, 1024, 1024),
24
+ dec_depths=(2, 2, 2, 2),
25
+ dec_channels=(64, 64, 128, 256),
26
+ dec_num_head=(4, 4, 8, 16),
27
+ dec_patch_size=(1024, 1024, 1024, 1024),
28
+ mlp_ratio=4,
29
+ qkv_bias=True,
30
+ qk_scale=None,
31
+ attn_drop=0.0,
32
+ proj_drop=0.0,
33
+ drop_path=0.3,
34
+ shuffle_orders=True,
35
+ pre_norm=True,
36
+ enable_rpe=False,
37
+ enable_flash=True,
38
+ upcast_attention=False,
39
+ upcast_softmax=False,
40
+ cls_mode=False,
41
+ pdnorm_bn=False,
42
+ pdnorm_ln=False,
43
+ pdnorm_decouple=True,
44
+ pdnorm_adaptive=False,
45
+ pdnorm_affine=True,
46
+ pdnorm_conditions=("ScanNet", "S3DIS", "Structured3D"),
47
+ ),
48
+ criteria=[
49
+ dict(type="CrossEntropyLoss", loss_weight=1.0, ignore_index=-1),
50
+ dict(type="LovaszLoss", mode="multiclass", loss_weight=1.0, ignore_index=-1),
51
+ ],
52
+ )
53
+
54
+ # scheduler settings
55
+ epoch = 800
56
+ optimizer = dict(type="AdamW", lr=0.006, weight_decay=0.05)
57
+ scheduler = dict(
58
+ type="OneCycleLR",
59
+ max_lr=[0.006, 0.0006],
60
+ pct_start=0.05,
61
+ anneal_strategy="cos",
62
+ div_factor=10.0,
63
+ final_div_factor=1000.0,
64
+ )
65
+ param_dicts = [dict(keyword="block", lr=0.0006)]
66
+
67
+ # dataset settings
68
+ dataset_type = "DefaultDataset"
69
+ data_root = "data/matterport3d"
70
+
71
+ data = dict(
72
+ num_classes=21,
73
+ ignore_index=-1,
74
+ names=(
75
+ "wall",
76
+ "floor",
77
+ "cabinet",
78
+ "bed",
79
+ "chair",
80
+ "sofa",
81
+ "table",
82
+ "door",
83
+ "window",
84
+ "bookshelf",
85
+ "picture",
86
+ "counter",
87
+ "desk",
88
+ "curtain",
89
+ "refrigerator",
90
+ "shower curtain",
91
+ "toilet",
92
+ "sink",
93
+ "bathtub",
94
+ "other",
95
+ "ceiling",
96
+ ),
97
+ train=dict(
98
+ type=dataset_type,
99
+ split="train",
100
+ data_root=data_root,
101
+ transform=[
102
+ dict(type="CenterShift", apply_z=True),
103
+ dict(
104
+ type="RandomDropout", dropout_ratio=0.2, dropout_application_ratio=0.2
105
+ ),
106
+ # dict(type="RandomRotateTargetAngle", angle=(1/2, 1, 3/2), center=[0, 0, 0], axis="z", p=0.75),
107
+ dict(type="RandomRotate", angle=[-1, 1], axis="z", center=[0, 0, 0], p=0.5),
108
+ dict(type="RandomRotate", angle=[-1 / 64, 1 / 64], axis="x", p=0.5),
109
+ dict(type="RandomRotate", angle=[-1 / 64, 1 / 64], axis="y", p=0.5),
110
+ dict(type="RandomScale", scale=[0.9, 1.1]),
111
+ # dict(type="RandomShift", shift=[0.2, 0.2, 0.2]),
112
+ dict(type="RandomFlip", p=0.5),
113
+ dict(type="RandomJitter", sigma=0.005, clip=0.02),
114
+ dict(type="ElasticDistortion", distortion_params=[[0.2, 0.4], [0.8, 1.6]]),
115
+ dict(type="ChromaticAutoContrast", p=0.2, blend_factor=None),
116
+ dict(type="ChromaticTranslation", p=0.95, ratio=0.05),
117
+ dict(type="ChromaticJitter", p=0.95, std=0.05),
118
+ # dict(type="HueSaturationTranslation", hue_max=0.2, saturation_max=0.2),
119
+ # dict(type="RandomColorDrop", p=0.2, color_augment=0.0),
120
+ dict(
121
+ type="GridSample",
122
+ grid_size=0.02,
123
+ hash_type="fnv",
124
+ mode="train",
125
+ return_grid_coord=True,
126
+ ),
127
+ dict(type="SphereCrop", point_max=102400, mode="random"),
128
+ dict(type="CenterShift", apply_z=False),
129
+ dict(type="NormalizeColor"),
130
+ # dict(type="ShufflePoint"),
131
+ dict(type="ToTensor"),
132
+ dict(
133
+ type="Collect",
134
+ keys=("coord", "grid_coord", "segment"),
135
+ feat_keys=("color", "normal"),
136
+ ),
137
+ ],
138
+ test_mode=False,
139
+ ),
140
+ val=dict(
141
+ type=dataset_type,
142
+ split="val",
143
+ data_root=data_root,
144
+ transform=[
145
+ dict(type="CenterShift", apply_z=True),
146
+ dict(
147
+ type="GridSample",
148
+ grid_size=0.02,
149
+ hash_type="fnv",
150
+ mode="train",
151
+ return_grid_coord=True,
152
+ ),
153
+ dict(type="CenterShift", apply_z=False),
154
+ dict(type="NormalizeColor"),
155
+ dict(type="ToTensor"),
156
+ dict(
157
+ type="Collect",
158
+ keys=("coord", "grid_coord", "segment"),
159
+ feat_keys=("color", "normal"),
160
+ ),
161
+ ],
162
+ test_mode=False,
163
+ ),
164
+ test=dict(
165
+ type=dataset_type,
166
+ split="val",
167
+ data_root=data_root,
168
+ transform=[
169
+ dict(type="CenterShift", apply_z=True),
170
+ dict(type="NormalizeColor"),
171
+ ],
172
+ test_mode=True,
173
+ test_cfg=dict(
174
+ voxelize=dict(
175
+ type="GridSample",
176
+ grid_size=0.02,
177
+ hash_type="fnv",
178
+ mode="test",
179
+ keys=("coord", "color", "normal"),
180
+ return_grid_coord=True,
181
+ ),
182
+ crop=None,
183
+ post_transform=[
184
+ dict(type="CenterShift", apply_z=False),
185
+ dict(type="ToTensor"),
186
+ dict(
187
+ type="Collect",
188
+ keys=("coord", "grid_coord", "index"),
189
+ feat_keys=("color", "normal"),
190
+ ),
191
+ ],
192
+ aug_transform=[
193
+ [
194
+ dict(
195
+ type="RandomRotateTargetAngle",
196
+ angle=[0],
197
+ axis="z",
198
+ center=[0, 0, 0],
199
+ p=1,
200
+ )
201
+ ],
202
+ [
203
+ dict(
204
+ type="RandomRotateTargetAngle",
205
+ angle=[1 / 2],
206
+ axis="z",
207
+ center=[0, 0, 0],
208
+ p=1,
209
+ )
210
+ ],
211
+ [
212
+ dict(
213
+ type="RandomRotateTargetAngle",
214
+ angle=[1],
215
+ axis="z",
216
+ center=[0, 0, 0],
217
+ p=1,
218
+ )
219
+ ],
220
+ [
221
+ dict(
222
+ type="RandomRotateTargetAngle",
223
+ angle=[3 / 2],
224
+ axis="z",
225
+ center=[0, 0, 0],
226
+ p=1,
227
+ )
228
+ ],
229
+ [
230
+ dict(
231
+ type="RandomRotateTargetAngle",
232
+ angle=[0],
233
+ axis="z",
234
+ center=[0, 0, 0],
235
+ p=1,
236
+ ),
237
+ dict(type="RandomScale", scale=[0.95, 0.95]),
238
+ ],
239
+ [
240
+ dict(
241
+ type="RandomRotateTargetAngle",
242
+ angle=[1 / 2],
243
+ axis="z",
244
+ center=[0, 0, 0],
245
+ p=1,
246
+ ),
247
+ dict(type="RandomScale", scale=[0.95, 0.95]),
248
+ ],
249
+ [
250
+ dict(
251
+ type="RandomRotateTargetAngle",
252
+ angle=[1],
253
+ axis="z",
254
+ center=[0, 0, 0],
255
+ p=1,
256
+ ),
257
+ dict(type="RandomScale", scale=[0.95, 0.95]),
258
+ ],
259
+ [
260
+ dict(
261
+ type="RandomRotateTargetAngle",
262
+ angle=[3 / 2],
263
+ axis="z",
264
+ center=[0, 0, 0],
265
+ p=1,
266
+ ),
267
+ dict(type="RandomScale", scale=[0.95, 0.95]),
268
+ ],
269
+ [
270
+ dict(
271
+ type="RandomRotateTargetAngle",
272
+ angle=[0],
273
+ axis="z",
274
+ center=[0, 0, 0],
275
+ p=1,
276
+ ),
277
+ dict(type="RandomScale", scale=[1.05, 1.05]),
278
+ ],
279
+ [
280
+ dict(
281
+ type="RandomRotateTargetAngle",
282
+ angle=[1 / 2],
283
+ axis="z",
284
+ center=[0, 0, 0],
285
+ p=1,
286
+ ),
287
+ dict(type="RandomScale", scale=[1.05, 1.05]),
288
+ ],
289
+ [
290
+ dict(
291
+ type="RandomRotateTargetAngle",
292
+ angle=[1],
293
+ axis="z",
294
+ center=[0, 0, 0],
295
+ p=1,
296
+ ),
297
+ dict(type="RandomScale", scale=[1.05, 1.05]),
298
+ ],
299
+ [
300
+ dict(
301
+ type="RandomRotateTargetAngle",
302
+ angle=[3 / 2],
303
+ axis="z",
304
+ center=[0, 0, 0],
305
+ p=1,
306
+ ),
307
+ dict(type="RandomScale", scale=[1.05, 1.05]),
308
+ ],
309
+ [dict(type="RandomFlip", p=1)],
310
+ ],
311
+ ),
312
+ ),
313
+ )
submodules/PointTransformerV3/Pointcept/configs/matterport3d/semseg-spunet-v1m1-0-base.py ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = ["../_base_/default_runtime.py"]
2
+
3
+ # misc custom setting
4
+ batch_size = 12 # bs: total bs in all gpus
5
+ mix_prob = 0.8
6
+ empty_cache = False
7
+ enable_amp = True
8
+
9
+ # model settings
10
+ model = dict(
11
+ type="DefaultSegmentor",
12
+ backbone=dict(
13
+ type="SpUNet-v1m1",
14
+ in_channels=6,
15
+ num_classes=21,
16
+ channels=(32, 64, 128, 256, 256, 128, 96, 96),
17
+ layers=(2, 3, 4, 6, 2, 2, 2, 2),
18
+ ),
19
+ criteria=[dict(type="CrossEntropyLoss", loss_weight=1.0, ignore_index=-1)],
20
+ )
21
+
22
+
23
+ # scheduler settings
24
+ epoch = 800
25
+ optimizer = dict(type="SGD", lr=0.05, momentum=0.9, weight_decay=0.0001, nesterov=True)
26
+ scheduler = dict(
27
+ type="OneCycleLR",
28
+ max_lr=optimizer["lr"],
29
+ pct_start=0.05,
30
+ anneal_strategy="cos",
31
+ div_factor=10.0,
32
+ final_div_factor=10000.0,
33
+ )
34
+
35
+ # dataset settings
36
+ dataset_type = "DefaultDataset"
37
+ data_root = "data/matterport3d"
38
+
39
+ data = dict(
40
+ num_classes=21,
41
+ ignore_index=-1,
42
+ names=(
43
+ "wall",
44
+ "floor",
45
+ "cabinet",
46
+ "bed",
47
+ "chair",
48
+ "sofa",
49
+ "table",
50
+ "door",
51
+ "window",
52
+ "bookshelf",
53
+ "picture",
54
+ "counter",
55
+ "desk",
56
+ "curtain",
57
+ "refrigerator",
58
+ "shower curtain",
59
+ "toilet",
60
+ "sink",
61
+ "bathtub",
62
+ "other",
63
+ "ceiling",
64
+ ),
65
+ train=dict(
66
+ type=dataset_type,
67
+ split="train",
68
+ data_root=data_root,
69
+ transform=[
70
+ dict(type="CenterShift", apply_z=True),
71
+ dict(
72
+ type="RandomDropout", dropout_ratio=0.2, dropout_application_ratio=0.2
73
+ ),
74
+ # dict(type="RandomRotateTargetAngle", angle=(1/2, 1, 3/2), center=[0, 0, 0], axis="z", p=0.75),
75
+ dict(type="RandomRotate", angle=[-1, 1], axis="z", center=[0, 0, 0], p=0.5),
76
+ dict(type="RandomRotate", angle=[-1 / 64, 1 / 64], axis="x", p=0.5),
77
+ dict(type="RandomRotate", angle=[-1 / 64, 1 / 64], axis="y", p=0.5),
78
+ dict(type="RandomScale", scale=[0.9, 1.1]),
79
+ # dict(type="RandomShift", shift=[0.2, 0.2, 0.2]),
80
+ dict(type="RandomFlip", p=0.5),
81
+ dict(type="RandomJitter", sigma=0.005, clip=0.02),
82
+ dict(type="ElasticDistortion", distortion_params=[[0.2, 0.4], [0.8, 1.6]]),
83
+ dict(type="ChromaticAutoContrast", p=0.2, blend_factor=None),
84
+ dict(type="ChromaticTranslation", p=0.95, ratio=0.05),
85
+ dict(type="ChromaticJitter", p=0.95, std=0.05),
86
+ # dict(type="HueSaturationTranslation", hue_max=0.2, saturation_max=0.2),
87
+ # dict(type="RandomColorDrop", p=0.2, color_augment=0.0),
88
+ dict(
89
+ type="GridSample",
90
+ grid_size=0.02,
91
+ hash_type="fnv",
92
+ mode="train",
93
+ return_grid_coord=True,
94
+ ),
95
+ dict(type="SphereCrop", point_max=100000, mode="random"),
96
+ dict(type="CenterShift", apply_z=False),
97
+ dict(type="NormalizeColor"),
98
+ dict(type="ShufflePoint"),
99
+ dict(type="ToTensor"),
100
+ dict(
101
+ type="Collect",
102
+ keys=("coord", "grid_coord", "segment"),
103
+ feat_keys=("color", "normal"),
104
+ ),
105
+ ],
106
+ test_mode=False,
107
+ ),
108
+ val=dict(
109
+ type=dataset_type,
110
+ split="val",
111
+ data_root=data_root,
112
+ transform=[
113
+ dict(type="CenterShift", apply_z=True),
114
+ dict(
115
+ type="GridSample",
116
+ grid_size=0.02,
117
+ hash_type="fnv",
118
+ mode="train",
119
+ return_grid_coord=True,
120
+ ),
121
+ # dict(type="SphereCrop", point_max=1000000, mode="center"),
122
+ dict(type="CenterShift", apply_z=False),
123
+ dict(type="NormalizeColor"),
124
+ dict(type="ToTensor"),
125
+ dict(
126
+ type="Collect",
127
+ keys=("coord", "grid_coord", "segment"),
128
+ feat_keys=("color", "normal"),
129
+ ),
130
+ ],
131
+ test_mode=False,
132
+ ),
133
+ test=dict(
134
+ type=dataset_type,
135
+ split="val",
136
+ data_root=data_root,
137
+ transform=[
138
+ dict(type="CenterShift", apply_z=True),
139
+ dict(type="NormalizeColor"),
140
+ ],
141
+ test_mode=True,
142
+ test_cfg=dict(
143
+ voxelize=dict(
144
+ type="GridSample",
145
+ grid_size=0.02,
146
+ hash_type="fnv",
147
+ mode="test",
148
+ return_grid_coord=True,
149
+ keys=("coord", "color", "normal"),
150
+ ),
151
+ crop=None,
152
+ post_transform=[
153
+ dict(type="CenterShift", apply_z=False),
154
+ dict(type="ToTensor"),
155
+ dict(
156
+ type="Collect",
157
+ keys=("coord", "grid_coord", "index"),
158
+ feat_keys=("color", "normal"),
159
+ ),
160
+ ],
161
+ aug_transform=[
162
+ [
163
+ dict(
164
+ type="RandomRotateTargetAngle",
165
+ angle=[0],
166
+ axis="z",
167
+ center=[0, 0, 0],
168
+ p=1,
169
+ )
170
+ ],
171
+ [
172
+ dict(
173
+ type="RandomRotateTargetAngle",
174
+ angle=[1 / 2],
175
+ axis="z",
176
+ center=[0, 0, 0],
177
+ p=1,
178
+ )
179
+ ],
180
+ [
181
+ dict(
182
+ type="RandomRotateTargetAngle",
183
+ angle=[1],
184
+ axis="z",
185
+ center=[0, 0, 0],
186
+ p=1,
187
+ )
188
+ ],
189
+ [
190
+ dict(
191
+ type="RandomRotateTargetAngle",
192
+ angle=[3 / 2],
193
+ axis="z",
194
+ center=[0, 0, 0],
195
+ p=1,
196
+ )
197
+ ],
198
+ [
199
+ dict(
200
+ type="RandomRotateTargetAngle",
201
+ angle=[0],
202
+ axis="z",
203
+ center=[0, 0, 0],
204
+ p=1,
205
+ ),
206
+ dict(type="RandomScale", scale=[0.95, 0.95]),
207
+ ],
208
+ [
209
+ dict(
210
+ type="RandomRotateTargetAngle",
211
+ angle=[1 / 2],
212
+ axis="z",
213
+ center=[0, 0, 0],
214
+ p=1,
215
+ ),
216
+ dict(type="RandomScale", scale=[0.95, 0.95]),
217
+ ],
218
+ [
219
+ dict(
220
+ type="RandomRotateTargetAngle",
221
+ angle=[1],
222
+ axis="z",
223
+ center=[0, 0, 0],
224
+ p=1,
225
+ ),
226
+ dict(type="RandomScale", scale=[0.95, 0.95]),
227
+ ],
228
+ [
229
+ dict(
230
+ type="RandomRotateTargetAngle",
231
+ angle=[3 / 2],
232
+ axis="z",
233
+ center=[0, 0, 0],
234
+ p=1,
235
+ ),
236
+ dict(type="RandomScale", scale=[0.95, 0.95]),
237
+ ],
238
+ [
239
+ dict(
240
+ type="RandomRotateTargetAngle",
241
+ angle=[0],
242
+ axis="z",
243
+ center=[0, 0, 0],
244
+ p=1,
245
+ ),
246
+ dict(type="RandomScale", scale=[1.05, 1.05]),
247
+ ],
248
+ [
249
+ dict(
250
+ type="RandomRotateTargetAngle",
251
+ angle=[1 / 2],
252
+ axis="z",
253
+ center=[0, 0, 0],
254
+ p=1,
255
+ ),
256
+ dict(type="RandomScale", scale=[1.05, 1.05]),
257
+ ],
258
+ [
259
+ dict(
260
+ type="RandomRotateTargetAngle",
261
+ angle=[1],
262
+ axis="z",
263
+ center=[0, 0, 0],
264
+ p=1,
265
+ ),
266
+ dict(type="RandomScale", scale=[1.05, 1.05]),
267
+ ],
268
+ [
269
+ dict(
270
+ type="RandomRotateTargetAngle",
271
+ angle=[3 / 2],
272
+ axis="z",
273
+ center=[0, 0, 0],
274
+ p=1,
275
+ ),
276
+ dict(type="RandomScale", scale=[1.05, 1.05]),
277
+ ],
278
+ [dict(type="RandomFlip", p=1)],
279
+ ],
280
+ ),
281
+ ),
282
+ )
submodules/PointTransformerV3/Pointcept/configs/modelnet40/cls-ptv3-v1m1-0-base.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = ["../_base_/default_runtime.py"]
2
+ # misc custom setting
3
+ batch_size = 32 # bs: total bs in all gpus
4
+ num_worker = 16
5
+ batch_size_val = 8
6
+ empty_cache = False
7
+ enable_amp = False
8
+
9
+ # model settings
10
+ model = dict(
11
+ type="DefaultClassifier",
12
+ num_classes=40,
13
+ backbone_embed_dim=512,
14
+ backbone=dict(
15
+ type="PT-v3m1",
16
+ in_channels=6,
17
+ order=("z", "z-trans", "hilbert", "hilbert-trans"),
18
+ stride=(2, 2, 2, 2),
19
+ enc_depths=(2, 2, 2, 6, 2),
20
+ enc_channels=(32, 64, 128, 256, 512),
21
+ enc_num_head=(2, 4, 8, 16, 32),
22
+ enc_patch_size=(1024, 1024, 1024, 1024, 1024),
23
+ dec_depths=(2, 2, 2, 2),
24
+ dec_channels=(64, 64, 128, 256),
25
+ dec_num_head=(4, 4, 8, 16),
26
+ dec_patch_size=(1024, 1024, 1024, 1024),
27
+ mlp_ratio=4,
28
+ qkv_bias=True,
29
+ qk_scale=None,
30
+ attn_drop=0.0,
31
+ proj_drop=0.0,
32
+ drop_path=0.3,
33
+ shuffle_orders=True,
34
+ pre_norm=True,
35
+ enable_rpe=False,
36
+ enable_flash=True,
37
+ upcast_attention=False,
38
+ upcast_softmax=False,
39
+ cls_mode=True,
40
+ pdnorm_bn=False,
41
+ pdnorm_ln=False,
42
+ pdnorm_decouple=True,
43
+ pdnorm_adaptive=False,
44
+ pdnorm_affine=True,
45
+ pdnorm_conditions=("ScanNet", "S3DIS", "Structured3D"),
46
+ ),
47
+ criteria=[
48
+ dict(type="CrossEntropyLoss", loss_weight=1.0, ignore_index=-1),
49
+ dict(type="LovaszLoss", mode="multiclass", loss_weight=1.0, ignore_index=-1),
50
+ ],
51
+ )
52
+
53
+ # scheduler settings
54
+ epoch = 300
55
+ # optimizer = dict(type="SGD", lr=0.1, momentum=0.9, weight_decay=0.0001, nesterov=True)
56
+ # scheduler = dict(type="MultiStepLR", milestones=[0.6, 0.8], gamma=0.1)
57
+ optimizer = dict(type="AdamW", lr=0.001, weight_decay=0.01)
58
+ scheduler = dict(
59
+ type="OneCycleLR",
60
+ max_lr=[0.001, 0.0001],
61
+ pct_start=0.05,
62
+ anneal_strategy="cos",
63
+ div_factor=10.0,
64
+ final_div_factor=1000.0,
65
+ )
66
+ param_dicts = [dict(keyword="block", lr=0.0001)]
67
+
68
+ # dataset settings
69
+ dataset_type = "ModelNetDataset"
70
+ data_root = "data/modelnet40_normal_resampled"
71
+ cache_data = False
72
+ class_names = [
73
+ "airplane",
74
+ "bathtub",
75
+ "bed",
76
+ "bench",
77
+ "bookshelf",
78
+ "bottle",
79
+ "bowl",
80
+ "car",
81
+ "chair",
82
+ "cone",
83
+ "cup",
84
+ "curtain",
85
+ "desk",
86
+ "door",
87
+ "dresser",
88
+ "flower_pot",
89
+ "glass_box",
90
+ "guitar",
91
+ "keyboard",
92
+ "lamp",
93
+ "laptop",
94
+ "mantel",
95
+ "monitor",
96
+ "night_stand",
97
+ "person",
98
+ "piano",
99
+ "plant",
100
+ "radio",
101
+ "range_hood",
102
+ "sink",
103
+ "sofa",
104
+ "stairs",
105
+ "stool",
106
+ "table",
107
+ "tent",
108
+ "toilet",
109
+ "tv_stand",
110
+ "vase",
111
+ "wardrobe",
112
+ "xbox",
113
+ ]
114
+
115
+ data = dict(
116
+ num_classes=40,
117
+ ignore_index=-1,
118
+ names=class_names,
119
+ train=dict(
120
+ type=dataset_type,
121
+ split="train",
122
+ data_root=data_root,
123
+ class_names=class_names,
124
+ transform=[
125
+ dict(type="NormalizeCoord"),
126
+ # dict(type="CenterShift", apply_z=True),
127
+ # dict(type="RandomRotate", angle=[-1, 1], axis="z", center=[0, 0, 0], p=0.5),
128
+ # dict(type="RandomRotate", angle=[-1/24, 1/24], axis="x", p=0.5),
129
+ # dict(type="RandomRotate", angle=[-1/24, 1/24], axis="y", p=0.5),
130
+ dict(type="RandomScale", scale=[0.7, 1.5], anisotropic=True),
131
+ dict(type="RandomShift", shift=((-0.2, 0.2), (-0.2, 0.2), (-0.2, 0.2))),
132
+ # dict(type="RandomFlip", p=0.5),
133
+ # dict(type="RandomJitter", sigma=0.005, clip=0.02),
134
+ # dict(type="ElasticDistortion", distortion_params=[[0.2, 0.4], [0.8, 1.6]]),
135
+ dict(
136
+ type="GridSample",
137
+ grid_size=0.01,
138
+ hash_type="fnv",
139
+ mode="train",
140
+ keys=("coord", "normal"),
141
+ return_grid_coord=True,
142
+ ),
143
+ # dict(type="SphereCrop", point_max=10000, mode="random"),
144
+ # dict(type="CenterShift", apply_z=True),
145
+ dict(type="ShufflePoint"),
146
+ dict(type="ToTensor"),
147
+ dict(
148
+ type="Collect",
149
+ keys=("coord", "grid_coord", "category"),
150
+ feat_keys=["coord", "normal"],
151
+ ),
152
+ ],
153
+ test_mode=False,
154
+ ),
155
+ val=dict(
156
+ type=dataset_type,
157
+ split="test",
158
+ data_root=data_root,
159
+ class_names=class_names,
160
+ transform=[
161
+ dict(type="NormalizeCoord"),
162
+ dict(
163
+ type="GridSample",
164
+ grid_size=0.01,
165
+ hash_type="fnv",
166
+ mode="train",
167
+ keys=("coord", "normal"),
168
+ return_grid_coord=True,
169
+ ),
170
+ dict(type="ToTensor"),
171
+ dict(
172
+ type="Collect",
173
+ keys=("coord", "grid_coord", "category"),
174
+ feat_keys=["coord", "normal"],
175
+ ),
176
+ ],
177
+ test_mode=False,
178
+ ),
179
+ test=dict(
180
+ type=dataset_type,
181
+ split="test",
182
+ data_root=data_root,
183
+ class_names=class_names,
184
+ transform=[
185
+ dict(type="NormalizeCoord"),
186
+ ],
187
+ test_mode=True,
188
+ test_cfg=dict(
189
+ post_transform=[
190
+ dict(
191
+ type="GridSample",
192
+ grid_size=0.01,
193
+ hash_type="fnv",
194
+ mode="train",
195
+ keys=("coord", "normal"),
196
+ return_grid_coord=True,
197
+ ),
198
+ dict(type="ToTensor"),
199
+ dict(
200
+ type="Collect",
201
+ keys=("coord", "grid_coord"),
202
+ feat_keys=["coord", "normal"],
203
+ ),
204
+ ],
205
+ aug_transform=[
206
+ [dict(type="RandomScale", scale=[1, 1], anisotropic=True)], # 1
207
+ [dict(type="RandomScale", scale=[0.8, 1.2], anisotropic=True)], # 2
208
+ [dict(type="RandomScale", scale=[0.8, 1.2], anisotropic=True)], # 3
209
+ [dict(type="RandomScale", scale=[0.8, 1.2], anisotropic=True)], # 4
210
+ [dict(type="RandomScale", scale=[0.8, 1.2], anisotropic=True)], # 5
211
+ [dict(type="RandomScale", scale=[0.8, 1.2], anisotropic=True)], # 5
212
+ [dict(type="RandomScale", scale=[0.8, 1.2], anisotropic=True)], # 6
213
+ [dict(type="RandomScale", scale=[0.8, 1.2], anisotropic=True)], # 7
214
+ [dict(type="RandomScale", scale=[0.8, 1.2], anisotropic=True)], # 8
215
+ [dict(type="RandomScale", scale=[0.8, 1.2], anisotropic=True)], # 9
216
+ ],
217
+ ),
218
+ ),
219
+ )
220
+
221
+ # hooks
222
+ hooks = [
223
+ dict(type="CheckpointLoader"),
224
+ dict(type="IterationTimer", warmup_iter=2),
225
+ dict(type="InformationWriter"),
226
+ dict(type="ClsEvaluator"),
227
+ dict(type="CheckpointSaver", save_freq=None),
228
+ dict(type="PreciseEvaluator", test_last=False),
229
+ ]
230
+
231
+ # tester
232
+ test = dict(type="ClsVotingTester", num_repeat=100)
submodules/PointTransformerV3/Pointcept/configs/modelnet40/cls-spunet-v1m1-0-base.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = ["../_base_/default_runtime.py"]
2
+ # misc custom setting
3
+ batch_size = 16 # bs: total bs in all gpus
4
+ # batch_size_val = 8
5
+ empty_cache = False
6
+ enable_amp = False
7
+
8
+ # model settings
9
+ model = dict(
10
+ type="DefaultClassifier",
11
+ num_classes=40,
12
+ backbone_embed_dim=256,
13
+ backbone=dict(
14
+ type="SpUNet-v1m1",
15
+ in_channels=6,
16
+ num_classes=0,
17
+ channels=(32, 64, 128, 256, 256, 128, 96, 96),
18
+ layers=(2, 3, 4, 6, 2, 2, 2, 2),
19
+ cls_mode=True,
20
+ ),
21
+ criteria=[dict(type="CrossEntropyLoss", loss_weight=1.0, ignore_index=-1)],
22
+ )
23
+
24
+ # scheduler settings
25
+ epoch = 200
26
+ optimizer = dict(type="SGD", lr=0.1, momentum=0.9, weight_decay=0.0001, nesterov=True)
27
+ scheduler = dict(type="MultiStepLR", milestones=[0.6, 0.8], gamma=0.1)
28
+
29
+ # dataset settings
30
+ dataset_type = "ModelNetDataset"
31
+ data_root = "data/modelnet40_normal_resampled"
32
+ cache_data = False
33
+ class_names = [
34
+ "airplane",
35
+ "bathtub",
36
+ "bed",
37
+ "bench",
38
+ "bookshelf",
39
+ "bottle",
40
+ "bowl",
41
+ "car",
42
+ "chair",
43
+ "cone",
44
+ "cup",
45
+ "curtain",
46
+ "desk",
47
+ "door",
48
+ "dresser",
49
+ "flower_pot",
50
+ "glass_box",
51
+ "guitar",
52
+ "keyboard",
53
+ "lamp",
54
+ "laptop",
55
+ "mantel",
56
+ "monitor",
57
+ "night_stand",
58
+ "person",
59
+ "piano",
60
+ "plant",
61
+ "radio",
62
+ "range_hood",
63
+ "sink",
64
+ "sofa",
65
+ "stairs",
66
+ "stool",
67
+ "table",
68
+ "tent",
69
+ "toilet",
70
+ "tv_stand",
71
+ "vase",
72
+ "wardrobe",
73
+ "xbox",
74
+ ]
75
+
76
+ data = dict(
77
+ num_classes=40,
78
+ ignore_index=-1,
79
+ names=class_names,
80
+ train=dict(
81
+ type=dataset_type,
82
+ split="train",
83
+ data_root=data_root,
84
+ class_names=class_names,
85
+ transform=[
86
+ dict(type="NormalizeCoord"),
87
+ # dict(type="CenterShift", apply_z=True),
88
+ # dict(type="RandomRotate", angle=[-1, 1], axis="z", center=[0, 0, 0], p=0.5),
89
+ # dict(type="RandomRotate", angle=[-1/24, 1/24], axis="x", p=0.5),
90
+ # dict(type="RandomRotate", angle=[-1/24, 1/24], axis="y", p=0.5),
91
+ dict(type="RandomScale", scale=[0.9, 1.1]),
92
+ dict(type="RandomShift", shift=((-0.2, 0.2), (-0.2, 0.2), (-0.2, 0.2))),
93
+ # dict(type="RandomFlip", p=0.5),
94
+ # dict(type="RandomJitter", sigma=0.005, clip=0.02),
95
+ # dict(type="ElasticDistortion", distortion_params=[[0.2, 0.4], [0.8, 1.6]]),
96
+ dict(
97
+ type="GridSample",
98
+ grid_size=0.01,
99
+ hash_type="fnv",
100
+ mode="train",
101
+ keys=("coord", "normal"),
102
+ return_grid_coord=True,
103
+ ),
104
+ # dict(type="SphereCrop", point_max=10000, mode="random"),
105
+ # dict(type="CenterShift", apply_z=True),
106
+ dict(type="ShufflePoint"),
107
+ dict(type="ToTensor"),
108
+ dict(
109
+ type="Collect",
110
+ keys=("coord", "grid_coord", "category"),
111
+ feat_keys=["coord", "normal"],
112
+ ),
113
+ ],
114
+ test_mode=False,
115
+ ),
116
+ val=dict(
117
+ type=dataset_type,
118
+ split="test",
119
+ data_root=data_root,
120
+ class_names=class_names,
121
+ transform=[
122
+ dict(type="NormalizeCoord"),
123
+ dict(
124
+ type="GridSample",
125
+ grid_size=0.01,
126
+ hash_type="fnv",
127
+ mode="train",
128
+ keys=("coord", "normal"),
129
+ return_grid_coord=True,
130
+ ),
131
+ dict(type="ToTensor"),
132
+ dict(
133
+ type="Collect",
134
+ keys=("coord", "grid_coord", "category"),
135
+ feat_keys=["coord", "normal"],
136
+ ),
137
+ ],
138
+ test_mode=False,
139
+ ),
140
+ test=dict(
141
+ type=dataset_type,
142
+ split="test",
143
+ data_root=data_root,
144
+ class_names=class_names,
145
+ transform=[
146
+ dict(type="NormalizeCoord"),
147
+ dict(
148
+ type="GridSample",
149
+ grid_size=0.01,
150
+ hash_type="fnv",
151
+ mode="train",
152
+ keys=("coord", "normal"),
153
+ return_grid_coord=True,
154
+ ),
155
+ dict(type="ToTensor"),
156
+ dict(
157
+ type="Collect",
158
+ keys=("coord", "grid_coord", "category"),
159
+ feat_keys=["coord", "normal"],
160
+ ),
161
+ ],
162
+ test_mode=True,
163
+ ),
164
+ )
165
+
166
+ # hooks
167
+ hooks = [
168
+ dict(type="CheckpointLoader"),
169
+ dict(type="IterationTimer", warmup_iter=2),
170
+ dict(type="InformationWriter"),
171
+ dict(type="ClsEvaluator"),
172
+ dict(type="CheckpointSaver", save_freq=None),
173
+ ]
174
+
175
+ # tester
176
+ test = dict(type="ClsTester")
submodules/PointTransformerV3/Pointcept/configs/nuscenes/semseg-ppt-v1m1-0-nu-sk-wa-spunet.py ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = ["../_base_/default_runtime.py"]
2
+
3
+ # misc custom setting
4
+ batch_size = 12 # bs: total bs in all gpus
5
+ num_worker = 24
6
+ mix_prob = 0.8
7
+ empty_cache = False
8
+ enable_amp = True
9
+ find_unused_parameters = True
10
+
11
+ # trainer
12
+ train = dict(
13
+ type="MultiDatasetTrainer",
14
+ )
15
+
16
+ # model settings
17
+ model = dict(
18
+ type="PPT-v1m1",
19
+ backbone=dict(
20
+ type="SpUNet-v1m3",
21
+ in_channels=4,
22
+ num_classes=0,
23
+ base_channels=32,
24
+ context_channels=256,
25
+ channels=(32, 64, 128, 256, 256, 128, 96, 96),
26
+ layers=(2, 3, 4, 6, 2, 2, 2, 2),
27
+ cls_mode=False,
28
+ conditions=("SemanticKITTI", "nuScenes", "Waymo"),
29
+ zero_init=False,
30
+ norm_decouple=True,
31
+ norm_adaptive=False,
32
+ norm_affine=True,
33
+ ),
34
+ criteria=[
35
+ dict(type="CrossEntropyLoss", loss_weight=1.0, ignore_index=-1),
36
+ dict(type="LovaszLoss", mode="multiclass", loss_weight=1.0, ignore_index=-1),
37
+ ],
38
+ backbone_out_channels=96,
39
+ context_channels=256,
40
+ conditions=("SemanticKITTI", "nuScenes", "Waymo"),
41
+ template="[x]",
42
+ clip_model="ViT-B/16",
43
+ # fmt: off
44
+ class_name=(
45
+ # SemanticKITTI
46
+ "car", "bicycle", "motorcycle", "truck", "other vehicle",
47
+ "person", "person who rides a bicycle", "person who rides a motorcycle", "road", "parking",
48
+ "path for pedestrians at the side of a road", "other ground", "building", "fence", "vegetation",
49
+ "trunk", "terrain", "pole", "traffic sign",
50
+ # nuScenes
51
+ "barrier", "bicycle", "bus", "car", "construction vehicle",
52
+ "motorcycle", "pedestrian", "traffic cone", "trailer", "truck",
53
+ "path suitable or safe for driving", "other flat", "sidewalk", "terrain", "man made", "vegetation",
54
+ # waymo
55
+ "car", "truck", "bus", "other vehicle", "person who rides a motorcycle",
56
+ "person who rides a bicycle", "pedestrian", "sign", "traffic light", "pole",
57
+ "construction cone", "bicycle", "motorcycle", "building", "vegetation",
58
+ "tree trunk", "curb", "road", "lane marker", "other ground", "horizontal surface that can not drive",
59
+ "surface when pedestrians most likely to walk on",
60
+ ),
61
+ valid_index=(
62
+ [i for i in range(19)],
63
+ [i for i in range(19, 19 + 16)],
64
+ [i for i in range(19 + 16, 19 + 16 + 22)],
65
+ ),
66
+ # fmt: on
67
+ backbone_mode=False,
68
+ )
69
+
70
+ # scheduler settings
71
+ epoch = 50
72
+ eval_epoch = 50
73
+ optimizer = dict(type="AdamW", lr=0.002, weight_decay=0.005)
74
+ scheduler = dict(
75
+ type="OneCycleLR",
76
+ max_lr=optimizer["lr"],
77
+ pct_start=0.04,
78
+ anneal_strategy="cos",
79
+ div_factor=10.0,
80
+ final_div_factor=100.0,
81
+ )
82
+ # param_dicts = [dict(keyword="modulation", lr=0.0002)]
83
+
84
+ # dataset settings
85
+ data = dict(
86
+ num_classes=16,
87
+ ignore_index=-1,
88
+ names=[
89
+ "barrier",
90
+ "bicycle",
91
+ "bus",
92
+ "car",
93
+ "construction_vehicle",
94
+ "motorcycle",
95
+ "pedestrian",
96
+ "traffic_cone",
97
+ "trailer",
98
+ "truck",
99
+ "driveable_surface",
100
+ "other_flat",
101
+ "sidewalk",
102
+ "terrain",
103
+ "manmade",
104
+ "vegetation",
105
+ ],
106
+ train=dict(
107
+ type="ConcatDataset",
108
+ datasets=[
109
+ # nuScenes
110
+ dict(
111
+ type="NuScenesDataset",
112
+ split="train",
113
+ data_root="data/nuscenes",
114
+ transform=[
115
+ # dict(type="RandomDropout", dropout_ratio=0.2, dropout_application_ratio=0.2),
116
+ # dict(type="RandomRotateTargetAngle", angle=(1/2, 1, 3/2), center=[0, 0, 0], axis='z', p=0.75),
117
+ dict(
118
+ type="RandomRotate",
119
+ angle=[-1, 1],
120
+ axis="z",
121
+ center=[0, 0, 0],
122
+ p=0.5,
123
+ ),
124
+ # dict(type="RandomRotate", angle=[-1/6, 1/6], axis='x', p=0.5),
125
+ # dict(type="RandomRotate", angle=[-1/6, 1/6], axis='y', p=0.5),
126
+ dict(
127
+ type="PointClip",
128
+ point_cloud_range=(-35.2, -35.2, -4, 35.2, 35.2, 2),
129
+ ),
130
+ dict(type="RandomScale", scale=[0.9, 1.1]),
131
+ # dict(type="RandomShift", shift=[0.2, 0.2, 0.2]),
132
+ dict(type="RandomFlip", p=0.5),
133
+ dict(type="RandomJitter", sigma=0.005, clip=0.02),
134
+ # dict(type="ElasticDistortion", distortion_params=[[0.2, 0.4], [0.8, 1.6]]),
135
+ dict(
136
+ type="GridSample",
137
+ grid_size=0.05,
138
+ hash_type="fnv",
139
+ mode="train",
140
+ keys=("coord", "strength", "segment"),
141
+ return_grid_coord=True,
142
+ ),
143
+ # dict(type="SphereCrop", point_max=1000000, mode="random"),
144
+ # dict(type="CenterShift", apply_z=False),
145
+ dict(type="Add", keys_dict={"condition": "nuScenes"}),
146
+ dict(type="ToTensor"),
147
+ dict(
148
+ type="Collect",
149
+ keys=("coord", "grid_coord", "segment", "condition"),
150
+ feat_keys=("coord", "strength"),
151
+ ),
152
+ ],
153
+ test_mode=False,
154
+ ignore_index=-1,
155
+ loop=1,
156
+ ),
157
+ # SemanticKITTI
158
+ dict(
159
+ type="SemanticKITTIDataset",
160
+ split="train",
161
+ data_root="data/semantic_kitti",
162
+ transform=[
163
+ # dict(type="RandomDropout", dropout_ratio=0.2, dropout_application_ratio=0.2),
164
+ # dict(type="RandomRotateTargetAngle", angle=(1/2, 1, 3/2), center=[0, 0, 0], axis="z", p=0.75),
165
+ dict(
166
+ type="RandomRotate",
167
+ angle=[-1, 1],
168
+ axis="z",
169
+ center=[0, 0, 0],
170
+ p=0.5,
171
+ ),
172
+ # dict(type="RandomRotate", angle=[-1/6, 1/6], axis="x", p=0.5),
173
+ # dict(type="RandomRotate", angle=[-1/6, 1/6], axis="y", p=0.5),
174
+ dict(
175
+ type="PointClip",
176
+ point_cloud_range=(-75.2, -75.2, -4, 75.2, 75.2, 2),
177
+ ),
178
+ dict(type="RandomScale", scale=[0.9, 1.1]),
179
+ # dict(type="RandomShift", shift=[0.2, 0.2, 0.2]),
180
+ dict(type="RandomFlip", p=0.5),
181
+ dict(type="RandomJitter", sigma=0.005, clip=0.02),
182
+ # dict(type="ElasticDistortion", distortion_params=[[0.2, 0.4], [0.8, 1.6]]),
183
+ dict(
184
+ type="GridSample",
185
+ grid_size=0.05,
186
+ hash_type="fnv",
187
+ mode="train",
188
+ keys=("coord", "strength", "segment"),
189
+ return_grid_coord=True,
190
+ ),
191
+ # dict(type="SphereCrop", point_max=1000000, mode="random"),
192
+ # dict(type="CenterShift", apply_z=False),
193
+ dict(type="Add", keys_dict={"condition": "SemanticKITTI"}),
194
+ dict(type="ToTensor"),
195
+ dict(
196
+ type="Collect",
197
+ keys=("coord", "grid_coord", "segment", "condition"),
198
+ feat_keys=("coord", "strength"),
199
+ ),
200
+ ],
201
+ test_mode=False,
202
+ ignore_index=-1,
203
+ loop=1,
204
+ ),
205
+ # Waymo
206
+ dict(
207
+ type="WaymoDataset",
208
+ split="training",
209
+ data_root="data/waymo",
210
+ transform=[
211
+ # dict(type="RandomDropout", dropout_ratio=0.2, dropout_application_ratio=0.2),
212
+ # dict(type="RandomRotateTargetAngle", angle=(1/2, 1, 3/2), center=[0, 0, 0], axis="z", p=0.75),
213
+ dict(
214
+ type="RandomRotate",
215
+ angle=[-1, 1],
216
+ axis="z",
217
+ center=[0, 0, 0],
218
+ p=0.5,
219
+ ),
220
+ # dict(type="RandomRotate", angle=[-1/6, 1/6], axis="x", p=0.5),
221
+ # dict(type="RandomRotate", angle=[-1/6, 1/6], axis="y", p=0.5),
222
+ dict(
223
+ type="PointClip",
224
+ point_cloud_range=(-35.2, -35.2, -4, 35.2, 35.2, 2),
225
+ ),
226
+ dict(type="RandomScale", scale=[0.9, 1.1]),
227
+ # dict(type="RandomShift", shift=[0.2, 0.2, 0.2]),
228
+ dict(type="RandomFlip", p=0.5),
229
+ dict(type="RandomJitter", sigma=0.005, clip=0.02),
230
+ # dict(type="ElasticDistortion", distortion_params=[[0.2, 0.4], [0.8, 1.6]]),
231
+ dict(
232
+ type="GridSample",
233
+ grid_size=0.05,
234
+ hash_type="fnv",
235
+ mode="train",
236
+ keys=("coord", "strength", "segment"),
237
+ return_grid_coord=True,
238
+ ),
239
+ # dict(type="SphereCrop", point_max=1000000, mode="random"),
240
+ # dict(type="CenterShift", apply_z=False),
241
+ dict(type="Add", keys_dict={"condition": "Waymo"}),
242
+ dict(type="ToTensor"),
243
+ dict(
244
+ type="Collect",
245
+ keys=("coord", "grid_coord", "segment", "condition"),
246
+ feat_keys=("coord", "strength"),
247
+ ),
248
+ ],
249
+ test_mode=False,
250
+ ignore_index=-1,
251
+ loop=1,
252
+ ),
253
+ ],
254
+ ),
255
+ val=dict(
256
+ type="NuScenesDataset",
257
+ split="val",
258
+ data_root="data/nuscenes",
259
+ transform=[
260
+ dict(type="PointClip", point_cloud_range=(-35.2, -35.2, -4, 35.2, 35.2, 2)),
261
+ dict(
262
+ type="GridSample",
263
+ grid_size=0.05,
264
+ hash_type="fnv",
265
+ mode="train",
266
+ keys=("coord", "strength", "segment"),
267
+ return_grid_coord=True,
268
+ ),
269
+ dict(type="Add", keys_dict={"condition": "nuScenes"}),
270
+ dict(type="ToTensor"),
271
+ dict(
272
+ type="Collect",
273
+ keys=("coord", "grid_coord", "segment", "condition"),
274
+ feat_keys=("coord", "strength"),
275
+ ),
276
+ ],
277
+ test_mode=False,
278
+ ignore_index=-1,
279
+ ),
280
+ test=dict(
281
+ type="NuScenesDataset",
282
+ split="val",
283
+ data_root="data/nuscenes",
284
+ transform=[
285
+ dict(type="Copy", keys_dict={"segment": "origin_segment"}),
286
+ dict(
287
+ type="GridSample",
288
+ grid_size=0.025,
289
+ hash_type="fnv",
290
+ mode="train",
291
+ keys=("coord", "strength", "segment"),
292
+ return_inverse=True,
293
+ ),
294
+ ],
295
+ test_mode=True,
296
+ test_cfg=dict(
297
+ voxelize=dict(
298
+ type="GridSample",
299
+ grid_size=0.05,
300
+ hash_type="fnv",
301
+ mode="test",
302
+ return_grid_coord=True,
303
+ keys=("coord", "strength"),
304
+ ),
305
+ crop=None,
306
+ post_transform=[
307
+ dict(type="Add", keys_dict={"condition": "nuScenes"}),
308
+ dict(type="ToTensor"),
309
+ dict(
310
+ type="Collect",
311
+ keys=("coord", "grid_coord", "index", "condition"),
312
+ feat_keys=("coord", "strength"),
313
+ ),
314
+ ],
315
+ aug_transform=[
316
+ [dict(type="RandomScale", scale=[0.9, 0.9])],
317
+ [dict(type="RandomScale", scale=[0.95, 0.95])],
318
+ [dict(type="RandomScale", scale=[1, 1])],
319
+ [dict(type="RandomScale", scale=[1.05, 1.05])],
320
+ [dict(type="RandomScale", scale=[1.1, 1.1])],
321
+ [
322
+ dict(type="RandomScale", scale=[0.9, 0.9]),
323
+ dict(type="RandomFlip", p=1),
324
+ ],
325
+ [
326
+ dict(type="RandomScale", scale=[0.95, 0.95]),
327
+ dict(type="RandomFlip", p=1),
328
+ ],
329
+ [dict(type="RandomScale", scale=[1, 1]), dict(type="RandomFlip", p=1)],
330
+ [
331
+ dict(type="RandomScale", scale=[1.05, 1.05]),
332
+ dict(type="RandomFlip", p=1),
333
+ ],
334
+ [
335
+ dict(type="RandomScale", scale=[1.1, 1.1]),
336
+ dict(type="RandomFlip", p=1),
337
+ ],
338
+ ],
339
+ ),
340
+ ignore_index=-1,
341
+ ),
342
+ )
submodules/PointTransformerV3/Pointcept/configs/nuscenes/semseg-ppt-v1m2-0-nu-sk-wa-spunet.py ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = ["../_base_/default_runtime.py"]
2
+
3
+ # misc custom setting
4
+ batch_size = 12 # bs: total bs in all gpus
5
+ num_worker = 24
6
+ mix_prob = 0.8
7
+ empty_cache = False
8
+ enable_amp = True
9
+ find_unused_parameters = True
10
+
11
+ # trainer
12
+ train = dict(
13
+ type="MultiDatasetTrainer",
14
+ )
15
+
16
+ # model settings
17
+ model = dict(
18
+ type="PPT-v1m2",
19
+ backbone=dict(
20
+ type="SpUNet-v1m3",
21
+ in_channels=4,
22
+ num_classes=0,
23
+ base_channels=32,
24
+ context_channels=256,
25
+ channels=(32, 64, 128, 256, 256, 128, 96, 96),
26
+ layers=(2, 3, 4, 6, 2, 2, 2, 2),
27
+ cls_mode=False,
28
+ conditions=("SemanticKITTI", "nuScenes", "Waymo"),
29
+ zero_init=False,
30
+ norm_decouple=True,
31
+ norm_adaptive=False,
32
+ norm_affine=True,
33
+ ),
34
+ criteria=[
35
+ dict(type="CrossEntropyLoss", loss_weight=1.0, ignore_index=-1),
36
+ dict(type="LovaszLoss", mode="multiclass", loss_weight=1.0, ignore_index=-1),
37
+ ],
38
+ backbone_out_channels=96,
39
+ context_channels=256,
40
+ conditions=("SemanticKITTI", "nuScenes", "Waymo"),
41
+ num_classes=(19, 16, 22),
42
+ )
43
+
44
+ # scheduler settings
45
+ epoch = 50
46
+ eval_epoch = 50
47
+ optimizer = dict(type="AdamW", lr=0.002, weight_decay=0.005)
48
+ scheduler = dict(
49
+ type="OneCycleLR",
50
+ max_lr=optimizer["lr"],
51
+ pct_start=0.04,
52
+ anneal_strategy="cos",
53
+ div_factor=10.0,
54
+ final_div_factor=100.0,
55
+ )
56
+ # param_dicts = [dict(keyword="modulation", lr=0.0002)]
57
+
58
+ # dataset settings
59
+ data = dict(
60
+ num_classes=16,
61
+ ignore_index=-1,
62
+ names=[
63
+ "barrier",
64
+ "bicycle",
65
+ "bus",
66
+ "car",
67
+ "construction_vehicle",
68
+ "motorcycle",
69
+ "pedestrian",
70
+ "traffic_cone",
71
+ "trailer",
72
+ "truck",
73
+ "driveable_surface",
74
+ "other_flat",
75
+ "sidewalk",
76
+ "terrain",
77
+ "manmade",
78
+ "vegetation",
79
+ ],
80
+ train=dict(
81
+ type="ConcatDataset",
82
+ datasets=[
83
+ # nuScenes
84
+ dict(
85
+ type="NuScenesDataset",
86
+ split="train",
87
+ data_root="data/nuscenes",
88
+ transform=[
89
+ # dict(type="RandomDropout", dropout_ratio=0.2, dropout_application_ratio=0.2),
90
+ # dict(type="RandomRotateTargetAngle", angle=(1/2, 1, 3/2), center=[0, 0, 0], axis='z', p=0.75),
91
+ dict(
92
+ type="RandomRotate",
93
+ angle=[-1, 1],
94
+ axis="z",
95
+ center=[0, 0, 0],
96
+ p=0.5,
97
+ ),
98
+ # dict(type="RandomRotate", angle=[-1/6, 1/6], axis='x', p=0.5),
99
+ # dict(type="RandomRotate", angle=[-1/6, 1/6], axis='y', p=0.5),
100
+ dict(
101
+ type="PointClip",
102
+ point_cloud_range=(-35.2, -35.2, -4, 35.2, 35.2, 2),
103
+ ),
104
+ dict(type="RandomScale", scale=[0.9, 1.1]),
105
+ # dict(type="RandomShift", shift=[0.2, 0.2, 0.2]),
106
+ dict(type="RandomFlip", p=0.5),
107
+ dict(type="RandomJitter", sigma=0.005, clip=0.02),
108
+ # dict(type="ElasticDistortion", distortion_params=[[0.2, 0.4], [0.8, 1.6]]),
109
+ dict(
110
+ type="GridSample",
111
+ grid_size=0.05,
112
+ hash_type="fnv",
113
+ mode="train",
114
+ keys=("coord", "strength", "segment"),
115
+ return_grid_coord=True,
116
+ ),
117
+ # dict(type="SphereCrop", point_max=1000000, mode="random"),
118
+ # dict(type="CenterShift", apply_z=False),
119
+ dict(type="Add", keys_dict={"condition": "nuScenes"}),
120
+ dict(type="ToTensor"),
121
+ dict(
122
+ type="Collect",
123
+ keys=("coord", "grid_coord", "segment", "condition"),
124
+ feat_keys=("coord", "strength"),
125
+ ),
126
+ ],
127
+ test_mode=False,
128
+ ignore_index=-1,
129
+ loop=1,
130
+ ),
131
+ # SemanticKITTI
132
+ dict(
133
+ type="SemanticKITTIDataset",
134
+ split="train",
135
+ data_root="data/semantic_kitti",
136
+ transform=[
137
+ # dict(type="RandomDropout", dropout_ratio=0.2, dropout_application_ratio=0.2),
138
+ # dict(type="RandomRotateTargetAngle", angle=(1/2, 1, 3/2), center=[0, 0, 0], axis="z", p=0.75),
139
+ dict(
140
+ type="RandomRotate",
141
+ angle=[-1, 1],
142
+ axis="z",
143
+ center=[0, 0, 0],
144
+ p=0.5,
145
+ ),
146
+ # dict(type="RandomRotate", angle=[-1/6, 1/6], axis="x", p=0.5),
147
+ # dict(type="RandomRotate", angle=[-1/6, 1/6], axis="y", p=0.5),
148
+ dict(
149
+ type="PointClip",
150
+ point_cloud_range=(-75.2, -75.2, -4, 75.2, 75.2, 2),
151
+ ),
152
+ dict(type="RandomScale", scale=[0.9, 1.1]),
153
+ # dict(type="RandomShift", shift=[0.2, 0.2, 0.2]),
154
+ dict(type="RandomFlip", p=0.5),
155
+ dict(type="RandomJitter", sigma=0.005, clip=0.02),
156
+ # dict(type="ElasticDistortion", distortion_params=[[0.2, 0.4], [0.8, 1.6]]),
157
+ dict(
158
+ type="GridSample",
159
+ grid_size=0.05,
160
+ hash_type="fnv",
161
+ mode="train",
162
+ keys=("coord", "strength", "segment"),
163
+ return_grid_coord=True,
164
+ ),
165
+ # dict(type="SphereCrop", point_max=1000000, mode="random"),
166
+ # dict(type="CenterShift", apply_z=False),
167
+ dict(type="Add", keys_dict={"condition": "SemanticKITTI"}),
168
+ dict(type="ToTensor"),
169
+ dict(
170
+ type="Collect",
171
+ keys=("coord", "grid_coord", "segment", "condition"),
172
+ feat_keys=("coord", "strength"),
173
+ ),
174
+ ],
175
+ test_mode=False,
176
+ ignore_index=-1,
177
+ loop=1,
178
+ ),
179
+ # Waymo
180
+ dict(
181
+ type="WaymoDataset",
182
+ split="training",
183
+ data_root="data/waymo",
184
+ transform=[
185
+ # dict(type="RandomDropout", dropout_ratio=0.2, dropout_application_ratio=0.2),
186
+ # dict(type="RandomRotateTargetAngle", angle=(1/2, 1, 3/2), center=[0, 0, 0], axis="z", p=0.75),
187
+ dict(
188
+ type="RandomRotate",
189
+ angle=[-1, 1],
190
+ axis="z",
191
+ center=[0, 0, 0],
192
+ p=0.5,
193
+ ),
194
+ # dict(type="RandomRotate", angle=[-1/6, 1/6], axis="x", p=0.5),
195
+ # dict(type="RandomRotate", angle=[-1/6, 1/6], axis="y", p=0.5),
196
+ dict(
197
+ type="PointClip",
198
+ point_cloud_range=(-35.2, -35.2, -4, 35.2, 35.2, 2),
199
+ ),
200
+ dict(type="RandomScale", scale=[0.9, 1.1]),
201
+ # dict(type="RandomShift", shift=[0.2, 0.2, 0.2]),
202
+ dict(type="RandomFlip", p=0.5),
203
+ dict(type="RandomJitter", sigma=0.005, clip=0.02),
204
+ # dict(type="ElasticDistortion", distortion_params=[[0.2, 0.4], [0.8, 1.6]]),
205
+ dict(
206
+ type="GridSample",
207
+ grid_size=0.05,
208
+ hash_type="fnv",
209
+ mode="train",
210
+ keys=("coord", "strength", "segment"),
211
+ return_grid_coord=True,
212
+ ),
213
+ # dict(type="SphereCrop", point_max=1000000, mode="random"),
214
+ # dict(type="CenterShift", apply_z=False),
215
+ dict(type="Add", keys_dict={"condition": "Waymo"}),
216
+ dict(type="ToTensor"),
217
+ dict(
218
+ type="Collect",
219
+ keys=("coord", "grid_coord", "segment", "condition"),
220
+ feat_keys=("coord", "strength"),
221
+ ),
222
+ ],
223
+ test_mode=False,
224
+ ignore_index=-1,
225
+ loop=1,
226
+ ),
227
+ ],
228
+ ),
229
+ val=dict(
230
+ type="NuScenesDataset",
231
+ split="val",
232
+ data_root="data/nuscenes",
233
+ transform=[
234
+ dict(type="PointClip", point_cloud_range=(-35.2, -35.2, -4, 35.2, 35.2, 2)),
235
+ dict(
236
+ type="GridSample",
237
+ grid_size=0.05,
238
+ hash_type="fnv",
239
+ mode="train",
240
+ keys=("coord", "strength", "segment"),
241
+ return_grid_coord=True,
242
+ ),
243
+ dict(type="Add", keys_dict={"condition": "nuScenes"}),
244
+ dict(type="ToTensor"),
245
+ dict(
246
+ type="Collect",
247
+ keys=("coord", "grid_coord", "segment", "condition"),
248
+ feat_keys=("coord", "strength"),
249
+ ),
250
+ ],
251
+ test_mode=False,
252
+ ignore_index=-1,
253
+ ),
254
+ test=dict(
255
+ type="NuScenesDataset",
256
+ split="val",
257
+ data_root="data/nuscenes",
258
+ transform=[
259
+ dict(type="Copy", keys_dict={"segment": "origin_segment"}),
260
+ dict(
261
+ type="GridSample",
262
+ grid_size=0.025,
263
+ hash_type="fnv",
264
+ mode="train",
265
+ keys=("coord", "strength", "segment"),
266
+ return_inverse=True,
267
+ ),
268
+ ],
269
+ test_mode=True,
270
+ test_cfg=dict(
271
+ voxelize=dict(
272
+ type="GridSample",
273
+ grid_size=0.05,
274
+ hash_type="fnv",
275
+ mode="test",
276
+ return_grid_coord=True,
277
+ keys=("coord", "strength"),
278
+ ),
279
+ crop=None,
280
+ post_transform=[
281
+ dict(type="Add", keys_dict={"condition": "nuScenes"}),
282
+ dict(type="ToTensor"),
283
+ dict(
284
+ type="Collect",
285
+ keys=("coord", "grid_coord", "index", "condition"),
286
+ feat_keys=("coord", "strength"),
287
+ ),
288
+ ],
289
+ aug_transform=[
290
+ [dict(type="RandomScale", scale=[0.9, 0.9])],
291
+ [dict(type="RandomScale", scale=[0.95, 0.95])],
292
+ [dict(type="RandomScale", scale=[1, 1])],
293
+ [dict(type="RandomScale", scale=[1.05, 1.05])],
294
+ [dict(type="RandomScale", scale=[1.1, 1.1])],
295
+ [
296
+ dict(type="RandomScale", scale=[0.9, 0.9]),
297
+ dict(type="RandomFlip", p=1),
298
+ ],
299
+ [
300
+ dict(type="RandomScale", scale=[0.95, 0.95]),
301
+ dict(type="RandomFlip", p=1),
302
+ ],
303
+ [dict(type="RandomScale", scale=[1, 1]), dict(type="RandomFlip", p=1)],
304
+ [
305
+ dict(type="RandomScale", scale=[1.05, 1.05]),
306
+ dict(type="RandomFlip", p=1),
307
+ ],
308
+ [
309
+ dict(type="RandomScale", scale=[1.1, 1.1]),
310
+ dict(type="RandomFlip", p=1),
311
+ ],
312
+ ],
313
+ ),
314
+ ignore_index=-1,
315
+ ),
316
+ )
submodules/PointTransformerV3/Pointcept/configs/nuscenes/semseg-ppt-v1m2-1-nu-sk-wa-spunet-submit.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = ["../_base_/default_runtime.py"]
2
+
3
+ # misc custom setting
4
+ batch_size = 12 # bs: total bs in all gpus
5
+ num_worker = 24
6
+ mix_prob = 0.8
7
+ empty_cache = False
8
+ enable_amp = True
9
+ find_unused_parameters = True
10
+ evaluate = False
11
+
12
+ # trainer
13
+ train = dict(
14
+ type="MultiDatasetTrainer",
15
+ )
16
+
17
+ # model settings
18
+ model = dict(
19
+ type="PPT-v1m2",
20
+ backbone=dict(
21
+ type="SpUNet-v1m3",
22
+ in_channels=4,
23
+ num_classes=0,
24
+ base_channels=32,
25
+ context_channels=256,
26
+ channels=(32, 64, 128, 256, 256, 128, 96, 96),
27
+ layers=(2, 3, 4, 6, 2, 2, 2, 2),
28
+ cls_mode=False,
29
+ conditions=("SemanticKITTI", "nuScenes", "Waymo"),
30
+ zero_init=False,
31
+ norm_decouple=True,
32
+ norm_adaptive=False,
33
+ norm_affine=True,
34
+ ),
35
+ criteria=[
36
+ dict(type="CrossEntropyLoss", loss_weight=1.0, ignore_index=-1),
37
+ dict(type="LovaszLoss", mode="multiclass", loss_weight=1.0, ignore_index=-1),
38
+ ],
39
+ backbone_out_channels=96,
40
+ context_channels=256,
41
+ conditions=("SemanticKITTI", "nuScenes", "Waymo"),
42
+ num_classes=(19, 16, 22),
43
+ )
44
+
45
+ # scheduler settings
46
+ epoch = 50
47
+ eval_epoch = 50
48
+ optimizer = dict(type="AdamW", lr=0.002, weight_decay=0.005)
49
+ scheduler = dict(
50
+ type="OneCycleLR",
51
+ max_lr=optimizer["lr"],
52
+ pct_start=0.04,
53
+ anneal_strategy="cos",
54
+ div_factor=10.0,
55
+ final_div_factor=100.0,
56
+ )
57
+ # param_dicts = [dict(keyword="modulation", lr=0.0002)]
58
+
59
+ # dataset settings
60
+ data = dict(
61
+ num_classes=16,
62
+ ignore_index=-1,
63
+ names=[
64
+ "barrier",
65
+ "bicycle",
66
+ "bus",
67
+ "car",
68
+ "construction_vehicle",
69
+ "motorcycle",
70
+ "pedestrian",
71
+ "traffic_cone",
72
+ "trailer",
73
+ "truck",
74
+ "driveable_surface",
75
+ "other_flat",
76
+ "sidewalk",
77
+ "terrain",
78
+ "manmade",
79
+ "vegetation",
80
+ ],
81
+ train=dict(
82
+ type="ConcatDataset",
83
+ datasets=[
84
+ # nuScenes
85
+ dict(
86
+ type="NuScenesDataset",
87
+ split=["train", "val"],
88
+ data_root="data/nuscenes",
89
+ transform=[
90
+ # dict(type="RandomDropout", dropout_ratio=0.2, dropout_application_ratio=0.2),
91
+ # dict(type="RandomRotateTargetAngle", angle=(1/2, 1, 3/2), center=[0, 0, 0], axis='z', p=0.75),
92
+ dict(
93
+ type="RandomRotate",
94
+ angle=[-1, 1],
95
+ axis="z",
96
+ center=[0, 0, 0],
97
+ p=0.5,
98
+ ),
99
+ # dict(type="RandomRotate", angle=[-1/6, 1/6], axis='x', p=0.5),
100
+ # dict(type="RandomRotate", angle=[-1/6, 1/6], axis='y', p=0.5),
101
+ dict(
102
+ type="PointClip",
103
+ point_cloud_range=(-35.2, -35.2, -4, 35.2, 35.2, 2),
104
+ ),
105
+ dict(type="RandomScale", scale=[0.9, 1.1]),
106
+ # dict(type="RandomShift", shift=[0.2, 0.2, 0.2]),
107
+ dict(type="RandomFlip", p=0.5),
108
+ dict(type="RandomJitter", sigma=0.005, clip=0.02),
109
+ # dict(type="ElasticDistortion", distortion_params=[[0.2, 0.4], [0.8, 1.6]]),
110
+ dict(
111
+ type="GridSample",
112
+ grid_size=0.05,
113
+ hash_type="fnv",
114
+ mode="train",
115
+ keys=("coord", "strength", "segment"),
116
+ return_grid_coord=True,
117
+ ),
118
+ # dict(type="SphereCrop", point_max=1000000, mode="random"),
119
+ # dict(type="CenterShift", apply_z=False),
120
+ dict(type="Add", keys_dict={"condition": "nuScenes"}),
121
+ dict(type="ToTensor"),
122
+ dict(
123
+ type="Collect",
124
+ keys=("coord", "grid_coord", "segment", "condition"),
125
+ feat_keys=("coord", "strength"),
126
+ ),
127
+ ],
128
+ test_mode=False,
129
+ ignore_index=-1,
130
+ loop=1,
131
+ ),
132
+ # SemanticKITTI
133
+ dict(
134
+ type="SemanticKITTIDataset",
135
+ split=["train", "val"],
136
+ data_root="data/semantic_kitti",
137
+ transform=[
138
+ # dict(type="RandomDropout", dropout_ratio=0.2, dropout_application_ratio=0.2),
139
+ # dict(type="RandomRotateTargetAngle", angle=(1/2, 1, 3/2), center=[0, 0, 0], axis="z", p=0.75),
140
+ dict(
141
+ type="RandomRotate",
142
+ angle=[-1, 1],
143
+ axis="z",
144
+ center=[0, 0, 0],
145
+ p=0.5,
146
+ ),
147
+ # dict(type="RandomRotate", angle=[-1/6, 1/6], axis="x", p=0.5),
148
+ # dict(type="RandomRotate", angle=[-1/6, 1/6], axis="y", p=0.5),
149
+ dict(
150
+ type="PointClip",
151
+ point_cloud_range=(-75.2, -75.2, -4, 75.2, 75.2, 2),
152
+ ),
153
+ dict(type="RandomScale", scale=[0.9, 1.1]),
154
+ # dict(type="RandomShift", shift=[0.2, 0.2, 0.2]),
155
+ dict(type="RandomFlip", p=0.5),
156
+ dict(type="RandomJitter", sigma=0.005, clip=0.02),
157
+ # dict(type="ElasticDistortion", distortion_params=[[0.2, 0.4], [0.8, 1.6]]),
158
+ dict(
159
+ type="GridSample",
160
+ grid_size=0.05,
161
+ hash_type="fnv",
162
+ mode="train",
163
+ keys=("coord", "strength", "segment"),
164
+ return_grid_coord=True,
165
+ ),
166
+ # dict(type="SphereCrop", point_max=1000000, mode="random"),
167
+ # dict(type="CenterShift", apply_z=False),
168
+ dict(type="Add", keys_dict={"condition": "SemanticKITTI"}),
169
+ dict(type="ToTensor"),
170
+ dict(
171
+ type="Collect",
172
+ keys=("coord", "grid_coord", "segment", "condition"),
173
+ feat_keys=("coord", "strength"),
174
+ ),
175
+ ],
176
+ test_mode=False,
177
+ ignore_index=-1,
178
+ loop=1,
179
+ ),
180
+ # Waymo
181
+ dict(
182
+ type="WaymoDataset",
183
+ split=["training", "validation"],
184
+ data_root="data/waymo",
185
+ transform=[
186
+ # dict(type="RandomDropout", dropout_ratio=0.2, dropout_application_ratio=0.2),
187
+ # dict(type="RandomRotateTargetAngle", angle=(1/2, 1, 3/2), center=[0, 0, 0], axis="z", p=0.75),
188
+ dict(
189
+ type="RandomRotate",
190
+ angle=[-1, 1],
191
+ axis="z",
192
+ center=[0, 0, 0],
193
+ p=0.5,
194
+ ),
195
+ # dict(type="RandomRotate", angle=[-1/6, 1/6], axis="x", p=0.5),
196
+ # dict(type="RandomRotate", angle=[-1/6, 1/6], axis="y", p=0.5),
197
+ dict(
198
+ type="PointClip",
199
+ point_cloud_range=(-35.2, -35.2, -4, 35.2, 35.2, 2),
200
+ ),
201
+ dict(type="RandomScale", scale=[0.9, 1.1]),
202
+ # dict(type="RandomShift", shift=[0.2, 0.2, 0.2]),
203
+ dict(type="RandomFlip", p=0.5),
204
+ dict(type="RandomJitter", sigma=0.005, clip=0.02),
205
+ # dict(type="ElasticDistortion", distortion_params=[[0.2, 0.4], [0.8, 1.6]]),
206
+ dict(
207
+ type="GridSample",
208
+ grid_size=0.05,
209
+ hash_type="fnv",
210
+ mode="train",
211
+ keys=("coord", "strength", "segment"),
212
+ return_grid_coord=True,
213
+ ),
214
+ # dict(type="SphereCrop", point_max=1000000, mode="random"),
215
+ # dict(type="CenterShift", apply_z=False),
216
+ dict(type="Add", keys_dict={"condition": "Waymo"}),
217
+ dict(type="ToTensor"),
218
+ dict(
219
+ type="Collect",
220
+ keys=("coord", "grid_coord", "segment", "condition"),
221
+ feat_keys=("coord", "strength"),
222
+ ),
223
+ ],
224
+ test_mode=False,
225
+ ignore_index=-1,
226
+ loop=1,
227
+ ),
228
+ ],
229
+ ),
230
+ test=dict(
231
+ type="NuScenesDataset",
232
+ split="test",
233
+ data_root="data/nuscenes",
234
+ transform=[
235
+ dict(type="Copy", keys_dict={"segment": "origin_segment"}),
236
+ dict(
237
+ type="GridSample",
238
+ grid_size=0.025,
239
+ hash_type="fnv",
240
+ mode="train",
241
+ keys=("coord", "strength", "segment"),
242
+ return_inverse=True,
243
+ ),
244
+ ],
245
+ test_mode=True,
246
+ test_cfg=dict(
247
+ voxelize=dict(
248
+ type="GridSample",
249
+ grid_size=0.05,
250
+ hash_type="fnv",
251
+ mode="test",
252
+ return_grid_coord=True,
253
+ keys=("coord", "strength"),
254
+ ),
255
+ crop=None,
256
+ post_transform=[
257
+ dict(type="Add", keys_dict={"condition": "nuScenes"}),
258
+ dict(type="ToTensor"),
259
+ dict(
260
+ type="Collect",
261
+ keys=("coord", "grid_coord", "index", "condition"),
262
+ feat_keys=("coord", "strength"),
263
+ ),
264
+ ],
265
+ aug_transform=[
266
+ [dict(type="RandomScale", scale=[0.9, 0.9])],
267
+ [dict(type="RandomScale", scale=[0.95, 0.95])],
268
+ [dict(type="RandomScale", scale=[1, 1])],
269
+ [dict(type="RandomScale", scale=[1.05, 1.05])],
270
+ [dict(type="RandomScale", scale=[1.1, 1.1])],
271
+ [
272
+ dict(type="RandomScale", scale=[0.9, 0.9]),
273
+ dict(type="RandomFlip", p=1),
274
+ ],
275
+ [
276
+ dict(type="RandomScale", scale=[0.95, 0.95]),
277
+ dict(type="RandomFlip", p=1),
278
+ ],
279
+ [dict(type="RandomScale", scale=[1, 1]), dict(type="RandomFlip", p=1)],
280
+ [
281
+ dict(type="RandomScale", scale=[1.05, 1.05]),
282
+ dict(type="RandomFlip", p=1),
283
+ ],
284
+ [
285
+ dict(type="RandomScale", scale=[1.1, 1.1]),
286
+ dict(type="RandomFlip", p=1),
287
+ ],
288
+ ],
289
+ ),
290
+ ignore_index=-1,
291
+ ),
292
+ )