Spaces:

kairunwen
/

LSM

Runtime error

App Files Files Community

kairunwen commited on Mar 25

Commit

57746f1

1 Parent(s): 3c9ccf0

Update Code

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +3 -0
.gitignore +157 -0
app.py +149 -9
assets/examples/bicycle/_DSC8679.JPG +0 -0
assets/examples/bicycle/_DSC8689.JPG +0 -0
assets/examples/bonsai/DSCF5565.JPG +0 -0
assets/examples/bonsai/DSCF5575.JPG +0 -0
assets/examples/garden/DSC07956.JPG +0 -0
assets/examples/garden/DSC07960.JPG +0 -0
assets/examples/kitchen/0.jpg +0 -0
assets/examples/kitchen/64.jpg +0 -0
assets/examples/sofa/000000.jpg +0 -0
assets/examples/sofa/000008.jpg +0 -0
configs/model_config.yaml +20 -0
requirements.txt +41 -0
scannetv2-labels.combined.tsv +608 -0
src/datasets/megadepth.py +125 -0
src/datasets/scannet.py +109 -0
src/datasets/scannetpp.py +107 -0
src/datasets_preprocess/scannet_preprocess.py +209 -0
src/datasets_preprocess/scannetpp_preprocess.py +227 -0
src/gaussian_head.py +142 -0
src/infer.py +23 -0
src/losses.py +193 -0
src/lseg.py +171 -0
src/model.py +176 -0
src/ptv3.py +13 -0
src/train.py +73 -0
src/utils/camera_utils.py +60 -0
src/utils/cuda_splatting.py +216 -0
src/utils/gaussian_model.py +160 -0
src/utils/graphics_utils.py +77 -0
src/utils/points_process.py +37 -0
src/utils/sh_utils.py +117 -0
src/utils/visualization_utils.py +355 -0
submodules/PointTransformerV3/.gitmodules +3 -0
submodules/PointTransformerV3/LICENSE +21 -0
submodules/PointTransformerV3/Pointcept/.github/workflows/formatter.yml +20 -0
submodules/PointTransformerV3/Pointcept/.gitignore +16 -0
submodules/PointTransformerV3/Pointcept/LICENSE +21 -0
submodules/PointTransformerV3/Pointcept/README.md +896 -0
submodules/PointTransformerV3/Pointcept/configs/_base_/dataset/scannetpp.py +104 -0
submodules/PointTransformerV3/Pointcept/configs/_base_/default_runtime.py +39 -0
submodules/PointTransformerV3/Pointcept/configs/matterport3d/semseg-pt-v3m1-0-base.py +313 -0
submodules/PointTransformerV3/Pointcept/configs/matterport3d/semseg-spunet-v1m1-0-base.py +282 -0
submodules/PointTransformerV3/Pointcept/configs/modelnet40/cls-ptv3-v1m1-0-base.py +232 -0
submodules/PointTransformerV3/Pointcept/configs/modelnet40/cls-spunet-v1m1-0-base.py +176 -0
submodules/PointTransformerV3/Pointcept/configs/nuscenes/semseg-ppt-v1m1-0-nu-sk-wa-spunet.py +342 -0
submodules/PointTransformerV3/Pointcept/configs/nuscenes/semseg-ppt-v1m2-0-nu-sk-wa-spunet.py +316 -0
submodules/PointTransformerV3/Pointcept/configs/nuscenes/semseg-ppt-v1m2-1-nu-sk-wa-spunet-submit.py +292 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+wheel/*.whl filter=lfs diff=lfs merge=lfs -text
+*.whl filter=lfs diff=lfs merge=lfs -text
+checkpoints/* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,157 @@

+# *.pth
+*.pt
+submodules/diff-gaussian-rasterization
+submodules/simple-knn
+# checkpoints/
+output*
+.gradio/
+core.*
+logs/*
+/data/
+# checkpoints/
+video*
+train_images*
+test_images_save*
+/pl_main
+/to_be_test
+/test_lsm
+/test_img
+/figure3
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+video/
+scannet_processed_scenes_1.tar.gz
+test_results/*
+output/*
+test_images
+colmap_scannet
+/test_lseg

app.py CHANGED Viewed

@@ -1,12 +1,152 @@
-import os
-import shlex
 import gradio as gr
-import subprocess
-from huggingface_hub import HfApi
-hf_token = os.getenv("LSM_token")
-api = HfApi()
-api.snapshot_download(repo_id="kairunwen/LSM_private_mast3r", repo_type="space", local_dir=".", token=hf_token)
-subprocess.run(shlex.split("pip install -r requirements.txt"))
-subprocess.run(shlex.split("python app.py"))

+import os, subprocess, shlex, sys, gc
+import time
+import torch
+import numpy as np
+import shutil
+import argparse
 import gradio as gr
+import uuid
+import spaces
+#
+subprocess.run(shlex.split("pip install wheel/torch_scatter-2.1.2+pt21cu121-cp310-cp310-linux_x86_64.whl"))
+subprocess.run(shlex.split("pip install wheel/flash_attn-2.6.3+cu123torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"))
+subprocess.run(shlex.split("pip install wheel/diff_gaussian_rasterization-0.0.0-cp310-cp310-linux_x86_64.whl"))
+subprocess.run(shlex.split("pip install wheel/simple_knn-0.0.0-cp310-cp310-linux_x86_64.whl"))
+subprocess.run(shlex.split("pip install wheel/curope-0.0.0-cp310-cp310-linux_x86_64.whl"))
+subprocess.run(shlex.split("pip install wheel/pointops-1.0-cp310-cp310-linux_x86_64.whl"))
+from src.utils.visualization_utils import render_video_from_file
+from src.model import LSM_MASt3R
+model = LSM_MASt3R.from_pretrained("checkpoints/pretrained_model/checkpoint-40.pth")
+model = model.eval()
+@spaces.GPU(duration=80)
+def process(inputfiles, input_path=None):
+    # 创建唯一的缓存目录
+    cache_dir = os.path.join('outputs', str(uuid.uuid4()))
+    os.makedirs(cache_dir, exist_ok=True)
+    if input_path is not None:
+        imgs_path = './assets/examples/' + input_path
+        imgs_names = sorted(os.listdir(imgs_path))
+        inputfiles = []
+        for imgs_name in imgs_names:
+            file_path = os.path.join(imgs_path, imgs_name)
+            print(file_path)
+            inputfiles.append(file_path)
+        print(inputfiles)
+    filelist = inputfiles
+    if len(filelist) != 2:
+        gr.Warning("Please select 2 images")
+        shutil.rmtree(cache_dir)  # 清理缓存目录
+        return None, None, None, None, None, None
+    ply_path = os.path.join(cache_dir, 'gaussians.ply')
+    # render_video_from_file(filelist, model, output_path=cache_dir, resolution=224)
+    render_video_from_file(filelist, model, output_path=cache_dir, resolution=512)
+    rgb_video_path = os.path.join(cache_dir, 'moved', 'output_images_video.mp4')
+    depth_video_path = os.path.join(cache_dir, 'moved', 'output_depth_video.mp4')
+    feature_video_path = os.path.join(cache_dir, 'moved', 'output_fmap_video.mp4')
+    return filelist, rgb_video_path, depth_video_path, feature_video_path, ply_path, ply_path
+_TITLE = 'LargeSpatialModel'
+_DESCRIPTION = '''
+<div style="display: flex; justify-content: center; align-items: center;">
+    <div style="width: 100%; text-align: center; font-size: 30px;">
+        <strong>Large Spatial Model: End-to-end Unposed Images to Semantic 3D</strong>
+    </div>
+</div>
+<p></p>
+<div align="center">
+    <a style="display:inline-block" href="https://arxiv.org/abs/2410.18956"><img src="https://img.shields.io/badge/ArXiv-2410.18956-b31b1b?logo=arxiv" alt='arxiv'></a>&nbsp;
+    <a style="display:inline-block" href="https://largespatialmodel.github.io/"><img src='https://img.shields.io/badge/Project_Page-ff7512?logo=lightning'></a>&nbsp;
+    <a title="Social" href="https://x.com/WayneINR" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
+        <img src="https://www.obukhov.ai/img/badges/badge-social.svg" alt="social">
+    </a>
+</div>
+<p></p>
+* Official demo of: [LargeSpatialModel: End-to-end Unposed Images to Semantic 3D](https://largespatialmodel.github.io/).
+* Examples for direct viewing: you can simply click the examples (in the bottom of the page), to quickly view the results on representative data.
+'''
+block = gr.Blocks().queue()
+with block:
+    gr.Markdown(_DESCRIPTION)
+    with gr.Column(variant="panel"):
+        with gr.Tab("Input"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    inputfiles = gr.File(file_count="multiple", label="Load Images")
+                    input_path = gr.Textbox(visible=False, label="example_path")
+                with gr.Column(scale=1):
+                    image_gallery = gr.Gallery(
+                        label="Gallery",
+                        show_label=False,
+                        elem_id="gallery",
+                        columns=[2],
+                        height=300,  # 固定高度
+                        object_fit="cover"  # 确保图片填满空间
+                    )
+        button_gen = gr.Button("Start Reconstruction", elem_id="button_gen")
+        processing_msg = gr.Markdown("Processing...", visible=False, elem_id="processing_msg")
+    with gr.Column(variant="panel"):
+        with gr.Tab("Output"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    rgb_video = gr.Video(label="RGB Video", autoplay=True)
+                with gr.Column(scale=1):
+                    feature_video = gr.Video(label="Feature Video", autoplay=True)
+                with gr.Column(scale=1):
+                    depth_video = gr.Video(label="Depth Video", autoplay=True)
+            with gr.Row():
+                with gr.Group():
+                    output_model = gr.Model3D(
+                        label="3D Dense Model under Gaussian Splats Formats, need more time to visualize",
+                        interactive=False,
+                        camera_position=[0.5, 0.5, 1],  # 稍微偏移一点，以便更好地查看模型
+                        height=600,
+                    )
+                    gr.Markdown(
+                        """
+                        <div class="model-description">
+                           &nbsp;&nbsp;Use the left mouse button to rotate, the scroll wheel to zoom, and the right mouse button to move.
+                        </div>
+                        """
+                    )
+            with gr.Row():
+                output_file = gr.File(label="PLY File")
+    examples = gr.Examples(
+        examples=[
+            "sofa",
+        ],
+        inputs=[input_path],
+        outputs=[image_gallery, rgb_video, depth_video, feature_video, output_model, output_file],
+        fn=lambda x: process(inputfiles=None, input_path=x),
+        cache_examples=True,
+        label="Examples"
+    )
+    button_gen.click(
+        process,
+        inputs=[inputfiles],
+        outputs=[image_gallery, rgb_video, depth_video, feature_video, output_model, output_file],
+    )
+block.launch(server_name="0.0.0.0", share=False)

assets/examples/bicycle/_DSC8679.JPG ADDED Viewed

assets/examples/bicycle/_DSC8689.JPG ADDED Viewed

assets/examples/bonsai/DSCF5565.JPG ADDED Viewed

assets/examples/bonsai/DSCF5575.JPG ADDED Viewed

assets/examples/garden/DSC07956.JPG ADDED Viewed

assets/examples/garden/DSC07960.JPG ADDED Viewed

assets/examples/kitchen/0.jpg ADDED Viewed

assets/examples/kitchen/64.jpg ADDED Viewed

assets/examples/sofa/000000.jpg ADDED Viewed

assets/examples/sofa/000008.jpg ADDED Viewed

configs/model_config.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+mast3r_config:
+  pretrained_model_name_or_path: "checkpoints/pretrained_model/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth"
+point_transformer_config:
+  enc_depths: [1, 1, 1, 3, 1]
+  enc_channels: [32, 64, 128, 256, 512]
+  enc_num_head: [2, 4, 8, 16, 32]
+  enc_patch_size: [1024, 1024, 1024, 1024, 1024]
+  dec_depths: [1, 1, 1, 1]
+  dec_channels: [64, 64, 128, 256]
+  dec_num_head: [4, 4, 8, 16]
+  dec_patch_size: [1024, 1024, 1024, 1024]
+gaussian_head_config:
+  rgb_residual: true
+  d_gs_feats: 32
+lseg_config:
+  pretrained_model_name_or_path: "checkpoints/pretrained_model/lang_seg.ckpt"
+  half_res: true

requirements.txt ADDED Viewed

	@@ -0,0 +1,41 @@

+torch==2.1.2
+torchvision==0.16.2
+pytorch-lightning==2.1.2
+open3d
+roma
+gradio
+matplotlib
+tqdm
+opencv-python
+scipy
+einops
+trimesh
+tensorboard
+pyglet<2
+numpy<2.0
+huggingface-hub[torch]>=0.22
+ninja
+scikit-learn
+arrow
+pandas
+torch-tb-profiler
+jaxtyping
+ninja
+h5py
+pyyaml
+moviepy==1.0.3
+jupyter
+lpips
+torch-geometric
+spconv-cu120
+git+https://github.com/openai/CLIP.git
+sharedarray
+tensorboardx
+yapf
+addict
+plyfile
+termcolor
+timm

scannetv2-labels.combined.tsv ADDED Viewed

	@@ -0,0 +1,608 @@

+id	raw_category	category	count	nyu40id	eigen13id	nyuClass	nyu40class	eigen13class	ModelNet40	ModelNet10	ShapeNetCore55	synsetoffset	wnsynsetid	wnsynsetkey	mpcat40	mpcat40index
+1	wall	wall	8277	1	12	wall	wall	Wall					n04546855	wall.n.01	wall	1
+2	chair	chair	4646	5	4	chair	chair	Chair	chair	chair	chair	3001627	n03001627	chair.n.01	chair	3
+22	books	book	1678	23	2	book	books	Books					n02870526	book.n.11	objects	39
+3	floor	floor	1553	2	5	floor	floor	Floor					n03365592	floor.n.01	floor	2
+5	door	door	1483	8	12	door	door	Wall	door				n03221720	door.n.01	door	4
+1163	object	object	1313	40	7		otherprop	Objects							objects	39
+16	window	window	1209	9	13	window	window	Window					n04587648	window.n.01	window	9
+4	table	table	1170	7	10	table	table	Table	table	table	table	4379243	n04379243	table.n.02	table	5
+56	trash can	trash can	1090	39	6	garbage bin	otherfurniture	Furniture			trash_bin	2747177	n02747177	ashcan.n.01	objects	39
+13	pillow	pillow	937	18	7	pillow	pillow	Objects			pillow	3938244	n03938244	pillow.n.01	cushion	8
+15	picture	picture	862	11	8	picture	picture	Picture					n03931044	picture.n.01	picture	6
+41	ceiling	ceiling	806	22	3	ceiling	ceiling	Ceiling					n02990373	ceiling.n.01	ceiling	17
+26	box	box	775	29	7	box	box	Objects					n02883344	box.n.01	objects	39
+161	doorframe	doorframe	768	8	12	door	door	Wall	door					doorframe.n.01	door	4
+19	monitor	monitor	765	40	7	monitor	otherprop	Objects	monitor	monitor	tv or monitor	3211117	n03782190	monitor.n.04	objects	39
+7	cabinet	cabinet	731	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	cabinet	7
+9	desk	desk	680	14	10	desk	desk	Table	desk	desk	table	4379243	n03179701	desk.n.01	table	5
+8	shelf	shelf	641	15	6	shelves	shelves	Furniture	bookshelf		bookshelf	2871439	n02871439	bookshelf.n.01	shelving	31
+10	office chair	office chair	595	5	4	chair	chair	Chair	chair	chair	chair	3001627	n04373704	swivel_chair.n.01	chair	3
+31	towel	towel	570	27	7	towel	towel	Objects					n04459362	towel.n.01	towel	20
+6	couch	couch	502	6	9	sofa	sofa	Sofa	sofa	sofa	sofa	4256520	n04256520	sofa.n.01	sofa	10
+14	sink	sink	488	34	7	sink	sink	Objects	sink				n04223580	sink.n.01	sink	15
+48	backpack	backpack	479	40	7	backpack	otherprop	Objects					n02769748	backpack.n.01	objects	39
+28	lamp	lamp	419	35	7	lamp	lamp	Objects	lamp		lamp	3636649	n03636649	lamp.n.02	lighting	28
+11	bed	bed	370	4	1	bed	bed	Bed	bed	bed	bed	2818832	n02818832	bed.n.01	bed	11
+18	bookshelf	bookshelf	360	10	6	bookshelf	bookshelf	Furniture	bookshelf		bookshelf	2871439	n02871439	bookshelf.n.01	shelving	31
+71	mirror	mirror	349	19	7	mirror	mirror	Objects					n03773035	mirror.n.01	mirror	21
+21	curtain	curtain	347	16	13	curtain	curtain	Window	curtain				n03151077	curtain.n.01	curtain	12
+40	plant	plant	331	40	7	plant	otherprop	Objects	plant				n00017222	plant.n.02	plant	14
+52	whiteboard	whiteboard	327	30	7	whiteboard	whiteboard	Objects					n03211616	display_panel.n.01	board_panel	35
+96	radiator	radiator	322	39	6	radiator	otherfurniture	Furniture					n04041069	radiator.n.02	misc	40
+22	book	book	318	23	2	book	books	Books					n02870526	book.n.11	objects	39
+29	kitchen cabinet	kitchen cabinet	310	3	6	cabinet	cabinet	Furniture					n02933112	cabinet.n.01	cabinet	7
+49	toilet paper	toilet paper	291	40	7	toilet paper	otherprop	Objects					n15075141	toilet_tissue.n.01	objects	39
+29	kitchen cabinets	kitchen cabinet	289	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	cabinet	7
+23	armchair	armchair	281	5	4	chair	chair	Chair	chair	chair	chair	3001627	n02738535	armchair.n.01	chair	3
+63	shoes	shoe	272	40	7	shoe	otherprop	Objects					n04199027	shoe.n.01	clothes	38
+24	coffee table	coffee table	258	7	10	coffee table	table	Table	table	table	table	4379243	n03063968	coffee_table.n.01	table	5
+17	toilet	toilet	256	33	7	toilet	toilet	Objects	toilet	toilet			n04446276	toilet.n.01	toilet	18
+47	bag	bag	252	37	7	bag	bag	Objects			suitcase	2773838	n02773838	bag.n.06	objects	39
+32	clothes	clothes	248	21	7	clothes	clothes	Objects					n02728440	apparel.n.01	clothes	38
+46	keyboard	keyboard	246	40	7	keyboard	otherprop	Objects	keyboard		computer keyboard	3085013	n03085013	computer_keyboard.n.01	objects	39
+65	bottle	bottle	226	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+97	recycling bin	recycling bin	225	39	6	garbage bin	otherfurniture	Furniture			trash_bin	2747177	n02747177	ashcan.n.01	objects	39
+34	nightstand	nightstand	224	32	6	night stand	night stand	Furniture	night_stand	night_stand			n03015254	chest_of_drawers.n.01	chest_of_drawers	13
+38	stool	stool	221	40	7	stool	otherprop	Objects	stool				n04326896	stool.n.01	stool	19
+33	tv	tv	219	25	11	television	television	TV			tv or monitor	3211117	n03211117	display.n.06	tv_monitor	22
+75	file cabinet	file cabinet	217	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	cabinet	7
+36	dresser	dresser	213	17	6	dresser	dresser	Furniture	dresser	dresser			n03015254	chest_of_drawers.n.01	chest_of_drawers	13
+64	computer tower	computer tower	203	40	7	computer	otherprop	Objects					n03082979	computer.n.01	objects	39
+32	clothing	clothes	165	21	7	clothes	clothes	Objects					n02728440	apparel.n.01	clothes	38
+101	telephone	telephone	164	40	7	telephone	otherprop	Objects			telephone	4401088	n04401088	telephone.n.01	objects	39
+130	cup	cup	157	40	7	cup	otherprop	Objects	cup		cup or mug	3797390	n03797390	mug.n.04	objects	39
+27	refrigerator	refrigerator	154	24	6	refridgerator	refridgerator	Furniture					n04070727	refrigerator.n.01	appliances	37
+44	end table	end table	147	7	10	table	table	Table	table	table	table	4379243	n04379243	table.n.02	table	5
+131	jacket	jacket	146	40	7	jacket	otherprop	Objects					n03589791	jacket.n.01	clothes	38
+55	shower curtain	shower curtain	144	28	7	shower curtain	shower curtain	Objects	curtain				n04209239	shower_curtain.n.01	curtain	12
+42	bathtub	bathtub	144	36	7	bathtub	bathtub	Objects	bathtub	bathtub	tub	2808440	n02808440	bathtub.n.01	bathtub	25
+59	microwave	microwave	141	40	7	microwave	otherprop	Objects			microwave	3761084	n03761084	microwave.n.02	appliances	37
+159	kitchen counter	kitchen counter	140	12	6	counter	counter	Furniture	table	table	table	4379243	n03116530	counter.n.01	counter	26
+74	sofa chair	sofa chair	129	5	4	chair	chair	Chair	chair	chair	chair	3001627	n03001627	chair.n.01	chair	3
+82	paper towel dispenser	paper towel dispenser	129	40	7	paper towel dispenser	otherprop	Objects							objects	39
+1164	bathroom vanity	bathroom vanity	126	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	table	5
+93	suitcase	suitcase	118	40	7	luggage	otherprop	Objects					n02773838	bag.n.06	objects	39
+77	laptop	laptop	111	40	7	laptop	otherprop	Objects	laptop		laptop	3642806	n03642806	laptop.n.01	objects	39
+67	ottoman	ottoman	111	39	6	ottoman	otherfurniture	Furniture	stool				n03380724	footstool.n.01	stool	19
+128	shower walls	shower wall	109	1	12	wall	wall	Wall					n04546855	wall.n.01	wall	1
+50	printer	printer	106	40	7	printer	otherprop	Objects			printer	4004475	n04004475	printer.n.03	appliances	37
+35	counter	counter	104	12	6	counter	counter	Furniture	table	table	table	4379243	n03116530	counter.n.01	counter	26
+69	board	board	100	38	7	board	otherstructure	Objects							board_panel	35
+100	soap dispenser	soap dispenser	99	40	7		otherprop	Objects					n04254120	soap_dispenser.n.01	objects	39
+62	stove	stove	95	38	7	stove	otherstructure	Objects			stove	4330267	n04330267	stove.n.02	appliances	37
+105	light	light	93	38	7	light	otherstructure	Objects					n03665366	light.n.02	lighting	28
+1165	closet wall	closet wall	90	1	12	wall	wall	Wall					n04546855	wall.n.01	wall	1
+165	mini fridge	mini fridge	87	24	6	refridgerator	refridgerator	Furniture					n03273913	electric_refrigerator.n.01	appliances	37
+7	cabinets	cabinet	79	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	cabinet	7
+5	doors	door	76	8	12	door	door	Wall	door				n03221720	door.n.01	door	4
+76	fan	fan	75	40	7	fan	otherprop	Objects					n03320046	fan.n.01	misc	40
+230	tissue box	tissue box	73	40	7	tissue box	otherprop	Objects					n02883344	box.n.01	objects	39
+54	blanket	blanket	72	40	7	blanket	otherprop	Objects					n02849154	blanket.n.01	objects	39
+125	bathroom stall	bathroom stall	71	38	7		otherstructure	Objects					n02873839	booth.n.02	misc	40
+72	copier	copier	70	40	7		otherprop	Objects					n03257586	duplicator.n.01	appliances	37
+68	bench	bench	66	39	6	bench	otherfurniture	Furniture	bench		bench	2828884	n02828884	bench.n.01	seating	34
+145	bar	bar	66	38	7	bar	otherstructure	Objects					n02788689	bar.n.03	misc	40
+157	soap dish	soap dish	65	40	7	soap dish	otherprop	Objects					n04254009	soap_dish.n.01	objects	39
+1166	laundry hamper	laundry hamper	65	40	7	laundry basket	otherprop	Objects							objects	39
+132	storage bin	storage bin	63	40	7	storage bin	otherprop	Objects							objects	39
+1167	bathroom stall door	bathroom stall door	62	8	12	door	door	Wall	door				n03221720	door.n.01	door	4
+232	light switch	light switch	61	38	7	light switch	otherstructure	Objects					n04372370	switch.n.01	misc	40
+134	coffee maker	coffee maker	61	40	7		otherprop	Objects					n03063338	coffee_maker.n.01	appliances	37
+51	tv stand	tv stand	61	39	6	tv stand	otherfurniture	Furniture	tv_stand				n03290653	entertainment_center.n.01	furniture	36
+250	decoration	decoration	60	40	7		otherprop	Objects					n03169390	decoration.n.01	misc	40
+1168	ceiling light	ceiling light	59	38	7	light	otherstructure	Objects					n03665366	light.n.02	lighting	28
+342	range hood	range hood	59	38	7	range hood	otherstructure	Objects	range_hood				n04053677	range_hood.n.01	misc	40
+89	blackboard	blackboard	58	38	7	blackboard	otherstructure	Objects					n02846511	blackboard.n.01	board_panel	35
+103	clock	clock	58	40	7	clock	otherprop	Objects			clock	3046257	n03046257	clock.n.01	objects	39
+99	wardrobe closet	wardrobe	54	39	6	wardrobe	otherfurniture	Furniture	wardrobe				n04550184	wardrobe.n.01	furniture	36
+95	rail	rail	53	38	7	railing	otherstructure	Objects					n04047401	railing.n.01	railing	30
+154	bulletin board	bulletin board	53	38	7	board	otherstructure	Objects					n03211616	display_panel.n.01	board_panel	35
+140	mat	mat	52	20	5	floor mat	floor mat	Floor					n03727837	mat.n.01	floor	2
+1169	trash bin	trash bin	52	39	6	garbage bin	otherfurniture	Furniture			trash_bin	2747177	n02747177	ashcan.n.01	objects	39
+193	ledge	ledge	51	38	7		otherstructure	Objects					n09337253	ledge.n.01	misc	40
+116	seat	seat	49	39	6	furniture	otherfurniture	Furniture					n04161981	seat.n.03	furniture	36
+202	mouse	mouse	49	40	7	mouse	otherprop	Objects					n03793489	mouse.n.04	objects	39
+73	basket	basket	48	40	7	basket	otherprop	Objects			basket	2801938	n02801938	basket.n.01	objects	39
+78	shower	shower	48	38	7		otherstructure	Objects					n04208936	shower.n.01	shower	23
+1170	dumbbell	dumbbell	48	40	7		otherprop	Objects					n03255030	dumbbell.n.01	objects	39
+79	paper	paper	46	26	7	paper	paper	Objects					n14974264	paper.n.01	objects	39
+80	person	person	46	31	7	person	person	Objects	person				n05217688	person.n.02	misc	40
+141	windowsill	windowsill	45	38	7		otherstructure	Objects					n04590263	windowsill.n.01	window	9
+57	closet	closet	45	39	6	wardrobe	otherfurniture	Furniture	wardrobe						misc	40
+102	bucket	bucket	45	40	7	bucket	otherprop	Objects					n02909870	bucket.n.01	misc	40
+261	sign	sign	44	40	7	sign	otherprop	Objects					n04217882	signboard.n.01	objects	39
+118	speaker	speaker	43	40	7	speaker	otherprop	Objects			speaker	3691459	n03691459	loudspeaker.n.01	objects	39
+136	dishwasher	dishwasher	43	38	7	dishwasher	otherstructure	Objects			dishwasher	3207941	n03207941	dishwasher.n.01	appliances	37
+98	container	container	43	40	7	container	otherprop	Objects					n03094503	container.n.01	objects	39
+1171	stair rail	stair rail	42	38	7	banister	otherstructure	Objects					n02788148	bannister.n.02	railing	30
+170	shower curtain rod	shower curtain rod	42	40	7		otherprop	Objects							curtain	12
+1172	tube	tube	41	40	7		otherprop	Objects							misc	40
+1173	bathroom cabinet	bathroom cabinet	39	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	cabinet	7
+79	papers	paper	39	26	7	paper	paper	Objects					n14974264	paper.n.01	objects	39
+221	storage container	storage container	39	40	7	container	otherprop	Objects							objects	39
+570	paper bag	paper bag	39	37	7	bag	bag	Objects					n04122825	sack.n.01	objects	39
+138	paper towel roll	paper towel roll	39	40	7	paper towel	otherprop	Objects					n03887697	paper_towel.n.01	towel	20
+168	ball	ball	39	40	7	ball	otherprop	Objects							objects	39
+276	closet doors	closet door	38	8	12	door	door	Wall	door				n03221720	door.n.01	door	4
+106	laundry basket	laundry basket	37	40	7	laundry basket	otherprop	Objects			basket	2801938	n03050864	clothes_hamper.n.01	objects	39
+214	cart	cart	37	40	7	cart	otherprop	Objects					n03484083	handcart.n.01	shelving	31
+276	closet door	closet door	35	8	12	door	door	Wall	door				n03221720	door.n.01	door	4
+323	dish rack	dish rack	35	40	7	dish rack	otherprop	Objects					n03207630	dish_rack.n.01	objects	39
+58	stairs	stairs	35	38	7	stairs	otherstructure	Objects					n04298308	stairway.n.01	stairs	16
+86	blinds	blinds	35	13	13	blinds	blinds	Window					n02851099	blind.n.03	blinds	32
+2	stack of chairs	chair	35	5	4	chair	chair	Chair	chair	chair	chair	3001627	n03001627	chair.n.01	chair	3
+399	purse	purse	34	40	7	purse	otherprop	Objects					n02774152	bag.n.04	objects	39
+121	bicycle	bicycle	33	40	7	bicycle	otherprop	Objects			bicycle	2834778	n02834778	bicycle.n.01	objects	39
+185	tray	tray	32	40	7	tray	otherprop	Objects					n04476259	tray.n.01	objects	39
+300	plunger	plunger	30	40	7		otherprop	Objects					n03970156	plunger.n.03	objects	39
+180	paper cutter	paper cutter	30	40	7	paper cutter	otherprop	Objects					n03886940	paper_cutter.n.01	objects	39
+163	toilet paper dispenser	toilet paper dispenser	29	40	7		otherprop	Objects							objects	39
+26	boxes	box	29	29	7	box	box	Objects					n02883344	box.n.01	objects	39
+66	bin	bin	28	40	7	bin	otherprop	Objects					n02839910	bin.n.01	objects	39
+208	toilet seat cover dispenser	toilet seat cover dispenser	28	40	7		otherprop	Objects							objects	39
+112	guitar	guitar	28	40	7	guitar	otherprop	Objects	guitar		guitar	3467517	n03467517	guitar.n.01	objects	39
+540	mailboxes	mailbox	28	29	7	box	box	Objects			mailbox	3710193	n03710193	mailbox.n.01	misc	40
+395	handicap bar	handicap bar	27	38	7	bar	otherstructure	Objects							misc	40
+166	fire extinguisher	fire extinguisher	27	40	7	fire extinguisher	otherprop	Objects					n03345837	fire_extinguisher.n.01	misc	40
+122	ladder	ladder	27	39	6	ladder	otherfurniture	Furniture	stairs				n03632277	ladder.n.01	stairs	16
+120	column	column	26	38	7	column	otherstructure	Objects					n03074380	column.n.06	column	24
+107	pipe	pipe	25	40	7	pipe	otherprop	Objects					n03944672	pipe.n.02	misc	40
+283	vacuum cleaner	vacuum cleaner	25	40	7		otherprop	Objects					n04517823	vacuum.n.04	objects	39
+88	plate	plate	24	40	7	plate	otherprop	Objects					n03959485	plate.n.04	objects	39
+90	piano	piano	24	39	6	piano	otherfurniture	Furniture	piano		piano	3928116	n03928116	piano.n.01	furniture	36
+177	water cooler	water cooler	24	39	6	water cooler	otherfurniture	Furniture					n04559166	water_cooler.n.01	misc	40
+1174	cd case	cd case	24	40	7		otherprop	Objects							objects	39
+562	bowl	bowl	24	40	7	bowl	otherprop	Objects	bowl		bowl	2880940	n02880940	bowl.n.03	objects	39
+1175	closet rod	closet rod	24	40	7		otherprop	Objects					n04100174	rod.n.01	misc	40
+1156	bathroom counter	bathroom counter	24	12	6	counter	counter	Furniture	table	table	table	4379243	n03116530	counter.n.01	counter	26
+84	oven	oven	23	38	7	oven	otherstructure	Objects					n03862676	oven.n.01	appliances	37
+104	stand	stand	23	39	6	stand	otherfurniture	Furniture	table	table	table	4379243	n04301000	stand.n.04	table	5
+229	scale	scale	23	40	7	scale	otherprop	Objects					n04141975	scale.n.07	objects	39
+70	washing machine	washing machine	23	39	6	washing machine	otherfurniture	Furniture			washing_machine	4554684	n04554684	washer.n.03	appliances	37
+325	broom	broom	22	40	7	broom	otherprop	Objects					n02906734	broom.n.01	objects	39
+169	hat	hat	22	40	7	hat	otherprop	Objects					n03497657	hat.n.01	clothes	38
+128	shower wall	shower wall	22	1	12	wall	wall	Wall					n04208936	shower.n.01	wall	1
+331	guitar case	guitar case	21	40	7	guitar case	otherprop	Objects							objects	39
+87	rack	rack	21	39	6	stand	otherfurniture	Furniture					n04038440	rack.n.05	shelving	31
+488	water pitcher	water pitcher	21	40	7	pitcher	otherprop	Objects					n03950228	pitcher.n.02	objects	39
+776	laundry detergent	laundry detergent	21	40	7		otherprop	Objects							objects	39
+370	hair dryer	hair dryer	21	40	7	hair dryer	otherprop	Objects					n03483316	hand_blower.n.01	objects	39
+191	pillar	pillar	21	38	7	column	otherstructure	Objects					n03073977	column.n.07	column	24
+748	divider	divider	20	40	7		otherprop	Objects							wall	1
+242	power outlet	power outlet	19	40	7		otherprop	Objects							misc	40
+45	dining table	dining table	19	7	10	table	table	Table	table	table	table	4379243	n04379243	table.n.02	table	5
+417	shower floor	shower floor	19	2	5	floor	floor	Floor					n04208936	shower.n.01	floor	2
+70	washing machines	washing machine	19	39	6	washing machine	otherfurniture	Furniture			washing_machine	4554684	n04554684	washer.n.03	appliances	37
+188	shower door	shower door	19	8	12	door	door	Wall	door				n04208936	shower.n.01	door	4
+1176	coffee kettle	coffee kettle	18	40	7	pot	otherprop	Objects					n03612814	kettle.n.01	objects	39
+1177	wardrobe cabinet	wardrobe	18	39	6	wardrobe	otherfurniture	Furniture	wardrobe				n04550184	wardrobe.n.01	furniture	36
+1178	structure	structure	18	38	7		otherstructure	Objects							misc	40
+18	bookshelves	bookshelf	17	10	6	bookshelf	bookshelf	Furniture	bookshelf		bookshelf	2871439	n02871439	bookshelf.n.01	shelving	31
+110	clothes dryer	clothes dryer	17	39	6		otherfurniture	Furniture					n03251766	dryer.n.01	appliances	37
+148	toaster	toaster	17	40	7	toaster	otherprop	Objects					n04442312	toaster.n.02	appliances	37
+63	shoe	shoe	17	40	7	shoe	otherprop	Objects					n04199027	shoe.n.01	clothes	38
+155	ironing board	ironing board	16	39	6	ironing board	otherfurniture	Furniture					n03586090	ironing_board.n.01	objects	39
+572	alarm clock	alarm clock	16	40	7	alarm clock	otherprop	Objects			clock	3046257	n02694662	alarm_clock.n.01	objects	39
+1179	shower head	shower head	15	38	7		otherstructure	Objects							shower	23
+28	lamp base	lamp	15	35	7	lamp	lamp	Objects	lamp		lamp	3636649	n03636649	lamp.n.02	lighting	28
+392	water bottle	water bottle	15	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n04557648	water_bottle.n.01	objects	39
+1180	keyboard piano	keyboard piano	15	39	6	piano	otherfurniture	Furniture	piano		piano	3928116	n03928116	piano.n.01	furniture	36
+609	projector screen	projector screen	15	38	7	projector screen	otherstructure	Objects							misc	40
+1181	case of water bottles	case of water bottles	15	40	7		otherprop	Objects							objects	39
+195	toaster oven	toaster oven	14	40	7	toaster oven	otherprop	Objects					n04442441	toaster_oven.n.01	appliances	37
+581	music stand	music stand	14	39	6	music stand	otherfurniture	Furniture					n03801760	music_stand.n.01	furniture	36
+58	staircase	stairs	14	38	7	stairs	otherstructure	Objects					n04298308	stairway.n.01	stairs	16
+1182	coat rack	coat rack	14	40	7		otherprop	Objects					n03059103	coatrack.n.01	shelving	3
+1183	storage organizer	storage organizer	14	40	7		otherprop	Objects							shelving	3
+139	machine	machine	14	40	7	machine	otherprop	Objects					n03699975	machine.n.01	appliances	37
+1184	folded chair	folded chair	14	5	4	chair	chair	Chair	chair	chair	chair	3001627	n03001627	chair.n.01	chair	3
+1185	fire alarm	fire alarm	14	40	7		otherprop	Objects					n03343737	fire_alarm.n.02	misc	40
+156	fireplace	fireplace	13	38	7	fireplace	otherstructure	Objects					n03346455	fireplace.n.01	fireplace	27
+408	vent	vent	13	40	7		otherprop	Objects					n04526241	vent.n.01	misc	40
+213	furniture	furniture	13	39	6	furniture	otherfurniture	Furniture					n03405725	furniture.n.01	furniture	36
+1186	power strip	power strip	13	40	7		otherprop	Objects							objects	39
+1187	calendar	calendar	13	40	7		otherprop	Objects							objects	39
+1188	poster	poster	13	11	8	picture	picture	Picture					n03931044	picture.n.01	picture	6
+115	toilet paper holder	toilet paper holder	13	40	7	toilet paper holder	otherprop	Objects							objects	39
+1189	potted plant	potted plant	12	40	7	plant	otherprop	Objects	plant				n00017222	plant.n.02	plant	14
+304	stuffed animal	stuffed animal	12	40	7	stuffed animal	otherprop	Objects					n04399382	teddy.n.01	objects	39
+1190	luggage	luggage	12	40	7	luggage	otherprop	Objects					n02774630	baggage.n.01	objects	39
+21	curtains	curtain	12	16	13	curtain	curtain	Window	curtain				n03151077	curtain.n.01	curtain	12
+312	headphones	headphones	12	40	7		otherprop	Objects					n03261776	earphone.n.01	objects	39
+233	crate	crate	12	39	6	crate	otherfurniture	Furniture					n03127925	crate.n.01	objects	39
+286	candle	candle	12	40	7	candle	otherprop	Objects	lamp				n02948072	candle.n.01	objects	39
+264	projector	projector	12	40	7	projector	otherprop	Objects					n04009552	projector.n.02	objects	39
+110	clothes dryers	clothes dryer	12	39	6		otherfurniture	Furniture					n03251766	dryer.n.01	appliances	37
+1191	mattress	mattress	12	4	1	bed	bed	Bed	bed	bed	bed	2818832	n02818832	bed.n.01	bed	11
+356	dustpan	dustpan	12	40	7		otherprop	Objects					n03259009	dustpan.n.02	objects	39
+25	drawer	drawer	11	39	6	drawer	otherfurniture	Furniture					n03233905	drawer.n.01	furniture	36
+750	rod	rod	11	40	7		otherprop	Objects			pistol	3948459	n03427202	gat.n.01	misc	40
+269	globe	globe	11	40	7	globe	otherprop	Objects							objects	39
+307	footrest	footrest	11	39	6	foot rest	otherfurniture	Furniture	stool				n03380724	footstool.n.01	stool	19
+410	piano bench	piano bench	11	39	6	piano bench	otherfurniture	Furniture	bench		bench	2828884	n02828884	bench.n.01	seating	34
+730	breakfast bar	breakfast bar	11	38	7	bar	otherstructure	Objects							counter	26
+216	step stool	step stool	11	40	7	step stool	otherprop	Objects	stool				n04315713	step_stool.n.01	stool	19
+1192	hand rail	hand rail	11	38	7	railing	otherstructure	Objects							railing	30
+119	vending machine	vending machine	11	40	7	machine	otherprop	Objects					n04525305	vending_machine.n.01	appliances	37
+682	ceiling fan	ceiling fan	11	40	7	fan	otherprop	Objects					n03320046	fan.n.01	misc	40
+434	swiffer	swiffer	11	40	7		otherprop	Objects							objects	39
+126	foosball table	foosball table	11	39	6	foosball table	otherfurniture	Furniture	table	table	table	4379243	n04379243	table.n.02	table	5
+919	jar	jar	11	40	7	jar	otherprop	Objects			jar	3593526	n03593526	jar.n.01	objects	39
+85	footstool	footstool	11	39	6	ottoman	otherfurniture	Furniture	stool				n03380724	footstool.n.01	stool	19
+1193	folded table	folded table	10	7	10	table	table	Table	table	table	table	4379243	n04379243	table.n.02	table	5
+108	round table	round table	10	7	10	table	table	Table	table	table	table	4379243	n04114554	round_table.n.02	table	5
+135	hamper	hamper	10	40	7	basket	otherprop	Objects			basket	2801938	n03482405	hamper.n.02	objects	39
+1194	poster tube	poster tube	10	40	7		otherprop	Objects							objects	39
+432	case	case	10	40	7	case	otherprop	Objects							objects	39
+53	carpet	carpet	10	40	7	rug	otherprop	Objects					n04118021	rug.n.01	floor	2
+1195	thermostat	thermostat	10	40	7		otherprop	Objects					n04422875	thermostat.n.01	misc	40
+111	coat	coat	10	40	7	jacket	otherprop	Objects					n03057021	coat.n.01	clothes	38
+305	water fountain	water fountain	10	38	7	water fountain	otherstructure	Objects					n03241335	drinking_fountain.n.01	misc	40
+1125	smoke detector	smoke detector	10	40	7		otherprop	Objects							misc	40
+13	pillows	pillow	9	18	7	pillow	pillow	Objects			pillow	3938244	n03938244	pillow.n.01	cushion	8
+1196	flip flops	flip flops	9	40	7	shoe	otherprop	Objects					n04199027	shoe.n.01	clothes	38
+1197	cloth	cloth	9	21	7	clothes	clothes	Objects					n02728440	apparel.n.01	clothes	38
+1198	banner	banner	9	40	7		otherprop	Objects					n02788021	banner.n.01	misc	40
+1199	clothes hanger	clothes hanger	9	40	7		otherprop	Objects					n03057920	coat_hanger.n.01	objects	39
+1200	whiteboard eraser	whiteboard eraser	9	40	7		otherprop	Objects							objects	39
+378	iron	iron	9	40	7		otherprop	Objects					n03584829	iron.n.04	objects	39
+591	instrument case	instrument case	9	40	7	case	otherprop	Objects							objects	39
+49	toilet paper rolls	toilet paper	9	40	7	toilet paper	otherprop	Objects					n15075141	toilet_tissue.n.01	objects	39
+92	soap	soap	9	40	7	soap	otherprop	Objects					n04253437	soap.n.01	objects	39
+1098	block	block	9	40	7		otherprop	Objects							misc	40
+291	wall hanging	wall hanging	8	40	7		otherprop	Objects					n03491178	hanging.n.01	picture	6
+1063	kitchen island	kitchen island	8	38	7	kitchen island	otherstructure	Objects					n03620600	kitchen_island.n.01	counter	26
+107	pipes	pipe	8	38	7		otherstructure	Objects							misc	40
+1135	toothbrush	toothbrush	8	40	7	toothbrush	otherprop	Objects					n04453156	toothbrush.n.01	objects	39
+189	shirt	shirt	8	40	7		otherprop	Objects					n04197391	shirt.n.01	clothes	38
+245	cutting board	cutting board	8	40	7	cutting board	otherprop	Objects					n03025513	chopping_board.n.01	objects	39
+194	vase	vase	8	40	7	vase	otherprop	Objects	vase		jar	3593526	n04522168	vase.n.01	objects	39
+1201	shower control valve	shower control valve	8	38	7		otherstructure	Objects					n04208936	shower.n.01	shower	23
+386	exercise machine	exercise machine	8	40	7	machine	otherprop	Objects							gym_equipment	33
+1202	compost bin	compost bin	8	39	6	garbage bin	otherfurniture	Furniture			trash_bin	2747177	n02747177	ashcan.n.01	objects	39
+857	shorts	shorts	8	40	7	shorts	otherprop	Objects							clothes	38
+452	tire	tire	8	40	7		otherprop	Objects					n04440749	tire.n.01	objects	39
+1203	teddy bear	teddy bear	7	40	7	stuffed animal	otherprop	Objects					n04399382	teddy.n.01	objects	39
+346	bathrobe	bathrobe	7	40	7		otherprop	Objects					n02807616	bathrobe.n.01	clothes	38
+152	handrail	handrail	7	38	7	railing	otherstructure	Objects					n02788148	bannister.n.02	railing	30
+83	faucet	faucet	7	40	7	faucet	otherprop	Objects			faucet	3325088	n03325088	faucet.n.01	misc	40
+1204	pantry wall	pantry wall	7	1	12	wall	wall	Wall					n04546855	wall.n.01	wall	1
+726	thermos	thermos	7	40	7	flask	otherprop	Objects	bottle		bottle	2876657	n04422727	thermos.n.01	objects	39
+61	rug	rug	7	40	7	rug	otherprop	Objects					n04118021	rug.n.01	floor	2
+39	couch cushions	cushion	7	18	7	pillow	pillow	Objects					n03151500	cushion.n.03	cushion	8
+1117	tripod	tripod	7	39	6	stand	otherfurniture	Furniture					n04485082	tripod.n.01	objects	39
+540	mailbox	mailbox	7	29	7	box	box	Objects			mailbox	3710193	n03710193	mailbox.n.01	misc	40
+1205	tupperware	tupperware	7	40	7		otherprop	Objects							objects	39
+415	shoe rack	shoe rack	7	40	7	shoe rack	otherprop	Objects							shelving	31
+31	towels	towel	6	27	7	towel	towel	Objects					n04459362	towel.n.01	towel	20
+1206	beer bottles	beer bottle	6	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+153	treadmill	treadmill	6	39	6	treadmill	otherfurniture	Furniture					n04477387	treadmill.n.01	gym_equipment	33
+1207	salt	salt	6	40	7		otherprop	Objects							objects	39
+129	chest	chest	6	39	6	chest	otherfurniture	Furniture	dresser	dresser					chest_of_drawers	13
+220	dispenser	dispenser	6	40	7		otherprop	Objects					n03210683	dispenser.n.01	objects	39
+1208	mirror doors	mirror door	6	8	12	door	door	Wall	door				n03221720	door.n.01	door	4
+231	remote	remote	6	40	7		otherprop	Objects			remote_control	4074963	n04074963	remote_control.n.01	objects	39
+1209	folded ladder	folded ladder	6	39	6	ladder	otherfurniture	Furniture	stairs				n03632277	ladder.n.01	misc	40
+39	cushion	cushion	6	18	7	pillow	pillow	Objects					n03151500	cushion.n.03	cushion	8
+1210	carton	carton	6	40	7		otherprop	Objects							objects	39
+117	step	step	6	38	7		otherstructure	Objects					n04314914	step.n.04	misc	40
+822	drying rack	drying rack	6	39	6	drying rack	otherfurniture	Furniture							shelving	31
+238	slippers	slipper	6	40	7	shoe	otherprop	Objects					n04241394	slipper.n.01	clothes	38
+143	pool table	pool table	6	39	6	pool table	otherfurniture	Furniture	table	table	table	4379243	n03982430	pool_table.n.01	table	5
+1211	soda stream	soda stream	6	40	7		otherprop	Objects							objects	39
+228	toilet brush	toilet brush	6	40	7	toilet brush	otherprop	Objects							objects	39
+494	loft bed	loft bed	6	4	1	bed	bed	Bed	bed	bed	bed	2818832	n02818832	bed.n.01	bed	11
+226	cooking pot	cooking pot	6	40	7	pot	otherprop	Objects							objects	39
+91	heater	heater	6	39	6	heater	otherfurniture	Furniture					n03508101	heater.n.01	misc	40
+1072	messenger bag	messenger bag	6	37	7	bag	bag	Objects							objects	39
+435	stapler	stapler	6	40	7	stapler	otherprop	Objects					n04303497	stapler.n.01	objects	39
+1165	closet walls	closet wall	5	1	12	wall	wall	Wall					n04546855	wall.n.01	wall	1
+345	scanner	scanner	5	40	7		otherprop	Objects							appliances	37
+893	elliptical machine	elliptical machine	5	40	7	machine	otherprop	Objects							gym_equipment	33
+621	kettle	kettle	5	40	7	pot	otherprop	Objects					n03612814	kettle.n.01	objects	39
+1212	metronome	metronome	5	40	7		otherprop	Objects					n03757604	metronome.n.01	objects	39
+297	dumbell	dumbell	5	40	7		otherprop	Objects							objects	39
+1213	music book	music book	5	23	2	book	books	Books					n02870526	book.n.11	objects	39
+1214	rice cooker	rice cooker	5	40	7		otherprop	Objects							objects	39
+1215	dart board	dart board	5	38	7	board	otherstructure	Objects					n03162940	dartboard.n.01	objects	39
+529	sewing machine	sewing machine	5	40	7	sewing machine	otherprop	Objects					n04179913	sewing_machine.n.01	objects	39
+1216	grab bar	grab bar	5	38	7	railing	otherstructure	Objects							railing	30
+1217	flowerpot	flowerpot	5	40	7	vase	otherprop	Objects	vase		jar	3593526	n04522168	vase.n.01	objects	39
+1218	painting	painting	5	11	8	picture	picture	Picture					n03931044	picture.n.01	picture	6
+1219	railing	railing	5	38	7	railing	otherstructure	Objects					n04047401	railing.n.01	railing	30
+1220	stair	stair	5	38	7	stairs	otherstructure	Objects	stairs				n04314914	step.n.04	stairs	16
+525	toolbox	toolbox	5	39	6	chest	otherfurniture	Furniture					n04452615	toolbox.n.01	objects	39
+204	nerf gun	nerf gun	5	40	7		otherprop	Objects							objects	39
+693	binders	binder	5	40	7	binder	otherprop	Objects							objects	39
+179	desk lamp	desk lamp	5	35	7	lamp	lamp	Objects	lamp		lamp	3636649	n03636649	lamp.n.02	lighting	28
+1221	quadcopter	quadcopter	5	40	7		otherprop	Objects							objects	39
+1222	pitcher	pitcher	5	40	7	pitcher	otherprop	Objects					n03950228	pitcher.n.02	objects	39
+1223	hanging	hanging	5	40	7		otherprop	Objects							misc	40
+1224	mail	mail	5	40	7		otherprop	Objects							misc	40
+1225	closet ceiling	closet ceiling	5	22	3	ceiling	ceiling	Ceiling					n02990373	ceiling.n.01	ceiling	17
+1226	hoverboard	hoverboard	5	40	7		otherprop	Objects							objects	39
+1227	beanbag chair	beanbag chair	5	39	6	bean bag	otherfurniture	Furniture					n02816656	beanbag.n.01	chair	3
+571	water heater	water heater	5	40	7	water heater	otherprop	Objects					n04560113	water_heater.n.01	misc	40
+1228	spray bottle	spray bottle	5	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+556	rope	rope	5	40	7	rope	otherprop	Objects					n04108268	rope.n.01	objects	39
+280	plastic container	plastic container	5	40	7	container	otherprop	Objects							objects	39
+1229	soap bottle	soap bottle	5	40	7	soap	otherprop	Objects							objects	39
+1230	ikea bag	ikea bag	4	37	7	bag	bag	Objects				2773838	n02773838	bag.n.06	objects	39
+1231	sleeping bag	sleeping bag	4	40	7		otherprop	Objects					n04235860	sleeping_bag.n.01	objects	39
+1232	duffel bag	duffel bag	4	37	7	bag	bag	Objects			suitcase	2773838	n02773838	bag.n.06	objects	39
+746	frying pan	frying pan	4	40	7	frying pan	otherprop	Objects					n03400231	frying_pan.n.01	objects	39
+1233	oven mitt	oven mitt	4	40	7		otherprop	Objects							objects	39
+1234	pot	pot	4	40	7	pot	otherprop	Objects					n04235860	sleeping_bag.n.01	objects	39
+144	hand dryer	hand dryer	4	40	7		otherprop	Objects							objects	39
+282	dollhouse	dollhouse	4	39	6	doll house	otherfurniture	Furniture					n03219483	dollhouse.n.01	objects	39
+167	shampoo bottle	shampoo bottle	4	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+1235	hair brush	hair brush	4	40	7		otherprop	Objects					n02908217	brush.n.02	objects	39
+1236	tennis racket	tennis racket	4	40	7		otherprop	Objects					n04409806	tennis_racket.n.01	objects	39
+1237	display case	display case	4	40	7	case	otherprop	Objects							objects	39
+234	ping pong table	ping pong table	4	39	6	ping pong table	otherfurniture	Furniture	table	table	table	4379243	n04379243	table.n.02	table	5
+563	boiler	boiler	4	40	7		otherprop	Objects							misc	40
+1238	bag of coffee beans	bag of coffee beans	4	37	7	bag	bag	Objects			suitcase	2773838	n02773838	bag.n.06	objects	39
+1239	bananas	banana	4	40	7		otherprop	Objects					n00021265	food.n.01	objects	39
+1240	carseat	carseat	4	40	7		otherprop	Objects							misc	40
+366	helmet	helmet	4	40	7		otherprop	Objects			helmet	3513137	n03513137	helmet.n.02	clothes	38
+816	umbrella	umbrella	4	40	7	umbrella	otherprop	Objects					n04507155	umbrella.n.01	objects	39
+1241	coffee box	coffee box	4	40	7		otherprop	Objects							objects	39
+719	envelope	envelope	4	40	7	envelope	otherprop	Objects					n03291819	envelope.n.01	objects	39
+284	wet floor sign	wet floor sign	4	40	7	sign	otherprop	Objects							misc	40
+1242	clothing rack	clothing rack	4	39	6	stand	otherfurniture	Furniture					n04038440	rack.n.05	shelving	31
+247	controller	controller	4	40	7		otherprop	Objects					n03096960	control.n.09	objects	39
+1243	bath walls	bathroom wall	4	1	12	wall	wall	Wall					n04546855	wall.n.01	wall	1
+1244	podium	podium	4	39	6		otherfurniture	Furniture					n03159640	dais.n.01	furniture	36
+1245	storage box	storage box	4	29	7	box	box	Objects					n02883344	box.n.01	objects	39
+1246	dolly	dolly	4	40	7		otherprop	Objects							misc	40
+1247	shampoo	shampoo	3	40	7		otherprop	Objects					n04183516	shampoo.n.01	objects	39
+592	paper tray	paper tray	3	40	7	paper tray	otherprop	Objects							objects	39
+385	cabinet door	cabinet door	3	8	12	door	door	Wall	door						door	4
+1248	changing station	changing station	3	40	7		otherprop	Objects							misc	40
+1249	poster printer	poster printer	3	40	7	printer	otherprop	Objects			printer	4004475	n04004475	printer.n.03	appliances	37
+133	screen	screen	3	40	7		otherprop	Objects					n03151077	curtain.n.01	curtain	12
+301	soap bar	soap bar	3	38	7	bar	otherstructure	Objects							objects	39
+1250	crutches	crutches	3	40	7		otherprop	Objects					n03141823	crutch.n.01	objects	39
+379	studio light	studio light	3	38	7	light	otherstructure	Objects							lighting	28
+130	stack of cups	cup	3	40	7	cup	otherprop	Objects	cup		cup or mug	3797390	n03797390	mug.n.04	objects	39
+1251	toilet flush button	toilet flush button	3	40	7		otherprop	Objects							objects	39
+450	trunk	trunk	3	40	7		otherprop	Objects							misc	40
+1252	grocery bag	grocery bag	3	37	7	bag	bag	Objects			suitcase	2773838	n03461288	grocery_bag.n.01	objects	39
+316	plastic bin	plastic bin	3	40	7	bin	otherprop	Objects							objects	39
+1253	pizza box	pizza box	3	29	7	box	box	Objects							objects	39
+385	cabinet doors	cabinet door	3	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	door	4
+1254	legs	legs	3	31	7	person	person	Objects	person				n05217688	person.n.02	misc	40
+461	car	car	3	40	7	car	otherprop	Objects	car		car	2958343	n02958343	car.n.01	misc	40
+1255	shaving cream	shaving cream	3	40	7		otherprop	Objects					n04186051	shaving_cream.n.01	objects	39
+1256	luggage stand	luggage stand	3	39	6	stand	otherfurniture	Furniture					n04038440	rack.n.05	shelving	31
+599	shredder	shredder	3	40	7		otherprop	Objects					n04210120	shredder.n.01	objects	39
+281	statue	statue	3	40	7	sculpture	otherprop	Objects					n04306847	statue.n.01	misc	40
+1257	urinal	urinal	3	33	7	toilet	toilet	Objects	toilet	toilet			n04515991	urinal.n.01	toilet	18
+1258	hose	hose	3	40	7		otherprop	Objects					n03539875	hose.n.03	misc	40
+1259	bike pump	bike pump	3	40	7		otherprop	Objects							objects	39
+319	coatrack	coatrack	3	40	7		otherprop	Objects					n03059103	coatrack.n.01	shelving	31
+1260	bear	bear	3	40	7		otherprop	Objects							objects	39
+28	wall lamp	lamp	3	35	7	lamp	lamp	Objects	lamp		lamp	3636649	n03636649	lamp.n.02	lighting	28
+1261	humidifier	humidifier	3	40	7		otherprop	Objects							objects	39
+546	toothpaste	toothpaste	3	40	7	toothpaste	otherprop	Objects							objects	39
+1262	mouthwash bottle	mouthwash bottle	3	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+1263	poster cutter	poster cutter	3	40	7		otherprop	Objects							objects	39
+1264	golf bag	golf bag	3	37	7	bag	bag	Objects			suitcase	2773838	n03445617	golf_bag.n.01	objects	39
+1265	food container	food container	3	40	7	container	otherprop	Objects					n03094503	container.n.01	objects	39
+1266	camera	camera	3	40	7		otherprop	Objects							objects	39
+28	table lamp	lamp	3	35	7	lamp	lamp	Objects	lamp		lamp	3636649	n04380533	table_lamp.n.01	lighting	28
+1267	yoga mat	yoga mat	3	20	5	floor mat	floor mat	Floor					n03727837	mat.n.01	floor	2
+1268	card	card	3	40	7		otherprop	Objects							objects	39
+1269	mug	mug	3	40	7	cup	otherprop	Objects	cup		cup or mug	3797390	n03797390	mug.n.04	objects	39
+188	shower doors	shower door	3	38	7		otherstructure	Objects					n04208936	shower.n.01	door	4
+689	cardboard	cardboard	3	40	7		otherprop	Objects							objects	39
+1270	rack stand	rack stand	3	39	6	stand	otherfurniture	Furniture					n04038440	rack.n.05	shelving	31
+1271	boxes of paper	boxes of paper	3	29	7	box	box	Objects					n02883344	box.n.01	objects	39
+1272	flag	flag	3	40	7		otherprop	Objects							misc	40
+354	futon	futon	3	39	6	mattress	otherfurniture	Furniture					n03408444	futon.n.01	sofa	10
+339	magazine	magazine	3	40	7	magazine	otherprop	Objects					n06595351	magazine.n.01	objects	39
+1009	exit sign	exit sign	3	40	7	exit sign	otherprop	Objects							misc	40
+1273	rolled poster	rolled poster	3	40	7		otherprop	Objects							objects	39
+1274	wheel	wheel	3	40	7		otherprop	Objects							objects	39
+15	pictures	picture	3	11	8	picture	picture	Picture					n03931044	picture.n.01	picture	6
+1275	blackboard eraser	blackboard eraser	3	40	7	eraser	otherprop	Objects					n03294833	eraser.n.01	objects	39
+361	organizer	organizer	3	40	7		otherprop	Objects					n03918737	personal_digital_assistant.n.01	objects	39
+1276	doll	doll	3	40	7	toy	otherprop	Objects					n03219135	doll.n.01	objects	39
+326	book rack	book rack	3	39	6	bookrack	otherfurniture	Furniture							objects	39
+1277	laundry bag	laundry bag	3	40	7	laundry basket	otherprop	Objects			basket	2801938	n03050864	clothes_hamper.n.01	objects	39
+1278	sponge	sponge	3	40	7		otherprop	Objects					n01906749	sponge.n.04	objects	39
+116	seating	seat	3	39	6	furniture	otherfurniture	Furniture					n04161981	seat.n.03	furniture	36
+1184	folded chairs	folded chair	2	5	4	chair	chair	Chair	chair	chair	chair	3001627	n03001627	chair.n.01	chair	3
+1279	lotion bottle	lotion bottle	2	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+212	can	can	2	40	7	can	otherprop	Objects			can	2946921	n02946921	can.n.01	objects	39
+1280	lunch box	lunch box	2	40	7		otherprop	Objects							objects	39
+1281	food display	food display	2	40	7		otherprop	Objects							misc	40
+794	storage shelf	storage shelf	2	40	7		otherprop	Objects							shelving	31
+1282	sliding wood door	sliding wood door	2	40	7		otherprop	Objects							door	4
+955	pants	pants	2	40	7		otherprop	Objects					n04489008	trouser.n.01	clothes	38
+387	wood	wood	2	40	7		otherprop	Objects							misc	40
+69	boards	board	2	38	7	board	otherstructure	Objects							board_panel	35
+65	bottles	bottle	2	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+523	washcloth	washcloth	2	40	7		otherprop	Objects					n04554523	washcloth.n.01	towel	20
+389	workbench	workbench	2	39	6	bench	otherfurniture	Furniture	bench		table	4379243	n04600486	workbench.n.01	table	5
+29	open kitchen cabinet	kitchen cabinet	2	3	6	cabinet	cabinet	Furniture					n02933112	cabinet.n.01	cabinet	7
+1283	organizer shelf	organizer shelf	2	15	6	shelves	shelves	Furniture	bookshelf		bookshelf	2871439	n02871439	bookshelf.n.01	shelving	31
+146	frame	frame	2	38	7		otherstructure	Objects							misc	40
+130	cups	cup	2	40	7	cup	otherprop	Objects	cup		cup or mug	3797390	n03797390	mug.n.04	objects	39
+372	exercise ball	exercise ball	2	40	7	ball	otherprop	Objects					n04285146	sports_equipment.n.01	gym_equipment	33
+289	easel	easel	2	39	6	stand	otherfurniture	Furniture					n03262809	easel.n.01	furniture	36
+440	garbage bag	garbage bag	2	37	7	bag	bag	Objects			suitcase	2773838	n02773838	bag.n.06	objects	39
+321	roomba	roomba	2	40	7		otherprop	Objects							objects	39
+976	garage door	garage door	2	38	7	garage door	otherstructure	Objects	door						door	4
+1256	luggage rack	luggage stand	2	39	6	stand	otherfurniture	Furniture					n04038440		shelving	31
+1284	bike lock	bike lock	2	40	7		otherprop	Objects							objects	39
+1285	briefcase	briefcase	2	40	7		otherprop	Objects					n02900705	briefcase.n.01	objects	39
+357	hand towel	hand towel	2	27	7	towel	towel	Objects					n03490006	hand_towel.n.01	towel	20
+1286	bath products	bath product	2	40	7		otherprop	Objects							objects	39
+1287	star	star	2	40	7		otherprop	Objects					n09444783	star.n.03	misc	40
+365	map	map	2	40	7	map	otherprop	Objects					n03720163	map.n.01	misc	40
+1288	coffee bean bag	coffee bean bag	2	37	7	bag	bag	Objects			suitcase	2773838	n02773838	bag.n.06	objects	39
+81	headboard	headboard	2	39	6	headboard	otherfurniture	Furniture					n03502200	headboard.n.01	bed	11
+1289	ipad	ipad	2	40	7		otherprop	Objects							objects	39
+1290	display rack	display rack	2	39	6	stand	otherfurniture	Furniture					n04038440	rack.n.05	shelving	31
+948	traffic cone	traffic cone	2	40	7	cone	otherprop	Objects	cone						objects	39
+174	toiletry	toiletry	2	40	7		otherprop	Objects					n04447443	toiletry.n.01	objects	39
+1028	canopy	canopy	2	40	7		otherprop	Objects							misc	40
+1291	massage chair	massage chair	2	5	4	chair	chair	Chair	chair	chair	chair	3001627	n03001627	chair.n.01	chair	3
+1292	paper organizer	paper organizer	2	40	7		otherprop	Objects							objects	39
+1005	barricade	barricade	2	40	7		otherprop	Objects							misc	40
+235	platform	platform	2	38	7		otherstructure	Objects							misc	40
+1293	cap	cap	2	40	7	hat	otherprop	Objects					n03497657	hat.n.01	clothes	38
+1294	dumbbell plates	dumbbell plates	2	40	7		otherprop	Objects							objects	39
+1295	elevator	elevator	2	38	7		otherstructure	Objects							misc	40
+1296	cooking pan	cooking pan	2	40	7	pan	otherprop	Objects					n03880531	pan.n.01	objects	39
+1297	trash bag	trash bag	2	37	7	bag	bag	Objects							objects	39
+1298	santa	santa	2	40	7		otherprop	Objects							misc	40
+1299	jewelry box	jewelry box	2	29	7	box	box	Objects					n02883344	box.n.01	objects	39
+1300	boat	boat	2	40	7		otherprop	Objects							misc	40
+1301	sock	sock	2	21	7	clothes	clothes	Objects					n04254777	sock.n.01	clothes	38
+1051	kinect	kinect	2	40	7	kinect	otherprop	Objects							objects	39
+566	crib	crib	2	39	6	crib	otherfurniture	Furniture							furniture	36
+1302	plastic storage bin	plastic storage bin	2	40	7	container	otherprop	Objects					n03094503	container.n.01	objects	39
+1062	cooler	cooler	2	24	6	refridgerator	refridgerator	Furniture					n03102654	cooler.n.01	appliances	37
+1303	kitchen apron	kitchen apron	2	21	7	clothes	clothes	Objects					n02728440	apparel.n.01	clothes	38
+1304	dishwashing soap bottle	dishwashing soap bottle	2	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+1305	xbox controller	xbox controller	2	40	7		otherprop	Objects							objects	39
+1306	banana holder	banana holder	2	40	7		otherprop	Objects							objects	39
+298	ping pong paddle	ping pong paddle	2	40	7		otherprop	Objects							table	5
+1307	airplane	airplane	2	40	7		otherprop	Objects							misc	40
+1308	conditioner bottle	conditioner bottle	2	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+1309	tea kettle	tea kettle	2	40	7	tea kettle	otherprop	Objects					n04397768	teakettle.n.01	objects	39
+43	bedframe	bedframe	2	39	6		otherfurniture	Furniture					n02822579	bedstead.n.01	bed	11
+1310	wood beam	wood beam	2	38	7		otherstructure	Objects							beam	29
+593	toilet paper package	toilet paper package	2	40	7		otherprop	Objects							objects	39
+1311	wall mounted coat rack	wall mounted coat rack	2	40	7		otherprop	Objects					n03059103	coatrack.n.01	shelving	31
+1312	film light	film light	2	40	7		otherprop	Objects							lighting	28
+749	ceiling lamp	ceiling lamp	1	35	7	lamp	lamp	Objects	lamp		lamp	3636649	n03636649	lamp.n.02	lighting	28
+623	chain	chain	1	40	7		otherprop	Objects							chair	3
+1313	sofa	sofa	1	6	9	sofa	sofa	Sofa	sofa	sofa	sofa	4256520	n04256520	sofa.n.01	sofa	10
+99	closet wardrobe	wardrobe	1	39	6	wardrobe	otherfurniture	Furniture	wardrobe				n04550184	wardrobe.n.01	furniture	36
+265	sweater	sweater	1	40	7		otherprop	Objects					n04370048	sweater.n.01	clothes	38
+1314	kitchen mixer	kitchen mixer	1	40	7		otherprop	Objects							appliances	37
+99	wardrobe	wardrobe	1	39	6	wardrobe	otherfurniture	Furniture	wardrobe				n04550184	wardrobe.n.01	furniture	36
+1315	water softener	water softener	1	40	7		otherprop	Objects							misc	40
+448	banister	banister	1	38	7	banister	otherstructure	Objects					n02788148	bannister.n.02	railing	30
+257	trolley	trolley	1	40	7	trolley	otherprop	Objects					n04335435	streetcar.n.01	misc	40
+1316	pantry shelf	pantry shelf	1	15	6	shelves	shelves	Furniture	bookshelf		bookshelf	2871439	n02871439	bookshelf.n.01	shelving	31
+786	sofa bed	sofa bed	1	4	1	bed	bed	Bed	bed	bed	bed	2818832	n02818832	bed.n.01	bed	11
+801	loofa	loofa	1	40	7		otherprop	Objects							objects	39
+972	shower faucet handle	shower faucet handle	1	40	7	handle	otherprop	Objects							shower	23
+1317	toy piano	toy piano	1	40	7	toy	otherprop	Objects					n03964744	plaything.n.01	objects	39
+1318	fish	fish	1	40	7		otherprop	Objects					n02512053	fish.n.01	objects	39
+75	file cabinets	file cabinet	1	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n03337140	file.n.03	cabinet	7
+657	cat litter box	cat litter box	1	29	7	box	box	Objects							objects	39
+561	electric panel	electric panel	1	40	7		otherprop	Objects							misc	40
+93	suitcases	suitcase	1	40	7	luggage	otherprop	Objects					n02774630	baggage.n.01	objects	39
+513	curtain rod	curtain rod	1	38	7	curtain rod	otherstructure	Objects							curtain	12
+411	bunk bed	bunk bed	1	39	6	bunk bed	otherfurniture	Furniture	bed	bed	bed	2818832	n02920259	bunk_bed.n.01	bed	11
+1122	chandelier	chandelier	1	38	7	chandelier	otherstructure	Objects					n03005285	chandelier.n.01	lighting	28
+922	tape	tape	1	40	7	tape	otherprop	Objects							objects	39
+88	plates	plate	1	40	7		otherprop	Objects					n03959485	plate.n.04	objects	39
+518	alarm	alarm	1	40	7	alarm	otherprop	Objects			clock	3046257	n02694662	alarm_clock.n.01	objects	39
+814	fire hose	fire hose	1	40	7		otherprop	Objects					n03346004	fire_hose.n.01	misc	40
+1319	toy dinosaur	toy dinosaur	1	40	7	toy	otherprop	Objects					n03964744	plaything.n.01	objects	39
+1320	cone	cone	1	40	7		otherprop	Objects							objects	39
+649	glass doors	glass door	1	8	12	door	door	Wall	door				n03221720	door.n.01	door	4
+607	hatrack	hatrack	1	40	7		otherprop	Objects					n03059103	coatrack.n.01	shelving	31
+819	subwoofer	subwoofer	1	40	7	speaker	otherprop	Objects			speaker	3691459	n04349401	subwoofer.n.01	objects	39
+1321	fire sprinkler	fire sprinkler	1	40	7		otherprop	Objects							misc	40
+1322	trash cabinet	trash cabinet	1	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	cabinet	7
+1204	pantry walls	pantry wall	1	1	12	wall	wall	Wall					n04546855	wall.n.01	wall	1
+227	photo	photo	1	40	7	photo	otherprop	Objects					n03925226	photograph.n.01	picture	6
+817	barrier	barrier	1	40	7		otherprop	Objects					n02796623	barrier.n.01	misc	40
+130	stacks of cups	cup	1	40	7		otherprop	Objects					n03147509	cup.n.01	objects	39
+712	beachball	beachball	1	40	7	ball	otherprop	Objects					n02814224	beach_ball.n.01	objects	39
+1323	folded boxes	folded boxes	1	40	7		otherprop	Objects							objects	39
+1324	contact lens solution bottle	contact lens solution bottle	1	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+673	covered box	covered box	1	29	7	box	box	Objects							objects	39
+459	folder	folder	1	40	7	folder	otherprop	Objects					n03376279	folder.n.02	objects	39
+643	mail trays	mail tray	1	40	7	mail tray	otherprop	Objects							objects	39
+238	slipper	slipper	1	40	7		otherprop	Objects					n04241394	slipper.n.01	clothes	38
+765	magazine rack	magazine rack	1	39	6	stand	otherfurniture	Furniture					n03704549	magazine_rack.n.01	shelving	31
+1008	sticker	sticker	1	40	7	sticker	otherprop	Objects					n07272545	gummed_label.n.01	objects	39
+225	lotion	lotion	1	40	7		otherprop	Objects					n03690938	lotion.n.01	objects	39
+1083	buddha	buddha	1	40	7		otherprop	Objects							objects	39
+813	file organizer	file organizer	1	40	7		otherprop	Objects							objects	39
+138	paper towel rolls	paper towel roll	1	40	7	paper towel	otherprop	Objects					n03887697	paper_towel.n.01	towel	20
+1145	night lamp	night lamp	1	35	7	lamp	lamp	Objects	lamp		lamp	3636649	n03636649	lamp.n.02	lighting	28
+796	fuse box	fuse box	1	40	7		otherprop	Objects							misc	40
+1325	knife block	knife block	1	40	7		otherprop	Objects							objects	39
+363	furnace	furnace	1	39	6	furnace	otherfurniture	Furniture					n03404449	furnace.n.01
+1174	cd cases	cd case	1	40	7		otherprop	Objects							objects	39
+38	stools	stool	1	40	7	stool	otherprop	Objects	stool				n04326896	stool.n.01	stool	19
+1326	hand sanitzer dispenser	hand sanitzer dispenser	1	40	7		otherprop	Objects					n04254120	soap_dispenser.n.01	objects	39
+997	teapot	teapot	1	40	7	tea pot	otherprop	Objects					n04398044	teapot.n.01	objects	39
+1327	pen holder	pen holder	1	40	7		otherprop	Objects							objects	39
+1328	tray rack	tray rack	1	40	7		otherprop	Objects							objects	39
+1329	wig	wig	1	40	7		otherprop	Objects					n04584207	wig.n.01	objects	39
+182	switch	switch	1	40	7		otherprop	Objects					n04372370	switch.n.01	misc	40
+280	plastic containers	plastic container	1	40	7	container	otherprop	Objects					n03094503	container.n.01	objects	39
+1330	night light	night light	1	40	7		otherprop	Objects							lighting	28
+1331	notepad	notepad	1	40	7		otherprop	Objects							objects	39
+1332	mail bin	mail bin	1	40	7		otherprop	Objects							misc	40
+1333	elevator button	elevator button	1	40	7		otherprop	Objects							misc	40
+939	gaming wheel	gaming wheel	1	40	7		otherprop	Objects							objects	39
+1334	drum set	drum set	1	40	7		otherprop	Objects							objects	39
+480	cosmetic bag	cosmetic bag	1	37	7	bag	bag	Objects							objects	39
+907	coffee mug	coffee mug	1	40	7	vessel	otherprop	Objects			cup or mug	3797390	n03063599	coffee_mug.n.01	objects	39
+1335	closet shelf	closet shelf	1	15	6	shelves	shelves	Furniture	bookshelf		bookshelf	2871439	n02871439	bookshelf.n.01	shelving	31
+1336	baby mobile	baby mobile	1	40	7		otherprop	Objects							objects	39
+829	diaper bin	diaper bin	1	40	7	bin	otherprop	Objects							objects	39
+947	door wall	door wall	1	1	12	wall	wall	Wall							wall	1
+1116	stepstool	stepstool	1	40	7	step stool	otherprop	Objects							objects	39
+599	paper shredder	shredder	1	40	7		otherprop	Objects					n04210120	shredder.n.01	objects	39
+733	dress rack	dress rack	1	40	7		otherprop	Objects					n03238762	dress_rack.n.01	misc	40
+123	cover	cover	1	40	7	blanket	otherprop	Objects							objects	39
+506	shopping bag	shopping bag	1	37	7	bag	bag	Objects					n04204081	shopping_bag.n.01	objects	39
+569	sliding door	sliding door	1	8	12	door	door	Wall	door				n04239074	sliding_door.n.01	door	4
+1337	exercise bike	exercise bike	1	40	7	machine	otherprop	Objects					n04210120	shredder.n.01	gym_equipment	33
+1338	recliner chair	recliner chair	1	5	4	chair	chair	Chair	chair	chair	chair	3001627	n03238762	dress_rack.n.01	chair	3
+1314	kitchenaid mixer	kitchen mixer	1	40	7		otherprop	Objects							appliances	37
+1339	soda can	soda can	1	40	7	can	otherprop	Objects			can	2946921	n02946921	can.n.01	objects	39
+1340	stovetop	stovetop	1	38	7	stove	otherstructure	Objects			stove	4330267	n04330267	stove.n.02	appliances	37
+851	stepladder	stepladder	1	39	6	ladder	otherfurniture	Furniture	stairs				n04315599	step_ladder.n.01	stairs	16
+142	tap	tap	1	40	7	faucet	otherprop	Objects			faucet	3325088	n04559451	water_faucet.n.01	objects	39
+436	cable	cable	1	40	7	cables	otherprop	Objects							objects	39
+1341	baby changing station	baby changing station	1	39	6		otherfurniture	Furniture							furniture	36
+1342	costume	costume	1	21	7	clothes	clothes	Objects					n02728440	apparel.n.01	clothes	38
+885	rocking chair	rocking chair	1	5	4	chair	chair	Chair	chair	chair	chair	3001627	n04099969	rocking_chair.n.01	chair	3
+693	binder	binder	1	40	7	binder	otherprop	Objects							objects	39
+815	media center	media center	1	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	cabinet	7
+401	towel rack	towel rack	1	40	7		otherprop	Objects					n04459773	towel_rack.n.01	misc	40
+1343	medal	medal	1	40	7		otherprop	Objects							objects	39
+1184	stack of folded chairs	folded chair	1	5	4	chair	chair	Chair	chair	chair	chair	3001627	n03001627	chair.n.01	chair	3
+1344	telescope	telescope	1	40	7		otherprop	Objects					n04403638	telescope.n.01	objects	39
+1345	closet doorframe	closet doorframe	1	8	12	door	door	Wall	door						door	4
+160	glass	glass	1	38	7	glass	otherstructure	Objects					n03438257	glass.n.02	misc	40
+1126	baseball cap	baseball cap	1	40	7		otherprop	Objects			cap	2954340	n02799323	baseball_cap.n.01	clothes	38
+1346	battery disposal jar	battery disposal jar	1	40	7	jar	otherprop	Objects			jar	3593526	n03593526	jar.n.01	objects	39
+332	mop	mop	1	40	7		otherprop	Objects					n04367480	swab.n.02	objects	39
+397	tank	tank	1	40	7		otherprop	Objects							objects	39
+643	mail tray	mail tray	1	40	7	mail tray	otherprop	Objects							objects	39
+551	centerpiece	centerpiece	1	40	7	centerpiece	otherprop	Objects					n02994419	centerpiece.n.02	objects	39
+1163	stick	object	1	40	7	stick	otherprop	Objects							objects	39
+1347	closet floor	closet floor	1	2	5	floor	floor	Floor					n03365592	floor.n.01	floor	2
+1348	dryer sheets	dryer sheets	1	40	7		otherprop	Objects							objects	39
+803	bycicle	bycicle	1	40	7		otherprop	Objects							misc	40
+484	flower stand	flower stand	1	39	6	stand	otherfurniture	Furniture							furniture	36
+1349	air mattress	air mattress	1	4	1	bed	bed	Bed	bed	bed	bed	2818832	n02690809	air_mattress.n.01	bed	11
+1350	clip	clip	1	40	7		otherprop	Objects							objects	39
+222	side table	side table	1	7	10	table	table	Table	table	table	table	4379243	n04379243	table.n.02	table	5
+1253	pizza boxes	pizza box	1	29	7	box	box	Objects					n02883344	box.n.01	objects	39
+1351	display	display	1	39	7		otherfurniture	Furniture					n03211117	display.n.06	misc	40
+1352	postcard	postcard	1	40	7		otherprop	Objects							objects	39
+828	display sign	display sign	1	40	7	sign	otherprop	Objects							misc	40
+1353	paper towel	paper towel	1	40	7	paper towel	otherprop	Objects					n03887697	paper_towel.n.01	towel	20
+612	boots	boot	1	40	7	shoe	otherprop	Objects					n04199027	shoe.n.01	clothes	38
+1354	tennis racket bag	tennis racket bag	1	40	7		otherprop	Objects							objects	39
+1355	air hockey table	air hockey table	1	7	10	table	table	Table	table	table	table	4379243	n04379243	table.n.02	table	5
+1301	socks	sock	1	21	7	clothes	clothes	Objects					n04254777	sock.n.01	clothes	38
+1356	food bag	food bag	1	37	7	bag	bag	Objects							objects	39
+1199	clothes hangers	clothes hanger	1	40	7		otherprop	Objects					n03057920	coat_hanger.n.01	misc	40
+1357	starbucks cup	starbucks cup	1	40	7	cup	otherprop	Objects	cup		cup or mug	3797390	n03797390	mug.n.04	objects	39

src/datasets/megadepth.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Dataloader for preprocessed MegaDepth
+# dataset at https://www.cs.cornell.edu/projects/megadepth/
+# See datasets_preprocess/preprocess_megadepth.py
+# --------------------------------------------------------
+import os.path as osp
+import numpy as np
+import sys
+sys.path.append("submodules/mast3r/dust3r")
+from dust3r.datasets.base.base_stereo_view_dataset import BaseStereoViewDataset
+from dust3r.utils.image import imread_cv2
+class MegaDepth(BaseStereoViewDataset):
+    def __init__(self, *args, split, ROOT, **kwargs):
+        self.ROOT = ROOT
+        super().__init__(*args, **kwargs)
+        self.num_views = 3 # render third view
+        self.loaded_data = self._load_data(self.split)
+        if self.split is None:
+            pass
+        elif self.split == 'train':
+            self.select_scene(('0015', '0022'), opposite=True)
+        elif self.split == 'val':
+            self.select_scene(('0015', '0022'))
+        else:
+            raise ValueError(f'bad {self.split=}')
+    def _load_data(self, split):
+        with np.load(osp.join(self.ROOT, 'all_metadata.npz')) as data:
+            self.all_scenes = data['scenes']
+            self.all_images = data['images']
+            self.pairs = data['pairs']
+    def __len__(self):
+        return len(self.pairs)
+    def get_stats(self):
+        return f'{len(self)} pairs from {len(self.all_scenes)} scenes'
+    def select_scene(self, scene, *instances, opposite=False):
+        scenes = (scene,) if isinstance(scene, str) else tuple(scene)
+        scene_id = [s.startswith(scenes) for s in self.all_scenes]
+        assert any(scene_id), 'no scene found'
+        valid = np.in1d(self.pairs['scene_id'], np.nonzero(scene_id)[0])
+        if instances:
+            image_id = [i.startswith(instances) for i in self.all_images]
+            image_id = np.nonzero(image_id)[0]
+            assert len(image_id), 'no instance found'
+            # both together?
+            if len(instances) == 2:
+                valid &= np.in1d(self.pairs['im1_id'], image_id) & np.in1d(self.pairs['im2_id'], image_id)
+            else:
+                valid &= np.in1d(self.pairs['im1_id'], image_id) | np.in1d(self.pairs['im2_id'], image_id)
+        if opposite:
+            valid = ~valid
+        assert valid.any()
+        self.pairs = self.pairs[valid]
+    def _get_views(self, pair_idx, resolution, rng):
+        scene_id, im1_id, im2_id, score = self.pairs[pair_idx]
+        im3_id = int((im1_id + im2_id) / 2)
+        scene, subscene = self.all_scenes[scene_id].split()
+        seq_path = osp.join(self.ROOT, scene, subscene)
+        views = []
+        for im_id in [im1_id, im2_id, im2_id]:
+            img = self.all_images[im_id]
+            try:
+                image = imread_cv2(osp.join(seq_path, img + '.jpg'))
+                depthmap = imread_cv2(osp.join(seq_path, img + ".exr"))
+                camera_params = np.load(osp.join(seq_path, img + ".npz"))
+            except Exception as e:
+                raise OSError(f'cannot load {img}, got exception {e}')
+            intrinsics = np.float32(camera_params['intrinsics'])
+            camera_pose = np.float32(camera_params['cam2world'])
+            image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                image, depthmap, intrinsics, resolution, rng, info=(seq_path, img))
+            views.append(dict(
+                img=image,
+                depthmap=depthmap,
+                camera_pose=camera_pose,  # cam2world
+                camera_intrinsics=intrinsics,
+                dataset='MegaDepth',
+                label=osp.relpath(seq_path, self.ROOT),
+                instance=img))
+        return views
+if __name__ == "__main__":
+    from dust3r.datasets.base.base_stereo_view_dataset import view_name
+    from dust3r.viz import SceneViz, auto_cam_size
+    from dust3r.utils.image import rgb
+    dataset = MegaDepth(split='train', ROOT="data/megadepth_processed", resolution=224, aug_crop=16)
+    for idx in np.random.permutation(len(dataset)):
+        views = dataset[idx]
+        assert len(views) == 3
+        print(idx, view_name(views[0]), view_name(views[1]), view_name(views[2]))
+        viz = SceneViz()
+        poses = [views[view_idx]['camera_pose'] for view_idx in [0, 1, 2]]
+        cam_size = max(auto_cam_size(poses), 0.001)
+        for view_idx in [0, 1, 2]:
+            pts3d = views[view_idx]['pts3d']
+            valid_mask = views[view_idx]['valid_mask']
+            colors = rgb(views[view_idx]['img'])
+            viz.add_pointcloud(pts3d, colors, valid_mask)
+            viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],
+                           focal=views[view_idx]['camera_intrinsics'][0, 0],
+                           color=(idx * 255, (1 - idx) * 255, 0),
+                           image=colors,
+                           cam_size=cam_size)
+        viz.show()

src/datasets/scannet.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import os
+import os.path as osp
+import sys
+sys.path.append("submodules/mast3r/dust3r")
+from dust3r.datasets.base.base_stereo_view_dataset import BaseStereoViewDataset
+import numpy as np
+import cv2
+from dust3r.utils.image import imread_cv2
+class Scannet(BaseStereoViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        super().__init__(*args, **kwargs)
+        self.num_views = 3 # render third view
+        self._load_data()
+    def _load_data(self):
+        # Traverse all the folders in the data_root
+        scene_names = [folder for folder in os.listdir(self.ROOT) if os.path.isdir(os.path.join(self.ROOT, folder))]
+        # Filter out scenes without scene_data.npz
+        valid_scenes = []
+        for scene_name in scene_names:
+            scene_data_path = osp.join(self.ROOT, scene_name, "scene_data.npz")
+            if osp.exists(scene_data_path):
+                valid_scenes.append(scene_name)
+            else:
+                print(f"Skipping {scene_name}: scene_data.npz not found")
+        scene_names = valid_scenes
+        scene_names.sort()
+        if self.split == 'train':
+            scene_names = scene_names[:-150]
+        else:
+            scene_names = scene_names[-150:]
+        # merge all pairs and images
+        pairs = [] # (scene_name, image_idx1, image_idx2)
+        images = {} # (scene_name, image_idx) -> image_path
+        for scene_name in scene_names:
+            scene_path = osp.join(self.ROOT, scene_name, "scene_data.npz")
+            scene_data = np.load(scene_path)
+            pairs.extend([(scene_name, *pair) for pair in scene_data['pairs']])
+            images.update({(scene_name, idx): path for idx, path in enumerate(scene_data['images'])})
+        self.pairs = pairs
+        self.images = images
+    def __len__(self):
+        return len(self.pairs)
+    def _get_views(self, idx, resolution, rng):
+        scene_name, image_idx1, image_idx2, _ = self.pairs[idx]
+        image_idx1 = int(image_idx1)
+        image_idx2 = int(image_idx2)
+        image_idx3 = int((image_idx1 + image_idx2) / 2)
+        views = []
+        for view_idx in [image_idx1, image_idx2, image_idx3]:
+            basename = self.images[(scene_name, view_idx)]
+            # Load RGB image
+            rgb_path = osp.join(self.ROOT, scene_name, 'images', f'{basename}.jpg')
+            rgb_image = imread_cv2(rgb_path)
+            # Load depthmap
+            depthmap_path = osp.join(self.ROOT, scene_name, 'depths', f'{basename}.png')
+            depthmap = imread_cv2(depthmap_path, cv2.IMREAD_UNCHANGED)
+            depthmap = depthmap.astype(np.float32) / 1000
+            depthmap[~np.isfinite(depthmap)] = 0  # invalid
+            # Load camera parameters
+            meta_path = osp.join(self.ROOT, scene_name, 'images', f'{basename}.npz')
+            meta = np.load(meta_path)
+            intrinsics = meta['camera_intrinsics']
+            camera_pose = meta['camera_pose']
+            # crop if necessary
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx)
+            views.append(dict(
+                img=rgb_image,
+                depthmap=depthmap.astype(np.float32),
+                camera_pose=camera_pose.astype(np.float32),
+                camera_intrinsics=intrinsics.astype(np.float32),
+                dataset='ScanNet',
+                label=scene_name + '_' + basename,
+                instance=f'{str(idx)}_{str(view_idx)}',
+            ))
+        return views
+if __name__ == "__main__":
+    from dust3r.datasets.base.base_stereo_view_dataset import view_name
+    from dust3r.viz import SceneViz, auto_cam_size
+    from dust3r.utils.image import rgb
+    dataset = Scannet(split='train', ROOT="data/scannet_processed", resolution=224, aug_crop=16)
+    print(len(dataset))
+    for idx in np.random.permutation(len(dataset)):
+        views = dataset[idx]
+        assert len(views) == 3
+        print(view_name(views[0]), view_name(views[1]), view_name(views[2]))
+        viz = SceneViz()
+        poses = [views[view_idx]['camera_pose'] for view_idx in [0, 1, 2]]
+        cam_size = max(auto_cam_size(poses), 0.001)
+        for view_idx in [0, 1, 2]:
+            pts3d = views[view_idx]['pts3d']
+            valid_mask = views[view_idx]['valid_mask']
+            colors = rgb(views[view_idx]['img'])
+            viz.add_pointcloud(pts3d, colors, valid_mask)
+            viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],
+                           focal=views[view_idx]['camera_intrinsics'][0, 0],
+                           color=(idx*255, (1 - idx)*255, 0),
+                           image=colors,
+                           cam_size=cam_size)
+        viz.show()

src/datasets/scannetpp.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import os
+import os.path as osp
+import sys
+sys.path.append("submodules/mast3r/dust3r")
+from dust3r.datasets.base.base_stereo_view_dataset import BaseStereoViewDataset
+import numpy as np
+import cv2
+from dust3r.utils.image import imread_cv2
+class Scannetpp(BaseStereoViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        super().__init__(*args, **kwargs)
+        assert self.split == 'train' # just for training
+        self.num_views = 3 # render third view
+        self._load_data()
+    def _load_data(self):
+        # Traverse all the folders in the data_root
+        scene_names = [folder for folder in os.listdir(self.ROOT) if os.path.isdir(os.path.join(self.ROOT, folder))]
+        # Filter out scenes without scene_data.npz
+        valid_scenes = []
+        for scene_name in scene_names:
+            scene_data_path = osp.join(self.ROOT, scene_name, "scene_data.npz")
+            if osp.exists(scene_data_path):
+                valid_scenes.append(scene_name)
+            else:
+                print(f"Skipping {scene_name}: scene_data.npz not found")
+        scene_names = valid_scenes
+        scene_names.sort()
+        # merge all pairs and images
+        pairs = [] # (scene_name, image_idx1, image_idx2)
+        images = {} # (scene_name, image_idx) -> image_path
+        for scene_name in scene_names:
+            scene_path = osp.join(self.ROOT, scene_name, "scene_data.npz")
+            scene_data = np.load(scene_path)
+            pairs.extend([(scene_name, *pair) for pair in scene_data['pairs']])
+            images.update({(scene_name, idx): path for idx, path in enumerate(scene_data['images'])})
+        self.pairs = pairs
+        self.images = images
+    def __len__(self):
+        return len(self.pairs)
+    def _get_views(self, idx, resolution, rng):
+        scene_name, image_idx1, image_idx2, _ = self.pairs[idx]
+        image_idx1 = int(image_idx1)
+        image_idx2 = int(image_idx2)
+        image_idx3 = int((image_idx1 + image_idx2) / 2)
+        views = []
+        for view_idx in [image_idx1, image_idx2, image_idx3]:
+            basename = self.images[(scene_name, view_idx)]
+            # Load RGB image
+            rgb_path = osp.join(self.ROOT, scene_name, 'images', f'{basename}.JPG')
+            rgb_image = imread_cv2(rgb_path)
+            # Load depthmap
+            depthmap_path = osp.join(self.ROOT, scene_name, 'depths', f'{basename}.png')
+            depthmap = imread_cv2(depthmap_path, cv2.IMREAD_UNCHANGED)
+            depthmap = depthmap.astype(np.float32) / 1000
+            depthmap[~np.isfinite(depthmap)] = 0  # invalid
+            # Load camera parameters
+            meta_path = osp.join(self.ROOT, scene_name, 'images', f'{basename}.npz')
+            meta = np.load(meta_path)
+            intrinsics = meta['camera_intrinsics']
+            camera_pose = meta['camera_pose']
+            # crop if necessary
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx)
+            views.append(dict(
+                img=rgb_image,
+                depthmap=depthmap.astype(np.float32),
+                camera_pose=camera_pose.astype(np.float32),
+                camera_intrinsics=intrinsics.astype(np.float32),
+                dataset='ScanNet++',
+                label=scene_name + '_' + basename,
+                instance=f'{str(idx)}_{str(view_idx)}',
+            ))
+        return views
+if __name__ == "__main__":
+    from dust3r.datasets.base.base_stereo_view_dataset import view_name
+    from dust3r.viz import SceneViz, auto_cam_size
+    from dust3r.utils.image import rgb
+    dataset = Scannetpp(split='train', ROOT="data/scannetpp_processed", resolution=224, aug_crop=16)
+    print(len(dataset))
+    for idx in np.random.permutation(len(dataset)):
+        views = dataset[idx]
+        assert len(views) == 3
+        print(view_name(views[0]), view_name(views[1]), view_name(views[2]))
+        viz = SceneViz()
+        poses = [views[view_idx]['camera_pose'] for view_idx in [0, 1, 2]]
+        cam_size = max(auto_cam_size(poses), 0.001)
+        for view_idx in [0, 1, 2]:
+            pts3d = views[view_idx]['pts3d']
+            valid_mask = views[view_idx]['valid_mask']
+            colors = rgb(views[view_idx]['img'])
+            viz.add_pointcloud(pts3d, colors, valid_mask)
+            viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],
+                           focal=views[view_idx]['camera_intrinsics'][0, 0],
+                           color=(idx*255, (1 - idx)*255, 0),
+                           image=colors,
+                           cam_size=cam_size)
+        viz.show()

src/datasets_preprocess/scannet_preprocess.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import os
+import numpy as np
+import cv2
+import torch
+import torch.multiprocessing as mp
+def process_scene_on_gpu(gpu_id, scene_names, data_root, output_queue):
+    torch.cuda.set_device(gpu_id)
+    local_pairs = {}
+    local_images = {}
+    for scene_name in scene_names:
+        save_path = os.path.join(data_root, scene_name, "scene_data.npz")
+        if os.path.exists(save_path):
+            print(f"Scene {scene_name} already processed, skipping")
+            continue
+        pairs, images = process_scene(data_root, scene_name)
+        np.savez_compressed(save_path, pairs=pairs, images=images)
+    output_queue.put((local_pairs, local_images))
+def preprocess_scannet(data_root, threads_per_gpu=4):
+    scene_names = [folder for folder in os.listdir(data_root) if os.path.isdir(os.path.join(data_root, folder))]
+    num_gpus = torch.cuda.device_count()
+    total_threads = num_gpus * threads_per_gpu
+    # 将场景平均分配给所有线程
+    scenes_per_thread = [scene_names[i::total_threads] for i in range(total_threads)]
+    output_queue = mp.Queue()
+    processes = []
+    # 为每个GPU创建多个进程
+    for gpu_id in range(num_gpus):
+        for thread_id in range(threads_per_gpu):
+            process_id = gpu_id * threads_per_gpu + thread_id
+            p = mp.Process(
+                target=process_scene_on_gpu,
+                args=(gpu_id, scenes_per_thread[process_id], data_root, output_queue)
+            )
+            p.start()
+            processes.append(p)
+    # 收集所有进程的结果
+    all_pairs = {}
+    all_images = {}
+    for _ in range(total_threads):
+        local_pairs, local_images = output_queue.get()
+        all_pairs.update(local_pairs)
+        all_images.update(local_images)
+    # Wait for all processes to complete
+    for p in processes:
+        p.join()
+    # Save to npz file
+    np.savez_compressed(os.path.join(data_root, "scannet_image_pairs.npz"), **all_pairs)
+    np.savez_compressed(os.path.join(data_root, "scannet_images.npz"), **all_images)
+    # print the number of image pairs
+    # sum up the number of image pairs for all scenes
+    total_pairs = sum(len(pairs) for pairs in all_pairs.values())
+    print(f"Total number of image pairs: {total_pairs}")
+    return all_pairs, all_images
+def process_scene(data_root, scene_name):
+    pairs = []
+    images_dir = os.path.join(data_root, scene_name, "images")
+    images = [os.path.splitext(file)[0] for file in os.listdir(images_dir) if file.endswith(".jpg")]
+    images.sort()
+    # Check validity of c2w for each image
+    valid_images = []
+    for image in images:
+        _, c2w, _ = load_image(data_root, scene_name, image)
+        if is_valid_c2w(c2w):
+            valid_images.append(image)
+        else:
+            print(f"Invalid c2w for image {image} in scene {scene_name}")
+    # generate image pairs
+    slide_window = 50
+    num_sub_intervals = 5
+    pairs = generate_image_pairs(data_root, scene_name, valid_images, slide_window, num_sub_intervals)
+    print(f"Scene {scene_name} has {len(pairs)} image pairs and {len(valid_images)} valid images out of {len(images)} total images")
+    return pairs, valid_images
+def is_valid_c2w(c2w):
+    return not np.any(np.isinf(c2w)) and not np.any(np.isnan(c2w))
+def generate_image_pairs(data_root, scene_name, images, slide_window, num_sub_intervals=3):
+    pairs = []
+    n = len(images)
+    # Define IOU sub-intervals
+    iou_range = (0.3, 0.8)
+    sub_interval_size = (iou_range[1] - iou_range[0]) / num_sub_intervals
+    sub_intervals = [(iou_range[0] + i * sub_interval_size, iou_range[0] + (i + 1) * sub_interval_size)
+                     for i in range(num_sub_intervals)]
+    for i in range(n):
+        # Keep track of whether a pair has been added for each sub-interval
+        interval_selected = [False] * num_sub_intervals
+        for j in range(i+1, min(i + slide_window, n)):
+            # Break early if all sub-intervals have been selected
+            if all(interval_selected):
+                break
+            # Load image pair
+            depth1, c2w1, K1 = load_image(data_root, scene_name, images[i])
+            depth2, c2w2, K2 = load_image(data_root, scene_name, images[j])
+            # Calculate mean IoU
+            try:
+                iou_1 = calculate_iou(depth1, c2w1, K1, depth2, c2w2, K2)
+                iou_2 = calculate_iou(depth2, c2w2, K2, depth1, c2w1, K1)
+            except Exception as e:
+                print(f"Error calculating IoU for images {images[i]} and {images[j]} in scene {scene_name}: {str(e)}")
+                continue
+            mean_iou = (iou_1 + iou_2) / 2
+            # Check which sub-interval the mean IoU falls into
+            for idx, (lower, upper) in enumerate(sub_intervals):
+                if lower <= mean_iou <= upper and not interval_selected[idx]:
+                    pairs.append((i, j, mean_iou))
+                    interval_selected[idx] = True  # Mark this interval as selected
+                    break  # Move to the next pair after adding one in the current sub-interval
+    return pairs
+def load_image(data_root, scene_name, image_id):
+    # load depthmap
+    depth_path = f"{data_root}/{scene_name}/depths/{image_id}.png"
+    depth = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED).astype(np.float32) / 1000.0
+    # load camera parameters
+    meta_path = f"{data_root}/{scene_name}/images/{image_id}.npz"
+    meta = np.load(meta_path)
+    c2w = meta['camera_pose']
+    K = meta['camera_intrinsics']
+    return depth, c2w, K
+# Unproject depthmap to point cloud and project to another camera
+def calculate_iou(depth1, c2w1, K1, depth2, c2w2, K2):
+    # Move data to GPU and ensure float32 dtype
+    depth1 = torch.from_numpy(depth1).cuda().float()
+    depth2 = torch.from_numpy(depth2).cuda().float()
+    c2w1 = torch.from_numpy(c2w1).cuda().float()
+    c2w2 = torch.from_numpy(c2w2).cuda().float()
+    K1 = torch.from_numpy(K1).cuda().float()
+    K2 = torch.from_numpy(K2).cuda().float()
+    # Get image dimensions
+    h, w = depth1.shape
+    # Create pixel coordinates
+    y, x = torch.meshgrid(torch.arange(h, device='cuda', dtype=torch.float32),
+                            torch.arange(w, device='cuda', dtype=torch.float32))
+    pixels = torch.stack((x.flatten(), y.flatten(), torch.ones_like(x.flatten())), dim=-1).T
+    # Unproject pixels to 3D points
+    pixels_3d = torch.linalg.inv(K1) @ pixels
+    pixels_3d *= depth1.flatten().unsqueeze(0)
+    # Transform 3D points to world coordinates
+    pixels_world = c2w1[:3, :3] @ pixels_3d + c2w1[:3, 3:4]
+    # Check if c2w2[:3, :3] is invertible
+    if torch.det(c2w2[:3, :3]) == 0:
+        return 0, False  # Calculation failed
+    # Project world points to second camera
+    pixels_cam2 = torch.linalg.inv(c2w2[:3, :3]) @ (pixels_world - c2w2[:3, 3:4])
+    pixels_img2 = K2 @ pixels_cam2
+    # Normalize homogeneous coordinates
+    pixels_img2 = pixels_img2[:2] / pixels_img2[2]
+    pixels_img2 = pixels_img2.T
+    # Filter valid pixels
+    valid_mask = (pixels_img2[:, 0] >= 0) & (pixels_img2[:, 0] < w) & \
+                    (pixels_img2[:, 1] >= 0) & (pixels_img2[:, 1] < h)
+    pixels_img2 = pixels_img2[valid_mask].long()
+    # Compare depths
+    projected_depth = pixels_cam2[2, valid_mask]
+    actual_depth = depth2[pixels_img2[:, 1], pixels_img2[:, 0]]
+    depth_diff = torch.abs(projected_depth - actual_depth)
+    depth_threshold = 0.1  # 10cm threshold
+    overlap_mask = depth_diff < depth_threshold
+    # Calculate IoU
+    intersection = torch.sum(overlap_mask)
+    union = torch.sum(valid_mask) + torch.sum(depth2 > 0) - intersection
+    iou = intersection.float() / union.float() if union > 0 else torch.tensor(0.0, device='cuda')
+    return iou.item()
+if __name__ == "__main__":
+    data_root = "data/scannet_processed"
+    # 可以通过参数指定每个GPU的线程数
+    preprocess_scannet(data_root, threads_per_gpu=12)

src/datasets_preprocess/scannetpp_preprocess.py ADDED Viewed

	@@ -0,0 +1,227 @@

+import os
+import numpy as np
+import cv2
+import torch
+import torch.multiprocessing as mp
+import shutil
+def process_scene_on_gpu(gpu_id, scene_names, data_root, target_root, output_queue):
+    torch.cuda.set_device(gpu_id)
+    local_pairs = {}
+    local_images = {}
+    for scene_name in scene_names:
+        save_path = os.path.join(target_root, scene_name, "scene_data.npz")
+        if os.path.exists(save_path):
+            print(f"Scene {scene_name} already processed, skipping")
+            continue
+        pairs, images = process_scene(data_root, target_root, scene_name)
+        np.savez_compressed(save_path, pairs=pairs, images=images)
+    output_queue.put((local_pairs, local_images))
+def preprocess_scannetpp(data_root, target_root):
+    # Traverse all the folders in the data_root
+    scene_names = [folder for folder in os.listdir(data_root) if os.path.isdir(os.path.join(data_root, folder))]
+    # Get the number of available GPUs
+    num_gpus = torch.cuda.device_count()
+    # Distribute scenes across GPUs
+    scenes_per_gpu = [scene_names[i::num_gpus] for i in range(num_gpus)]
+    # Create a multiprocessing queue to collect results
+    output_queue = mp.Queue()
+    # Launch parallel processes
+    processes = []
+    for gpu_id in range(num_gpus):
+        p = mp.Process(target=process_scene_on_gpu, args=(gpu_id, scenes_per_gpu[gpu_id], data_root, target_root, output_queue))
+        p.start()
+        processes.append(p)
+    # Collect results from all processes
+    all_pairs = {}
+    all_images = {}
+    for _ in range(num_gpus):
+        local_pairs, local_images = output_queue.get()
+        all_pairs.update(local_pairs)
+        all_images.update(local_images)
+    # Wait for all processes to complete
+    for p in processes:
+        p.join()
+    # Save to npz file
+    np.savez_compressed(os.path.join(data_root, "scannet_image_pairs.npz"), **all_pairs)
+    np.savez_compressed(os.path.join(data_root, "scannet_images.npz"), **all_images)
+    # print the number of image pairs
+    # sum up the number of image pairs for all scenes
+    total_pairs = sum(len(pairs) for pairs in all_pairs.values())
+    print(f"Total number of image pairs: {total_pairs}")
+    return all_pairs, all_images
+# def preprocess_scannetpp(data_root, target_root):
+#     # Traverse all the folders in the data_root
+#     scene_names = [folder for folder in os.listdir(data_root) if os.path.isdir(os.path.join(data_root, folder))]
+#     for scene_name in scene_names:
+#         save_path = os.path.join(target_root, scene_name, "scene_data.npz")
+#         if os.path.exists(save_path):
+#             print(f"Scene {scene_name} already processed, skipping")
+#             continue
+#         pairs, images = process_scene(data_root, target_root, scene_name)
+#         np.savez_compressed(save_path, pairs=pairs, images=images)
+def process_scene(data_root, target_root, scene_name):
+    pairs = []
+    images_dir = os.path.join(data_root, scene_name, "images")
+    images = [os.path.splitext(file)[0] for file in os.listdir(images_dir) if file.endswith(".JPG")]
+    images.sort()
+    # copy images, depths, and camera parameters to target_root
+    os.makedirs(os.path.join(target_root, scene_name, "images"), exist_ok=True)
+    os.makedirs(os.path.join(target_root, scene_name, "depths"), exist_ok=True)
+    for image in images:
+        shutil.copy(os.path.join(data_root, scene_name, "images", f"{image}.JPG"), os.path.join(target_root, scene_name, "images", f"{image}.JPG"))
+        shutil.copy(os.path.join(data_root, scene_name, "depths", f"{image}.png"), os.path.join(target_root, scene_name, "depths", f"{image}.png"))
+        shutil.copy(os.path.join(data_root, scene_name, "images", f"{image}.npz"), os.path.join(target_root, scene_name, "images", f"{image}.npz"))
+    # Check validity of c2w for each image
+    valid_images = []
+    for image in images:
+        _, c2w, _ = load_image(data_root, scene_name, image)
+        if is_valid_c2w(c2w):
+            valid_images.append(image)
+        else:
+            print(f"Invalid c2w for image {image} in scene {scene_name}")
+    # generate image pairs
+    slide_window = 100
+    num_sub_intervals = 5
+    pairs = generate_image_pairs(data_root, scene_name, valid_images, slide_window, num_sub_intervals)
+    print(f"Scene {scene_name} has {len(pairs)} image pairs and {len(valid_images)} valid images out of {len(images)} total images")
+    return pairs, valid_images
+def is_valid_c2w(c2w):
+    return not np.any(np.isinf(c2w)) and not np.any(np.isnan(c2w))
+def generate_image_pairs(data_root, scene_name, images, slide_window, num_sub_intervals=3):
+    pairs = []
+    n = len(images)
+    # Define IOU sub-intervals
+    iou_range = (0.3, 0.8)
+    sub_interval_size = (iou_range[1] - iou_range[0]) / num_sub_intervals
+    sub_intervals = [(iou_range[0] + i * sub_interval_size, iou_range[0] + (i + 1) * sub_interval_size)
+                     for i in range(num_sub_intervals)]
+    for i in range(n):
+        # Keep track of whether a pair has been added for each sub-interval
+        interval_selected = [False] * num_sub_intervals
+        for j in range(i+1, min(i + slide_window, n)):
+            # Break early if all sub-intervals have been selected
+            if all(interval_selected):
+                break
+            # Load image pair
+            depth1, c2w1, K1 = load_image(data_root, scene_name, images[i])
+            depth2, c2w2, K2 = load_image(data_root, scene_name, images[j])
+            # Calculate mean IoU
+            try:
+                iou_1 = calculate_iou(depth1, c2w1, K1, depth2, c2w2, K2)
+                iou_2 = calculate_iou(depth2, c2w2, K2, depth1, c2w1, K1)
+            except Exception as e:
+                print(f"Error calculating IoU for images {images[i]} and {images[j]} in scene {scene_name}: {str(e)}")
+                continue
+            mean_iou = (iou_1 + iou_2) / 2
+            # Check which sub-interval the mean IoU falls into
+            for idx, (lower, upper) in enumerate(sub_intervals):
+                if lower <= mean_iou <= upper and not interval_selected[idx]:
+                    pairs.append((i, j, mean_iou))
+                    interval_selected[idx] = True  # Mark this interval as selected
+                    break  # Move to the next pair after adding one in the current sub-interval
+    return pairs
+def load_image(data_root, scene_name, image_id):
+    # load depthmap
+    depth_path = f"{data_root}/{scene_name}/depths/{image_id}.png"
+    depth = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED).astype(np.float32) / 1000.0
+    # load camera parameters
+    meta_path = f"{data_root}/{scene_name}/images/{image_id}.npz"
+    meta = np.load(meta_path)
+    c2w = meta['camera_pose']
+    K = meta['camera_intrinsics']
+    return depth, c2w, K
+# Unproject depthmap to point cloud and project to another camera
+def calculate_iou(depth1, c2w1, K1, depth2, c2w2, K2):
+    # Move data to GPU and ensure float32 dtype
+    depth1 = torch.from_numpy(depth1).cuda().float()
+    depth2 = torch.from_numpy(depth2).cuda().float()
+    c2w1 = torch.from_numpy(c2w1).cuda().float()
+    c2w2 = torch.from_numpy(c2w2).cuda().float()
+    K1 = torch.from_numpy(K1).cuda().float()
+    K2 = torch.from_numpy(K2).cuda().float()
+    # Get image dimensions
+    h, w = depth1.shape
+    # Create pixel coordinates
+    y, x = torch.meshgrid(torch.arange(h, device='cuda', dtype=torch.float32),
+                            torch.arange(w, device='cuda', dtype=torch.float32))
+    pixels = torch.stack((x.flatten(), y.flatten(), torch.ones_like(x.flatten())), dim=-1).T
+    # Unproject pixels to 3D points
+    pixels_3d = torch.linalg.inv(K1) @ pixels
+    pixels_3d *= depth1.flatten().unsqueeze(0)
+    # Transform 3D points to world coordinates
+    pixels_world = c2w1[:3, :3] @ pixels_3d + c2w1[:3, 3:4]
+    # Check if c2w2[:3, :3] is invertible
+    if torch.det(c2w2[:3, :3]) == 0:
+        return 0, False  # Calculation failed
+    # Project world points to second camera
+    pixels_cam2 = torch.linalg.inv(c2w2[:3, :3]) @ (pixels_world - c2w2[:3, 3:4])
+    pixels_img2 = K2 @ pixels_cam2
+    # Normalize homogeneous coordinates
+    pixels_img2 = pixels_img2[:2] / pixels_img2[2]
+    pixels_img2 = pixels_img2.T
+    # Filter valid pixels
+    valid_mask = (pixels_img2[:, 0] >= 0) & (pixels_img2[:, 0] < w) & \
+                    (pixels_img2[:, 1] >= 0) & (pixels_img2[:, 1] < h)
+    pixels_img2 = pixels_img2[valid_mask].long()
+    # Compare depths
+    projected_depth = pixels_cam2[2, valid_mask]
+    actual_depth = depth2[pixels_img2[:, 1], pixels_img2[:, 0]]
+    depth_diff = torch.abs(projected_depth - actual_depth)
+    depth_threshold = 0.1  # 10cm threshold
+    overlap_mask = depth_diff < depth_threshold
+    # Calculate IoU
+    intersection = torch.sum(overlap_mask)
+    union = torch.sum(valid_mask) + torch.sum(depth2 > 0) - intersection
+    iou = intersection.float() / union.float() if union > 0 else torch.tensor(0.0, device='cuda')
+    return iou.item()
+if __name__ == "__main__":
+    data_root = "data/scannetpp_processed"
+    target_root = "data/scannetpp_target"
+    preprocess_scannetpp(data_root, target_root)

src/gaussian_head.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import torch
+import torch.nn as nn
+from einops import rearrange
+from src.utils.gaussian_model import build_covariance
+from simple_knn._C import distCUDA2
+from src.utils.sh_utils import RGB2SH
+class GaussianHead(nn.Module):
+    def __init__(self, d_pt_feat=64, **kwargs):
+        super().__init__()
+        # args
+        self.args = kwargs
+        self.d_means = 3
+        self.d_scales = 3
+        self.d_rotations = 4
+        self.d_opacities = 1
+        self.sh_degree = 3
+        self.d_view_dep_features = 3 # RGB
+        self.d_sh = (self.sh_degree + 1) ** 2
+        self.d_attr = (self.d_scales + self.d_rotations + self.d_opacities + self.d_view_dep_features * self.d_sh)
+        if self.args.get('d_gs_feats'):
+            self.d_attr += self.args['d_gs_feats']
+        # Create a mask for the spherical harmonics coefficients.
+        # This ensures that at initialization, the coefficients are biased
+        # towards having a large DC component and small view-dependent components.
+        self.register_buffer(
+            "sh_mask",
+            torch.ones((self.d_sh,), dtype=torch.float32),
+            persistent=False,
+        )
+        for degree in range(1, self.sh_degree + 1):
+            self.sh_mask[degree**2 : (degree + 1) ** 2] = 0.5 * 0.25**degree
+        self.gaussian_proj = nn.Linear(d_pt_feat, self.d_attr)
+        # Activation functions
+        self.scale_activation = torch.exp
+        self.rotation_activation = torch.nn.functional.normalize
+        self.opacity_activation = torch.sigmoid
+    def forward(self, point_transformer_output, lseg_features=None):
+        pred1 = {}
+        pred2 = {}
+        scene_scale = point_transformer_output['scale'] # B, 1, 1
+        scene_center = point_transformer_output['center'] # B, 1, 3
+        B, H, W, _ = point_transformer_output['shape']
+        normalized_means = point_transformer_output['coord'] # B * V * H * W, 3
+        colors = point_transformer_output['color'] # B * V * H * W, 3
+        # split normalized_means to 2 views
+        normalized_means = rearrange(normalized_means, '(b v h w) c -> v b (h w) c', v=2, b=B, h=H, w=W)
+        means = normalized_means * scene_scale + scene_center # V, B, H * W, 3
+        means = rearrange(means, 'v b (h w) c -> b (v h w) c', b=B, v=2, h=H, w=W)
+        # get features
+        feat = point_transformer_output['feat']
+        gaussian_attr = self.gaussian_proj(feat)
+        # # split gaussian attributes
+        # scales, rotations, opacities, sh_coeffs = torch.split(gaussian_attr,
+        #                                                               [
+        #                                                                   self.d_scales,
+        #                                                                   self.d_rotations,
+        #                                                                   self.d_opacities,
+        #                                                                   self.d_view_dep_features * self.d_sh
+        #                                                               ],
+        #                                                               dim=-1)
+        scales, rotations, opacities, sh_coeffs, gs_feats = torch.split(gaussian_attr,
+                                                                      [
+                                                                          self.d_scales,
+                                                                          self.d_rotations,
+                                                                          self.d_opacities,
+                                                                          self.d_view_dep_features * self.d_sh,
+                                                                          self.args['d_gs_feats']
+                                                                      ],
+                                                                      dim=-1)
+        # scales
+        # calculate the distance between each point and its nearest neighbor
+        all_dist = torch.stack([torch.sqrt(torch.clamp_min(distCUDA2(pts3d), 0.0000001)) for pts3d in means]) # B, V * H * W
+        median_dist = all_dist.median(dim=-1)[0][:, None, None] # B, 1, 1
+        scales = self.scale_activation(scales)
+        scales = rearrange(scales, '(b v h w) c -> b (v h w) c', b=B, v=2, h=H, w=W)
+        scales = scales * all_dist[..., None]
+        # clip scales
+        scales = torch.clamp(scales, min=0.1 * median_dist, max=3.0 * median_dist)
+        scales = rearrange(scales, 'b (v h w) c -> (b v h w) c', b=B, v=2, h=H, w=W)
+        # activation
+        rotations = self.rotation_activation(rotations)
+        opacities = self.opacity_activation(opacities)
+        # build covariance matrix
+        covs = build_covariance(scales, rotations)
+        # sh_mask
+        sh_coeffs = rearrange(sh_coeffs, '(b v h w) (c d) -> (b v h w) c d', b=B, v=2, h=H, w=W, c=self.d_sh, d=self.d_view_dep_features)
+        sh_dc = sh_coeffs[..., 0, :]
+        sh_rest = sh_coeffs[..., 1:, :]
+        if self.args.get('rgb_residual'):
+            # denormalize colors
+            colors = colors * 0.5 + 0.5
+            sh_rgb = RGB2SH(colors) # (B * V * H * W, 3)
+            # add rgb residual to dc component
+            sh_dc = sh_dc + sh_rgb
+            # concatenate dc and rest
+            sh_coeffs = torch.cat([sh_dc[..., None, :], sh_rest], dim=-2)
+        sh_coeffs = sh_coeffs * self.sh_mask[None, :, None]
+        # lseg_features(learning residual)
+        lseg_features = rearrange(lseg_features, '(v b) c h w -> (b v h w) c', b=B, v=2, h=H, w=W)
+        gs_feats = gs_feats + lseg_features
+        # split to 2 views
+        scales = rearrange(scales, '(b v h w) ... -> v b h w ...', v=2, b=B, h=H, w=W)
+        rotations = rearrange(rotations, '(b v h w) ... -> v b h w ...', v=2, b=B, h=H, w=W)
+        opacities = rearrange(opacities, '(b v h w) ... -> v b h w ...', v=2, b=B, h=H, w=W)
+        sh_coeffs = rearrange(sh_coeffs, '(b v h w) ... -> v b h w ...', v=2, b=B, h=H, w=W)
+        covs = rearrange(covs, '(b v h w) ... -> v b h w ...', v=2, b=B, h=H, w=W)
+        means = rearrange(means, 'b (v h w) ... -> v b h w ...', v=2, b=B, h=H, w=W)
+        gs_feats = rearrange(gs_feats, '(b v h w) ... -> v b h w ...', v=2, b=B, h=H, w=W)
+        pred1['scales'] = scales[0]
+        pred1['rotations'] = rotations[0]
+        pred1['covs'] = covs[0]
+        pred1['opacities'] = opacities[0]
+        pred1['sh_coeffs'] = sh_coeffs[0]
+        pred1['means'] = means[0]
+        pred1['gs_feats'] = gs_feats[0]
+        pred2['scales'] = scales[1]
+        pred2['rotations'] = rotations[1]
+        pred2['covs'] = covs[1]
+        pred2['opacities'] = opacities[1]
+        pred2['sh_coeffs'] = sh_coeffs[1]
+        pred2['means'] = means[1]
+        pred2['gs_feats'] = gs_feats[1]
+        return pred1, pred2

src/infer.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import argparse
+import sys
+sys.path.append('.')
+from src.model import LSM_MASt3R
+from src.utils.visualization_utils import render_video_from_file
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--file_list', type=str, nargs='+', required=True,
+                        help='List of input image files or directories')
+    parser.add_argument('--model_path', type=str, required=True)
+    parser.add_argument('--output_path', type=str, required=True)
+    parser.add_argument('--resolution', type=int, default=512)
+    parser.add_argument('--n_interp', type=int, default=90)
+    parser.add_argument('--fps', type=int, default=30)
+    args = parser.parse_args()
+    # 1. load model
+    model = LSM_MASt3R.from_pretrained(args.model_path)
+    # 2. render video
+    render_video_from_file(args.file_list, model, args.output_path, resolution=args.resolution, n_interp=args.n_interp, fps=args.fps)

src/losses.py ADDED Viewed

	@@ -0,0 +1,193 @@

+from submodules.mast3r.dust3r.dust3r.losses import *
+from torchmetrics import PeakSignalNoiseRatio, StructuralSimilarityIndexMeasure, JaccardIndex, Accuracy
+import lpips
+from src.utils.gaussian_model import GaussianModel
+from src.utils.cuda_splatting import render, DummyPipeline
+from einops import rearrange
+from src.utils.camera_utils import get_scaled_camera
+from torchvision.utils import save_image
+from dust3r.inference import make_batch_symmetric
+class L2Loss (LLoss):
+    """ Euclidean distance between 3d points  """
+    def distance(self, a, b):
+        return torch.norm(a - b, dim=-1)  # normalized L2 distance
+class L1Loss (LLoss):
+    """ Manhattan distance between 3d points """
+    def distance(self, a, b):
+        return torch.abs(a - b).mean()  # L1 distance
+L2 = L2Loss()
+L1 = L1Loss()
+def merge_and_split_predictions(pred1, pred2):
+    merged = {}
+    for key in pred1.keys():
+        merged_pred = torch.stack([pred1[key], pred2[key]], dim=1)
+        merged_pred = rearrange(merged_pred, 'b v h w ... -> b (v h w) ...')
+        merged[key] = merged_pred
+    # Split along the batch dimension
+    batch_size = next(iter(merged.values())).shape[0]
+    split = [{key: value[i] for key, value in merged.items()} for i in range(batch_size)]
+    return split
+class GaussianLoss(MultiLoss):
+    def __init__(self, ssim_weight=0.2):
+        super().__init__()
+        self.ssim_weight = ssim_weight
+        self.ssim = StructuralSimilarityIndexMeasure(data_range=1.0).cuda()
+        self.psnr = PeakSignalNoiseRatio(data_range=1.0).cuda()
+        self.lpips_vgg = lpips.LPIPS(net='vgg').cuda()
+        self.pipeline = DummyPipeline()
+        # bg_color
+        self.register_buffer('bg_color', torch.tensor([0.0, 0.0, 0.0]).cuda())
+    def get_name(self):
+        return f'GaussianLoss(ssim_weight={self.ssim_weight})'
+    # def compute_loss(self, gt1, gt2, target_view, pred1, pred2, model):
+    #     # render images
+    #     # 1. merge predictions
+    #     pred = merge_and_split_predictions(pred1, pred2)
+    #     # 2. calculate optimal scaling
+    #     pred_pts1 = pred1['means']
+    #     pred_pts2 = pred2['means']
+    #     # convert to camera1 coordinates
+    #     # everything is normalized w.r.t. camera of view1
+    #     valid1 = gt1['valid_mask'].clone()
+    #     valid2 = gt2['valid_mask'].clone()
+    #     in_camera1 = inv(gt1['camera_pose'])
+    #     gt_pts1 = geotrf(in_camera1, gt1['pts3d'].to(in_camera1.device))  # B,H,W,3
+    #     gt_pts2 = geotrf(in_camera1, gt2['pts3d'].to(in_camera1.device))  # B,H,W,3
+    #     scaling = find_opt_scaling(gt_pts1, gt_pts2, pred_pts1, pred_pts2, valid1=valid1, valid2=valid2)
+    #     # 3. render images(need gaussian model, camera, pipeline)
+    #     rendered_images = []
+    #     rendered_feats = []
+    #     for i in range(len(pred)):
+    #         # get gaussian model
+    #         gaussians = GaussianModel.from_predictions(pred[i], sh_degree=3)
+    #         # get camera
+    #         ref_camera_extrinsics = gt1['camera_pose'][i]
+    #         target_extrinsics = target_view['camera_pose'][i]
+    #         target_intrinsics = target_view['camera_intrinsics'][i]
+    #         image_shape = target_view['true_shape'][i]
+    #         scale = scaling[i]
+    #         camera = get_scaled_camera(ref_camera_extrinsics, target_extrinsics, target_intrinsics, scale, image_shape)
+    #         # render(image and features)
+    #         rendered_output = render(camera, gaussians, self.pipeline, self.bg_color)
+    #         rendered_images.append(rendered_output['render'])
+    #         rendered_feats.append(rendered_output['feature_map'])
+    #     rendered_images = torch.stack(rendered_images, dim=0) # B, 3, H, W
+    #     rendered_feats = torch.stack(rendered_feats, dim=0) # B, d_feats, H, W
+    #     rendered_feats = model.feature_expansion(rendered_feats) # B, 512, H//2, W//2
+    #     gt_images = target_view['img'] * 0.5 + 0.5
+    #     gt_feats = model.lseg_feature_extractor.extract_features(target_view['img']) # B, 512, H//2, W//2
+    #     image_loss = torch.abs(rendered_images - gt_images).mean()
+    #     feature_loss = torch.abs(rendered_feats - gt_feats).mean()
+    #     loss = image_loss + 100 * feature_loss
+    #     # # temp
+    #     # gt_logits = model.lseg_feature_extractor.decode_feature(gt_feats, ['wall', 'floor', 'others'])
+    #     # gt_labels = torch.argmax(gt_logits, dim=1, keepdim=True)
+    #     # rendered_logits = model.lseg_feature_extractor.decode_feature(rendered_feats, ['wall', 'floor', 'others'])
+    #     # rendered_labels = torch.argmax(rendered_logits, dim=1, keepdim=True)
+    #     # calculate metric
+    #     with torch.no_grad():
+    #         ssim = self.ssim(rendered_images, gt_images)
+    #         psnr = self.psnr(rendered_images, gt_images)
+    #         lpips = self.lpips_vgg(rendered_images, gt_images).mean()
+    #     return loss, {'ssim': ssim, 'psnr': psnr, 'lpips': lpips, 'image_loss': image_loss, 'feature_loss': feature_loss}
+    def compute_loss(self, gt1, gt2, target_view, pred1, pred2, model):
+        # render images
+        # 1. merge predictions
+        pred = merge_and_split_predictions(pred1, pred2)
+        # 2. calculate optimal scaling
+        pred_pts1 = pred1['means']
+        pred_pts2 = pred2['means']
+        # convert to camera1 coordinates
+        # everything is normalized w.r.t. camera of view1
+        valid1 = gt1['valid_mask'].clone()
+        valid2 = gt2['valid_mask'].clone()
+        in_camera1 = inv(gt1['camera_pose'])
+        gt_pts1 = geotrf(in_camera1, gt1['pts3d'].to(in_camera1.device))  # B,H,W,3
+        gt_pts2 = geotrf(in_camera1, gt2['pts3d'].to(in_camera1.device))  # B,H,W,3
+        scaling = find_opt_scaling(gt_pts1, gt_pts2, pred_pts1, pred_pts2, valid1=valid1, valid2=valid2)
+        # 3. render images(need gaussian model, camera, pipeline)
+        rendered_images = []
+        rendered_feats = []
+        gt_images = []
+        for i in range(len(pred)):
+            # get gaussian model
+            gaussians = GaussianModel.from_predictions(pred[i], sh_degree=3)
+            # get camera
+            ref_camera_extrinsics = gt1['camera_pose'][i]
+            target_view_list = [gt1, gt2, target_view] # use gt1, gt2, and target_view
+            for j in range(len(target_view_list)):
+                target_extrinsics = target_view_list[j]['camera_pose'][i]
+                target_intrinsics = target_view_list[j]['camera_intrinsics'][i]
+                image_shape = target_view_list[j]['true_shape'][i]
+                scale = scaling[i]
+                camera = get_scaled_camera(ref_camera_extrinsics, target_extrinsics, target_intrinsics, scale, image_shape)
+                # render(image and features)
+                rendered_output = render(camera, gaussians, self.pipeline, self.bg_color)
+                rendered_images.append(rendered_output['render'])
+                rendered_feats.append(rendered_output['feature_map'])
+                gt_images.append(target_view_list[j]['img'][i] * 0.5 + 0.5)
+        rendered_images = torch.stack(rendered_images, dim=0) # B, 3, H, W
+        gt_images = torch.stack(gt_images, dim=0)
+        rendered_feats = torch.stack(rendered_feats, dim=0) # B, d_feats, H, W
+        rendered_feats = model.feature_expansion(rendered_feats) # B, 512, H//2, W//2
+        gt_feats = model.lseg_feature_extractor.extract_features(gt_images) # B, 512, H//2, W//2
+        image_loss = torch.abs(rendered_images - gt_images).mean()
+        feature_loss = torch.abs(rendered_feats - gt_feats).mean()
+        loss = image_loss + feature_loss
+        # calculate metric
+        with torch.no_grad():
+            ssim = self.ssim(rendered_images, gt_images)
+            psnr = self.psnr(rendered_images, gt_images)
+            lpips = self.lpips_vgg(rendered_images, gt_images).mean()
+        return loss, {'ssim': ssim, 'psnr': psnr, 'lpips': lpips, 'image_loss': image_loss, 'feature_loss': feature_loss}
+# loss for one batch
+def loss_of_one_batch(batch, model, criterion, device, symmetrize_batch=False, use_amp=False, ret=None):
+    view1, view2, target_view = batch
+    ignore_keys = set(['depthmap', 'dataset', 'label', 'instance', 'idx', 'true_shape', 'rng', 'pts3d'])
+    for view in batch:
+        for name in view.keys():  # pseudo_focal
+            if name in ignore_keys:
+                continue
+            view[name] = view[name].to(device, non_blocking=True)
+    if symmetrize_batch:
+        view1, view2 = make_batch_symmetric(batch)
+    # Get the actual model if it's distributed
+    actual_model = model.module if hasattr(model, 'module') else model
+    with torch.cuda.amp.autocast(enabled=bool(use_amp)):
+        pred1, pred2 = actual_model(view1, view2)
+        # loss is supposed to be symmetric
+        with torch.cuda.amp.autocast(enabled=False):
+            loss = criterion(view1, view2, target_view, pred1, pred2, actual_model) if criterion is not None else None
+    result = dict(view1=view1, view2=view2, target_view=target_view, pred1=pred1, pred2=pred2, loss=loss)
+    return result[ret] if ret else result

src/lseg.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import torch
+import torch.nn as nn
+from submodules.lang_seg.modules.models.lseg_net import LSegNet, clip
+class LSegFeatureExtractor(LSegNet):
+    def __init__(self, half_res=True):
+        super().__init__(
+            labels='',
+            backbone='clip_vitl16_384',
+            features=256,
+            crop_size=224,
+            arch_option=0,
+            block_depth=0,
+            activation='lrelu'
+        )
+        self.half_res = half_res
+    @torch.no_grad()
+    def extract_features(self, x):
+        layer_1, layer_2, layer_3, layer_4 = forward_layers(self.pretrained, x)
+        # layer:(b, 1024, h//16, w//16)
+        # image_features = torch.cat([layer_1, layer_2, layer_3, layer_4], dim=1)
+        # # image_features:(b, 4096, h//16, w//16)
+        # dense feature
+        # DPT head
+        pretrained = self.pretrained
+        layer_1 = pretrained.act_postprocess1[3 : len(pretrained.act_postprocess1)](layer_1)
+        layer_2 = pretrained.act_postprocess2[3 : len(pretrained.act_postprocess2)](layer_2)
+        layer_3 = pretrained.act_postprocess3[3 : len(pretrained.act_postprocess3)](layer_3)
+        layer_4 = pretrained.act_postprocess4[3 : len(pretrained.act_postprocess4)](layer_4)
+        # refinenet
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+        path_4 = self.scratch.refinenet4(layer_4_rn)
+        path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
+        path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
+        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+        # (b, 512, h//2, w//2)
+        image_features = self.scratch.head1(path_1)
+        if self.half_res:
+            return image_features
+        # (b, 512, h, w)
+        image_features = self.scratch.output_conv(image_features)
+        return image_features
+    @torch.no_grad()
+    def decode_feature(self, image_features, labelset=''):
+        # # image_features:(b, 4096, h//16, w//16)
+        # # split image_features into 4 parts
+        # layer_1, layer_2, layer_3, layer_4 = torch.split(image_features, 1024, dim=1)
+        # # DPT head
+        # pretrained = self.pretrained
+        # layer_1 = pretrained.act_postprocess1[3 : len(pretrained.act_postprocess1)](layer_1)
+        # layer_2 = pretrained.act_postprocess2[3 : len(pretrained.act_postprocess2)](layer_2)
+        # layer_3 = pretrained.act_postprocess3[3 : len(pretrained.act_postprocess3)](layer_3)
+        # layer_4 = pretrained.act_postprocess4[3 : len(pretrained.act_postprocess4)](layer_4)
+        # # refinenet
+        # layer_1_rn = self.scratch.layer1_rn(layer_1)
+        # layer_2_rn = self.scratch.layer2_rn(layer_2)
+        # layer_3_rn = self.scratch.layer3_rn(layer_3)
+        # layer_4_rn = self.scratch.layer4_rn(layer_4)
+        # path_4 = self.scratch.refinenet4(layer_4_rn)
+        # path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
+        # path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
+        # path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+        # image_features = self.scratch.head1(path_1)
+        imshape = image_features.shape
+        # encode text
+        if labelset == '':
+            text = self.text
+        else:
+            text = clip.tokenize(labelset)
+        self.logit_scale = self.logit_scale.to(image_features.device)
+        text = text.to(image_features.device)
+        text_features = self.clip_pretrained.encode_text(text)
+        image_features = image_features.permute(0,2,3,1).reshape(-1, self.out_c)
+        # normalized features
+        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+        logits_per_image = self.logit_scale * image_features.half() @ text_features.t()
+        out = logits_per_image.float().view(imshape[0], imshape[2], imshape[3], -1).permute(0,3,1,2)
+        if self.arch_option in [1, 2]:
+            for _ in range(self.block_depth - 1):
+                out = self.scratch.head_block(out)
+            out = self.scratch.head_block(out, False)
+        if self.half_res:
+            out = self.scratch.output_conv(out)
+        return out
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+        print(f"Loading checkpoint from: {pretrained_model_name_or_path}")
+        ckpt = torch.load(pretrained_model_name_or_path, map_location='cpu')
+        print(f"Checkpoint loaded. Keys in checkpoint: {ckpt.keys()}")
+        print("Processing state dict...")
+        new_state_dict = {k[len("net."):]: v for k, v in ckpt['state_dict'].items() if k.startswith("net.")}
+        print(f"Processed state dict. Number of keys: {len(new_state_dict)}")
+        print("Initializing model...")
+        model = cls(*args, **kwargs)
+        print("Loading state dict into model...")
+        model.load_state_dict(new_state_dict, strict=True)
+        print("State dict loaded successfully.")
+        print("Cleaning up...")
+        del ckpt
+        del new_state_dict
+        print("Model loading complete.")
+        return model
+def forward_layers(pretrained, x):
+    b, c, h, w = x.shape
+    # encoder
+    glob = pretrained.model.forward_flex(x)
+    layer_1 = pretrained.activations["1"]
+    layer_2 = pretrained.activations["2"]
+    layer_3 = pretrained.activations["3"]
+    layer_4 = pretrained.activations["4"]
+    layer_1 = pretrained.act_postprocess1[0:2](layer_1)
+    layer_2 = pretrained.act_postprocess2[0:2](layer_2)
+    layer_3 = pretrained.act_postprocess3[0:2](layer_3)
+    layer_4 = pretrained.act_postprocess4[0:2](layer_4)
+    unflatten = nn.Sequential(
+        nn.Unflatten(
+            2,
+            torch.Size(
+                [
+                    h // pretrained.model.patch_size[1],
+                    w // pretrained.model.patch_size[0],
+                ]
+            ),
+        )
+    )
+    if layer_1.ndim == 3:
+        layer_1 = unflatten(layer_1)
+    if layer_2.ndim == 3:
+        layer_2 = unflatten(layer_2)
+    if layer_3.ndim == 3:
+        layer_3 = unflatten(layer_3)
+    if layer_4.ndim == 3:
+        layer_4 = unflatten(layer_4)
+    return layer_1, layer_2, layer_3, layer_4

src/model.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import torch
+import torch.nn as nn
+import yaml
+import sys
+sys.path.append(".")
+sys.path.append("submodules")
+sys.path.append("submodules/mast3r")
+from mast3r.model import AsymmetricMASt3R
+from src.ptv3 import PTV3
+from src.gaussian_head import GaussianHead
+from src.utils.points_process import merge_points
+from src.losses import GaussianLoss
+from src.lseg import LSegFeatureExtractor
+import argparse
+class LSM_MASt3R(nn.Module):
+    def __init__(self,
+                 mast3r_config,
+                 point_transformer_config,
+                 gaussian_head_config,
+                 lseg_config,
+                 ):
+        super().__init__()
+        # self.config
+        self.config = {
+            'mast3r_config': mast3r_config,
+            'point_transformer_config': point_transformer_config,
+            'gaussian_head_config': gaussian_head_config,
+            'lseg_config': lseg_config
+        }
+        # Initialize AsymmetricMASt3R
+        self.mast3r = AsymmetricMASt3R.from_pretrained(**mast3r_config)
+        # Freeze MASt3R parameters
+        for param in self.mast3r.parameters():
+            param.requires_grad = False
+        self.mast3r.eval()
+        # Initialize PointTransformerV3
+        self.point_transformer = PTV3(**point_transformer_config)
+        # Initialize the gaussian head
+        self.gaussian_head = GaussianHead(**gaussian_head_config)
+        # Initialize the lseg feature extractor
+        self.lseg_feature_extractor = LSegFeatureExtractor.from_pretrained(**lseg_config)
+        for param in self.lseg_feature_extractor.parameters():
+            param.requires_grad = False
+        self.lseg_feature_extractor.eval()
+        # Define two linear layers
+        d_gs_feats = gaussian_head_config.get('d_gs_feats', 32)
+        self.feature_reduction = nn.Sequential(
+            nn.Conv2d(512, d_gs_feats, kernel_size=1),
+            nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
+        ) # (b, 512, h//2, w//2) -> (b, d_features, h, w)
+        self.feature_expansion = nn.Sequential(
+            nn.Conv2d(d_gs_feats, 512, kernel_size=1),
+            nn.Upsample(scale_factor=0.5, mode='bilinear', align_corners=True)
+        ) # (b, d_features, h, w) -> (b, 512, h//2, w//2)
+    def forward(self, view1, view2):
+        # AsymmetricMASt3R forward pass
+        mast3r_output = self.mast3r(view1, view2)
+        # merge points from two views
+        data_dict = merge_points(mast3r_output, view1, view2)
+        # PointTransformerV3 forward pass
+        point_transformer_output = self.point_transformer(data_dict)
+        # extract lseg features
+        lseg_features = self.extract_lseg_features(view1, view2)
+        # Gaussian head forward pass
+        final_output = self.gaussian_head(point_transformer_output, lseg_features)
+        return final_output
+    def extract_lseg_features(self, view1, view2):
+        # concat view1 and view2
+        img = torch.cat([view1['img'], view2['img']], dim=0) # (v*b, 3, h, w)
+        # extract features
+        lseg_features = self.lseg_feature_extractor.extract_features(img) # (v*b, 512, h//2, w//2)
+        # reduce dimensions
+        lseg_features = self.feature_reduction(lseg_features) # (v*b, d_features, h, w)
+        return lseg_features
+    @staticmethod
+    def from_pretrained(checkpoint_path, device='cuda'):
+        # Load the checkpoint
+        ckpt = torch.load(checkpoint_path, map_location='cpu')
+        # Extract the configuration from the checkpoint
+        config = ckpt['args']
+        # Create a new instance of LSM_MASt3R
+        model = eval(config.model)
+        # Load the state dict
+        model.load_state_dict(ckpt['model'])
+        # Move the model to the specified device
+        model = model.to(device)
+        return model
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        # 获取所有参数的state_dict
+        full_state_dict = super().state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)
+        # 只保留需要训练的参数
+        trainable_state_dict = {
+            k: v for k, v in full_state_dict.items()
+            if not (k.startswith('mast3r.') or k.startswith('lseg_feature_extractor.'))
+        }
+        return trainable_state_dict
+    def load_state_dict(self, state_dict, strict=True):
+        # 获取当前模型的完整state_dict
+        model_state = super().state_dict()
+        # 只更新需要训练的参数
+        for k in list(state_dict.keys()):
+            if k in model_state and not (k.startswith('mast3r.') or k.startswith('lseg_feature_extractor.')):
+                model_state[k] = state_dict[k]
+        # 使用更新后的state_dict加载模型
+        super().load_state_dict(model_state, strict=False)
+if __name__ == "__main__":
+    from torch.utils.data import DataLoader
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str)
+    args = parser.parse_args()
+    # Load config
+    with open("configs/model_config.yaml", "r") as f:
+        config = yaml.safe_load(f)
+    # Initialize model
+    if args.checkpoint is not None:
+        model = LSM_MASt3R.from_pretrained(args.checkpoint, device='cuda')
+    else:
+        model = LSM_MASt3R(**config).to('cuda')
+    model.eval()
+    # Print model
+    print(model)
+    # Load dataset
+    from src.datasets.scannet import Scannet
+    dataset = Scannet(split='train', ROOT="data/scannet_processed", resolution=[(512, 384)])
+    # Print dataset
+    print(dataset)
+    # Test model
+    data_loader = DataLoader(dataset, batch_size=3, shuffle=True)
+    data = next(iter(data_loader))
+    # move data to cuda
+    for view in data:
+        view['img'] = view['img'].to('cuda')
+        view['depthmap'] = view['depthmap'].to('cuda')
+        view['camera_pose'] = view['camera_pose'].to('cuda')
+        view['camera_intrinsics'] = view['camera_intrinsics'].to('cuda')
+    # Forward pass
+    output = model(*data[:2])
+    # Loss
+    loss = GaussianLoss()
+    loss_value = loss(*data, *output, model)
+    print(loss_value)

src/ptv3.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from PointTransformerV3.model import *
+class PTV3(PointTransformerV3):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    def encode(self, data_dict):
+        point = Point(data_dict)
+        point.serialization(order=self.order, shuffle_orders=self.shuffle_orders)
+        point.sparsify()
+        point = self.embedding(point)
+        point = self.enc(point)
+        return point.feats

src/train.py ADDED Viewed

	@@ -0,0 +1,73 @@

+#!/usr/bin/env python3
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# training executable for MASt3R
+# --------------------------------------------------------
+import sys
+sys.path.append('.')
+sys.path.append('submodules/mast3r')
+from mast3r.model import AsymmetricMASt3R
+from mast3r.losses import ConfMatchingLoss, MatchingLoss, APLoss, Regr3D, InfoNCE, Regr3D_ScaleShiftInv
+from mast3r.datasets import ARKitScenes, BlendedMVS, Co3d, MegaDepth, ScanNetpp, StaticThings3D, Waymo, WildRGBD
+import mast3r.utils.path_to_dust3r  # noqa
+# add mast3r classes to dust3r imports
+import dust3r.training
+dust3r.training.AsymmetricMASt3R = AsymmetricMASt3R
+dust3r.training.Regr3D = Regr3D
+dust3r.training.Regr3D_ScaleShiftInv = Regr3D_ScaleShiftInv
+dust3r.training.MatchingLoss = MatchingLoss
+dust3r.training.ConfMatchingLoss = ConfMatchingLoss
+dust3r.training.InfoNCE = InfoNCE
+dust3r.training.APLoss = APLoss
+import dust3r.datasets
+dust3r.datasets.ARKitScenes = ARKitScenes
+dust3r.datasets.BlendedMVS = BlendedMVS
+dust3r.datasets.Co3d = Co3d
+dust3r.datasets.MegaDepth = MegaDepth
+dust3r.datasets.ScanNetpp = ScanNetpp
+dust3r.datasets.StaticThings3D = StaticThings3D
+dust3r.datasets.Waymo = Waymo
+dust3r.datasets.WildRGBD = WildRGBD
+from src.datasets.scannet import Scannet
+from src.datasets.scannetpp import Scannetpp
+from src.datasets.megadepth import MegaDepth
+dust3r.datasets.Scannet = Scannet
+dust3r.datasets.Scannetpp = Scannetpp
+dust3r.datasets.MegaDepth = MegaDepth
+from src.model import LSM_MASt3R
+dust3r.training.LSM_MASt3R = LSM_MASt3R
+from src.losses import GaussianLoss
+dust3r.training.GaussianLoss = GaussianLoss
+from dust3r.training import get_args_parser as dust3r_get_args_parser  # noqa
+from dust3r.training import train  # noqa
+import yaml
+def get_args_parser():
+    parser = dust3r_get_args_parser()
+    parser.prog = 'LSM_MASt3R training'
+    # Load the configuration
+    with open("configs/model_config.yaml", "r") as f:
+        config = yaml.safe_load(f)
+    # Convert the config dict to a string of keyword arguments
+    config_str = ", ".join(f"{k}={v}" for k, v in config.items())
+    # Set the default model string with parameters
+    parser.set_defaults(model=f"LSM_MASt3R({config_str})")
+    return parser
+if __name__ == '__main__':
+    args = get_args_parser()
+    args = args.parse_args()
+    train(args)

src/utils/camera_utils.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import math
+import torch
+from dust3r.utils.geometry import inv
+from src.utils.cuda_splatting import DummyCamera
+def get_scaled_camera(ref_camera_extrinsics, target_camera_extrinsics, target_camera_intrinsics, scale, image_shape):
+    """
+    get a scaled camera from a reference camera to a target camera
+    """
+    # get extrinsics(target_camera to ref_camera)
+    target_camera_extrinsics = inv(ref_camera_extrinsics) @ target_camera_extrinsics
+    # scale translation
+    target_camera_extrinsics[:3, 3] = target_camera_extrinsics[:3, 3] * scale
+    # invert extrinsics(ref_camera to target_camera)
+    target_camera_extrinsics_inv = inv(target_camera_extrinsics)
+    # calculate fov
+    fovx = 2 * math.atan(image_shape[1] / (2 * target_camera_intrinsics[0, 0]))
+    fovy = 2 * math.atan(image_shape[0] / (2 * target_camera_intrinsics[1, 1]))
+    # return camera(numpy)
+    R = target_camera_extrinsics_inv[:3, :3].cpu().numpy().transpose() # R.transpose() : ref_camera_2_target_camera
+    T = target_camera_extrinsics_inv[:3, 3].cpu().numpy() # T : ref_camera_2_target_camera
+    image_shape = image_shape.cpu().numpy()
+    return DummyCamera(R, T, fovx, fovy, image_shape[1], image_shape[0])
+def move_c2w_along_z(extrinsics: torch.Tensor, distance: float) -> torch.Tensor:
+    """
+    向后移动多个 Camera-to-World (C2W) 矩阵，使相机沿各自 Z 轴方向远离原点。
+    参数:
+        extrinsics (torch.Tensor): 形状为 [N, 4, 4] 的张量，包含 N 个 C2W 矩阵。
+        distance (float): 向后移动的距离。
+    返回:
+        torch.Tensor: 更新后的 C2W 矩阵，形状与输入相同。
+    """
+    # 确保输入是一个四维矩阵，且最后一维是 4x4
+    assert extrinsics.dim() == 3 and extrinsics.shape[1:] == (4, 4), \
+        "输入的 extrinsics 必须是形状为 [N, 4, 4] 的张量"
+    # 创建一个拷贝以免修改原矩阵
+    updated_extrinsics = extrinsics.clone()
+    # 遍历每个 C2W 矩阵
+    for i in range(updated_extrinsics.shape[0]):
+        # 提取旋转矩阵 R 和平移向量 t
+        R = updated_extrinsics[i, :3, :3]  # 形状为 [3, 3]
+        t = updated_extrinsics[i, :3, 3]   # 形状为 [3]
+        # 获取相机的 Z 轴方向（第三列）
+        z_axis = R[:, 2]  # 形状为 [3]
+        # 计算新的平移向量，沿 Z 轴方向向后移动
+        t_new = t - distance * z_axis
+        # 更新 C2W 矩阵的平移部分
+        updated_extrinsics[i, :3, 3] = t_new
+    return updated_extrinsics

src/utils/cuda_splatting.py ADDED Viewed

	@@ -0,0 +1,216 @@

+#
+# Copyright (C) 2023, Inria
+# GRAPHDECO research group, https://team.inria.fr/graphdeco
+# All rights reserved.
+#
+# This software is free for non-commercial, research and evaluation use
+# under the terms of the LICENSE.md file.
+#
+# For inquiries contact  [email protected]
+#
+import numpy as np
+import torch
+import math
+from diff_gaussian_rasterization import GaussianRasterizationSettings, GaussianRasterizer
+from .gaussian_model import GaussianModel
+from .sh_utils import eval_sh
+from .graphics_utils import getWorld2View2, getProjectionMatrix
+class DummyCamera:
+    def __init__(self, R, T, FoVx, FoVy, W, H):
+        self.projection_matrix = getProjectionMatrix(znear=0.01, zfar=100.0, fovX=FoVx, fovY=FoVy).transpose(0,1).cuda()
+        self.R = R
+        self.T = T
+        self.world_view_transform = torch.tensor(getWorld2View2(R, T, np.array([0,0,0]), 1.0)).transpose(0, 1).cuda()
+        self.full_proj_transform = (self.world_view_transform.unsqueeze(0).bmm(self.projection_matrix.unsqueeze(0))).squeeze(0)
+        self.camera_center = self.world_view_transform.inverse()[3, :3]
+        self.image_width = W
+        self.image_height = H
+        self.FoVx = FoVx
+        self.FoVy = FoVy
+class DummyPipeline:
+    convert_SHs_python = False
+    compute_cov3D_python = False
+    debug = False
+def calculate_fov(output_width, output_height, focal_length, aspect_ratio=1.0, invert_y=False):
+    fovx = 2 * math.atan((output_width / (2 * focal_length)))
+    fovy = 2 * math.atan((output_height / aspect_ratio) / (2 * focal_length))
+    if invert_y:
+        fovy = -fovy
+    return fovx, fovy
+# def render(viewpoint_camera, pc : GaussianModel, pipe, bg_color : torch.Tensor, scaling_modifier = 1.0, override_color = None):
+#     """
+#     Render the scene.
+#     Background tensor (bg_color) must be on GPU!
+#     """
+#     # Create zero tensor. We will use it to make pytorch return gradients of the 2D (screen-space) means
+#     screenspace_points = torch.zeros_like(pc.get_xyz, dtype=pc.get_xyz.dtype, requires_grad=True, device="cuda") + 0
+#     try:
+#         screenspace_points.retain_grad()
+#     except:
+#         pass
+#     # Set up rasterization configuration
+#     tanfovx = math.tan(viewpoint_camera.FoVx * 0.5)
+#     tanfovy = math.tan(viewpoint_camera.FoVy * 0.5)
+#     raster_settings = GaussianRasterizationSettings(
+#         image_height=int(viewpoint_camera.image_height),
+#         image_width=int(viewpoint_camera.image_width),
+#         tanfovx=tanfovx,
+#         tanfovy=tanfovy,
+#         bg=bg_color,
+#         scale_modifier=scaling_modifier,
+#         viewmatrix=viewpoint_camera.world_view_transform,
+#         projmatrix=viewpoint_camera.full_proj_transform,
+#         sh_degree=pc.active_sh_degree,
+#         campos=viewpoint_camera.camera_center,
+#         prefiltered=False,
+#         debug=pipe.debug
+#     )
+#     rasterizer = GaussianRasterizer(raster_settings=raster_settings)
+#     means3D = pc.get_xyz
+#     means2D = screenspace_points
+#     opacity = pc.get_opacity
+#     # If precomputed 3d covariance is provided, use it. If not, then it will be computed from
+#     # scaling / rotation by the rasterizer.
+#     scales = None
+#     rotations = None
+#     cov3D_precomp = None
+#     if pipe.compute_cov3D_python:
+#         cov3D_precomp = pc.get_covariance(scaling_modifier)
+#     else:
+#         scales = pc.get_scaling
+#         rotations = pc.get_rotation
+#     # If precomputed colors are provided, use them. Otherwise, if it is desired to precompute colors
+#     # from SHs in Python, do it. If not, then SH -> RGB conversion will be done by rasterizer.
+#     shs = None
+#     colors_precomp = None
+#     if override_color is None:
+#         if pipe.convert_SHs_python:
+#             shs_view = pc.get_features.transpose(1, 2).view(-1, 3, (pc.max_sh_degree+1)**2)
+#             dir_pp = (pc.get_xyz - viewpoint_camera.camera_center.repeat(pc.get_features.shape[0], 1))
+#             dir_pp_normalized = dir_pp/dir_pp.norm(dim=1, keepdim=True)
+#             sh2rgb = eval_sh(pc.active_sh_degree, shs_view, dir_pp_normalized)
+#             colors_precomp = torch.clamp_min(sh2rgb + 0.5, 0.0)
+#         else:
+#             shs = pc.get_features
+#     else:
+#         colors_precomp = override_color
+#     # Rasterize visible Gaussians to image, obtain their radii (on screen).
+#     rendered_image, radii = rasterizer(
+#         means3D = means3D,
+#         means2D = means2D,
+#         shs = shs,
+#         colors_precomp = colors_precomp,
+#         opacities = opacity,
+#         scales = scales,
+#         rotations = rotations,
+#         cov3D_precomp = cov3D_precomp)
+#     # Those Gaussians that were frustum culled or had a radius of 0 were not visible.
+#     # They will be excluded from value updates used in the splitting criteria.
+#     return {"render": rendered_image,
+#             "viewspace_points": screenspace_points,
+#             "visibility_filter" : radii > 0,
+#             "radii": radii}
+def render(viewpoint_camera, pc : GaussianModel, pipe, bg_color : torch.Tensor, scaling_modifier = 1.0, override_color = None):
+    """
+    Render the scene.
+    Background tensor (bg_color) must be on GPU!
+    """
+    # Create zero tensor. We will use it to make pytorch return gradients of the 2D (screen-space) means
+    screenspace_points = torch.zeros_like(pc.get_xyz, dtype=pc.get_xyz.dtype, requires_grad=True, device="cuda") + 0
+    try:
+        screenspace_points.retain_grad()
+    except:
+        pass
+    # Set up rasterization configuration
+    tanfovx = math.tan(viewpoint_camera.FoVx * 0.5)
+    tanfovy = math.tan(viewpoint_camera.FoVy * 0.5)
+    raster_settings = GaussianRasterizationSettings(
+        image_height=int(viewpoint_camera.image_height),
+        image_width=int(viewpoint_camera.image_width),
+        tanfovx=tanfovx,
+        tanfovy=tanfovy,
+        bg=bg_color,
+        scale_modifier=scaling_modifier,
+        viewmatrix=viewpoint_camera.world_view_transform,
+        projmatrix=viewpoint_camera.full_proj_transform,
+        sh_degree=pc.active_sh_degree,
+        campos=viewpoint_camera.camera_center,
+        prefiltered=False,
+        debug=pipe.debug
+    )
+    rasterizer = GaussianRasterizer(raster_settings=raster_settings)
+    means3D = pc.get_xyz
+    means2D = screenspace_points
+    opacity = pc.get_opacity
+    # If precomputed 3d covariance is provided, use it. If not, then it will be computed from
+    # scaling / rotation by the rasterizer.
+    scales = None
+    rotations = None
+    cov3D_precomp = None
+    if pipe.compute_cov3D_python:
+        cov3D_precomp = pc.get_covariance(scaling_modifier)
+    else:
+        scales = pc.get_scaling
+        rotations = pc.get_rotation
+    # If precomputed colors are provided, use them. Otherwise, if it is desired to precompute colors
+    # from SHs in Python, do it. If not, then SH -> RGB conversion will be done by rasterizer.
+    shs = None
+    colors_precomp = None
+    if override_color is None:
+        if pipe.convert_SHs_python:
+            shs_view = pc.get_features.transpose(1, 2).view(-1, 3, (pc.max_sh_degree+1)**2)
+            dir_pp = (pc.get_xyz - viewpoint_camera.camera_center.repeat(pc.get_features.shape[0], 1))
+            dir_pp_normalized = dir_pp/dir_pp.norm(dim=1, keepdim=True)
+            sh2rgb = eval_sh(pc.active_sh_degree, shs_view, dir_pp_normalized)
+            colors_precomp = torch.clamp_min(sh2rgb + 0.5, 0.0)
+        else:
+            shs = pc.get_features
+    else:
+        colors_precomp = override_color
+    semantic_feature = pc.get_semantic_feature
+    # Rasterize visible Gaussians to image, obtain their radii (on screen).
+    rendered_image, feature_map, radii, depth = rasterizer(
+        means3D = means3D,
+        means2D = means2D,
+        shs = shs,
+        colors_precomp = colors_precomp,
+        semantic_feature = semantic_feature,
+        opacities = opacity,
+        scales = scales,
+        rotations = rotations,
+        cov3D_precomp = cov3D_precomp)
+    # Those Gaussians that were frustum culled or had a radius of 0 were not visible.
+    # They will be excluded from value updates used in the splitting criteria.
+    return {"render": rendered_image,
+            "viewspace_points": screenspace_points,
+            "visibility_filter" : radii > 0,
+            "radii": radii,
+            'feature_map': feature_map,
+            "depth": depth} ###d

src/utils/gaussian_model.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import os
+import torch
+from einops import rearrange
+import numpy as np
+from plyfile import PlyData, PlyElement
+from os import makedirs, path
+from errno import EEXIST
+def mkdir_p(folder_path):
+    # Creates a directory. equivalent to using mkdir -p on the command line
+    try:
+        makedirs(folder_path)
+    except OSError as exc: # Python >2.5
+        if exc.errno == EEXIST and path.isdir(folder_path):
+            pass
+        else:
+            raise
+def RGB2SH(rgb):
+    return (rgb - 0.5) / C0
+C0 = 0.28209479177387814
+# https://github.com/facebookresearch/pytorch3d/blob/main/pytorch3d/transforms/rotation_conversions.py
+def quaternion_to_matrix(
+    quaternions,
+    eps=1e-8,
+) :
+    # Order changed to match scipy format!
+    i, j, k, r = torch.unbind(quaternions, dim=-1)
+    two_s = 2 / ((quaternions * quaternions).sum(dim=-1) + eps)
+    o = torch.stack(
+        (
+            1 - two_s * (j * j + k * k),
+            two_s * (i * j - k * r),
+            two_s * (i * k + j * r),
+            two_s * (i * j + k * r),
+            1 - two_s * (i * i + k * k),
+            two_s * (j * k - i * r),
+            two_s * (i * k - j * r),
+            two_s * (j * k + i * r),
+            1 - two_s * (i * i + j * j),
+        ),
+        -1,
+    )
+    return rearrange(o, "... (i j) -> ... i j", i=3, j=3)
+def build_covariance(
+    scale,
+    rotation_xyzw,
+):
+    scale = scale.diag_embed()
+    rotation = quaternion_to_matrix(rotation_xyzw)
+    return (
+        rotation
+        @ scale
+        @ rearrange(scale, "... i j -> ... j i")
+        @ rearrange(rotation, "... i j -> ... j i")
+    )
+def inverse_sigmoid(x):
+    return torch.log(x/(1-x))
+class GaussianModel:
+    def __init__(self, sh_degree : int):
+        self.active_sh_degree = 0
+        self.max_sh_degree = sh_degree
+        self._xyz = torch.empty(0)
+        self._features_dc = torch.empty(0)
+        self._features_rest = torch.empty(0)
+        self._scaling = torch.empty(0)
+        self._rotation = torch.empty(0)
+        self._opacity = torch.empty(0)
+        self.max_radii2D = torch.empty(0)
+        self.xyz_gradient_accum = torch.empty(0)
+        self.denom = torch.empty(0)
+        self.optimizer = None
+        self.percent_dense = 0
+        self.spatial_lr_scale = 0
+        self._semantic_feature = torch.empty(0)
+    @property
+    def get_scaling(self):
+        return self._scaling
+    @property
+    def get_rotation(self):
+        return self._rotation
+    @property
+    def get_xyz(self):
+        return self._xyz
+    @property
+    def get_features(self):
+        features_dc = self._features_dc
+        features_rest = self._features_rest
+        return torch.cat((features_dc, features_rest), dim=1)
+    @property
+    def get_opacity(self):
+        return self._opacity
+    @property
+    def get_semantic_feature(self):
+        return self._semantic_feature
+    def construct_list_of_attributes(self):
+        l = ['x', 'y', 'z', 'nx', 'ny', 'nz']
+        # All channels except the 3 DC
+        for i in range(self._features_dc.shape[1]*self._features_dc.shape[2]):
+            l.append('f_dc_{}'.format(i))
+        for i in range(self._features_rest.shape[1]*self._features_rest.shape[2]):
+            l.append('f_rest_{}'.format(i))
+        l.append('opacity')
+        for i in range(self._scaling.shape[1]):
+            l.append('scale_{}'.format(i))
+        for i in range(self._rotation.shape[1]):
+            l.append('rot_{}'.format(i))
+        # Add semantic features
+        for i in range(self._semantic_feature.shape[1]*self._semantic_feature.shape[2]):
+            l.append('semantic_{}'.format(i))
+        return l
+    @staticmethod
+    def from_predictions(pred, sh_degree):
+        gaussians = GaussianModel(sh_degree=sh_degree)
+        gaussians._xyz = pred['means']
+        gaussians._features_dc = pred['sh_coeffs'][:, :1] # N, 1, d_sh
+        gaussians._features_rest = pred['sh_coeffs'][:, 1:] # N, d_sh-1, d_sh
+        gaussians._opacity = pred['opacities'] # N, 1
+        gaussians._scaling = pred['scales'] # N, 3, 3
+        gaussians._rotation = pred['rotations'] # N, 4
+        gaussians._semantic_feature = pred['gs_feats'][:, None, :] # N, 1, d_feats
+        return gaussians
+    def save_ply(self, path):
+        mkdir_p(os.path.dirname(path))
+        xyz = self._xyz.detach().cpu().numpy()
+        normals = np.zeros_like(xyz)
+        f_dc = self._features_dc.detach().transpose(1, 2).flatten(start_dim=1).contiguous().cpu().numpy()
+        f_rest = self._features_rest.detach().transpose(1, 2).flatten(start_dim=1).contiguous().cpu().numpy()
+        opacities = inverse_sigmoid(self._opacity).detach().cpu().numpy()
+        scale = torch.log(self._scaling).detach().cpu().numpy()
+        rotation = self._rotation.detach().cpu().numpy()
+        semantic_feature = self._semantic_feature.detach().transpose(1, 2).flatten(start_dim=1).contiguous().cpu().numpy()
+        dtype_full = [(attribute, 'f4') for attribute in self.construct_list_of_attributes()]
+        elements = np.empty(xyz.shape[0], dtype=dtype_full)
+        attributes = np.concatenate((xyz, normals, f_dc, f_rest, opacities, scale, rotation, semantic_feature), axis=1)
+        # attributes = np.concatenate((xyz, normals, f_dc, f_rest, opacities, scale, rotation), axis=1)
+        elements[:] = list(map(tuple, attributes))
+        el = PlyElement.describe(elements, 'vertex')
+        PlyData([el]).write(path)

src/utils/graphics_utils.py ADDED Viewed

	@@ -0,0 +1,77 @@

+#
+# Copyright (C) 2023, Inria
+# GRAPHDECO research group, https://team.inria.fr/graphdeco
+# All rights reserved.
+#
+# This software is free for non-commercial, research and evaluation use
+# under the terms of the LICENSE.md file.
+#
+# For inquiries contact  [email protected]
+#
+import torch
+import math
+import numpy as np
+from typing import NamedTuple
+class BasicPointCloud(NamedTuple):
+    points : np.array
+    colors : np.array
+    normals : np.array
+def geom_transform_points(points, transf_matrix):
+    P, _ = points.shape
+    ones = torch.ones(P, 1, dtype=points.dtype, device=points.device)
+    points_hom = torch.cat([points, ones], dim=1)
+    points_out = torch.matmul(points_hom, transf_matrix.unsqueeze(0))
+    denom = points_out[..., 3:] + 0.0000001
+    return (points_out[..., :3] / denom).squeeze(dim=0)
+def getWorld2View(R, t):
+    Rt = np.zeros((4, 4))
+    Rt[:3, :3] = R.transpose()
+    Rt[:3, 3] = t
+    Rt[3, 3] = 1.0
+    return np.float32(Rt)
+def getWorld2View2(R, t, translate=np.array([.0, .0, .0]), scale=1.0):
+    Rt = np.zeros((4, 4))
+    Rt[:3, :3] = R.transpose()
+    Rt[:3, 3] = t
+    Rt[3, 3] = 1.0
+    C2W = np.linalg.inv(Rt)
+    cam_center = C2W[:3, 3]
+    cam_center = (cam_center + translate) * scale
+    C2W[:3, 3] = cam_center
+    Rt = np.linalg.inv(C2W)
+    return np.float32(Rt)
+def getProjectionMatrix(znear, zfar, fovX, fovY):
+    tanHalfFovY = math.tan((fovY / 2))
+    tanHalfFovX = math.tan((fovX / 2))
+    top = tanHalfFovY * znear
+    bottom = -top
+    right = tanHalfFovX * znear
+    left = -right
+    P = torch.zeros(4, 4)
+    z_sign = 1.0
+    P[0, 0] = 2.0 * znear / (right - left)
+    P[1, 1] = 2.0 * znear / (top - bottom)
+    P[0, 2] = (right + left) / (right - left)
+    P[1, 2] = (top + bottom) / (top - bottom)
+    P[3, 2] = z_sign
+    P[2, 2] = z_sign * zfar / (zfar - znear)
+    P[2, 3] = -(zfar * znear) / (zfar - znear)
+    return P
+def fov2focal(fov, pixels):
+    return pixels / (2 * math.tan(fov / 2))
+def focal2fov(focal, pixels):
+    return 2*math.atan(pixels/(2*focal))

src/utils/points_process.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import torch
+from einops import rearrange
+# merge points from two views and add color information
+def merge_points(mast3r_output, view1, view2, grid_size=0.01):
+    # get points from mast3r_output
+    points1 = mast3r_output[0]['pts3d'].detach() # B, H, W, 3
+    points2 = mast3r_output[1]['pts3d_in_other_view'].detach() # B, H, W, 3
+    shape = points1.shape
+    # add color information
+    colors = torch.stack([view1['img'], view2['img']], dim=1) # B, V, 3, H, W
+    colors = rearrange(colors, 'b v c h w -> b (v h w) c') # B, V * H * W, 3
+    # merge points
+    points = torch.stack([points1, points2], dim=1) # B, V, H, W, 3
+    points = rearrange(points, 'b v h w c -> b (v h w) c') # B, V * H * W, 3
+    B, N, _ = points.shape
+    offset = torch.arange(1, B + 1, device=points.device) * N
+    # Center and normalize points
+    center = torch.mean(points, dim=1, keepdim=True)
+    points = points - center
+    scale = torch.max(torch.norm(points, dim=2, keepdim=True), dim=1, keepdim=True)[0]
+    points = points / scale
+    # concat points and colors
+    feat = torch.cat([points, colors], dim=-1) # B, V * H * W, 6
+    data_dict = {
+        'coord': rearrange(points, 'b n c -> (b n) c'),
+        'color': rearrange(colors, 'b n c -> (b n) c'),
+        'feat': rearrange(feat, 'b n c -> (b n) c'),
+        'offset': offset,
+        'grid_size': grid_size,
+        'center': center,
+        'scale': scale,
+        'shape': shape,
+    }
+    return data_dict

src/utils/sh_utils.py ADDED Viewed

	@@ -0,0 +1,117 @@

+#  Copyright 2021 The PlenOctree Authors.
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+C0 = 0.28209479177387814
+C1 = 0.4886025119029199
+C2 = [
+    1.0925484305920792,
+    -1.0925484305920792,
+    0.31539156525252005,
+    -1.0925484305920792,
+    0.5462742152960396
+]
+C3 = [
+    -0.5900435899266435,
+    2.890611442640554,
+    -0.4570457994644658,
+    0.3731763325901154,
+    -0.4570457994644658,
+    1.445305721320277,
+    -0.5900435899266435
+]
+C4 = [
+    2.5033429417967046,
+    -1.7701307697799304,
+    0.9461746957575601,
+    -0.6690465435572892,
+    0.10578554691520431,
+    -0.6690465435572892,
+    0.47308734787878004,
+    -1.7701307697799304,
+    0.6258357354491761,
+]
+def eval_sh(deg, sh, dirs):
+    """
+    Evaluate spherical harmonics at unit directions
+    using hardcoded SH polynomials.
+    Works with torch/np/jnp.
+    ... Can be 0 or more batch dimensions.
+    Args:
+        deg: int SH deg. Currently, 0-3 supported
+        sh: jnp.ndarray SH coeffs [..., C, (deg + 1) ** 2]
+        dirs: jnp.ndarray unit directions [..., 3]
+    Returns:
+        [..., C]
+    """
+    assert deg <= 4 and deg >= 0
+    coeff = (deg + 1) ** 2
+    assert sh.shape[-1] >= coeff
+    result = C0 * sh[..., 0]
+    if deg > 0:
+        x, y, z = dirs[..., 0:1], dirs[..., 1:2], dirs[..., 2:3]
+        result = (result -
+                C1 * y * sh[..., 1] +
+                C1 * z * sh[..., 2] -
+                C1 * x * sh[..., 3])
+        if deg > 1:
+            xx, yy, zz = x * x, y * y, z * z
+            xy, yz, xz = x * y, y * z, x * z
+            result = (result +
+                    C2[0] * xy * sh[..., 4] +
+                    C2[1] * yz * sh[..., 5] +
+                    C2[2] * (2.0 * zz - xx - yy) * sh[..., 6] +
+                    C2[3] * xz * sh[..., 7] +
+                    C2[4] * (xx - yy) * sh[..., 8])
+            if deg > 2:
+                result = (result +
+                C3[0] * y * (3 * xx - yy) * sh[..., 9] +
+                C3[1] * xy * z * sh[..., 10] +
+                C3[2] * y * (4 * zz - xx - yy)* sh[..., 11] +
+                C3[3] * z * (2 * zz - 3 * xx - 3 * yy) * sh[..., 12] +
+                C3[4] * x * (4 * zz - xx - yy) * sh[..., 13] +
+                C3[5] * z * (xx - yy) * sh[..., 14] +
+                C3[6] * x * (xx - 3 * yy) * sh[..., 15])
+                if deg > 3:
+                    result = (result + C4[0] * xy * (xx - yy) * sh[..., 16] +
+                            C4[1] * yz * (3 * xx - yy) * sh[..., 17] +
+                            C4[2] * xy * (7 * zz - 1) * sh[..., 18] +
+                            C4[3] * yz * (7 * zz - 3) * sh[..., 19] +
+                            C4[4] * (zz * (35 * zz - 30) + 3) * sh[..., 20] +
+                            C4[5] * xz * (7 * zz - 3) * sh[..., 21] +
+                            C4[6] * (xx - yy) * (7 * zz - 1) * sh[..., 22] +
+                            C4[7] * xz * (xx - 3 * yy) * sh[..., 23] +
+                            C4[8] * (xx * (xx - 3 * yy) - yy * (3 * xx - yy)) * sh[..., 24])
+    return result
+def RGB2SH(rgb):
+    return (rgb - 0.5) / C0
+def SH2RGB(sh):
+    return sh * C0 + 0.5

src/utils/visualization_utils.py ADDED Viewed

	@@ -0,0 +1,355 @@

+import sys
+import os
+import numpy as np
+import scipy.interpolate
+import PIL
+import torch
+import matplotlib.pyplot as plt
+from sklearn.preprocessing import StandardScaler
+from sklearn.decomposition import PCA
+import moviepy.editor as mpy
+sys.path.append('submodules/mast3r/dust3r')
+from dust3r.utils.image import heif_support_enabled, exif_transpose, _resize_pil_image, ImgNorm
+from dust3r.image_pairs import make_pairs
+from dust3r.inference import inference
+from dust3r.cloud_opt import global_aligner, GlobalAlignerMode
+sys.path.append('.')
+from src.utils.cuda_splatting import render, DummyPipeline
+from src.utils.gaussian_model import GaussianModel
+from src.utils.camera_utils import get_scaled_camera
+from src.losses import merge_and_split_predictions
+from src.utils.camera_utils import move_c2w_along_z
+from einops import rearrange
+LABELS = ['wall', 'floor', 'ceiling', 'chair', 'table', 'sofa', 'bed', 'other']
+NUM_LABELS = len(LABELS) + 1
+PALLETE = plt.cm.get_cmap('tab10', NUM_LABELS)
+COLORS_LIST = [PALLETE(i)[:3] for i in range(NUM_LABELS)]
+COLORS = torch.tensor(COLORS_LIST, dtype=torch.float32)
+def load_images(folder_or_list, size, square_ok=False, verbose=True, save_dir=None):
+    """ open and convert all images in a list or folder to proper input format for DUSt3R
+    """
+    if isinstance(folder_or_list, str):
+        if verbose:
+            print(f'>> Loading images from {folder_or_list}')
+        root, folder_content = folder_or_list, sorted(os.listdir(folder_or_list))
+    elif isinstance(folder_or_list, list):
+        if verbose:
+            print(f'>> Loading a list of {len(folder_or_list)} images')
+        root, folder_content = '', folder_or_list
+    else:
+        raise ValueError(f'bad {folder_or_list=} ({type(folder_or_list)})')
+    supported_images_extensions = ['.jpg', '.jpeg', '.png']
+    if heif_support_enabled:
+        supported_images_extensions += ['.heic', '.heif']
+    supported_images_extensions = tuple(supported_images_extensions)
+    imgs = []
+    for path in folder_content:
+        if not path.lower().endswith(supported_images_extensions):
+            continue
+        img = exif_transpose(PIL.Image.open(os.path.join(root, path))).convert('RGB')
+        W1, H1 = img.size
+        if size == 224:
+            # resize short side to 224 (then crop)
+            img = _resize_pil_image(img, round(size * max(W1/H1, H1/W1)))
+        else:
+            # resize long side to 512
+            img = _resize_pil_image(img, size)
+        W, H = img.size
+        cx, cy = W//2, H//2
+        if size == 224:
+            half = min(cx, cy)
+            img = img.crop((cx-half, cy-half, cx+half, cy+half))
+        else:
+            halfw, halfh = ((2*cx)//32)*16, ((2*cy)//32)*16
+            if not (square_ok) and W == H:
+                halfh = 3*halfw/4
+            img = img.crop((cx-halfw, cy-halfh, cx+halfw, cy+halfh))
+        W2, H2 = img.size
+        if verbose:
+            print(f' - adding {path} with resolution {W1}x{H1} --> {W2}x{H2}')
+        # Save the processed image if save_dir is provided
+        if save_dir:
+            os.makedirs(save_dir, exist_ok=True)
+            save_path = os.path.join(save_dir, f"processed_{len(imgs):03d}.png")
+            img.save(save_path)
+            if verbose:
+                print(f' - saved processed image to {save_path}')
+        imgs.append(dict(img=ImgNorm(img)[None], true_shape=np.int32(
+            [img.size[::-1]]), idx=len(imgs), instance=str(len(imgs))))
+    assert imgs, 'no images foud at '+root
+    if verbose:
+        print(f' (Found {len(imgs)} images)')
+    return imgs
+def normalize(x):
+    """Normalization helper function."""
+    return x / np.linalg.norm(x)
+def viewmatrix(lookdir, up, position):
+    """Construct lookat view matrix."""
+    vec2 = normalize(lookdir)
+    vec0 = normalize(np.cross(up, vec2))
+    vec1 = normalize(np.cross(vec2, vec0))
+    m = np.stack([vec0, vec1, vec2, position], axis=1)
+    return m
+def poses_to_points(poses, dist):
+    """Converts from pose matrices to (position, lookat, up) format."""
+    pos = poses[:, :3, -1]
+    lookat = poses[:, :3, -1] - dist * poses[:, :3, 2]
+    up = poses[:, :3, -1] + dist * poses[:, :3, 1]
+    return np.stack([pos, lookat, up], 1)
+def points_to_poses(points):
+    """Converts from (position, lookat, up) format to pose matrices."""
+    return np.array([viewmatrix(p - l, u - p, p) for p, l, u in points])
+def interp(points, n, k, s):
+    """Runs multidimensional B-spline interpolation on the input points."""
+    sh = points.shape
+    pts = np.reshape(points, (sh[0], -1))
+    k = min(k, sh[0] - 1)
+    tck, _ = scipy.interpolate.splprep(pts.T, k=k, s=s)
+    u = np.linspace(0, 1, n, endpoint=False)
+    new_points = np.array(scipy.interpolate.splev(u, tck))
+    new_points = np.reshape(new_points.T, (n, sh[1], sh[2]))
+    return new_points
+def generate_interpolated_path(poses, n_interp, spline_degree=5,
+                               smoothness=.03, rot_weight=.1):
+    """Creates a smooth spline path between input keyframe camera poses.
+  Spline is calculated with poses in format (position, lookat-point, up-point).
+  Args:
+    poses: (n, 3, 4) array of input pose keyframes.
+    n_interp: returned path will have n_interp * (n - 1) total poses.
+    spline_degree: polynomial degree of B-spline.
+    smoothness: parameter for spline smoothing, 0 forces exact interpolation.
+    rot_weight: relative weighting of rotation/translation in spline solve.
+  Returns:
+    Array of new camera poses with shape (n_interp * (n - 1), 3, 4).
+  """
+    points = poses_to_points(poses, dist=rot_weight)
+    new_points = interp(points,
+                        n_interp * (points.shape[0] - 1),
+                        k=spline_degree,
+                        s=smoothness)
+    return points_to_poses(new_points)
+def batch_visualize_tensor_global_pca(tensor_batch, num_components=3):
+    B, C, H, W = tensor_batch.shape
+    tensor_flat_all = tensor_batch.reshape(B, C, -1).permute(1, 0, 2).reshape(C, -1).T
+    tensor_flat_all_np = tensor_flat_all.cpu().numpy()
+    scaler = StandardScaler()
+    tensor_flat_all_np = scaler.fit_transform(tensor_flat_all_np)
+    pca = PCA(n_components=num_components)
+    tensor_reduced_all_np = pca.fit_transform(tensor_flat_all_np)
+    tensor_reduced_all = torch.tensor(tensor_reduced_all_np, dtype=tensor_batch.dtype).T.reshape(num_components, B, H * W).permute(1, 0, 2)
+    output_tensor = torch.zeros((B, 3, H, W))
+    for i in range(B):
+        tensor_reduced = tensor_reduced_all[i].reshape(num_components, H, W)
+        tensor_reduced -= tensor_reduced.min()
+        tensor_reduced /= tensor_reduced.max()
+        output_tensor[i] = tensor_reduced[:3]
+    return output_tensor
+def depth_to_colormap(depth_tensor, colormap='jet'):
+    B, _, _, _ = depth_tensor.shape
+    depth_tensor = (depth_tensor - depth_tensor.min()) / (depth_tensor.max() - depth_tensor.min())
+    depth_np = depth_tensor.squeeze(1).cpu().numpy()
+    cmap = plt.get_cmap(colormap)
+    colored_images = []
+    for i in range(B):
+        colored_image = cmap(depth_np[i])
+        colored_images.append(colored_image[..., :3])
+    colored_tensor = torch.tensor(np.array(colored_images), dtype=torch.float32).permute(0, 3, 1, 2)
+    return colored_tensor
+def save_video(frames, video_path, fps=24):
+    clips = [mpy.ImageClip(frame).set_duration(1/fps) for frame in frames]
+    video = mpy.concatenate_videoclips(clips, method="compose")
+    video.write_videofile(video_path, fps=fps)
+def tensors_to_videos(all_images, all_depth_vis, all_fmap_vis, all_sems_vis, video_dir='videos', fps=24):
+    B, C, H, W = all_images.shape
+    assert all_depth_vis.shape == (B, C, H, W)
+    assert all_fmap_vis.shape == (B, C, H, W)
+    assert all_sems_vis.shape == (B, C, H, W)
+    os.makedirs(video_dir, exist_ok=True)
+    all_images = (all_images.permute(0, 2, 3, 1).cpu().numpy() * 255).astype(np.uint8)
+    all_depth_vis = (all_depth_vis.permute(0, 2, 3, 1).cpu().numpy() * 255).astype(np.uint8)
+    all_fmap_vis = (all_fmap_vis.permute(0, 2, 3, 1).cpu().numpy() * 255).astype(np.uint8)
+    all_sems_vis = (all_sems_vis.permute(0, 2, 3, 1).cpu().numpy() * 255).astype(np.uint8)
+    save_video(all_images, os.path.join(video_dir, 'output_images_video.mp4'), fps=fps)
+    save_video(all_depth_vis, os.path.join(video_dir, 'output_depth_video.mp4'), fps=fps)
+    save_video(all_fmap_vis, os.path.join(video_dir, 'output_fmap_video.mp4'), fps=fps)
+    # save_video(all_sems_vis, os.path.join(video_dir, 'output_sems_video.mp4'), fps=fps)
+    print(f'Videos saved to {video_dir}')
+def transfer_images_to_device(images, device):
+    """
+    Transfer the loaded images to the specified device.
+    Args:
+        images (list): List of dictionaries containing image data.
+        device (str or torch.device): The device to transfer the data to.
+    Returns:
+        list: List of dictionaries with image data transferred to the specified device.
+    """
+    transferred_images = []
+    for img_dict in images:
+        transferred_dict = {
+            'img': img_dict['img'].to(device),
+            'true_shape': torch.tensor(img_dict['true_shape'], device=device),
+            'idx': img_dict['idx'],
+            'instance': img_dict['instance']
+        }
+        transferred_images.append(transferred_dict)
+    return transferred_images
+def render_camera_path(video_poses, camera_params, gaussians, model, device, pipeline, bg_color, image_shape):
+    """渲染相机路径的帮助函数
+    Args:
+        video_poses: 相机位姿列表
+        camera_params: 包含extrinsics和intrinsics的相机参数
+        gaussians: 高斯模型
+        model: 特征提取模型
+        device: 计算设备
+        pipeline: 渲染管线
+        bg_color: 背景颜色
+        image_shape: 图像尺寸
+    Returns:
+        rendered_images: 渲染的图像
+        rendered_feats: 渲染的特征图
+        rendered_depths: 渲染的深度图
+        rendered_sems: 渲染的语义图
+    """
+    extrinsics, intrinsics = camera_params
+    rendered_images = []
+    rendered_feats = []
+    rendered_depths = []
+    rendered_sems = []
+    for i in range(len(video_poses)):
+        target_extrinsics = torch.zeros(4, 4).to(device)
+        target_extrinsics[3, 3] = 1.0
+        target_extrinsics[:3, :4] = torch.tensor(video_poses[i], device=device)
+        camera = get_scaled_camera(extrinsics[0], target_extrinsics, intrinsics[0], 1.0, image_shape)
+        rendered_output = render(camera, gaussians, pipeline, bg_color)
+        rendered_images.append(rendered_output['render'])
+        # 处理特征图
+        feature_map = rendered_output['feature_map']
+        feature_map = model.feature_expansion(feature_map[None, ...])
+        # 处理语义图
+        logits = model.lseg_feature_extractor.decode_feature(feature_map, labelset=LABELS)
+        semantic_map = torch.argmax(logits, dim=1) + 1
+        mask = COLORS[semantic_map.cpu()]
+        mask = rearrange(mask, 'b h w c -> b c h w')
+        rendered_sems.append(mask.squeeze(0))
+        # 降采样并上采样特征图
+        feature_map = feature_map[:, ::16, ...]
+        feature_map = torch.nn.functional.interpolate(feature_map, scale_factor=2, mode='bilinear', align_corners=True)
+        rendered_feats.append(feature_map[0])
+        del feature_map
+        rendered_depths.append(rendered_output['depth'])
+    # 堆叠并处理结果
+    rendered_images = torch.clamp(torch.stack(rendered_images, dim=0), 0, 1)
+    rendered_feats = torch.stack(rendered_feats, dim=0)
+    rendered_depths = torch.stack(rendered_depths, dim=0)
+    rendered_sems = torch.stack(rendered_sems, dim=0)
+    return rendered_images, rendered_feats, rendered_depths, rendered_sems
+@torch.no_grad()
+def render_video_from_file(file_list, model, output_path, device='cuda', resolution=224, n_interp=90, fps=30, path_type='default'):
+    # 1. load images
+    images = load_images(file_list, resolution, save_dir=os.path.join(output_path, 'processed_images'))
+    images = transfer_images_to_device(images, device)  # Transfer images to the specified device
+    image_shape = images[0]['true_shape'][0]
+    # 2. get camera pose
+    pairs = make_pairs(images, prefilter=None, symmetrize=True)
+    output = inference(pairs, model.mast3r, device, batch_size=1)
+    mode = GlobalAlignerMode.PairViewer
+    scene = global_aligner(output, device=device, mode=mode)
+    extrinsics = scene.get_im_poses()
+    intrinsics = scene.get_intrinsics()
+    video_poses = generate_interpolated_path(extrinsics[:, :3, :].cpu().numpy(), n_interp=n_interp) # extrinsics: (b, 3, 4)
+    # 3. get gaussians
+    pred1, pred2 = model(*images)
+    pred = merge_and_split_predictions(pred1, pred2)
+    gaussians = GaussianModel.from_predictions(pred[0], sh_degree=3)
+    # 4. 渲染原始视角
+    pipeline = DummyPipeline()
+    bg_color = torch.tensor([0.0, 0.0, 0.0]).to(device)
+    camera_params = (extrinsics, intrinsics)
+    rendered_images, rendered_feats, rendered_depths, rendered_sems = render_camera_path(
+        video_poses, camera_params, gaussians, model, device, pipeline, bg_color, image_shape)
+    # 5. 可视化
+    all_fmap_vis = batch_visualize_tensor_global_pca(rendered_feats)
+    all_depth_vis = depth_to_colormap(rendered_depths)
+    all_sems_vis = rendered_sems
+    # 6. 保存视频和高斯点云
+    tensors_to_videos(rendered_images, all_depth_vis, all_fmap_vis, all_sems_vis, output_path, fps=fps)
+    gaussians.save_ply(os.path.join(output_path, 'gaussians.ply'))
+    # 7. 渲染移动后的视角
+    moved_extrinsics = move_c2w_along_z(extrinsics, 2.0)
+    moved_video_poses = generate_interpolated_path(moved_extrinsics[:, :3, :].cpu().numpy(), n_interp=n_interp)
+    camera_params = (extrinsics, intrinsics)
+    moved_rendered_images, moved_rendered_feats, moved_rendered_depths, moved_rendered_sems = render_camera_path(
+        moved_video_poses, camera_params, gaussians, model, device, pipeline, bg_color, image_shape)
+    # 8. 可视化和保存移动后的结果
+    moved_all_fmap_vis = batch_visualize_tensor_global_pca(moved_rendered_feats)
+    moved_all_depth_vis = depth_to_colormap(moved_rendered_depths)
+    moved_all_sems_vis = moved_rendered_sems
+    moved_output_path = os.path.join(output_path, 'moved')
+    os.makedirs(moved_output_path, exist_ok=True)
+    tensors_to_videos(moved_rendered_images, moved_all_depth_vis, moved_all_fmap_vis, moved_all_sems_vis,
+                     moved_output_path, fps=fps)

submodules/PointTransformerV3/.gitmodules ADDED Viewed

	@@ -0,0 +1,3 @@

+[submodule "Pointcept"]
+	path = Pointcept
+	url = https://github.com/Pointcept/Pointcept

submodules/PointTransformerV3/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Pointcept
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

submodules/PointTransformerV3/Pointcept/.github/workflows/formatter.yml ADDED Viewed

	@@ -0,0 +1,20 @@

+name: Formatter
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - main
+  pull_request:
+    types: [opened, reopened, synchronize]
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+jobs:
+  formatter:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: psf/black@stable

submodules/PointTransformerV3/Pointcept/.gitignore ADDED Viewed

	@@ -0,0 +1,16 @@

+image/
+__pycache__
+**/build/
+**/*.egg-info/
+**/dist/
+*.so
+exp
+weights
+data
+log
+outputs/
+.vscode
+.idea
+*/.DS_Store
+**/*.out
+Dockerfile

submodules/PointTransformerV3/Pointcept/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Pointcept
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

submodules/PointTransformerV3/Pointcept/README.md ADDED Viewed

	@@ -0,0 +1,896 @@

+<p align="center">
+    <!-- pypi-strip -->
+    <picture>
+    <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/Pointcept/Pointcept/main/docs/logo_dark.png">
+    <source media="(prefers-color-scheme: light)" srcset="https://raw.githubusercontent.com/Pointcept/Pointcept/main/docs/logo.png">
+    <!-- /pypi-strip -->
+    <img alt="pointcept" src="https://raw.githubusercontent.com/Pointcept/Pointcept/main/docs/logo.png" width="400">
+    <!-- pypi-strip -->
+    </picture><br>
+    <!-- /pypi-strip -->
+</p>
+[![Formatter](https://github.com/pointcept/pointcept/actions/workflows/formatter.yml/badge.svg)](https://github.com/pointcept/pointcept/actions/workflows/formatter.yml)
+**Pointcept** is a powerful and flexible codebase for point cloud perception research. It is also an official implementation of the following paper:
+- **Point Transformer V3: Simpler, Faster, Stronger**
+*Xiaoyang Wu, Li Jiang, Peng-Shuai Wang, Zhijian Liu, Xihui Liu, Yu Qiao, Wanli Ouyang, Tong He, Hengshuang Zhao*
+IEEE Conference on Computer Vision and Pattern Recognition (**CVPR**) 2024 - Oral
+[ Backbone ] [PTv3] - [ [arXiv](https://arxiv.org/abs/2312.10035) ] [ [Bib](https://xywu.me/research/ptv3/bib.txt) ] [ [Project](https://github.com/Pointcept/PointTransformerV3) ] &rarr; [here](https://github.com/Pointcept/PointTransformerV3)
+- **OA-CNNs: Omni-Adaptive Sparse CNNs for 3D Semantic Segmentation**
+*Bohao Peng, Xiaoyang Wu, Li Jiang, Yukang Chen, Hengshuang Zhao, Zhuotao Tian, Jiaya Jia*
+IEEE Conference on Computer Vision and Pattern Recognition (**CVPR**) 2024
+[ Backbone ] [ OA-CNNs ] - [ [arXiv](https://arxiv.org/abs/2403.14418) ] [ [Bib](https://xywu.me/research/oacnns/bib.txt) ] &rarr; [here](#oa-cnns)
+- **PonderV2: Pave the Way for 3D Foundation Model with A Universal Pre-training Paradigm**
+*Haoyi Zhu\*, Honghui Yang\*, Xiaoyang Wu\*, Di Huang\*, Sha Zhang, Xianglong He, Tong He, Hengshuang Zhao, Chunhua Shen, Yu Qiao, Wanli Ouyang*
+arXiv Preprint 2023
+[ Pretrain ] [PonderV2] - [ [arXiv](https://arxiv.org/abs/2310.08586) ] [ [Bib](https://xywu.me/research/ponderv2/bib.txt) ] [ [Project](https://github.com/OpenGVLab/PonderV2) ] &rarr; [here](https://github.com/OpenGVLab/PonderV2)
+- **Towards Large-scale 3D Representation Learning with Multi-dataset Point Prompt Training**
+*Xiaoyang Wu, Zhuotao Tian, Xin Wen, Bohao Peng, Xihui Liu, Kaicheng Yu, Hengshuang Zhao*
+IEEE Conference on Computer Vision and Pattern Recognition (**CVPR**) 2024
+[ Pretrain ] [PPT] - [ [arXiv](https://arxiv.org/abs/2308.09718) ] [ [Bib](https://xywu.me/research/ppt/bib.txt) ] &rarr; [here](#point-prompt-training-ppt)
+- **Masked Scene Contrast: A Scalable Framework for Unsupervised 3D Representation Learning**
+*Xiaoyang Wu, Xin Wen, Xihui Liu, Hengshuang Zhao*
+IEEE Conference on Computer Vision and Pattern Recognition (**CVPR**) 2023
+[ Pretrain ] [ MSC ] - [ [arXiv](https://arxiv.org/abs/2303.14191) ] [ [Bib](https://xywu.me/research/msc/bib.txt) ] &rarr; [here](#masked-scene-contrast-msc)
+- **Learning Context-aware Classifier for Semantic Segmentation** (3D Part)
+*Zhuotao Tian, Jiequan Cui, Li Jiang, Xiaojuan Qi, Xin Lai, Yixin Chen, Shu Liu, Jiaya Jia*
+AAAI Conference on Artificial Intelligence (**AAAI**) 2023 - Oral
+[ SemSeg ] [ CAC ] - [ [arXiv](https://arxiv.org/abs/2303.11633) ] [ [Bib](https://xywu.me/research/cac/bib.txt) ] [ [2D Part](https://github.com/tianzhuotao/CAC) ] &rarr; [here](#context-aware-classifier)
+- **Point Transformer V2: Grouped Vector Attention and Partition-based Pooling**
+*Xiaoyang Wu, Yixing Lao, Li Jiang, Xihui Liu, Hengshuang Zhao*
+Conference on Neural Information Processing Systems (**NeurIPS**) 2022
+[ Backbone ] [ PTv2 ] - [ [arXiv](https://arxiv.org/abs/2210.05666) ] [ [Bib](https://xywu.me/research/ptv2/bib.txt) ] &rarr; [here](#point-transformers)
+- **Point Transformer**
+*Hengshuang Zhao, Li Jiang, Jiaya Jia, Philip Torr, Vladlen Koltun*
+IEEE International Conference on Computer Vision (**ICCV**) 2021 - Oral
+[ Backbone ] [ PTv1 ] - [ [arXiv](https://arxiv.org/abs/2012.09164) ] [ [Bib](https://hszhao.github.io/papers/iccv21_pointtransformer_bib.txt) ] &rarr; [here](#point-transformers)
+Additionally, **Pointcept** integrates the following excellent work (contain above):
+Backbone:
+[MinkUNet](https://github.com/NVIDIA/MinkowskiEngine) ([here](#sparseunet)),
+[SpUNet](https://github.com/traveller59/spconv) ([here](#sparseunet)),
+[SPVCNN](https://github.com/mit-han-lab/spvnas) ([here](#spvcnn)),
+[OACNNs](https://arxiv.org/abs/2403.14418) ([here](#oa-cnns)),
+[PTv1](https://arxiv.org/abs/2012.09164) ([here](#point-transformers)),
+[PTv2](https://arxiv.org/abs/2210.05666) ([here](#point-transformers)),
+[PTv3](https://arxiv.org/abs/2312.10035) ([here](#point-transformers)),
+[StratifiedFormer](https://github.com/dvlab-research/Stratified-Transformer) ([here](#stratified-transformer)),
+[OctFormer](https://github.com/octree-nn/octformer) ([here](#octformer)),
+[Swin3D](https://github.com/microsoft/Swin3D) ([here](#swin3d));
+Semantic Segmentation:
+[Mix3d](https://github.com/kumuji/mix3d) ([here](https://github.com/Pointcept/Pointcept/blob/main/configs/scannet/semseg-spunet-v1m1-0-base.py#L5)),
+[CAC](https://arxiv.org/abs/2303.11633) ([here](#context-aware-classifier));
+Instance Segmentation:
+[PointGroup](https://github.com/dvlab-research/PointGroup) ([here](#pointgroup));
+Pre-training:
+[PointContrast](https://github.com/facebookresearch/PointContrast) ([here](#pointcontrast)),
+[Contrastive Scene Contexts](https://github.com/facebookresearch/ContrastiveSceneContexts) ([here](#contrastive-scene-contexts)),
+[Masked Scene Contrast](https://arxiv.org/abs/2303.14191) ([here](#masked-scene-contrast-msc)),
+[Point Prompt Training](https://arxiv.org/abs/2308.09718) ([here](#point-prompt-training-ppt));
+Datasets:
+[ScanNet](http://www.scan-net.org/) ([here](#scannet-v2)),
+[ScanNet200](http://www.scan-net.org/) ([here](#scannet-v2)),
+[ScanNet++](https://kaldir.vc.in.tum.de/scannetpp/) ([here](#scannet)),
+[S3DIS](https://docs.google.com/forms/d/e/1FAIpQLScDimvNMCGhy_rmBA2gHfDu3naktRm6A8BPwAWWDv-Uhm6Shw/viewform?c=0&w=1) ([here](#s3dis)),
+[Matterport3D](https://niessner.github.io/Matterport/) ([here](#matterport3d)),
+[ArkitScene](https://github.com/apple/ARKitScenes),
+[Structured3D](https://structured3d-dataset.org/) ([here](#structured3d)),
+[SemanticKITTI](http://www.semantic-kitti.org/) ([here](#semantickitti)),
+[nuScenes](https://www.nuscenes.org/nuscenes) ([here](#nuscenes)),
+[ModelNet40](https://modelnet.cs.princeton.edu/) ([here](#modelnet)),
+[Waymo](https://waymo.com/open/) ([here](#waymo)).
+## Highlights
+- *May, 2024*: In v1.5.2, we redesigned the default structure for each dataset for better performance. Please **re-preprocess** datasets or **download** our preprocessed datasets from **[here](https://huggingface.co/Pointcept)**.
+- *Apr, 2024*: **PTv3** is selected as one of the 90 **Oral** papers (3.3% accepted papers, 0.78% submissions) by CVPR'24!
+- *Mar, 2024*: We release code for **OA-CNNs**, accepted by CVPR'24. Issue related to **OA-CNNs** can @Pbihao.
+- *Feb, 2024*: **PTv3** and **PPT** are accepted by CVPR'24, another **two** papers by our Pointcept team have also been accepted by CVPR'24 🎉🎉🎉. We will make them publicly available soon!
+- *Dec, 2023*: **PTv3** is released on arXiv, and the code is available in Pointcept. PTv3 is an efficient backbone model that achieves SOTA performances across indoor and outdoor scenarios.
+- *Aug, 2023*: **PPT** is released on arXiv. PPT presents a multi-dataset pre-training framework that achieves SOTA performance in both **indoor** and **outdoor** scenarios. It is compatible with various existing pre-training frameworks and backbones.  A **pre-release** version of the code is accessible; for those interested, please feel free to contact me directly for access.
+- *Mar, 2023*: We released our codebase, **Pointcept**, a highly potent tool for point cloud representation learning and perception. We welcome new work to join the _Pointcept_ family and highly recommend reading [Quick Start](#quick-start) before starting your trail.
+- *Feb, 2023*: **MSC** and **CeCo** accepted by CVPR 2023. _MSC_ is a highly efficient and effective pretraining framework that facilitates cross-dataset large-scale pretraining, while _CeCo_ is a segmentation method specifically designed for long-tail datasets. Both approaches are compatible with all existing backbone models in our codebase, and we will soon make the code available for public use.
+- *Jan, 2023*: **CAC**, oral work of AAAI 2023, has expanded its 3D result with the incorporation of Pointcept. This addition will allow CAC to serve as a pluggable segmentor within our codebase.
+- *Sep, 2022*: **PTv2** accepted by NeurIPS 2022. It is a continuation of the Point Transformer. The proposed GVA theory can apply to most existing attention mechanisms, while Grid Pooling is also a practical addition to existing pooling methods.
+## Citation
+If you find _Pointcept_ useful to your research, please cite our work as encouragement. (੭ˊ꒳ˋ)੭✧
+```
+@misc{pointcept2023,
+    title={Pointcept: A Codebase for Point Cloud Perception Research},
+    author={Pointcept Contributors},
+    howpublished = {\url{https://github.com/Pointcept/Pointcept}},
+    year={2023}
+}
+```
+## Overview
+- [Installation](#installation)
+- [Data Preparation](#data-preparation)
+- [Quick Start](#quick-start)
+- [Model Zoo](#model-zoo)
+- [Citation](#citation)
+- [Acknowledgement](#acknowledgement)
+## Installation
+### Requirements
+- Ubuntu: 18.04 and above.
+- CUDA: 11.3 and above.
+- PyTorch: 1.10.0 and above.
+### Conda Environment
+```bash
+conda create -n pointcept python=3.8 -y
+conda activate pointcept
+conda install ninja -y
+# Choose version you want here: https://pytorch.org/get-started/previous-versions/
+conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.3 -c pytorch -y
+conda install h5py pyyaml -c anaconda -y
+conda install sharedarray tensorboard tensorboardx yapf addict einops scipy plyfile termcolor timm -c conda-forge -y
+conda install pytorch-cluster pytorch-scatter pytorch-sparse -c pyg -y
+pip install torch-geometric
+# spconv (SparseUNet)
+# refer https://github.com/traveller59/spconv
+pip install spconv-cu113
+# PPT (clip)
+pip install ftfy regex tqdm
+pip install git+https://github.com/openai/CLIP.git
+# PTv1 & PTv2 or precise eval
+cd libs/pointops
+# usual
+python setup.py install
+# docker & multi GPU arch
+TORCH_CUDA_ARCH_LIST="ARCH LIST" python  setup.py install
+# e.g. 7.5: RTX 3000; 8.0: a100 More available in: https://developer.nvidia.com/cuda-gpus
+TORCH_CUDA_ARCH_LIST="7.5 8.0" python  setup.py install
+cd ../..
+# Open3D (visualization, optional)
+pip install open3d
+```
+## Data Preparation
+### ScanNet v2
+The preprocessing supports semantic and instance segmentation for both `ScanNet20`, `ScanNet200`, and `ScanNet Data Efficient`.
+- Download the [ScanNet](http://www.scan-net.org/) v2 dataset.
+- Run preprocessing code for raw ScanNet as follows:
+  ```bash
+  # RAW_SCANNET_DIR: the directory of downloaded ScanNet v2 raw dataset.
+  # PROCESSED_SCANNET_DIR: the directory of the processed ScanNet dataset (output dir).
+  python pointcept/datasets/preprocessing/scannet/preprocess_scannet.py --dataset_root ${RAW_SCANNET_DIR} --output_root ${PROCESSED_SCANNET_DIR}
+  ```
+- (Optional) Download ScanNet Data Efficient files:
+  ```bash
+  # download-scannet.py is the official download script
+  # or follow instructions here: https://kaldir.vc.in.tum.de/scannet_benchmark/data_efficient/documentation#download
+  python download-scannet.py --data_efficient -o ${RAW_SCANNET_DIR}
+  # unzip downloads
+  cd ${RAW_SCANNET_DIR}/tasks
+  unzip limited-annotation-points.zip
+  unzip limited-reconstruction-scenes.zip
+  # copy files to processed dataset folder
+  mkdir ${PROCESSED_SCANNET_DIR}/tasks
+  cp -r ${RAW_SCANNET_DIR}/tasks/points ${PROCESSED_SCANNET_DIR}/tasks
+  cp -r ${RAW_SCANNET_DIR}/tasks/scenes ${PROCESSED_SCANNET_DIR}/tasks
+  ```
+- (Alternative) Our preprocess data can be directly downloaded [[here](https://huggingface.co/datasets/Pointcept/scannet-compressed)], please agree the official license before download it.
+- Link processed dataset to codebase:
+  ```bash
+  # PROCESSED_SCANNET_DIR: the directory of the processed ScanNet dataset.
+  mkdir data
+  ln -s ${PROCESSED_SCANNET_DIR} ${CODEBASE_DIR}/data/scannet
+  ```
+### ScanNet++
+- Download the [ScanNet++](https://kaldir.vc.in.tum.de/scannetpp/) dataset.
+- Run preprocessing code for raw ScanNet++ as follows:
+  ```bash
+  # RAW_SCANNETPP_DIR: the directory of downloaded ScanNet++ raw dataset.
+  # PROCESSED_SCANNETPP_DIR: the directory of the processed ScanNet++ dataset (output dir).
+  # NUM_WORKERS: the number of workers for parallel preprocessing.
+  python pointcept/datasets/preprocessing/scannetpp/preprocess_scannetpp.py --dataset_root ${RAW_SCANNETPP_DIR} --output_root ${PROCESSED_SCANNETPP_DIR} --num_workers ${NUM_WORKERS}
+  ```
+- Sampling and chunking large point cloud data in train/val split as follows (only used for training):
+  ```bash
+  # PROCESSED_SCANNETPP_DIR: the directory of the processed ScanNet++ dataset (output dir).
+  # NUM_WORKERS: the number of workers for parallel preprocessing.
+  python pointcept/datasets/preprocessing/sampling_chunking_data.py --dataset_root ${PROCESSED_SCANNETPP_DIR} --grid_size 0.01 --chunk_range 6 6 --chunk_stride 3 3 --split train --num_workers ${NUM_WORKERS}
+  python pointcept/datasets/preprocessing/sampling_chunking_data.py --dataset_root ${PROCESSED_SCANNETPP_DIR} --grid_size 0.01 --chunk_range 6 6 --chunk_stride 3 3 --split val --num_workers ${NUM_WORKERS}
+  ```
+- (Alternative) Our preprocess data can be directly downloaded [[here](https://huggingface.co/datasets/Pointcept/scannetpp-compressed)], please agree the official license before download it.
+- Link processed dataset to codebase:
+  ```bash
+  # PROCESSED_SCANNETPP_DIR: the directory of the processed ScanNet dataset.
+  mkdir data
+  ln -s ${PROCESSED_SCANNETPP_DIR} ${CODEBASE_DIR}/data/scannetpp
+  ```
+### S3DIS
+- Download S3DIS data by filling this [Google form](https://docs.google.com/forms/d/e/1FAIpQLScDimvNMCGhy_rmBA2gHfDu3naktRm6A8BPwAWWDv-Uhm6Shw/viewform?c=0&w=1). Download the `Stanford3dDataset_v1.2.zip` file and unzip it.
+- Fix error in `Area_5/office_19/Annotations/ceiling` Line 323474 (103.0�0000 => 103.000000).
+- (Optional) Download Full 2D-3D S3DIS dataset (no XYZ) from [here](https://github.com/alexsax/2D-3D-Semantics) for parsing normal.
+- Run preprocessing code for S3DIS as follows:
+  ```bash
+  # S3DIS_DIR: the directory of downloaded Stanford3dDataset_v1.2 dataset.
+  # RAW_S3DIS_DIR: the directory of Stanford2d3dDataset_noXYZ dataset. (optional, for parsing normal)
+  # PROCESSED_S3DIS_DIR: the directory of processed S3DIS dataset (output dir).
+  # S3DIS without aligned angle
+  python pointcept/datasets/preprocessing/s3dis/preprocess_s3dis.py --dataset_root ${S3DIS_DIR} --output_root ${PROCESSED_S3DIS_DIR}
+  # S3DIS with aligned angle
+  python pointcept/datasets/preprocessing/s3dis/preprocess_s3dis.py --dataset_root ${S3DIS_DIR} --output_root ${PROCESSED_S3DIS_DIR} --align_angle
+  # S3DIS with normal vector (recommended, normal is helpful)
+  python pointcept/datasets/preprocessing/s3dis/preprocess_s3dis.py --dataset_root ${S3DIS_DIR} --output_root ${PROCESSED_S3DIS_DIR} --raw_root ${RAW_S3DIS_DIR} --parse_normal
+  python pointcept/datasets/preprocessing/s3dis/preprocess_s3dis.py --dataset_root ${S3DIS_DIR} --output_root ${PROCESSED_S3DIS_DIR} --raw_root ${RAW_S3DIS_DIR} --align_angle --parse_normal
+  ```
+- (Alternative) Our preprocess data can also be downloaded [[here](https://huggingface.co/datasets/Pointcept/s3dis-compressed
+)] (with normal vector and aligned angle), please agree with the official license before downloading it.
+- Link processed dataset to codebase.
+  ```bash
+  # PROCESSED_S3DIS_DIR: the directory of processed S3DIS dataset.
+  mkdir data
+  ln -s ${PROCESSED_S3DIS_DIR} ${CODEBASE_DIR}/data/s3dis
+  ```
+### Structured3D
+- Download Structured3D panorama related and perspective (full) related zip files by filling this [Google form](https://docs.google.com/forms/d/e/1FAIpQLSc0qtvh4vHSoZaW6UvlXYy79MbcGdZfICjh4_t4bYofQIVIdw/viewform?pli=1) (no need to unzip them).
+- Organize all downloaded zip file in one folder (`${STRUCT3D_DIR}`).
+- Run preprocessing code for Structured3D as follows:
+  ```bash
+  # STRUCT3D_DIR: the directory of downloaded Structured3D dataset.
+  # PROCESSED_STRUCT3D_DIR: the directory of processed Structured3D dataset (output dir).
+  # NUM_WORKERS: Number for workers for preprocessing, default same as cpu count (might OOM).
+  export PYTHONPATH=./
+  python pointcept/datasets/preprocessing/structured3d/preprocess_structured3d.py --dataset_root ${STRUCT3D_DIR} --output_root ${PROCESSED_STRUCT3D_DIR} --num_workers ${NUM_WORKERS} --grid_size 0.01 --fuse_prsp --fuse_pano
+  ```
+Following the instruction of [Swin3D](https://arxiv.org/abs/2304.06906), we keep 25 categories with frequencies of more than 0.001, out of the original 40 categories.
+[//]: # (- &#40;Alternative&#41; Our preprocess data can also be downloaded [[here]&#40;&#41;], please agree the official license before download it.)
+- (Alternative) Our preprocess data can also be downloaded [[here](https://huggingface.co/datasets/Pointcept/structured3d-compressed
+)] (with perspective views and panorama view, 471.7G after unzipping), please agree the official license before download it.
+- Link processed dataset to codebase.
+  ```bash
+  # PROCESSED_STRUCT3D_DIR: the directory of processed Structured3D dataset (output dir).
+  mkdir data
+  ln -s ${PROCESSED_STRUCT3D_DIR} ${CODEBASE_DIR}/data/structured3d
+  ```
+### Matterport3D
+- Follow [this page](https://niessner.github.io/Matterport/#download) to request access to the dataset.
+- Download the "region_segmentation" type, which represents the division of a scene into individual rooms.
+  ```bash
+  # download-mp.py is the official download script
+  # MATTERPORT3D_DIR: the directory of downloaded Matterport3D dataset.
+  python download-mp.py -o {MATTERPORT3D_DIR} --type region_segmentations
+  ```
+- Unzip the region_segmentations data
+  ```bash
+  # MATTERPORT3D_DIR: the directory of downloaded Matterport3D dataset.
+  python pointcept/datasets/preprocessing/matterport3d/unzip_matterport3d_region_segmentation.py --dataset_root {MATTERPORT3D_DIR}
+  ```
+- Run preprocessing code for Matterport3D as follows:
+  ```bash
+  # MATTERPORT3D_DIR: the directory of downloaded Matterport3D dataset.
+  # PROCESSED_MATTERPORT3D_DIR: the directory of processed Matterport3D dataset (output dir).
+  # NUM_WORKERS: the number of workers for this preprocessing.
+  python pointcept/datasets/preprocessing/matterport3d/preprocess_matterport3d_mesh.py --dataset_root ${MATTERPORT3D_DIR} --output_root ${PROCESSED_MATTERPORT3D_DIR} --num_workers ${NUM_WORKERS}
+  ```
+- Link processed dataset to codebase.
+  ```bash
+  # PROCESSED_MATTERPORT3D_DIR: the directory of processed Matterport3D dataset (output dir).
+  mkdir data
+  ln -s ${PROCESSED_MATTERPORT3D_DIR} ${CODEBASE_DIR}/data/matterport3d
+  ```
+Following the instruction of [OpenRooms](https://github.com/ViLab-UCSD/OpenRooms), we remapped Matterport3D's categories to ScanNet 20 semantic categories with the addition of a ceiling category.
+* (Alternative) Our preprocess data can also be downloaded [here](https://huggingface.co/datasets/Pointcept/matterport3d-compressed), please agree the official license before download it.
+### SemanticKITTI
+- Download [SemanticKITTI](http://www.semantic-kitti.org/dataset.html#download) dataset.
+- Link dataset to codebase.
+  ```bash
+  # SEMANTIC_KITTI_DIR: the directory of SemanticKITTI dataset.
+  # |- SEMANTIC_KITTI_DIR
+  #   |- dataset
+  #     |- sequences
+  #       |- 00
+  #       |- 01
+  #       |- ...
+  mkdir -p data
+  ln -s ${SEMANTIC_KITTI_DIR} ${CODEBASE_DIR}/data/semantic_kitti
+  ```
+### nuScenes
+- Download the official [NuScene](https://www.nuscenes.org/nuscenes#download) dataset (with Lidar Segmentation) and organize the downloaded files as follows:
+  ```bash
+  NUSCENES_DIR
+  │── samples
+  │── sweeps
+  │── lidarseg
+  ...
+  │── v1.0-trainval
+  │── v1.0-test
+  ```
+- Run information preprocessing code (modified from OpenPCDet) for nuScenes as follows:
+  ```bash
+  # NUSCENES_DIR: the directory of downloaded nuScenes dataset.
+  # PROCESSED_NUSCENES_DIR: the directory of processed nuScenes dataset (output dir).
+  # MAX_SWEEPS: Max number of sweeps. Default: 10.
+  pip install nuscenes-devkit pyquaternion
+  python pointcept/datasets/preprocessing/nuscenes/preprocess_nuscenes_info.py --dataset_root ${NUSCENES_DIR} --output_root ${PROCESSED_NUSCENES_DIR} --max_sweeps ${MAX_SWEEPS} --with_camera
+  ```
+- (Alternative) Our preprocess nuScenes information data can also be downloaded [[here](
+https://huggingface.co/datasets/Pointcept/nuscenes-compressed)] (only processed information, still need to download raw dataset and link to the folder), please agree the official license before download it.
+- Link raw dataset to processed NuScene dataset folder:
+  ```bash
+  # NUSCENES_DIR: the directory of downloaded nuScenes dataset.
+  # PROCESSED_NUSCENES_DIR: the directory of processed nuScenes dataset (output dir).
+  ln -s ${NUSCENES_DIR} {PROCESSED_NUSCENES_DIR}/raw
+  ```
+  then the processed nuscenes folder is organized as follows:
+  ```bash
+  nuscene
+  |── raw
+      │── samples
+      │── sweeps
+      │── lidarseg
+      ...
+      │── v1.0-trainval
+      │── v1.0-test
+  |── info
+  ```
+- Link processed dataset to codebase.
+  ```bash
+  # PROCESSED_NUSCENES_DIR: the directory of processed nuScenes dataset (output dir).
+  mkdir data
+  ln -s ${PROCESSED_NUSCENES_DIR} ${CODEBASE_DIR}/data/nuscenes
+  ```
+### Waymo
+- Download the official [Waymo](https://waymo.com/open/download/) dataset (v1.4.3) and organize the downloaded files as follows:
+  ```bash
+  WAYMO_RAW_DIR
+  │── training
+  │── validation
+  │── testing
+  ```
+- Install the following dependence:
+  ```bash
+  # If shows "No matching distribution found", download whl directly from Pypi and install the package.
+  conda create -n waymo python=3.10 -y
+  conda activate waymo
+  pip install waymo-open-dataset-tf-2-12-0
+  ```
+- Run the preprocessing code as follows:
+  ```bash
+  # WAYMO_DIR: the directory of the downloaded Waymo dataset.
+  # PROCESSED_WAYMO_DIR: the directory of the processed Waymo dataset (output dir).
+  # NUM_WORKERS: num workers for preprocessing
+  python pointcept/datasets/preprocessing/waymo/preprocess_waymo.py --dataset_root ${WAYMO_DIR} --output_root ${PROCESSED_WAYMO_DIR} --splits training validation --num_workers ${NUM_WORKERS}
+  ```
+- Link processed dataset to the codebase.
+  ```bash
+  # PROCESSED_WAYMO_DIR: the directory of the processed Waymo dataset (output dir).
+  mkdir data
+  ln -s ${PROCESSED_WAYMO_DIR} ${CODEBASE_DIR}/data/waymo
+  ```
+### ModelNet
+- Download [modelnet40_normal_resampled.zip](https://shapenet.cs.stanford.edu/media/modelnet40_normal_resampled.zip) and unzip
+- Link dataset to the codebase.
+  ```bash
+  mkdir -p data
+  ln -s ${MODELNET_DIR} ${CODEBASE_DIR}/data/modelnet40_normal_resampled
+  ```
+## Quick Start
+### Training
+**Train from scratch.** The training processing is based on configs in `configs` folder.
+The training script will generate an experiment folder in `exp` folder and backup essential code in the experiment folder.
+Training config, log, tensorboard, and checkpoints will also be saved into the experiment folder during the training process.
+```bash
+export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}
+# Script (Recommended)
+sh scripts/train.sh -p ${INTERPRETER_PATH} -g ${NUM_GPU} -d ${DATASET_NAME} -c ${CONFIG_NAME} -n ${EXP_NAME}
+# Direct
+export PYTHONPATH=./
+python tools/train.py --config-file ${CONFIG_PATH} --num-gpus ${NUM_GPU} --options save_path=${SAVE_PATH}
+```
+For example:
+```bash
+# By script (Recommended)
+# -p is default set as python and can be ignored
+sh scripts/train.sh -p python -d scannet -c semseg-pt-v2m2-0-base -n semseg-pt-v2m2-0-base
+# Direct
+export PYTHONPATH=./
+python tools/train.py --config-file configs/scannet/semseg-pt-v2m2-0-base.py --options save_path=exp/scannet/semseg-pt-v2m2-0-base
+```
+**Resume training from checkpoint.** If the training process is interrupted by accident, the following script can resume training from a given checkpoint.
+```bash
+export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}
+# Script (Recommended)
+# simply add "-r true"
+sh scripts/train.sh -p ${INTERPRETER_PATH} -g ${NUM_GPU} -d ${DATASET_NAME} -c ${CONFIG_NAME} -n ${EXP_NAME} -r true
+# Direct
+export PYTHONPATH=./
+python tools/train.py --config-file ${CONFIG_PATH} --num-gpus ${NUM_GPU} --options save_path=${SAVE_PATH} resume=True weight=${CHECKPOINT_PATH}
+```
+### Testing
+During training, model evaluation is performed on point clouds after grid sampling (voxelization), providing an initial assessment of model performance. However, to obtain precise evaluation results, testing is **essential**. The testing process involves subsampling a dense point cloud into a sequence of voxelized point clouds, ensuring comprehensive coverage of all points. These sub-results are then predicted and collected to form a complete prediction of the entire point cloud. This approach yields  higher evaluation results compared to simply mapping/interpolating the prediction. In addition, our testing code supports TTA (test time augmentation) testing, which further enhances the stability of evaluation performance.
+```bash
+# By script (Based on experiment folder created by training script)
+sh scripts/test.sh -p ${INTERPRETER_PATH} -g ${NUM_GPU} -d ${DATASET_NAME} -n ${EXP_NAME} -w ${CHECKPOINT_NAME}
+# Direct
+export PYTHONPATH=./
+python tools/test.py --config-file ${CONFIG_PATH} --num-gpus ${NUM_GPU} --options save_path=${SAVE_PATH} weight=${CHECKPOINT_PATH}
+```
+For example:
+```bash
+# By script (Based on experiment folder created by training script)
+# -p is default set as python and can be ignored
+# -w is default set as model_best and can be ignored
+sh scripts/test.sh -p python -d scannet -n semseg-pt-v2m2-0-base -w model_best
+# Direct
+export PYTHONPATH=./
+python tools/test.py --config-file configs/scannet/semseg-pt-v2m2-0-base.py --options save_path=exp/scannet/semseg-pt-v2m2-0-base weight=exp/scannet/semseg-pt-v2m2-0-base/model/model_best.pth
+```
+The TTA can be disabled by replace `data.test.test_cfg.aug_transform = [...]` with:
+```python
+data = dict(
+    train = dict(...),
+    val = dict(...),
+    test = dict(
+        ...,
+        test_cfg = dict(
+            ...,
+            aug_transform = [
+                [dict(type="RandomRotateTargetAngle", angle=[0], axis="z", center=[0, 0, 0], p=1)]
+            ]
+        )
+    )
+)
+```
+### Offset
+`Offset` is the separator of point clouds in batch data, and it is similar to the concept of `Batch` in PyG.
+A visual illustration of batch and offset is as follows:
+<p align="center">
+    <!-- pypi-strip -->
+    <picture>
+    <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/Pointcept/Pointcept/main/docs/offset_dark.png">
+    <source media="(prefers-color-scheme: light)" srcset="https://raw.githubusercontent.com/Pointcept/Pointcept/main/docs/offset.png">
+    <!-- /pypi-strip -->
+    <img alt="pointcept" src="https://raw.githubusercontent.com/Pointcept/Pointcept/main/docs/offset.png" width="480">
+    <!-- pypi-strip -->
+    </picture><br>
+    <!-- /pypi-strip -->
+</p>
+## Model Zoo
+### 1. Backbones and Semantic Segmentation
+#### SparseUNet
+_Pointcept_ provides `SparseUNet` implemented by `SpConv` and `MinkowskiEngine`. The SpConv version is recommended since SpConv is easy to install and faster than MinkowskiEngine. Meanwhile, SpConv is also widely applied in outdoor perception.
+- **SpConv (recommend)**
+The SpConv version `SparseUNet` in the codebase was fully rewrite from `MinkowskiEngine` version, example running script is as follows:
+```bash
+# ScanNet val
+sh scripts/train.sh -g 4 -d scannet -c semseg-spunet-v1m1-0-base -n semseg-spunet-v1m1-0-base
+# ScanNet200
+sh scripts/train.sh -g 4 -d scannet200 -c semseg-spunet-v1m1-0-base -n semseg-spunet-v1m1-0-base
+# S3DIS
+sh scripts/train.sh -g 4 -d s3dis -c semseg-spunet-v1m1-0-base -n semseg-spunet-v1m1-0-base
+# S3DIS (with normal)
+sh scripts/train.sh -g 4 -d s3dis -c semseg-spunet-v1m1-0-cn-base -n semseg-spunet-v1m1-0-cn-base
+# SemanticKITTI
+sh scripts/train.sh -g 4 -d semantic_kitti -c semseg-spunet-v1m1-0-base -n semseg-spunet-v1m1-0-base
+# nuScenes
+sh scripts/train.sh -g 4 -d nuscenes -c semseg-spunet-v1m1-0-base -n semseg-spunet-v1m1-0-base
+# ModelNet40
+sh scripts/train.sh -g 2 -d modelnet40 -c cls-spunet-v1m1-0-base -n cls-spunet-v1m1-0-base
+# ScanNet Data Efficient
+sh scripts/train.sh -g 4 -d scannet -c semseg-spunet-v1m1-2-efficient-la20 -n semseg-spunet-v1m1-2-efficient-la20
+sh scripts/train.sh -g 4 -d scannet -c semseg-spunet-v1m1-2-efficient-la50 -n semseg-spunet-v1m1-2-efficient-la50
+sh scripts/train.sh -g 4 -d scannet -c semseg-spunet-v1m1-2-efficient-la100 -n semseg-spunet-v1m1-2-efficient-la100
+sh scripts/train.sh -g 4 -d scannet -c semseg-spunet-v1m1-2-efficient-la200 -n semseg-spunet-v1m1-2-efficient-la200
+sh scripts/train.sh -g 4 -d scannet -c semseg-spunet-v1m1-2-efficient-lr1 -n semseg-spunet-v1m1-2-efficient-lr1
+sh scripts/train.sh -g 4 -d scannet -c semseg-spunet-v1m1-2-efficient-lr5 -n semseg-spunet-v1m1-2-efficient-lr5
+sh scripts/train.sh -g 4 -d scannet -c semseg-spunet-v1m1-2-efficient-lr10 -n semseg-spunet-v1m1-2-efficient-lr10
+sh scripts/train.sh -g 4 -d scannet -c semseg-spunet-v1m1-2-efficient-lr20 -n semseg-spunet-v1m1-2-efficient-lr20
+# Profile model run time
+sh scripts/train.sh -g 4 -d scannet -c semseg-spunet-v1m1-0-enable-profiler -n semseg-spunet-v1m1-0-enable-profiler
+```
+- **MinkowskiEngine**
+The MinkowskiEngine version `SparseUNet` in the codebase was modified from the original MinkowskiEngine repo, and example running scripts are as follows:
+1. Install MinkowskiEngine, refer https://github.com/NVIDIA/MinkowskiEngine
+2. Training with the following example scripts:
+```bash
+# Uncomment "# from .sparse_unet import *" in "pointcept/models/__init__.py"
+# Uncomment "# from .mink_unet import *" in "pointcept/models/sparse_unet/__init__.py"
+# ScanNet
+sh scripts/train.sh -g 4 -d scannet -c semseg-minkunet34c-0-base -n semseg-minkunet34c-0-base
+# ScanNet200
+sh scripts/train.sh -g 4 -d scannet200 -c semseg-minkunet34c-0-base -n semseg-minkunet34c-0-base
+# S3DIS
+sh scripts/train.sh -g 4 -d s3dis -c semseg-minkunet34c-0-base -n semseg-minkunet34c-0-base
+# SemanticKITTI
+sh scripts/train.sh -g 2 -d semantic_kitti -c semseg-minkunet34c-0-base -n semseg-minkunet34c-0-base
+```
+#### OA-CNNs
+Introducing Omni-Adaptive 3D CNNs (**OA-CNNs**), a family of networks that integrates a lightweight module to greatly enhance the adaptivity of sparse CNNs at minimal computational cost. Without any self-attention modules, **OA-CNNs** favorably surpass point transformers in terms of accuracy in both indoor and outdoor scenes, with much less latency and memory cost. Issue related to **OA-CNNs** can @Pbihao.
+```bash
+# ScanNet
+sh scripts/train.sh -g 4 -d scannet -c semseg-oacnns-v1m1-0-base -n semseg-oacnns-v1m1-0-base
+```
+#### Point Transformers
+- **PTv3**
+[PTv3](https://arxiv.org/abs/2312.10035) is an efficient backbone model that achieves SOTA performances across indoor and outdoor scenarios. The full PTv3 relies on FlashAttention, while FlashAttention relies on CUDA 11.6 and above, make sure your local Pointcept environment satisfies the requirements.
+If you can not upgrade your local environment to satisfy the requirements (CUDA >= 11.6), then you can disable FlashAttention by setting the model parameter `enable_flash` to `false` and reducing the `enc_patch_size` and `dec_patch_size` to a level (e.g. 128).
+FlashAttention force disables RPE and forces the accuracy reduced to fp16. If you require these features, please disable `enable_flash` and adjust `enable_rpe`, `upcast_attention` and`upcast_softmax`.
+Detailed instructions and experiment records (containing weights) are available on the [project repository](https://github.com/Pointcept/PointTransformerV3). Example running scripts are as follows:
+```bash
+# Scratched ScanNet
+sh scripts/train.sh -g 4 -d scannet -c semseg-pt-v3m1-0-base -n semseg-pt-v3m1-0-base
+# PPT joint training (ScanNet + Structured3D) and evaluate in ScanNet
+sh scripts/train.sh -g 8 -d scannet -c semseg-pt-v3m1-1-ppt-extreme -n semseg-pt-v3m1-1-ppt-extreme
+# Scratched ScanNet200
+sh scripts/train.sh -g 4 -d scannet200 -c semseg-pt-v3m1-0-base -n semseg-pt-v3m1-0-base
+# Fine-tuning from  PPT joint training (ScanNet + Structured3D) with ScanNet200
+# PTV3_PPT_WEIGHT_PATH: Path to model weight trained by PPT multi-dataset joint training
+# e.g. exp/scannet/semseg-pt-v3m1-1-ppt-extreme/model/model_best.pth
+sh scripts/train.sh -g 4 -d scannet200 -c semseg-pt-v3m1-1-ppt-ft -n semseg-pt-v3m1-1-ppt-ft -w ${PTV3_PPT_WEIGHT_PATH}
+# Scratched ScanNet++
+sh scripts/train.sh -g 4 -d scannetpp -c semseg-pt-v3m1-0-base -n semseg-pt-v3m1-0-base
+# Scratched ScanNet++ test
+sh scripts/train.sh -g 4 -d scannetpp -c semseg-pt-v3m1-1-submit -n semseg-pt-v3m1-1-submit
+# Scratched S3DIS
+sh scripts/train.sh -g 4 -d s3dis -c semseg-pt-v3m1-0-base -n semseg-pt-v3m1-0-base
+# an example for disbale flash_attention and enable rpe.
+sh scripts/train.sh -g 4 -d s3dis -c semseg-pt-v3m1-1-rpe -n semseg-pt-v3m1-0-rpe
+# PPT joint training (ScanNet + S3DIS + Structured3D) and evaluate in ScanNet
+sh scripts/train.sh -g 8 -d s3dis -c semseg-pt-v3m1-1-ppt-extreme -n semseg-pt-v3m1-1-ppt-extreme
+# S3DIS 6-fold cross validation
+# 1. The default configs are evaluated on Area_5, modify the "data.train.split", "data.val.split", and "data.test.split" to make the config evaluated on Area_1 ~ Area_6 respectively.
+# 2. Train and evaluate the model on each split of areas and gather result files located in "exp/s3dis/EXP_NAME/result/Area_x.pth" in one single folder, noted as RECORD_FOLDER.
+# 3. Run the following script to get S3DIS 6-fold cross validation performance:
+export PYTHONPATH=./
+python tools/test_s3dis_6fold.py --record_root ${RECORD_FOLDER}
+# Scratched nuScenes
+sh scripts/train.sh -g 4 -d nuscenes -c semseg-pt-v3m1-0-base -n semseg-pt-v3m1-0-base
+# Scratched Waymo
+sh scripts/train.sh -g 4 -d waymo -c semseg-pt-v3m1-0-base -n semseg-pt-v3m1-0-base
+# More configs and exp records for PTv3 will be available soon.
+```
+Indoor semantic segmentation
+| Model | Benchmark | Additional Data | Num GPUs | Val mIoU | Config | Tensorboard | Exp Record |
+| :---: | :---: |:---------------:| :---: | :---: | :---: | :---: | :---: |
+| PTv3 | ScanNet |     &cross;     | 4 | 77.6% | [link](https://github.com/Pointcept/Pointcept/blob/main/configs/scannet/semseg-pt-v3m1-0-base.py) | [link](https://huggingface.co/Pointcept/PointTransformerV3/tensorboard) | [link](https://huggingface.co/Pointcept/PointTransformerV3/tree/main/scannet-semseg-pt-v3m1-0-base) |
+| PTv3 + PPT | ScanNet |     &check;     | 8 | 78.5% | [link](https://github.com/Pointcept/Pointcept/blob/main/configs/scannet/semseg-pt-v3m1-1-ppt-extreme.py) | [link](https://huggingface.co/Pointcept/PointTransformerV3/tensorboard) | [link](https://huggingface.co/Pointcept/PointTransformerV3/tree/main/scannet-semseg-pt-v3m1-1-ppt-extreme) |
+| PTv3 | ScanNet200 |     &cross;     | 4 | 35.3% | [link](https://github.com/Pointcept/Pointcept/blob/main/configs/scannet200/semseg-pt-v3m1-0-base.py) | [link](https://huggingface.co/Pointcept/PointTransformerV3/tensorboard) |[link](https://huggingface.co/Pointcept/PointTransformerV3/tree/main/scannet200-semseg-pt-v3m1-0-base)|
+| PTv3 + PPT | ScanNet200 | &check; (f.t.)  | 4 |  |  |  |  |
+| PTv3 | S3DIS (Area5) |     &cross;     | 4 | 73.6% | [link](https://github.com/Pointcept/Pointcept/blob/main/configs/s3dis/semseg-pt-v3m1-0-rpe.py) | [link](https://huggingface.co/Pointcept/PointTransformerV3/tensorboard) | [link](https://huggingface.co/Pointcept/PointTransformerV3/tree/main/s3dis-semseg-pt-v3m1-0-rpe) |
+| PTv3 + PPT | S3DIS (Area5) |     &check;     | 8 | 75.4% | [link](https://github.com/Pointcept/Pointcept/blob/main/configs/s3dis/semseg-pt-v3m1-1-ppt-extreme.py) | [link](https://huggingface.co/Pointcept/PointTransformerV3/tensorboard) | [link](https://huggingface.co/Pointcept/PointTransformerV3/tree/main/s3dis-semseg-pt-v3m1-1-ppt-extreme) |
+Outdoor semantic segmentation
+| Model | Benchmark | Additional Data | Num GPUs | Val mIoU | Config | Tensorboard | Exp Record |
+| :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
+| PTv3 | nuScenes | &cross; | 4 | 80.3 | [link](https://github.com/Pointcept/Pointcept/blob/main/configs/nuscenes/semseg-pt-v3m1-0-base.py) | [link](https://huggingface.co/Pointcept/PointTransformerV3/tensorboard)|[link](https://huggingface.co/Pointcept/PointTransformerV3/tree/main/nuscenes-semseg-pt-v3m1-0-base) |
+| PTv3 + PPT | nuScenes | &check; | 8 | | | | |
+| PTv3 | SemanticKITTI | &cross; | 4 | | | | |
+| PTv3 + PPT | SemanticKITTI | &check; | 8 | | | | |
+| PTv3 | Waymo | &cross; | 4 | 71.2 | [link](https://github.com/Pointcept/Pointcept/blob/main/configs/waymo/semseg-pt-v3m1-0-base.py) | [link](https://huggingface.co/Pointcept/PointTransformerV3/tensorboard) | [link](https://huggingface.co/Pointcept/PointTransformerV3/tree/main/waymo-semseg-pt-v3m1-0-base) (log only) |
+| PTv3 + PPT | Waymo | &check; | 8 | | | | |
+_**\*Released model weights are trained for v1.5.1, weights for v1.5.2 and later is still ongoing.**_
+- **PTv2 mode2**
+The original PTv2 was trained on 4 * RTX a6000 (48G memory). Even enabling AMP, the memory cost of the original PTv2 is slightly larger than 24G. Considering GPUs with 24G memory are much more accessible, I tuned the PTv2 on the latest Pointcept and made it runnable on 4 * RTX 3090 machines.
+`PTv2 Mode2` enables AMP and disables _Position Encoding Multiplier_ & _Grouped Linear_. During our further research, we found that precise coordinates are not necessary for point cloud understanding (Replacing precise coordinates with grid coordinates doesn't influence the performance. Also, SparseUNet is an example). As for Grouped Linear, my implementation of Grouped Linear seems to cost more memory than the Linear layer provided by PyTorch. Benefiting from the codebase and better parameter tuning, we also relieve the overfitting problem. The reproducing performance is even better than the results reported in our paper.
+Example running scripts are as follows:
+```bash
+# ptv2m2: PTv2 mode2, disable PEM & Grouped Linear, GPU memory cost < 24G (recommend)
+# ScanNet
+sh scripts/train.sh -g 4 -d scannet -c semseg-pt-v2m2-0-base -n semseg-pt-v2m2-0-base
+sh scripts/train.sh -g 4 -d scannet -c semseg-pt-v2m2-3-lovasz -n semseg-pt-v2m2-3-lovasz
+# ScanNet test
+sh scripts/train.sh -g 4 -d scannet -c semseg-pt-v2m2-1-submit -n semseg-pt-v2m2-1-submit
+# ScanNet200
+sh scripts/train.sh -g 4 -d scannet200 -c semseg-pt-v2m2-0-base -n semseg-pt-v2m2-0-base
+# ScanNet++
+sh scripts/train.sh -g 4 -d scannetpp -c semseg-pt-v2m2-0-base -n semseg-pt-v2m2-0-base
+# ScanNet++ test
+sh scripts/train.sh -g 4 -d scannetpp -c semseg-pt-v2m2-1-submit -n semseg-pt-v2m2-1-submit
+# S3DIS
+sh scripts/train.sh -g 4 -d s3dis -c semseg-pt-v2m2-0-base -n semseg-pt-v2m2-0-base
+# SemanticKITTI
+sh scripts/train.sh -g 4 -d semantic_kitti -c semseg-pt-v2m2-0-base -n semseg-pt-v2m2-0-base
+# nuScenes
+sh scripts/train.sh -g 4 -d nuscenes -c semseg-pt-v2m2-0-base -n semseg-pt-v2m2-0-base
+```
+- **PTv2 mode1**
+`PTv2 mode1` is the original PTv2 we reported in our paper, example running scripts are as follows:
+```bash
+# ptv2m1: PTv2 mode1, Original PTv2, GPU memory cost > 24G
+# ScanNet
+sh scripts/train.sh -g 4 -d scannet -c semseg-pt-v2m1-0-base -n semseg-pt-v2m1-0-base
+# ScanNet200
+sh scripts/train.sh -g 4 -d scannet200 -c semseg-pt-v2m1-0-base -n semseg-pt-v2m1-0-base
+# S3DIS
+sh scripts/train.sh -g 4 -d s3dis -c semseg-pt-v2m1-0-base -n semseg-pt-v2m1-0-base
+```
+- **PTv1**
+The original PTv1 is also available in our Pointcept codebase. I haven't run PTv1 for a long time, but I have ensured that the example running script works well.
+```bash
+# ScanNet
+sh scripts/train.sh -g 4 -d scannet -c semseg-pt-v1-0-base -n semseg-pt-v1-0-base
+# ScanNet200
+sh scripts/train.sh -g 4 -d scannet200 -c semseg-pt-v1-0-base -n semseg-pt-v1-0-base
+# S3DIS
+sh scripts/train.sh -g 4 -d s3dis -c semseg-pt-v1-0-base -n semseg-pt-v1-0-base
+```
+#### Stratified Transformer
+1. Additional requirements:
+```bash
+pip install torch-points3d
+# Fix dependence, caused by installing torch-points3d
+pip uninstall SharedArray
+pip install SharedArray==3.2.1
+cd libs/pointops2
+python setup.py install
+cd ../..
+```
+2. Uncomment `# from .stratified_transformer import *` in `pointcept/models/__init__.py`.
+3. Refer [Optional Installation](installation) to install dependence.
+4. Training with the following example scripts:
+```bash
+# stv1m1: Stratified Transformer mode1, Modified from the original Stratified Transformer code.
+# PTv2m2: Stratified Transformer mode2, My rewrite version (recommend).
+# ScanNet
+sh scripts/train.sh -g 4 -d scannet -c semseg-st-v1m2-0-refined -n semseg-st-v1m2-0-refined
+sh scripts/train.sh -g 4 -d scannet -c semseg-st-v1m1-0-origin -n semseg-st-v1m1-0-origin
+# ScanNet200
+sh scripts/train.sh -g 4 -d scannet200 -c semseg-st-v1m2-0-refined -n semseg-st-v1m2-0-refined
+# S3DIS
+sh scripts/train.sh -g 4 -d s3dis -c semseg-st-v1m2-0-refined -n semseg-st-v1m2-0-refined
+```
+#### SPVCNN
+`SPVCNN` is a baseline model of [SPVNAS](https://github.com/mit-han-lab/spvnas), it is also a practical baseline for outdoor datasets.
+1. Install torchsparse:
+```bash
+# refer https://github.com/mit-han-lab/torchsparse
+# install method without sudo apt install
+conda install google-sparsehash -c bioconda
+export C_INCLUDE_PATH=${CONDA_PREFIX}/include:$C_INCLUDE_PATH
+export CPLUS_INCLUDE_PATH=${CONDA_PREFIX}/include:CPLUS_INCLUDE_PATH
+pip install --upgrade git+https://github.com/mit-han-lab/torchsparse.git
+```
+2. Training with the following example scripts:
+```bash
+# SemanticKITTI
+sh scripts/train.sh -g 2 -d semantic_kitti -c semseg-spvcnn-v1m1-0-base -n semseg-spvcnn-v1m1-0-base
+```
+#### OctFormer
+OctFormer from _OctFormer: Octree-based Transformers for 3D Point Clouds_.
+1. Additional requirements:
+```bash
+cd libs
+git clone https://github.com/octree-nn/dwconv.git
+pip install ./dwconv
+pip install ocnn
+```
+2. Uncomment `# from .octformer import *` in `pointcept/models/__init__.py`.
+2. Training with the following example scripts:
+```bash
+# ScanNet
+sh scripts/train.sh -g 4 -d scannet -c semseg-octformer-v1m1-0-base -n semseg-octformer-v1m1-0-base
+```
+#### Swin3D
+Swin3D from _Swin3D: A Pretrained Transformer Backbone for 3D Indoor Scene Understanding_.
+1. Additional requirements:
+```bash
+# 1. Install MinkEngine v0.5.4, follow readme in https://github.com/NVIDIA/MinkowskiEngine;
+# 2. Install Swin3D, mainly for cuda operation:
+cd libs
+git clone https://github.com/microsoft/Swin3D.git
+cd Swin3D
+pip install ./
+```
+2. Uncomment `# from .swin3d import *` in `pointcept/models/__init__.py`.
+3. Pre-Training with the following example scripts (Structured3D preprocessing refer [here](#structured3d)):
+```bash
+# Structured3D + Swin-S
+sh scripts/train.sh -g 4 -d structured3d -c semseg-swin3d-v1m1-0-small -n semseg-swin3d-v1m1-0-small
+# Structured3D + Swin-L
+sh scripts/train.sh -g 4 -d structured3d -c semseg-swin3d-v1m1-1-large -n semseg-swin3d-v1m1-1-large
+# Addition
+# Structured3D + SpUNet
+sh scripts/train.sh -g 4 -d structured3d -c semseg-spunet-v1m1-0-base -n semseg-spunet-v1m1-0-base
+# Structured3D + PTv2
+sh scripts/train.sh -g 4 -d structured3d -c semseg-pt-v2m2-0-base -n semseg-pt-v2m2-0-base
+```
+4. Fine-tuning with the following example scripts:
+```bash
+# ScanNet + Swin-S
+sh scripts/train.sh -g 4 -d scannet -w exp/structured3d/semseg-swin3d-v1m1-1-large/model/model_last.pth -c semseg-swin3d-v1m1-0-small -n semseg-swin3d-v1m1-0-small
+# ScanNet + Swin-L
+sh scripts/train.sh -g 4 -d scannet -w exp/structured3d/semseg-swin3d-v1m1-1-large/model/model_last.pth -c semseg-swin3d-v1m1-1-large -n semseg-swin3d-v1m1-1-large
+# S3DIS + Swin-S (here we provide config support S3DIS normal vector)
+sh scripts/train.sh -g 4 -d s3dis -w exp/structured3d/semseg-swin3d-v1m1-1-large/model/model_last.pth -c semseg-swin3d-v1m1-0-small -n semseg-swin3d-v1m1-0-small
+# S3DIS + Swin-L (here we provide config support S3DIS normal vector)
+sh scripts/train.sh -g 4 -d s3dis -w exp/structured3d/semseg-swin3d-v1m1-1-large/model/model_last.pth -c semseg-swin3d-v1m1-1-large -n semseg-swin3d-v1m1-1-large
+```
+#### Context-Aware Classifier
+`Context-Aware Classifier` is a segmentor that can further boost the performance of each backbone, as a replacement for `Default Segmentor`.  Training with the following example scripts:
+```bash
+# ScanNet
+sh scripts/train.sh -g 4 -d scannet -c semseg-cac-v1m1-0-spunet-base -n semseg-cac-v1m1-0-spunet-base
+sh scripts/train.sh -g 4 -d scannet -c semseg-cac-v1m1-1-spunet-lovasz -n semseg-cac-v1m1-1-spunet-lovasz
+sh scripts/train.sh -g 4 -d scannet -c semseg-cac-v1m1-2-ptv2-lovasz -n semseg-cac-v1m1-2-ptv2-lovasz
+# ScanNet200
+sh scripts/train.sh -g 4 -d scannet200 -c semseg-cac-v1m1-0-spunet-base -n semseg-cac-v1m1-0-spunet-base
+sh scripts/train.sh -g 4 -d scannet200 -c semseg-cac-v1m1-1-spunet-lovasz -n semseg-cac-v1m1-1-spunet-lovasz
+sh scripts/train.sh -g 4 -d scannet200 -c semseg-cac-v1m1-2-ptv2-lovasz -n semseg-cac-v1m1-2-ptv2-lovasz
+```
+### 2. Instance Segmentation
+#### PointGroup
+[PointGroup](https://github.com/dvlab-research/PointGroup) is a baseline framework for point cloud instance segmentation.
+1. Additional requirements:
+```bash
+conda install -c bioconda google-sparsehash
+cd libs/pointgroup_ops
+python setup.py install --include_dirs=${CONDA_PREFIX}/include
+cd ../..
+```
+2. Uncomment `# from .point_group import *` in `pointcept/models/__init__.py`.
+3. Training with the following example scripts:
+```bash
+# ScanNet
+sh scripts/train.sh -g 4 -d scannet -c insseg-pointgroup-v1m1-0-spunet-base -n insseg-pointgroup-v1m1-0-spunet-base
+# S3DIS
+sh scripts/train.sh -g 4 -d scannet -c insseg-pointgroup-v1m1-0-spunet-base -n insseg-pointgroup-v1m1-0-spunet-base
+```
+### 3. Pre-training
+#### Masked Scene Contrast (MSC)
+1. Pre-training with the following example scripts:
+```bash
+# ScanNet
+sh scripts/train.sh -g 8 -d scannet -c pretrain-msc-v1m1-0-spunet-base -n pretrain-msc-v1m1-0-spunet-base
+```
+2. Fine-tuning with the following example scripts:
+enable PointGroup ([here](#pointgroup)) before fine-tuning on instance segmentation task.
+```bash
+# ScanNet20 Semantic Segmentation
+sh scripts/train.sh -g 8 -d scannet -w exp/scannet/pretrain-msc-v1m1-0-spunet-base/model/model_last.pth -c semseg-spunet-v1m1-4-ft -n semseg-msc-v1m1-0f-spunet-base
+# ScanNet20 Instance Segmentation (enable PointGroup before running the script)
+sh scripts/train.sh -g 4 -d scannet -w exp/scannet/pretrain-msc-v1m1-0-spunet-base/model/model_last.pth -c insseg-pointgroup-v1m1-0-spunet-base -n insseg-msc-v1m1-0f-pointgroup-spunet-base
+```
+3. Example log and weight: [[Pretrain](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wuxy_connect_hku_hk/EYvNV4XUJ_5Mlk-g15RelN4BW_P8lVBfC_zhjC_BlBDARg?e=UoGFWH)] [[Semseg](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wuxy_connect_hku_hk/EQkDiv5xkOFKgCpGiGtAlLwBon7i8W6my3TIbGVxuiTttQ?e=tQFnbr)]
+#### Point Prompt Training (PPT)
+PPT presents a multi-dataset pre-training framework, and it is compatible with various existing pre-training frameworks and backbones.
+1. PPT supervised joint training with the following example scripts:
+```bash
+# ScanNet + Structured3d, validate on ScanNet (S3DIS might cause long data time, w/o S3DIS for a quick validation) >= 3090 * 8
+sh scripts/train.sh -g 8 -d scannet -c semseg-ppt-v1m1-0-sc-st-spunet -n semseg-ppt-v1m1-0-sc-st-spunet
+sh scripts/train.sh -g 8 -d scannet -c semseg-ppt-v1m1-1-sc-st-spunet-submit -n semseg-ppt-v1m1-1-sc-st-spunet-submit
+# ScanNet + S3DIS + Structured3d, validate on S3DIS (>= a100 * 8)
+sh scripts/train.sh -g 8 -d s3dis -c semseg-ppt-v1m1-0-s3-sc-st-spunet -n semseg-ppt-v1m1-0-s3-sc-st-spunet
+# SemanticKITTI + nuScenes + Waymo, validate on SemanticKITTI (bs12 >= 3090 * 4 >= 3090 * 8, v1m1-0 is still on tuning)
+sh scripts/train.sh -g 4 -d semantic_kitti -c semseg-ppt-v1m1-0-nu-sk-wa-spunet -n semseg-ppt-v1m1-0-nu-sk-wa-spunet
+sh scripts/train.sh -g 4 -d semantic_kitti -c semseg-ppt-v1m2-0-sk-nu-wa-spunet -n semseg-ppt-v1m2-0-sk-nu-wa-spunet
+sh scripts/train.sh -g 4 -d semantic_kitti -c semseg-ppt-v1m2-1-sk-nu-wa-spunet-submit -n semseg-ppt-v1m2-1-sk-nu-wa-spunet-submit
+# SemanticKITTI + nuScenes + Waymo, validate on nuScenes (bs12 >= 3090 * 4; bs24 >= 3090 * 8, v1m1-0 is still on tuning))
+sh scripts/train.sh -g 4 -d nuscenes -c semseg-ppt-v1m1-0-nu-sk-wa-spunet -n semseg-ppt-v1m1-0-nu-sk-wa-spunet
+sh scripts/train.sh -g 4 -d nuscenes -c semseg-ppt-v1m2-0-nu-sk-wa-spunet -n semseg-ppt-v1m2-0-nu-sk-wa-spunet
+sh scripts/train.sh -g 4 -d nuscenes -c semseg-ppt-v1m2-1-nu-sk-wa-spunet-submit -n semseg-ppt-v1m2-1-nu-sk-wa-spunet-submit
+```
+#### PointContrast
+1. Preprocess and link ScanNet-Pair dataset (pair-wise matching with ScanNet raw RGB-D frame, ~1.5T):
+```bash
+# RAW_SCANNET_DIR: the directory of downloaded ScanNet v2 raw dataset.
+# PROCESSED_SCANNET_PAIR_DIR: the directory of processed ScanNet pair dataset (output dir).
+python pointcept/datasets/preprocessing/scannet/scannet_pair/preprocess.py --dataset_root ${RAW_SCANNET_DIR} --output_root ${PROCESSED_SCANNET_PAIR_DIR}
+ln -s ${PROCESSED_SCANNET_PAIR_DIR} ${CODEBASE_DIR}/data/scannet
+```
+2. Pre-training with the following example scripts:
+```bash
+# ScanNet
+sh scripts/train.sh -g 8 -d scannet -c pretrain-msc-v1m1-1-spunet-pointcontrast -n pretrain-msc-v1m1-1-spunet-pointcontrast
+```
+3. Fine-tuning refer [MSC](#masked-scene-contrast-msc).
+#### Contrastive Scene Contexts
+1. Preprocess and link ScanNet-Pair dataset (refer [PointContrast](#pointcontrast)):
+2. Pre-training with the following example scripts:
+```bash
+# ScanNet
+sh scripts/train.sh -g 8 -d scannet -c pretrain-msc-v1m2-0-spunet-csc -n pretrain-msc-v1m2-0-spunet-csc
+```
+3. Fine-tuning refer [MSC](#masked-scene-contrast-msc).
+## Acknowledgement
+_Pointcept_ is designed by [Xiaoyang](https://xywu.me/), named by [Yixing](https://github.com/yxlao) and the logo is created by [Yuechen](https://julianjuaner.github.io/). It is derived from [Hengshuang](https://hszhao.github.io/)'s [Semseg](https://github.com/hszhao/semseg) and inspirited by several repos, e.g., [MinkowskiEngine](https://github.com/NVIDIA/MinkowskiEngine), [pointnet2](https://github.com/charlesq34/pointnet2), [mmcv](https://github.com/open-mmlab/mmcv/tree/master/mmcv), and [Detectron2](https://github.com/facebookresearch/detectron2).

submodules/PointTransformerV3/Pointcept/configs/_base_/dataset/scannetpp.py ADDED Viewed

	@@ -0,0 +1,104 @@

+data = dict(
+    names=[
+        "wall",
+        "ceiling",
+        "floor",
+        "table",
+        "door",
+        "ceiling lamp",
+        "cabinet",
+        "blinds",
+        "curtain",
+        "chair",
+        "storage cabinet",
+        "office chair",
+        "bookshelf",
+        "whiteboard",
+        "window",
+        "box",
+        "window frame",
+        "monitor",
+        "shelf",
+        "doorframe",
+        "pipe",
+        "heater",
+        "kitchen cabinet",
+        "sofa",
+        "windowsill",
+        "bed",
+        "shower wall",
+        "trash can",
+        "book",
+        "plant",
+        "blanket",
+        "tv",
+        "computer tower",
+        "kitchen counter",
+        "refrigerator",
+        "jacket",
+        "electrical duct",
+        "sink",
+        "bag",
+        "picture",
+        "pillow",
+        "towel",
+        "suitcase",
+        "backpack",
+        "crate",
+        "keyboard",
+        "rack",
+        "toilet",
+        "paper",
+        "printer",
+        "poster",
+        "painting",
+        "microwave",
+        "board",
+        "shoes",
+        "socket",
+        "bottle",
+        "bucket",
+        "cushion",
+        "basket",
+        "shoe rack",
+        "telephone",
+        "file folder",
+        "cloth",
+        "blind rail",
+        "laptop",
+        "plant pot",
+        "exhaust fan",
+        "cup",
+        "coat hanger",
+        "light switch",
+        "speaker",
+        "table lamp",
+        "air vent",
+        "clothes hanger",
+        "kettle",
+        "smoke detector",
+        "container",
+        "power strip",
+        "slippers",
+        "paper bag",
+        "mouse",
+        "cutting board",
+        "toilet paper",
+        "paper towel",
+        "pot",
+        "clock",
+        "pan",
+        "tap",
+        "jar",
+        "soap dispenser",
+        "binder",
+        "bowl",
+        "tissue box",
+        "whiteboard eraser",
+        "toilet brush",
+        "spray bottle",
+        "headphones",
+        "stapler",
+        "marker",
+    ]
+)

submodules/PointTransformerV3/Pointcept/configs/_base_/default_runtime.py ADDED Viewed

	@@ -0,0 +1,39 @@

+weight = None  # path to model weight
+resume = False  # whether to resume training process
+evaluate = True  # evaluate after each epoch training process
+test_only = False  # test process
+seed = None  # train process will init a random seed and record
+save_path = "exp/default"
+num_worker = 16  # total worker in all gpu
+batch_size = 16  # total batch size in all gpu
+batch_size_val = None  # auto adapt to bs 1 for each gpu
+batch_size_test = None  # auto adapt to bs 1 for each gpu
+epoch = 100  # total epoch, data loop = epoch // eval_epoch
+eval_epoch = 100  # sche total eval & checkpoint epoch
+clip_grad = None  # disable with None, enable with a float
+sync_bn = False
+enable_amp = False
+empty_cache = False
+empty_cache_per_epoch = False
+find_unused_parameters = False
+mix_prob = 0
+param_dicts = None  # example: param_dicts = [dict(keyword="block", lr_scale=0.1)]
+# hook
+hooks = [
+    dict(type="CheckpointLoader"),
+    dict(type="IterationTimer", warmup_iter=2),
+    dict(type="InformationWriter"),
+    dict(type="SemSegEvaluator"),
+    dict(type="CheckpointSaver", save_freq=None),
+    dict(type="PreciseEvaluator", test_last=False),
+]
+# Trainer
+train = dict(type="DefaultTrainer")
+# Tester
+test = dict(type="SemSegTester", verbose=True)

submodules/PointTransformerV3/Pointcept/configs/matterport3d/semseg-pt-v3m1-0-base.py ADDED Viewed

	@@ -0,0 +1,313 @@

+_base_ = ["../_base_/default_runtime.py"]
+# misc custom setting
+batch_size = 12  # bs: total bs in all gpus
+num_worker = 24
+mix_prob = 0.8
+empty_cache = False
+enable_amp = True
+# model settings
+model = dict(
+    type="DefaultSegmentorV2",
+    num_classes=21,
+    backbone_out_channels=64,
+    backbone=dict(
+        type="PT-v3m1",
+        in_channels=6,
+        order=("z", "z-trans", "hilbert", "hilbert-trans"),
+        stride=(2, 2, 2, 2),
+        enc_depths=(2, 2, 2, 6, 2),
+        enc_channels=(32, 64, 128, 256, 512),
+        enc_num_head=(2, 4, 8, 16, 32),
+        enc_patch_size=(1024, 1024, 1024, 1024, 1024),
+        dec_depths=(2, 2, 2, 2),
+        dec_channels=(64, 64, 128, 256),
+        dec_num_head=(4, 4, 8, 16),
+        dec_patch_size=(1024, 1024, 1024, 1024),
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        drop_path=0.3,
+        shuffle_orders=True,
+        pre_norm=True,
+        enable_rpe=False,
+        enable_flash=True,
+        upcast_attention=False,
+        upcast_softmax=False,
+        cls_mode=False,
+        pdnorm_bn=False,
+        pdnorm_ln=False,
+        pdnorm_decouple=True,
+        pdnorm_adaptive=False,
+        pdnorm_affine=True,
+        pdnorm_conditions=("ScanNet", "S3DIS", "Structured3D"),
+    ),
+    criteria=[
+        dict(type="CrossEntropyLoss", loss_weight=1.0, ignore_index=-1),
+        dict(type="LovaszLoss", mode="multiclass", loss_weight=1.0, ignore_index=-1),
+    ],
+)
+# scheduler settings
+epoch = 800
+optimizer = dict(type="AdamW", lr=0.006, weight_decay=0.05)
+scheduler = dict(
+    type="OneCycleLR",
+    max_lr=[0.006, 0.0006],
+    pct_start=0.05,
+    anneal_strategy="cos",
+    div_factor=10.0,
+    final_div_factor=1000.0,
+)
+param_dicts = [dict(keyword="block", lr=0.0006)]
+# dataset settings
+dataset_type = "DefaultDataset"
+data_root = "data/matterport3d"
+data = dict(
+    num_classes=21,
+    ignore_index=-1,
+    names=(
+        "wall",
+        "floor",
+        "cabinet",
+        "bed",
+        "chair",
+        "sofa",
+        "table",
+        "door",
+        "window",
+        "bookshelf",
+        "picture",
+        "counter",
+        "desk",
+        "curtain",
+        "refrigerator",
+        "shower curtain",
+        "toilet",
+        "sink",
+        "bathtub",
+        "other",
+        "ceiling",
+    ),
+    train=dict(
+        type=dataset_type,
+        split="train",
+        data_root=data_root,
+        transform=[
+            dict(type="CenterShift", apply_z=True),
+            dict(
+                type="RandomDropout", dropout_ratio=0.2, dropout_application_ratio=0.2
+            ),
+            # dict(type="RandomRotateTargetAngle", angle=(1/2, 1, 3/2), center=[0, 0, 0], axis="z", p=0.75),
+            dict(type="RandomRotate", angle=[-1, 1], axis="z", center=[0, 0, 0], p=0.5),
+            dict(type="RandomRotate", angle=[-1 / 64, 1 / 64], axis="x", p=0.5),
+            dict(type="RandomRotate", angle=[-1 / 64, 1 / 64], axis="y", p=0.5),
+            dict(type="RandomScale", scale=[0.9, 1.1]),
+            # dict(type="RandomShift", shift=[0.2, 0.2, 0.2]),
+            dict(type="RandomFlip", p=0.5),
+            dict(type="RandomJitter", sigma=0.005, clip=0.02),
+            dict(type="ElasticDistortion", distortion_params=[[0.2, 0.4], [0.8, 1.6]]),
+            dict(type="ChromaticAutoContrast", p=0.2, blend_factor=None),
+            dict(type="ChromaticTranslation", p=0.95, ratio=0.05),
+            dict(type="ChromaticJitter", p=0.95, std=0.05),
+            # dict(type="HueSaturationTranslation", hue_max=0.2, saturation_max=0.2),
+            # dict(type="RandomColorDrop", p=0.2, color_augment=0.0),
+            dict(
+                type="GridSample",
+                grid_size=0.02,
+                hash_type="fnv",
+                mode="train",
+                return_grid_coord=True,
+            ),
+            dict(type="SphereCrop", point_max=102400, mode="random"),
+            dict(type="CenterShift", apply_z=False),
+            dict(type="NormalizeColor"),
+            # dict(type="ShufflePoint"),
+            dict(type="ToTensor"),
+            dict(
+                type="Collect",
+                keys=("coord", "grid_coord", "segment"),
+                feat_keys=("color", "normal"),
+            ),
+        ],
+        test_mode=False,
+    ),
+    val=dict(
+        type=dataset_type,
+        split="val",
+        data_root=data_root,
+        transform=[
+            dict(type="CenterShift", apply_z=True),
+            dict(
+                type="GridSample",
+                grid_size=0.02,
+                hash_type="fnv",
+                mode="train",
+                return_grid_coord=True,
+            ),
+            dict(type="CenterShift", apply_z=False),
+            dict(type="NormalizeColor"),
+            dict(type="ToTensor"),
+            dict(
+                type="Collect",
+                keys=("coord", "grid_coord", "segment"),
+                feat_keys=("color", "normal"),
+            ),
+        ],
+        test_mode=False,
+    ),
+    test=dict(
+        type=dataset_type,
+        split="val",
+        data_root=data_root,
+        transform=[
+            dict(type="CenterShift", apply_z=True),
+            dict(type="NormalizeColor"),
+        ],
+        test_mode=True,
+        test_cfg=dict(
+            voxelize=dict(
+                type="GridSample",
+                grid_size=0.02,
+                hash_type="fnv",
+                mode="test",
+                keys=("coord", "color", "normal"),
+                return_grid_coord=True,
+            ),
+            crop=None,
+            post_transform=[
+                dict(type="CenterShift", apply_z=False),
+                dict(type="ToTensor"),
+                dict(
+                    type="Collect",
+                    keys=("coord", "grid_coord", "index"),
+                    feat_keys=("color", "normal"),
+                ),
+            ],
+            aug_transform=[
+                [
+                    dict(
+                        type="RandomRotateTargetAngle",
+                        angle=[0],
+                        axis="z",
+                        center=[0, 0, 0],
+                        p=1,
+                    )
+                ],
+                [
+                    dict(
+                        type="RandomRotateTargetAngle",
+                        angle=[1 / 2],
+                        axis="z",
+                        center=[0, 0, 0],
+                        p=1,
+                    )
+                ],
+                [
+                    dict(
+                        type="RandomRotateTargetAngle",
+                        angle=[1],
+                        axis="z",
+                        center=[0, 0, 0],
+                        p=1,
+                    )
+                ],
+                [
+                    dict(
+                        type="RandomRotateTargetAngle",
+                        angle=[3 / 2],
+                        axis="z",
+                        center=[0, 0, 0],
+                        p=1,
+                    )
+                ],
+                [
+                    dict(
+                        type="RandomRotateTargetAngle",
+                        angle=[0],
+                        axis="z",
+                        center=[0, 0, 0],
+                        p=1,
+                    ),
+                    dict(type="RandomScale", scale=[0.95, 0.95]),
+                ],
+                [
+                    dict(
+                        type="RandomRotateTargetAngle",
+                        angle=[1 / 2],
+                        axis="z",
+                        center=[0, 0, 0],
+                        p=1,
+                    ),
+                    dict(type="RandomScale", scale=[0.95, 0.95]),
+                ],
+                [
+                    dict(
+                        type="RandomRotateTargetAngle",
+                        angle=[1],
+                        axis="z",
+                        center=[0, 0, 0],
+                        p=1,
+                    ),
+                    dict(type="RandomScale", scale=[0.95, 0.95]),
+                ],
+                [
+                    dict(
+                        type="RandomRotateTargetAngle",
+                        angle=[3 / 2],
+                        axis="z",
+                        center=[0, 0, 0],
+                        p=1,
+                    ),
+                    dict(type="RandomScale", scale=[0.95, 0.95]),
+                ],
+                [
+                    dict(
+                        type="RandomRotateTargetAngle",
+                        angle=[0],
+                        axis="z",
+                        center=[0, 0, 0],
+                        p=1,
+                    ),
+                    dict(type="RandomScale", scale=[1.05, 1.05]),
+                ],
+                [
+                    dict(
+                        type="RandomRotateTargetAngle",
+                        angle=[1 / 2],
+                        axis="z",
+                        center=[0, 0, 0],
+                        p=1,
+                    ),
+                    dict(type="RandomScale", scale=[1.05, 1.05]),
+                ],
+                [
+                    dict(
+                        type="RandomRotateTargetAngle",
+                        angle=[1],
+                        axis="z",
+                        center=[0, 0, 0],
+                        p=1,
+                    ),
+                    dict(type="RandomScale", scale=[1.05, 1.05]),
+                ],
+                [
+                    dict(
+                        type="RandomRotateTargetAngle",
+                        angle=[3 / 2],
+                        axis="z",
+                        center=[0, 0, 0],
+                        p=1,
+                    ),
+                    dict(type="RandomScale", scale=[1.05, 1.05]),
+                ],
+                [dict(type="RandomFlip", p=1)],
+            ],
+        ),
+    ),
+)

submodules/PointTransformerV3/Pointcept/configs/matterport3d/semseg-spunet-v1m1-0-base.py ADDED Viewed

	@@ -0,0 +1,282 @@

+_base_ = ["../_base_/default_runtime.py"]
+# misc custom setting
+batch_size = 12  # bs: total bs in all gpus
+mix_prob = 0.8
+empty_cache = False
+enable_amp = True
+# model settings
+model = dict(
+    type="DefaultSegmentor",
+    backbone=dict(
+        type="SpUNet-v1m1",
+        in_channels=6,
+        num_classes=21,
+        channels=(32, 64, 128, 256, 256, 128, 96, 96),
+        layers=(2, 3, 4, 6, 2, 2, 2, 2),
+    ),
+    criteria=[dict(type="CrossEntropyLoss", loss_weight=1.0, ignore_index=-1)],
+)
+# scheduler settings
+epoch = 800
+optimizer = dict(type="SGD", lr=0.05, momentum=0.9, weight_decay=0.0001, nesterov=True)
+scheduler = dict(
+    type="OneCycleLR",
+    max_lr=optimizer["lr"],
+    pct_start=0.05,
+    anneal_strategy="cos",
+    div_factor=10.0,
+    final_div_factor=10000.0,
+)
+# dataset settings
+dataset_type = "DefaultDataset"
+data_root = "data/matterport3d"
+data = dict(
+    num_classes=21,
+    ignore_index=-1,
+    names=(
+        "wall",
+        "floor",
+        "cabinet",
+        "bed",
+        "chair",
+        "sofa",
+        "table",
+        "door",
+        "window",
+        "bookshelf",
+        "picture",
+        "counter",
+        "desk",
+        "curtain",
+        "refrigerator",
+        "shower curtain",
+        "toilet",
+        "sink",
+        "bathtub",
+        "other",
+        "ceiling",
+    ),
+    train=dict(
+        type=dataset_type,
+        split="train",
+        data_root=data_root,
+        transform=[
+            dict(type="CenterShift", apply_z=True),
+            dict(
+                type="RandomDropout", dropout_ratio=0.2, dropout_application_ratio=0.2
+            ),
+            # dict(type="RandomRotateTargetAngle", angle=(1/2, 1, 3/2), center=[0, 0, 0], axis="z", p=0.75),
+            dict(type="RandomRotate", angle=[-1, 1], axis="z", center=[0, 0, 0], p=0.5),
+            dict(type="RandomRotate", angle=[-1 / 64, 1 / 64], axis="x", p=0.5),
+            dict(type="RandomRotate", angle=[-1 / 64, 1 / 64], axis="y", p=0.5),
+            dict(type="RandomScale", scale=[0.9, 1.1]),
+            # dict(type="RandomShift", shift=[0.2, 0.2, 0.2]),
+            dict(type="RandomFlip", p=0.5),
+            dict(type="RandomJitter", sigma=0.005, clip=0.02),
+            dict(type="ElasticDistortion", distortion_params=[[0.2, 0.4], [0.8, 1.6]]),
+            dict(type="ChromaticAutoContrast", p=0.2, blend_factor=None),
+            dict(type="ChromaticTranslation", p=0.95, ratio=0.05),
+            dict(type="ChromaticJitter", p=0.95, std=0.05),
+            # dict(type="HueSaturationTranslation", hue_max=0.2, saturation_max=0.2),
+            # dict(type="RandomColorDrop", p=0.2, color_augment=0.0),
+            dict(
+                type="GridSample",
+                grid_size=0.02,
+                hash_type="fnv",
+                mode="train",
+                return_grid_coord=True,
+            ),
+            dict(type="SphereCrop", point_max=100000, mode="random"),
+            dict(type="CenterShift", apply_z=False),
+            dict(type="NormalizeColor"),
+            dict(type="ShufflePoint"),
+            dict(type="ToTensor"),
+            dict(
+                type="Collect",
+                keys=("coord", "grid_coord", "segment"),
+                feat_keys=("color", "normal"),
+            ),
+        ],
+        test_mode=False,
+    ),
+    val=dict(
+        type=dataset_type,
+        split="val",
+        data_root=data_root,
+        transform=[
+            dict(type="CenterShift", apply_z=True),
+            dict(
+                type="GridSample",
+                grid_size=0.02,
+                hash_type="fnv",
+                mode="train",
+                return_grid_coord=True,
+            ),
+            # dict(type="SphereCrop", point_max=1000000, mode="center"),
+            dict(type="CenterShift", apply_z=False),
+            dict(type="NormalizeColor"),
+            dict(type="ToTensor"),
+            dict(
+                type="Collect",
+                keys=("coord", "grid_coord", "segment"),
+                feat_keys=("color", "normal"),
+            ),
+        ],
+        test_mode=False,
+    ),
+    test=dict(
+        type=dataset_type,
+        split="val",
+        data_root=data_root,
+        transform=[
+            dict(type="CenterShift", apply_z=True),
+            dict(type="NormalizeColor"),
+        ],
+        test_mode=True,
+        test_cfg=dict(
+            voxelize=dict(
+                type="GridSample",
+                grid_size=0.02,
+                hash_type="fnv",
+                mode="test",
+                return_grid_coord=True,
+                keys=("coord", "color", "normal"),
+            ),
+            crop=None,
+            post_transform=[
+                dict(type="CenterShift", apply_z=False),
+                dict(type="ToTensor"),
+                dict(
+                    type="Collect",
+                    keys=("coord", "grid_coord", "index"),
+                    feat_keys=("color", "normal"),
+                ),
+            ],
+            aug_transform=[
+                [
+                    dict(
+                        type="RandomRotateTargetAngle",
+                        angle=[0],
+                        axis="z",
+                        center=[0, 0, 0],
+                        p=1,
+                    )
+                ],
+                [
+                    dict(
+                        type="RandomRotateTargetAngle",
+                        angle=[1 / 2],
+                        axis="z",
+                        center=[0, 0, 0],
+                        p=1,
+                    )
+                ],
+                [
+                    dict(
+                        type="RandomRotateTargetAngle",
+                        angle=[1],
+                        axis="z",
+                        center=[0, 0, 0],
+                        p=1,
+                    )
+                ],
+                [
+                    dict(
+                        type="RandomRotateTargetAngle",
+                        angle=[3 / 2],
+                        axis="z",
+                        center=[0, 0, 0],
+                        p=1,
+                    )
+                ],
+                [
+                    dict(
+                        type="RandomRotateTargetAngle",
+                        angle=[0],
+                        axis="z",
+                        center=[0, 0, 0],
+                        p=1,
+                    ),
+                    dict(type="RandomScale", scale=[0.95, 0.95]),
+                ],
+                [
+                    dict(
+                        type="RandomRotateTargetAngle",
+                        angle=[1 / 2],
+                        axis="z",
+                        center=[0, 0, 0],
+                        p=1,
+                    ),
+                    dict(type="RandomScale", scale=[0.95, 0.95]),
+                ],
+                [
+                    dict(
+                        type="RandomRotateTargetAngle",
+                        angle=[1],
+                        axis="z",
+                        center=[0, 0, 0],
+                        p=1,
+                    ),
+                    dict(type="RandomScale", scale=[0.95, 0.95]),
+                ],
+                [
+                    dict(
+                        type="RandomRotateTargetAngle",
+                        angle=[3 / 2],
+                        axis="z",
+                        center=[0, 0, 0],
+                        p=1,
+                    ),
+                    dict(type="RandomScale", scale=[0.95, 0.95]),
+                ],
+                [
+                    dict(
+                        type="RandomRotateTargetAngle",
+                        angle=[0],
+                        axis="z",
+                        center=[0, 0, 0],
+                        p=1,
+                    ),
+                    dict(type="RandomScale", scale=[1.05, 1.05]),
+                ],
+                [
+                    dict(
+                        type="RandomRotateTargetAngle",
+                        angle=[1 / 2],
+                        axis="z",
+                        center=[0, 0, 0],
+                        p=1,
+                    ),
+                    dict(type="RandomScale", scale=[1.05, 1.05]),
+                ],
+                [
+                    dict(
+                        type="RandomRotateTargetAngle",
+                        angle=[1],
+                        axis="z",
+                        center=[0, 0, 0],
+                        p=1,
+                    ),
+                    dict(type="RandomScale", scale=[1.05, 1.05]),
+                ],
+                [
+                    dict(
+                        type="RandomRotateTargetAngle",
+                        angle=[3 / 2],
+                        axis="z",
+                        center=[0, 0, 0],
+                        p=1,
+                    ),
+                    dict(type="RandomScale", scale=[1.05, 1.05]),
+                ],
+                [dict(type="RandomFlip", p=1)],
+            ],
+        ),
+    ),
+)

submodules/PointTransformerV3/Pointcept/configs/modelnet40/cls-ptv3-v1m1-0-base.py ADDED Viewed

	@@ -0,0 +1,232 @@

+_base_ = ["../_base_/default_runtime.py"]
+# misc custom setting
+batch_size = 32  # bs: total bs in all gpus
+num_worker = 16
+batch_size_val = 8
+empty_cache = False
+enable_amp = False
+# model settings
+model = dict(
+    type="DefaultClassifier",
+    num_classes=40,
+    backbone_embed_dim=512,
+    backbone=dict(
+        type="PT-v3m1",
+        in_channels=6,
+        order=("z", "z-trans", "hilbert", "hilbert-trans"),
+        stride=(2, 2, 2, 2),
+        enc_depths=(2, 2, 2, 6, 2),
+        enc_channels=(32, 64, 128, 256, 512),
+        enc_num_head=(2, 4, 8, 16, 32),
+        enc_patch_size=(1024, 1024, 1024, 1024, 1024),
+        dec_depths=(2, 2, 2, 2),
+        dec_channels=(64, 64, 128, 256),
+        dec_num_head=(4, 4, 8, 16),
+        dec_patch_size=(1024, 1024, 1024, 1024),
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        drop_path=0.3,
+        shuffle_orders=True,
+        pre_norm=True,
+        enable_rpe=False,
+        enable_flash=True,
+        upcast_attention=False,
+        upcast_softmax=False,
+        cls_mode=True,
+        pdnorm_bn=False,
+        pdnorm_ln=False,
+        pdnorm_decouple=True,
+        pdnorm_adaptive=False,
+        pdnorm_affine=True,
+        pdnorm_conditions=("ScanNet", "S3DIS", "Structured3D"),
+    ),
+    criteria=[
+        dict(type="CrossEntropyLoss", loss_weight=1.0, ignore_index=-1),
+        dict(type="LovaszLoss", mode="multiclass", loss_weight=1.0, ignore_index=-1),
+    ],
+)
+# scheduler settings
+epoch = 300
+# optimizer = dict(type="SGD", lr=0.1, momentum=0.9, weight_decay=0.0001, nesterov=True)
+# scheduler = dict(type="MultiStepLR", milestones=[0.6, 0.8], gamma=0.1)
+optimizer = dict(type="AdamW", lr=0.001, weight_decay=0.01)
+scheduler = dict(
+    type="OneCycleLR",
+    max_lr=[0.001, 0.0001],
+    pct_start=0.05,
+    anneal_strategy="cos",
+    div_factor=10.0,
+    final_div_factor=1000.0,
+)
+param_dicts = [dict(keyword="block", lr=0.0001)]
+# dataset settings
+dataset_type = "ModelNetDataset"
+data_root = "data/modelnet40_normal_resampled"
+cache_data = False
+class_names = [
+    "airplane",
+    "bathtub",
+    "bed",
+    "bench",
+    "bookshelf",
+    "bottle",
+    "bowl",
+    "car",
+    "chair",
+    "cone",
+    "cup",
+    "curtain",
+    "desk",
+    "door",
+    "dresser",
+    "flower_pot",
+    "glass_box",
+    "guitar",
+    "keyboard",
+    "lamp",
+    "laptop",
+    "mantel",
+    "monitor",
+    "night_stand",
+    "person",
+    "piano",
+    "plant",
+    "radio",
+    "range_hood",
+    "sink",
+    "sofa",
+    "stairs",
+    "stool",
+    "table",
+    "tent",
+    "toilet",
+    "tv_stand",
+    "vase",
+    "wardrobe",
+    "xbox",
+]
+data = dict(
+    num_classes=40,
+    ignore_index=-1,
+    names=class_names,
+    train=dict(
+        type=dataset_type,
+        split="train",
+        data_root=data_root,
+        class_names=class_names,
+        transform=[
+            dict(type="NormalizeCoord"),
+            # dict(type="CenterShift", apply_z=True),
+            # dict(type="RandomRotate", angle=[-1, 1], axis="z", center=[0, 0, 0], p=0.5),
+            # dict(type="RandomRotate", angle=[-1/24, 1/24], axis="x", p=0.5),
+            # dict(type="RandomRotate", angle=[-1/24, 1/24], axis="y", p=0.5),
+            dict(type="RandomScale", scale=[0.7, 1.5], anisotropic=True),
+            dict(type="RandomShift", shift=((-0.2, 0.2), (-0.2, 0.2), (-0.2, 0.2))),
+            # dict(type="RandomFlip", p=0.5),
+            # dict(type="RandomJitter", sigma=0.005, clip=0.02),
+            # dict(type="ElasticDistortion", distortion_params=[[0.2, 0.4], [0.8, 1.6]]),
+            dict(
+                type="GridSample",
+                grid_size=0.01,
+                hash_type="fnv",
+                mode="train",
+                keys=("coord", "normal"),
+                return_grid_coord=True,
+            ),
+            # dict(type="SphereCrop", point_max=10000, mode="random"),
+            # dict(type="CenterShift", apply_z=True),
+            dict(type="ShufflePoint"),
+            dict(type="ToTensor"),
+            dict(
+                type="Collect",
+                keys=("coord", "grid_coord", "category"),
+                feat_keys=["coord", "normal"],
+            ),
+        ],
+        test_mode=False,
+    ),
+    val=dict(
+        type=dataset_type,
+        split="test",
+        data_root=data_root,
+        class_names=class_names,
+        transform=[
+            dict(type="NormalizeCoord"),
+            dict(
+                type="GridSample",
+                grid_size=0.01,
+                hash_type="fnv",
+                mode="train",
+                keys=("coord", "normal"),
+                return_grid_coord=True,
+            ),
+            dict(type="ToTensor"),
+            dict(
+                type="Collect",
+                keys=("coord", "grid_coord", "category"),
+                feat_keys=["coord", "normal"],
+            ),
+        ],
+        test_mode=False,
+    ),
+    test=dict(
+        type=dataset_type,
+        split="test",
+        data_root=data_root,
+        class_names=class_names,
+        transform=[
+            dict(type="NormalizeCoord"),
+        ],
+        test_mode=True,
+        test_cfg=dict(
+            post_transform=[
+                dict(
+                    type="GridSample",
+                    grid_size=0.01,
+                    hash_type="fnv",
+                    mode="train",
+                    keys=("coord", "normal"),
+                    return_grid_coord=True,
+                ),
+                dict(type="ToTensor"),
+                dict(
+                    type="Collect",
+                    keys=("coord", "grid_coord"),
+                    feat_keys=["coord", "normal"],
+                ),
+            ],
+            aug_transform=[
+                [dict(type="RandomScale", scale=[1, 1], anisotropic=True)],  # 1
+                [dict(type="RandomScale", scale=[0.8, 1.2], anisotropic=True)],  # 2
+                [dict(type="RandomScale", scale=[0.8, 1.2], anisotropic=True)],  # 3
+                [dict(type="RandomScale", scale=[0.8, 1.2], anisotropic=True)],  # 4
+                [dict(type="RandomScale", scale=[0.8, 1.2], anisotropic=True)],  # 5
+                [dict(type="RandomScale", scale=[0.8, 1.2], anisotropic=True)],  # 5
+                [dict(type="RandomScale", scale=[0.8, 1.2], anisotropic=True)],  # 6
+                [dict(type="RandomScale", scale=[0.8, 1.2], anisotropic=True)],  # 7
+                [dict(type="RandomScale", scale=[0.8, 1.2], anisotropic=True)],  # 8
+                [dict(type="RandomScale", scale=[0.8, 1.2], anisotropic=True)],  # 9
+            ],
+        ),
+    ),
+)
+# hooks
+hooks = [
+    dict(type="CheckpointLoader"),
+    dict(type="IterationTimer", warmup_iter=2),
+    dict(type="InformationWriter"),
+    dict(type="ClsEvaluator"),
+    dict(type="CheckpointSaver", save_freq=None),
+    dict(type="PreciseEvaluator", test_last=False),
+]
+# tester
+test = dict(type="ClsVotingTester", num_repeat=100)

submodules/PointTransformerV3/Pointcept/configs/modelnet40/cls-spunet-v1m1-0-base.py ADDED Viewed

	@@ -0,0 +1,176 @@

+_base_ = ["../_base_/default_runtime.py"]
+# misc custom setting
+batch_size = 16  # bs: total bs in all gpus
+# batch_size_val = 8
+empty_cache = False
+enable_amp = False
+# model settings
+model = dict(
+    type="DefaultClassifier",
+    num_classes=40,
+    backbone_embed_dim=256,
+    backbone=dict(
+        type="SpUNet-v1m1",
+        in_channels=6,
+        num_classes=0,
+        channels=(32, 64, 128, 256, 256, 128, 96, 96),
+        layers=(2, 3, 4, 6, 2, 2, 2, 2),
+        cls_mode=True,
+    ),
+    criteria=[dict(type="CrossEntropyLoss", loss_weight=1.0, ignore_index=-1)],
+)
+# scheduler settings
+epoch = 200
+optimizer = dict(type="SGD", lr=0.1, momentum=0.9, weight_decay=0.0001, nesterov=True)
+scheduler = dict(type="MultiStepLR", milestones=[0.6, 0.8], gamma=0.1)
+# dataset settings
+dataset_type = "ModelNetDataset"
+data_root = "data/modelnet40_normal_resampled"
+cache_data = False
+class_names = [
+    "airplane",
+    "bathtub",
+    "bed",
+    "bench",
+    "bookshelf",
+    "bottle",
+    "bowl",
+    "car",
+    "chair",
+    "cone",
+    "cup",
+    "curtain",
+    "desk",
+    "door",
+    "dresser",
+    "flower_pot",
+    "glass_box",
+    "guitar",
+    "keyboard",
+    "lamp",
+    "laptop",
+    "mantel",
+    "monitor",
+    "night_stand",
+    "person",
+    "piano",
+    "plant",
+    "radio",
+    "range_hood",
+    "sink",
+    "sofa",
+    "stairs",
+    "stool",
+    "table",
+    "tent",
+    "toilet",
+    "tv_stand",
+    "vase",
+    "wardrobe",
+    "xbox",
+]
+data = dict(
+    num_classes=40,
+    ignore_index=-1,
+    names=class_names,
+    train=dict(
+        type=dataset_type,
+        split="train",
+        data_root=data_root,
+        class_names=class_names,
+        transform=[
+            dict(type="NormalizeCoord"),
+            # dict(type="CenterShift", apply_z=True),
+            # dict(type="RandomRotate", angle=[-1, 1], axis="z", center=[0, 0, 0], p=0.5),
+            # dict(type="RandomRotate", angle=[-1/24, 1/24], axis="x", p=0.5),
+            # dict(type="RandomRotate", angle=[-1/24, 1/24], axis="y", p=0.5),
+            dict(type="RandomScale", scale=[0.9, 1.1]),
+            dict(type="RandomShift", shift=((-0.2, 0.2), (-0.2, 0.2), (-0.2, 0.2))),
+            # dict(type="RandomFlip", p=0.5),
+            # dict(type="RandomJitter", sigma=0.005, clip=0.02),
+            # dict(type="ElasticDistortion", distortion_params=[[0.2, 0.4], [0.8, 1.6]]),
+            dict(
+                type="GridSample",
+                grid_size=0.01,
+                hash_type="fnv",
+                mode="train",
+                keys=("coord", "normal"),
+                return_grid_coord=True,
+            ),
+            # dict(type="SphereCrop", point_max=10000, mode="random"),
+            # dict(type="CenterShift", apply_z=True),
+            dict(type="ShufflePoint"),
+            dict(type="ToTensor"),
+            dict(
+                type="Collect",
+                keys=("coord", "grid_coord", "category"),
+                feat_keys=["coord", "normal"],
+            ),
+        ],
+        test_mode=False,
+    ),
+    val=dict(
+        type=dataset_type,
+        split="test",
+        data_root=data_root,
+        class_names=class_names,
+        transform=[
+            dict(type="NormalizeCoord"),
+            dict(
+                type="GridSample",
+                grid_size=0.01,
+                hash_type="fnv",
+                mode="train",
+                keys=("coord", "normal"),
+                return_grid_coord=True,
+            ),
+            dict(type="ToTensor"),
+            dict(
+                type="Collect",
+                keys=("coord", "grid_coord", "category"),
+                feat_keys=["coord", "normal"],
+            ),
+        ],
+        test_mode=False,
+    ),
+    test=dict(
+        type=dataset_type,
+        split="test",
+        data_root=data_root,
+        class_names=class_names,
+        transform=[
+            dict(type="NormalizeCoord"),
+            dict(
+                type="GridSample",
+                grid_size=0.01,
+                hash_type="fnv",
+                mode="train",
+                keys=("coord", "normal"),
+                return_grid_coord=True,
+            ),
+            dict(type="ToTensor"),
+            dict(
+                type="Collect",
+                keys=("coord", "grid_coord", "category"),
+                feat_keys=["coord", "normal"],
+            ),
+        ],
+        test_mode=True,
+    ),
+)
+# hooks
+hooks = [
+    dict(type="CheckpointLoader"),
+    dict(type="IterationTimer", warmup_iter=2),
+    dict(type="InformationWriter"),
+    dict(type="ClsEvaluator"),
+    dict(type="CheckpointSaver", save_freq=None),
+]
+# tester
+test = dict(type="ClsTester")

submodules/PointTransformerV3/Pointcept/configs/nuscenes/semseg-ppt-v1m1-0-nu-sk-wa-spunet.py ADDED Viewed

	@@ -0,0 +1,342 @@

+_base_ = ["../_base_/default_runtime.py"]
+# misc custom setting
+batch_size = 12  # bs: total bs in all gpus
+num_worker = 24
+mix_prob = 0.8
+empty_cache = False
+enable_amp = True
+find_unused_parameters = True
+# trainer
+train = dict(
+    type="MultiDatasetTrainer",
+)
+# model settings
+model = dict(
+    type="PPT-v1m1",
+    backbone=dict(
+        type="SpUNet-v1m3",
+        in_channels=4,
+        num_classes=0,
+        base_channels=32,
+        context_channels=256,
+        channels=(32, 64, 128, 256, 256, 128, 96, 96),
+        layers=(2, 3, 4, 6, 2, 2, 2, 2),
+        cls_mode=False,
+        conditions=("SemanticKITTI", "nuScenes", "Waymo"),
+        zero_init=False,
+        norm_decouple=True,
+        norm_adaptive=False,
+        norm_affine=True,
+    ),
+    criteria=[
+        dict(type="CrossEntropyLoss", loss_weight=1.0, ignore_index=-1),
+        dict(type="LovaszLoss", mode="multiclass", loss_weight=1.0, ignore_index=-1),
+    ],
+    backbone_out_channels=96,
+    context_channels=256,
+    conditions=("SemanticKITTI", "nuScenes", "Waymo"),
+    template="[x]",
+    clip_model="ViT-B/16",
+    # fmt: off
+    class_name=(
+        # SemanticKITTI
+        "car", "bicycle", "motorcycle", "truck", "other vehicle",
+        "person", "person who rides a bicycle", "person who rides a motorcycle", "road", "parking",
+        "path for pedestrians at the side of a road", "other ground", "building", "fence", "vegetation",
+        "trunk", "terrain", "pole", "traffic sign",
+        # nuScenes
+        "barrier", "bicycle", "bus", "car", "construction vehicle",
+        "motorcycle", "pedestrian", "traffic cone", "trailer", "truck",
+        "path suitable or safe for driving", "other flat", "sidewalk", "terrain", "man made", "vegetation",
+        # waymo
+        "car", "truck", "bus", "other vehicle", "person who rides a motorcycle",
+        "person who rides a bicycle", "pedestrian", "sign", "traffic light", "pole",
+        "construction cone", "bicycle", "motorcycle", "building", "vegetation",
+        "tree trunk", "curb", "road", "lane marker", "other ground", "horizontal surface that can not drive",
+        "surface when pedestrians most likely to walk on",
+    ),
+    valid_index=(
+        [i for i in range(19)],
+        [i for i in range(19, 19 + 16)],
+        [i for i in range(19 + 16, 19 + 16 + 22)],
+    ),
+    # fmt: on
+    backbone_mode=False,
+)
+# scheduler settings
+epoch = 50
+eval_epoch = 50
+optimizer = dict(type="AdamW", lr=0.002, weight_decay=0.005)
+scheduler = dict(
+    type="OneCycleLR",
+    max_lr=optimizer["lr"],
+    pct_start=0.04,
+    anneal_strategy="cos",
+    div_factor=10.0,
+    final_div_factor=100.0,
+)
+# param_dicts = [dict(keyword="modulation", lr=0.0002)]
+# dataset settings
+data = dict(
+    num_classes=16,
+    ignore_index=-1,
+    names=[
+        "barrier",
+        "bicycle",
+        "bus",
+        "car",
+        "construction_vehicle",
+        "motorcycle",
+        "pedestrian",
+        "traffic_cone",
+        "trailer",
+        "truck",
+        "driveable_surface",
+        "other_flat",
+        "sidewalk",
+        "terrain",
+        "manmade",
+        "vegetation",
+    ],
+    train=dict(
+        type="ConcatDataset",
+        datasets=[
+            # nuScenes
+            dict(
+                type="NuScenesDataset",
+                split="train",
+                data_root="data/nuscenes",
+                transform=[
+                    # dict(type="RandomDropout", dropout_ratio=0.2, dropout_application_ratio=0.2),
+                    # dict(type="RandomRotateTargetAngle", angle=(1/2, 1, 3/2), center=[0, 0, 0], axis='z', p=0.75),
+                    dict(
+                        type="RandomRotate",
+                        angle=[-1, 1],
+                        axis="z",
+                        center=[0, 0, 0],
+                        p=0.5,
+                    ),
+                    # dict(type="RandomRotate", angle=[-1/6, 1/6], axis='x', p=0.5),
+                    # dict(type="RandomRotate", angle=[-1/6, 1/6], axis='y', p=0.5),
+                    dict(
+                        type="PointClip",
+                        point_cloud_range=(-35.2, -35.2, -4, 35.2, 35.2, 2),
+                    ),
+                    dict(type="RandomScale", scale=[0.9, 1.1]),
+                    # dict(type="RandomShift", shift=[0.2, 0.2, 0.2]),
+                    dict(type="RandomFlip", p=0.5),
+                    dict(type="RandomJitter", sigma=0.005, clip=0.02),
+                    # dict(type="ElasticDistortion", distortion_params=[[0.2, 0.4], [0.8, 1.6]]),
+                    dict(
+                        type="GridSample",
+                        grid_size=0.05,
+                        hash_type="fnv",
+                        mode="train",
+                        keys=("coord", "strength", "segment"),
+                        return_grid_coord=True,
+                    ),
+                    # dict(type="SphereCrop", point_max=1000000, mode="random"),
+                    # dict(type="CenterShift", apply_z=False),
+                    dict(type="Add", keys_dict={"condition": "nuScenes"}),
+                    dict(type="ToTensor"),
+                    dict(
+                        type="Collect",
+                        keys=("coord", "grid_coord", "segment", "condition"),
+                        feat_keys=("coord", "strength"),
+                    ),
+                ],
+                test_mode=False,
+                ignore_index=-1,
+                loop=1,
+            ),
+            # SemanticKITTI
+            dict(
+                type="SemanticKITTIDataset",
+                split="train",
+                data_root="data/semantic_kitti",
+                transform=[
+                    # dict(type="RandomDropout", dropout_ratio=0.2, dropout_application_ratio=0.2),
+                    # dict(type="RandomRotateTargetAngle", angle=(1/2, 1, 3/2), center=[0, 0, 0], axis="z", p=0.75),
+                    dict(
+                        type="RandomRotate",
+                        angle=[-1, 1],
+                        axis="z",
+                        center=[0, 0, 0],
+                        p=0.5,
+                    ),
+                    # dict(type="RandomRotate", angle=[-1/6, 1/6], axis="x", p=0.5),
+                    # dict(type="RandomRotate", angle=[-1/6, 1/6], axis="y", p=0.5),
+                    dict(
+                        type="PointClip",
+                        point_cloud_range=(-75.2, -75.2, -4, 75.2, 75.2, 2),
+                    ),
+                    dict(type="RandomScale", scale=[0.9, 1.1]),
+                    # dict(type="RandomShift", shift=[0.2, 0.2, 0.2]),
+                    dict(type="RandomFlip", p=0.5),
+                    dict(type="RandomJitter", sigma=0.005, clip=0.02),
+                    # dict(type="ElasticDistortion", distortion_params=[[0.2, 0.4], [0.8, 1.6]]),
+                    dict(
+                        type="GridSample",
+                        grid_size=0.05,
+                        hash_type="fnv",
+                        mode="train",
+                        keys=("coord", "strength", "segment"),
+                        return_grid_coord=True,
+                    ),
+                    # dict(type="SphereCrop", point_max=1000000, mode="random"),
+                    # dict(type="CenterShift", apply_z=False),
+                    dict(type="Add", keys_dict={"condition": "SemanticKITTI"}),
+                    dict(type="ToTensor"),
+                    dict(
+                        type="Collect",
+                        keys=("coord", "grid_coord", "segment", "condition"),
+                        feat_keys=("coord", "strength"),
+                    ),
+                ],
+                test_mode=False,
+                ignore_index=-1,
+                loop=1,
+            ),
+            # Waymo
+            dict(
+                type="WaymoDataset",
+                split="training",
+                data_root="data/waymo",
+                transform=[
+                    # dict(type="RandomDropout", dropout_ratio=0.2, dropout_application_ratio=0.2),
+                    # dict(type="RandomRotateTargetAngle", angle=(1/2, 1, 3/2), center=[0, 0, 0], axis="z", p=0.75),
+                    dict(
+                        type="RandomRotate",
+                        angle=[-1, 1],
+                        axis="z",
+                        center=[0, 0, 0],
+                        p=0.5,
+                    ),
+                    # dict(type="RandomRotate", angle=[-1/6, 1/6], axis="x", p=0.5),
+                    # dict(type="RandomRotate", angle=[-1/6, 1/6], axis="y", p=0.5),
+                    dict(
+                        type="PointClip",
+                        point_cloud_range=(-35.2, -35.2, -4, 35.2, 35.2, 2),
+                    ),
+                    dict(type="RandomScale", scale=[0.9, 1.1]),
+                    # dict(type="RandomShift", shift=[0.2, 0.2, 0.2]),
+                    dict(type="RandomFlip", p=0.5),
+                    dict(type="RandomJitter", sigma=0.005, clip=0.02),
+                    # dict(type="ElasticDistortion", distortion_params=[[0.2, 0.4], [0.8, 1.6]]),
+                    dict(
+                        type="GridSample",
+                        grid_size=0.05,
+                        hash_type="fnv",
+                        mode="train",
+                        keys=("coord", "strength", "segment"),
+                        return_grid_coord=True,
+                    ),
+                    # dict(type="SphereCrop", point_max=1000000, mode="random"),
+                    # dict(type="CenterShift", apply_z=False),
+                    dict(type="Add", keys_dict={"condition": "Waymo"}),
+                    dict(type="ToTensor"),
+                    dict(
+                        type="Collect",
+                        keys=("coord", "grid_coord", "segment", "condition"),
+                        feat_keys=("coord", "strength"),
+                    ),
+                ],
+                test_mode=False,
+                ignore_index=-1,
+                loop=1,
+            ),
+        ],
+    ),
+    val=dict(
+        type="NuScenesDataset",
+        split="val",
+        data_root="data/nuscenes",
+        transform=[
+            dict(type="PointClip", point_cloud_range=(-35.2, -35.2, -4, 35.2, 35.2, 2)),
+            dict(
+                type="GridSample",
+                grid_size=0.05,
+                hash_type="fnv",
+                mode="train",
+                keys=("coord", "strength", "segment"),
+                return_grid_coord=True,
+            ),
+            dict(type="Add", keys_dict={"condition": "nuScenes"}),
+            dict(type="ToTensor"),
+            dict(
+                type="Collect",
+                keys=("coord", "grid_coord", "segment", "condition"),
+                feat_keys=("coord", "strength"),
+            ),
+        ],
+        test_mode=False,
+        ignore_index=-1,
+    ),
+    test=dict(
+        type="NuScenesDataset",
+        split="val",
+        data_root="data/nuscenes",
+        transform=[
+            dict(type="Copy", keys_dict={"segment": "origin_segment"}),
+            dict(
+                type="GridSample",
+                grid_size=0.025,
+                hash_type="fnv",
+                mode="train",
+                keys=("coord", "strength", "segment"),
+                return_inverse=True,
+            ),
+        ],
+        test_mode=True,
+        test_cfg=dict(
+            voxelize=dict(
+                type="GridSample",
+                grid_size=0.05,
+                hash_type="fnv",
+                mode="test",
+                return_grid_coord=True,
+                keys=("coord", "strength"),
+            ),
+            crop=None,
+            post_transform=[
+                dict(type="Add", keys_dict={"condition": "nuScenes"}),
+                dict(type="ToTensor"),
+                dict(
+                    type="Collect",
+                    keys=("coord", "grid_coord", "index", "condition"),
+                    feat_keys=("coord", "strength"),
+                ),
+            ],
+            aug_transform=[
+                [dict(type="RandomScale", scale=[0.9, 0.9])],
+                [dict(type="RandomScale", scale=[0.95, 0.95])],
+                [dict(type="RandomScale", scale=[1, 1])],
+                [dict(type="RandomScale", scale=[1.05, 1.05])],
+                [dict(type="RandomScale", scale=[1.1, 1.1])],
+                [
+                    dict(type="RandomScale", scale=[0.9, 0.9]),
+                    dict(type="RandomFlip", p=1),
+                ],
+                [
+                    dict(type="RandomScale", scale=[0.95, 0.95]),
+                    dict(type="RandomFlip", p=1),
+                ],
+                [dict(type="RandomScale", scale=[1, 1]), dict(type="RandomFlip", p=1)],
+                [
+                    dict(type="RandomScale", scale=[1.05, 1.05]),
+                    dict(type="RandomFlip", p=1),
+                ],
+                [
+                    dict(type="RandomScale", scale=[1.1, 1.1]),
+                    dict(type="RandomFlip", p=1),
+                ],
+            ],
+        ),
+        ignore_index=-1,
+    ),
+)

submodules/PointTransformerV3/Pointcept/configs/nuscenes/semseg-ppt-v1m2-0-nu-sk-wa-spunet.py ADDED Viewed

	@@ -0,0 +1,316 @@

+_base_ = ["../_base_/default_runtime.py"]
+# misc custom setting
+batch_size = 12  # bs: total bs in all gpus
+num_worker = 24
+mix_prob = 0.8
+empty_cache = False
+enable_amp = True
+find_unused_parameters = True
+# trainer
+train = dict(
+    type="MultiDatasetTrainer",
+)
+# model settings
+model = dict(
+    type="PPT-v1m2",
+    backbone=dict(
+        type="SpUNet-v1m3",
+        in_channels=4,
+        num_classes=0,
+        base_channels=32,
+        context_channels=256,
+        channels=(32, 64, 128, 256, 256, 128, 96, 96),
+        layers=(2, 3, 4, 6, 2, 2, 2, 2),
+        cls_mode=False,
+        conditions=("SemanticKITTI", "nuScenes", "Waymo"),
+        zero_init=False,
+        norm_decouple=True,
+        norm_adaptive=False,
+        norm_affine=True,
+    ),
+    criteria=[
+        dict(type="CrossEntropyLoss", loss_weight=1.0, ignore_index=-1),
+        dict(type="LovaszLoss", mode="multiclass", loss_weight=1.0, ignore_index=-1),
+    ],
+    backbone_out_channels=96,
+    context_channels=256,
+    conditions=("SemanticKITTI", "nuScenes", "Waymo"),
+    num_classes=(19, 16, 22),
+)
+# scheduler settings
+epoch = 50
+eval_epoch = 50
+optimizer = dict(type="AdamW", lr=0.002, weight_decay=0.005)
+scheduler = dict(
+    type="OneCycleLR",
+    max_lr=optimizer["lr"],
+    pct_start=0.04,
+    anneal_strategy="cos",
+    div_factor=10.0,
+    final_div_factor=100.0,
+)
+# param_dicts = [dict(keyword="modulation", lr=0.0002)]
+# dataset settings
+data = dict(
+    num_classes=16,
+    ignore_index=-1,
+    names=[
+        "barrier",
+        "bicycle",
+        "bus",
+        "car",
+        "construction_vehicle",
+        "motorcycle",
+        "pedestrian",
+        "traffic_cone",
+        "trailer",
+        "truck",
+        "driveable_surface",
+        "other_flat",
+        "sidewalk",
+        "terrain",
+        "manmade",
+        "vegetation",
+    ],
+    train=dict(
+        type="ConcatDataset",
+        datasets=[
+            # nuScenes
+            dict(
+                type="NuScenesDataset",
+                split="train",
+                data_root="data/nuscenes",
+                transform=[
+                    # dict(type="RandomDropout", dropout_ratio=0.2, dropout_application_ratio=0.2),
+                    # dict(type="RandomRotateTargetAngle", angle=(1/2, 1, 3/2), center=[0, 0, 0], axis='z', p=0.75),
+                    dict(
+                        type="RandomRotate",
+                        angle=[-1, 1],
+                        axis="z",
+                        center=[0, 0, 0],
+                        p=0.5,
+                    ),
+                    # dict(type="RandomRotate", angle=[-1/6, 1/6], axis='x', p=0.5),
+                    # dict(type="RandomRotate", angle=[-1/6, 1/6], axis='y', p=0.5),
+                    dict(
+                        type="PointClip",
+                        point_cloud_range=(-35.2, -35.2, -4, 35.2, 35.2, 2),
+                    ),
+                    dict(type="RandomScale", scale=[0.9, 1.1]),
+                    # dict(type="RandomShift", shift=[0.2, 0.2, 0.2]),
+                    dict(type="RandomFlip", p=0.5),
+                    dict(type="RandomJitter", sigma=0.005, clip=0.02),
+                    # dict(type="ElasticDistortion", distortion_params=[[0.2, 0.4], [0.8, 1.6]]),
+                    dict(
+                        type="GridSample",
+                        grid_size=0.05,
+                        hash_type="fnv",
+                        mode="train",
+                        keys=("coord", "strength", "segment"),
+                        return_grid_coord=True,
+                    ),
+                    # dict(type="SphereCrop", point_max=1000000, mode="random"),
+                    # dict(type="CenterShift", apply_z=False),
+                    dict(type="Add", keys_dict={"condition": "nuScenes"}),
+                    dict(type="ToTensor"),
+                    dict(
+                        type="Collect",
+                        keys=("coord", "grid_coord", "segment", "condition"),
+                        feat_keys=("coord", "strength"),
+                    ),
+                ],
+                test_mode=False,
+                ignore_index=-1,
+                loop=1,
+            ),
+            # SemanticKITTI
+            dict(
+                type="SemanticKITTIDataset",
+                split="train",
+                data_root="data/semantic_kitti",
+                transform=[
+                    # dict(type="RandomDropout", dropout_ratio=0.2, dropout_application_ratio=0.2),
+                    # dict(type="RandomRotateTargetAngle", angle=(1/2, 1, 3/2), center=[0, 0, 0], axis="z", p=0.75),
+                    dict(
+                        type="RandomRotate",
+                        angle=[-1, 1],
+                        axis="z",
+                        center=[0, 0, 0],
+                        p=0.5,
+                    ),
+                    # dict(type="RandomRotate", angle=[-1/6, 1/6], axis="x", p=0.5),
+                    # dict(type="RandomRotate", angle=[-1/6, 1/6], axis="y", p=0.5),
+                    dict(
+                        type="PointClip",
+                        point_cloud_range=(-75.2, -75.2, -4, 75.2, 75.2, 2),
+                    ),
+                    dict(type="RandomScale", scale=[0.9, 1.1]),
+                    # dict(type="RandomShift", shift=[0.2, 0.2, 0.2]),
+                    dict(type="RandomFlip", p=0.5),
+                    dict(type="RandomJitter", sigma=0.005, clip=0.02),
+                    # dict(type="ElasticDistortion", distortion_params=[[0.2, 0.4], [0.8, 1.6]]),
+                    dict(
+                        type="GridSample",
+                        grid_size=0.05,
+                        hash_type="fnv",
+                        mode="train",
+                        keys=("coord", "strength", "segment"),
+                        return_grid_coord=True,
+                    ),
+                    # dict(type="SphereCrop", point_max=1000000, mode="random"),
+                    # dict(type="CenterShift", apply_z=False),
+                    dict(type="Add", keys_dict={"condition": "SemanticKITTI"}),
+                    dict(type="ToTensor"),
+                    dict(
+                        type="Collect",
+                        keys=("coord", "grid_coord", "segment", "condition"),
+                        feat_keys=("coord", "strength"),
+                    ),
+                ],
+                test_mode=False,
+                ignore_index=-1,
+                loop=1,
+            ),
+            # Waymo
+            dict(
+                type="WaymoDataset",
+                split="training",
+                data_root="data/waymo",
+                transform=[
+                    # dict(type="RandomDropout", dropout_ratio=0.2, dropout_application_ratio=0.2),
+                    # dict(type="RandomRotateTargetAngle", angle=(1/2, 1, 3/2), center=[0, 0, 0], axis="z", p=0.75),
+                    dict(
+                        type="RandomRotate",
+                        angle=[-1, 1],
+                        axis="z",
+                        center=[0, 0, 0],
+                        p=0.5,
+                    ),
+                    # dict(type="RandomRotate", angle=[-1/6, 1/6], axis="x", p=0.5),
+                    # dict(type="RandomRotate", angle=[-1/6, 1/6], axis="y", p=0.5),
+                    dict(
+                        type="PointClip",
+                        point_cloud_range=(-35.2, -35.2, -4, 35.2, 35.2, 2),
+                    ),
+                    dict(type="RandomScale", scale=[0.9, 1.1]),
+                    # dict(type="RandomShift", shift=[0.2, 0.2, 0.2]),
+                    dict(type="RandomFlip", p=0.5),
+                    dict(type="RandomJitter", sigma=0.005, clip=0.02),
+                    # dict(type="ElasticDistortion", distortion_params=[[0.2, 0.4], [0.8, 1.6]]),
+                    dict(
+                        type="GridSample",
+                        grid_size=0.05,
+                        hash_type="fnv",
+                        mode="train",
+                        keys=("coord", "strength", "segment"),
+                        return_grid_coord=True,
+                    ),
+                    # dict(type="SphereCrop", point_max=1000000, mode="random"),
+                    # dict(type="CenterShift", apply_z=False),
+                    dict(type="Add", keys_dict={"condition": "Waymo"}),
+                    dict(type="ToTensor"),
+                    dict(
+                        type="Collect",
+                        keys=("coord", "grid_coord", "segment", "condition"),
+                        feat_keys=("coord", "strength"),
+                    ),
+                ],
+                test_mode=False,
+                ignore_index=-1,
+                loop=1,
+            ),
+        ],
+    ),
+    val=dict(
+        type="NuScenesDataset",
+        split="val",
+        data_root="data/nuscenes",
+        transform=[
+            dict(type="PointClip", point_cloud_range=(-35.2, -35.2, -4, 35.2, 35.2, 2)),
+            dict(
+                type="GridSample",
+                grid_size=0.05,
+                hash_type="fnv",
+                mode="train",
+                keys=("coord", "strength", "segment"),
+                return_grid_coord=True,
+            ),
+            dict(type="Add", keys_dict={"condition": "nuScenes"}),
+            dict(type="ToTensor"),
+            dict(
+                type="Collect",
+                keys=("coord", "grid_coord", "segment", "condition"),
+                feat_keys=("coord", "strength"),
+            ),
+        ],
+        test_mode=False,
+        ignore_index=-1,
+    ),
+    test=dict(
+        type="NuScenesDataset",
+        split="val",
+        data_root="data/nuscenes",
+        transform=[
+            dict(type="Copy", keys_dict={"segment": "origin_segment"}),
+            dict(
+                type="GridSample",
+                grid_size=0.025,
+                hash_type="fnv",
+                mode="train",
+                keys=("coord", "strength", "segment"),
+                return_inverse=True,
+            ),
+        ],
+        test_mode=True,
+        test_cfg=dict(
+            voxelize=dict(
+                type="GridSample",
+                grid_size=0.05,
+                hash_type="fnv",
+                mode="test",
+                return_grid_coord=True,
+                keys=("coord", "strength"),
+            ),
+            crop=None,
+            post_transform=[
+                dict(type="Add", keys_dict={"condition": "nuScenes"}),
+                dict(type="ToTensor"),
+                dict(
+                    type="Collect",
+                    keys=("coord", "grid_coord", "index", "condition"),
+                    feat_keys=("coord", "strength"),
+                ),
+            ],
+            aug_transform=[
+                [dict(type="RandomScale", scale=[0.9, 0.9])],
+                [dict(type="RandomScale", scale=[0.95, 0.95])],
+                [dict(type="RandomScale", scale=[1, 1])],
+                [dict(type="RandomScale", scale=[1.05, 1.05])],
+                [dict(type="RandomScale", scale=[1.1, 1.1])],
+                [
+                    dict(type="RandomScale", scale=[0.9, 0.9]),
+                    dict(type="RandomFlip", p=1),
+                ],
+                [
+                    dict(type="RandomScale", scale=[0.95, 0.95]),
+                    dict(type="RandomFlip", p=1),
+                ],
+                [dict(type="RandomScale", scale=[1, 1]), dict(type="RandomFlip", p=1)],
+                [
+                    dict(type="RandomScale", scale=[1.05, 1.05]),
+                    dict(type="RandomFlip", p=1),
+                ],
+                [
+                    dict(type="RandomScale", scale=[1.1, 1.1]),
+                    dict(type="RandomFlip", p=1),
+                ],
+            ],
+        ),
+        ignore_index=-1,
+    ),
+)

submodules/PointTransformerV3/Pointcept/configs/nuscenes/semseg-ppt-v1m2-1-nu-sk-wa-spunet-submit.py ADDED Viewed

	@@ -0,0 +1,292 @@

+_base_ = ["../_base_/default_runtime.py"]
+# misc custom setting
+batch_size = 12  # bs: total bs in all gpus
+num_worker = 24
+mix_prob = 0.8
+empty_cache = False
+enable_amp = True
+find_unused_parameters = True
+evaluate = False
+# trainer
+train = dict(
+    type="MultiDatasetTrainer",
+)
+# model settings
+model = dict(
+    type="PPT-v1m2",
+    backbone=dict(
+        type="SpUNet-v1m3",
+        in_channels=4,
+        num_classes=0,
+        base_channels=32,
+        context_channels=256,
+        channels=(32, 64, 128, 256, 256, 128, 96, 96),
+        layers=(2, 3, 4, 6, 2, 2, 2, 2),
+        cls_mode=False,
+        conditions=("SemanticKITTI", "nuScenes", "Waymo"),
+        zero_init=False,
+        norm_decouple=True,
+        norm_adaptive=False,
+        norm_affine=True,
+    ),
+    criteria=[
+        dict(type="CrossEntropyLoss", loss_weight=1.0, ignore_index=-1),
+        dict(type="LovaszLoss", mode="multiclass", loss_weight=1.0, ignore_index=-1),
+    ],
+    backbone_out_channels=96,
+    context_channels=256,
+    conditions=("SemanticKITTI", "nuScenes", "Waymo"),
+    num_classes=(19, 16, 22),
+)
+# scheduler settings
+epoch = 50
+eval_epoch = 50
+optimizer = dict(type="AdamW", lr=0.002, weight_decay=0.005)
+scheduler = dict(
+    type="OneCycleLR",
+    max_lr=optimizer["lr"],
+    pct_start=0.04,
+    anneal_strategy="cos",
+    div_factor=10.0,
+    final_div_factor=100.0,
+)
+# param_dicts = [dict(keyword="modulation", lr=0.0002)]
+# dataset settings
+data = dict(
+    num_classes=16,
+    ignore_index=-1,
+    names=[
+        "barrier",
+        "bicycle",
+        "bus",
+        "car",
+        "construction_vehicle",
+        "motorcycle",
+        "pedestrian",
+        "traffic_cone",
+        "trailer",
+        "truck",
+        "driveable_surface",
+        "other_flat",
+        "sidewalk",
+        "terrain",
+        "manmade",
+        "vegetation",
+    ],
+    train=dict(
+        type="ConcatDataset",
+        datasets=[
+            # nuScenes
+            dict(
+                type="NuScenesDataset",
+                split=["train", "val"],
+                data_root="data/nuscenes",
+                transform=[
+                    # dict(type="RandomDropout", dropout_ratio=0.2, dropout_application_ratio=0.2),
+                    # dict(type="RandomRotateTargetAngle", angle=(1/2, 1, 3/2), center=[0, 0, 0], axis='z', p=0.75),
+                    dict(
+                        type="RandomRotate",
+                        angle=[-1, 1],
+                        axis="z",
+                        center=[0, 0, 0],
+                        p=0.5,
+                    ),
+                    # dict(type="RandomRotate", angle=[-1/6, 1/6], axis='x', p=0.5),
+                    # dict(type="RandomRotate", angle=[-1/6, 1/6], axis='y', p=0.5),
+                    dict(
+                        type="PointClip",
+                        point_cloud_range=(-35.2, -35.2, -4, 35.2, 35.2, 2),
+                    ),
+                    dict(type="RandomScale", scale=[0.9, 1.1]),
+                    # dict(type="RandomShift", shift=[0.2, 0.2, 0.2]),
+                    dict(type="RandomFlip", p=0.5),
+                    dict(type="RandomJitter", sigma=0.005, clip=0.02),
+                    # dict(type="ElasticDistortion", distortion_params=[[0.2, 0.4], [0.8, 1.6]]),
+                    dict(
+                        type="GridSample",
+                        grid_size=0.05,
+                        hash_type="fnv",
+                        mode="train",
+                        keys=("coord", "strength", "segment"),
+                        return_grid_coord=True,
+                    ),
+                    # dict(type="SphereCrop", point_max=1000000, mode="random"),
+                    # dict(type="CenterShift", apply_z=False),
+                    dict(type="Add", keys_dict={"condition": "nuScenes"}),
+                    dict(type="ToTensor"),
+                    dict(
+                        type="Collect",
+                        keys=("coord", "grid_coord", "segment", "condition"),
+                        feat_keys=("coord", "strength"),
+                    ),
+                ],
+                test_mode=False,
+                ignore_index=-1,
+                loop=1,
+            ),
+            # SemanticKITTI
+            dict(
+                type="SemanticKITTIDataset",
+                split=["train", "val"],
+                data_root="data/semantic_kitti",
+                transform=[
+                    # dict(type="RandomDropout", dropout_ratio=0.2, dropout_application_ratio=0.2),
+                    # dict(type="RandomRotateTargetAngle", angle=(1/2, 1, 3/2), center=[0, 0, 0], axis="z", p=0.75),
+                    dict(
+                        type="RandomRotate",
+                        angle=[-1, 1],
+                        axis="z",
+                        center=[0, 0, 0],
+                        p=0.5,
+                    ),
+                    # dict(type="RandomRotate", angle=[-1/6, 1/6], axis="x", p=0.5),
+                    # dict(type="RandomRotate", angle=[-1/6, 1/6], axis="y", p=0.5),
+                    dict(
+                        type="PointClip",
+                        point_cloud_range=(-75.2, -75.2, -4, 75.2, 75.2, 2),
+                    ),
+                    dict(type="RandomScale", scale=[0.9, 1.1]),
+                    # dict(type="RandomShift", shift=[0.2, 0.2, 0.2]),
+                    dict(type="RandomFlip", p=0.5),
+                    dict(type="RandomJitter", sigma=0.005, clip=0.02),
+                    # dict(type="ElasticDistortion", distortion_params=[[0.2, 0.4], [0.8, 1.6]]),
+                    dict(
+                        type="GridSample",
+                        grid_size=0.05,
+                        hash_type="fnv",
+                        mode="train",
+                        keys=("coord", "strength", "segment"),
+                        return_grid_coord=True,
+                    ),
+                    # dict(type="SphereCrop", point_max=1000000, mode="random"),
+                    # dict(type="CenterShift", apply_z=False),
+                    dict(type="Add", keys_dict={"condition": "SemanticKITTI"}),
+                    dict(type="ToTensor"),
+                    dict(
+                        type="Collect",
+                        keys=("coord", "grid_coord", "segment", "condition"),
+                        feat_keys=("coord", "strength"),
+                    ),
+                ],
+                test_mode=False,
+                ignore_index=-1,
+                loop=1,
+            ),
+            # Waymo
+            dict(
+                type="WaymoDataset",
+                split=["training", "validation"],
+                data_root="data/waymo",
+                transform=[
+                    # dict(type="RandomDropout", dropout_ratio=0.2, dropout_application_ratio=0.2),
+                    # dict(type="RandomRotateTargetAngle", angle=(1/2, 1, 3/2), center=[0, 0, 0], axis="z", p=0.75),
+                    dict(
+                        type="RandomRotate",
+                        angle=[-1, 1],
+                        axis="z",
+                        center=[0, 0, 0],
+                        p=0.5,
+                    ),
+                    # dict(type="RandomRotate", angle=[-1/6, 1/6], axis="x", p=0.5),
+                    # dict(type="RandomRotate", angle=[-1/6, 1/6], axis="y", p=0.5),
+                    dict(
+                        type="PointClip",
+                        point_cloud_range=(-35.2, -35.2, -4, 35.2, 35.2, 2),
+                    ),
+                    dict(type="RandomScale", scale=[0.9, 1.1]),
+                    # dict(type="RandomShift", shift=[0.2, 0.2, 0.2]),
+                    dict(type="RandomFlip", p=0.5),
+                    dict(type="RandomJitter", sigma=0.005, clip=0.02),
+                    # dict(type="ElasticDistortion", distortion_params=[[0.2, 0.4], [0.8, 1.6]]),
+                    dict(
+                        type="GridSample",
+                        grid_size=0.05,
+                        hash_type="fnv",
+                        mode="train",
+                        keys=("coord", "strength", "segment"),
+                        return_grid_coord=True,
+                    ),
+                    # dict(type="SphereCrop", point_max=1000000, mode="random"),
+                    # dict(type="CenterShift", apply_z=False),
+                    dict(type="Add", keys_dict={"condition": "Waymo"}),
+                    dict(type="ToTensor"),
+                    dict(
+                        type="Collect",
+                        keys=("coord", "grid_coord", "segment", "condition"),
+                        feat_keys=("coord", "strength"),
+                    ),
+                ],
+                test_mode=False,
+                ignore_index=-1,
+                loop=1,
+            ),
+        ],
+    ),
+    test=dict(
+        type="NuScenesDataset",
+        split="test",
+        data_root="data/nuscenes",
+        transform=[
+            dict(type="Copy", keys_dict={"segment": "origin_segment"}),
+            dict(
+                type="GridSample",
+                grid_size=0.025,
+                hash_type="fnv",
+                mode="train",
+                keys=("coord", "strength", "segment"),
+                return_inverse=True,
+            ),
+        ],
+        test_mode=True,
+        test_cfg=dict(
+            voxelize=dict(
+                type="GridSample",
+                grid_size=0.05,
+                hash_type="fnv",
+                mode="test",
+                return_grid_coord=True,
+                keys=("coord", "strength"),
+            ),
+            crop=None,
+            post_transform=[
+                dict(type="Add", keys_dict={"condition": "nuScenes"}),
+                dict(type="ToTensor"),
+                dict(
+                    type="Collect",
+                    keys=("coord", "grid_coord", "index", "condition"),
+                    feat_keys=("coord", "strength"),
+                ),
+            ],
+            aug_transform=[
+                [dict(type="RandomScale", scale=[0.9, 0.9])],
+                [dict(type="RandomScale", scale=[0.95, 0.95])],
+                [dict(type="RandomScale", scale=[1, 1])],
+                [dict(type="RandomScale", scale=[1.05, 1.05])],
+                [dict(type="RandomScale", scale=[1.1, 1.1])],
+                [
+                    dict(type="RandomScale", scale=[0.9, 0.9]),
+                    dict(type="RandomFlip", p=1),
+                ],
+                [
+                    dict(type="RandomScale", scale=[0.95, 0.95]),
+                    dict(type="RandomFlip", p=1),
+                ],
+                [dict(type="RandomScale", scale=[1, 1]), dict(type="RandomFlip", p=1)],
+                [
+                    dict(type="RandomScale", scale=[1.05, 1.05]),
+                    dict(type="RandomFlip", p=1),
+                ],
+                [
+                    dict(type="RandomScale", scale=[1.1, 1.1]),
+                    dict(type="RandomFlip", p=1),
+                ],
+            ],
+        ),
+        ignore_index=-1,
+    ),
+)