MONAI
/

renalStructures_UNEST_segmentation

Model card Files Files and versions Community

project-monai commited on 24 days ago

Commit

b6d2bca

verified ·

1 Parent(s): 8354516

Upload renalStructures_UNEST_segmentation version 0.2.6

Browse files

Files changed (21) hide show

.gitattributes +2 -0
LICENSE +201 -0
configs/inference.json +137 -0
configs/logging.conf +21 -0
configs/metadata.json +94 -0
configs/multi_gpu_train.json +36 -0
configs/train.json +321 -0
docs/README.md +103 -0
docs/demos.png +3 -0
docs/renal.png +3 -0
docs/unest.png +0 -0
docs/val_dice.png +0 -0
models/model.pt +3 -0
scripts/__init__.py +10 -0
scripts/networks/__init__.py +10 -0
scripts/networks/nest/__init__.py +16 -0
scripts/networks/nest/utils.py +481 -0
scripts/networks/nest_transformer_3D.py +489 -0
scripts/networks/patchEmbed3D.py +190 -0
scripts/networks/unest.py +274 -0
scripts/networks/unest_block.py +245 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+docs/demos.png filter=lfs diff=lfs merge=lfs -text
+docs/renal.png filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

configs/inference.json ADDED Viewed

	@@ -0,0 +1,137 @@

+{
+    "imports": [
+        "$import glob",
+        "$import os"
+    ],
+    "bundle_root": "/models/renalStructures_UNEST_segmentation",
+    "output_dir": "$@bundle_root + '/eval'",
+    "dataset_dir": "$@bundle_root + './dataset/spleen'",
+    "datalist": "$list(sorted(glob.glob(@dataset_dir + '/*.nii.gz')))",
+    "device": "$torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')",
+    "network_def": {
+        "_target_": "scripts.networks.unest.UNesT",
+        "in_channels": 1,
+        "out_channels": 4
+    },
+    "network": "$@network_def.to(@device)",
+    "preprocessing": {
+        "_target_": "Compose",
+        "transforms": [
+            {
+                "_target_": "LoadImaged",
+                "keys": "image"
+            },
+            {
+                "_target_": "EnsureChannelFirstd",
+                "keys": "image",
+                "channel_dim": "no_channel"
+            },
+            {
+                "_target_": "Orientationd",
+                "keys": "image",
+                "axcodes": "RAS"
+            },
+            {
+                "_target_": "Spacingd",
+                "keys": "image",
+                "pixdim": [
+                    1.0,
+                    1.0,
+                    1.0
+                ],
+                "mode": "bilinear"
+            },
+            {
+                "_target_": "ScaleIntensityRanged",
+                "keys": "image",
+                "a_min": -175,
+                "a_max": 250,
+                "b_min": 0.0,
+                "b_max": 1.0,
+                "clip": true
+            },
+            {
+                "_target_": "EnsureTyped",
+                "keys": "image"
+            }
+        ]
+    },
+    "dataset": {
+        "_target_": "Dataset",
+        "data": "$[{'image': i} for i in @datalist]",
+        "transform": "@preprocessing"
+    },
+    "dataloader": {
+        "_target_": "DataLoader",
+        "dataset": "@dataset",
+        "batch_size": 1,
+        "shuffle": false,
+        "num_workers": 4
+    },
+    "inferer": {
+        "_target_": "SlidingWindowInferer",
+        "roi_size": [
+            96,
+            96,
+            96
+        ],
+        "sw_batch_size": 4,
+        "overlap": 0.5
+    },
+    "postprocessing": {
+        "_target_": "Compose",
+        "transforms": [
+            {
+                "_target_": "Activationsd",
+                "keys": "pred",
+                "softmax": true
+            },
+            {
+                "_target_": "Invertd",
+                "keys": "pred",
+                "transform": "@preprocessing",
+                "orig_keys": "image",
+                "nearest_interp": false,
+                "to_tensor": true
+            },
+            {
+                "_target_": "AsDiscreted",
+                "keys": "pred",
+                "argmax": true
+            },
+            {
+                "_target_": "SaveImaged",
+                "keys": "pred",
+                "output_dir": "@output_dir"
+            }
+        ]
+    },
+    "handlers": [
+        {
+            "_target_": "CheckpointLoader",
+            "load_path": "$@bundle_root + '/models/model.pt'",
+            "load_dict": {
+                "model": "@network"
+            },
+            "strict": "True"
+        },
+        {
+            "_target_": "StatsHandler",
+            "iteration_log": false
+        }
+    ],
+    "evaluator": {
+        "_target_": "SupervisedEvaluator",
+        "device": "@device",
+        "val_data_loader": "@dataloader",
+        "network": "@network",
+        "inferer": "@inferer",
+        "postprocessing": "@postprocessing",
+        "val_handlers": "@handlers",
+        "amp": false
+    },
+    "evaluating": [
+        "$setattr(torch.backends.cudnn, 'benchmark', True)",
+        "[email protected]()"
+    ]
+}

configs/logging.conf ADDED Viewed

	@@ -0,0 +1,21 @@

+[loggers]
+keys=root
+[handlers]
+keys=consoleHandler
+[formatters]
+keys=fullFormatter
+[logger_root]
+level=INFO
+handlers=consoleHandler
+[handler_consoleHandler]
+class=StreamHandler
+level=INFO
+formatter=fullFormatter
+args=(sys.stdout,)
+[formatter_fullFormatter]
+format=%(asctime)s - %(name)s - %(levelname)s - %(message)s

configs/metadata.json ADDED Viewed

	@@ -0,0 +1,94 @@

+{
+    "schema": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_20220324.json",
+    "version": "0.2.6",
+    "changelog": {
+        "0.2.6": "update to huggingface hosting",
+        "0.2.5": "update large files",
+        "0.2.4": "fix black 24.1 format error",
+        "0.2.3": "update AddChanneld with EnsureChannelFirstd and remove meta_dict",
+        "0.2.2": "add name tag",
+        "0.2.1": "fix license Copyright error",
+        "0.2.0": "update license files",
+        "0.1.3": "Add training pipeline for fine-tuning models, support MONAI Label active learning",
+        "0.1.2": "fixed the dimension in convolution according to MONAI 1.0 update",
+        "0.1.1": "fixed the model state dict name",
+        "0.1.0": "complete the model package"
+    },
+    "monai_version": "1.4.0",
+    "pytorch_version": "2.4.0",
+    "numpy_version": "1.24.4",
+    "optional_packages_version": {
+        "nibabel": "5.2.1",
+        "pytorch-ignite": "0.4.11",
+        "einops": "0.7.0",
+        "fire": "0.6.0",
+        "timm": "0.6.7",
+        "torchvision": "0.19.0",
+        "tensorboard": "2.17.0"
+    },
+    "name": "Renal structures UNEST segmentation",
+    "task": "Renal segmentation",
+    "description": "A transformer-based model for renal segmentation from CT image",
+    "authors": "Vanderbilt University + MONAI team",
+    "copyright": "Copyright (c) MONAI Consortium",
+    "data_source": "RawData.zip",
+    "data_type": "nibabel",
+    "image_classes": "single channel data, intensity scaled to [0, 1]",
+    "label_classes": "1: Kideny Cortex, 2:Medulla, 3:Pelvicalyceal system",
+    "pred_classes": "1: Kideny Cortex, 2:Medulla, 3:Pelvicalyceal system",
+    "eval_metrics": {
+        "mean_dice": 0.85
+    },
+    "intended_use": "This is an example, not to be used for diagnostic purposes",
+    "references": [
+        "Tang, Yucheng, et al. 'Self-supervised pre-training of swin transformers for 3d medical image analysis. arXiv preprint arXiv:2111.14791 (2021). https://arxiv.org/abs/2111.14791."
+    ],
+    "network_data_format": {
+        "inputs": {
+            "image": {
+                "type": "image",
+                "format": "hounsfield",
+                "modality": "CT",
+                "num_channels": 1,
+                "spatial_shape": [
+                    96,
+                    96,
+                    96
+                ],
+                "dtype": "float32",
+                "value_range": [
+                    0,
+                    1
+                ],
+                "is_patch_data": true,
+                "channel_def": {
+                    "0": "image"
+                }
+            }
+        },
+        "outputs": {
+            "pred": {
+                "type": "image",
+                "format": "segmentation",
+                "num_channels": 4,
+                "spatial_shape": [
+                    96,
+                    96,
+                    96
+                ],
+                "dtype": "float32",
+                "value_range": [
+                    0,
+                    1
+                ],
+                "is_patch_data": true,
+                "channel_def": {
+                    "0": "background",
+                    "1": "kidney cortex",
+                    "2": "medulla",
+                    "3": "pelvicalyceal system"
+                }
+            }
+        }
+    }
+}

configs/multi_gpu_train.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+    "device": "$torch.device(f'cuda:{dist.get_rank()}')",
+    "network": {
+        "_target_": "torch.nn.parallel.DistributedDataParallel",
+        "module": "$@network_def.to(@device)",
+        "device_ids": [
+            "@device"
+        ]
+    },
+    "train#sampler": {
+        "_target_": "DistributedSampler",
+        "dataset": "@train#dataset",
+        "even_divisible": true,
+        "shuffle": true
+    },
+    "train#dataloader#sampler": "@train#sampler",
+    "train#dataloader#shuffle": false,
+    "train#trainer#train_handlers": "$@train#handlers[: -2 if dist.get_rank() > 0 else None]",
+    "validate#sampler": {
+        "_target_": "DistributedSampler",
+        "dataset": "@validate#dataset",
+        "even_divisible": false,
+        "shuffle": false
+    },
+    "validate#dataloader#sampler": "@validate#sampler",
+    "validate#evaluator#val_handlers": "$None if dist.get_rank() > 0 else @validate#handlers",
+    "training": [
+        "$import torch.distributed as dist",
+        "$dist.init_process_group(backend='nccl')",
+        "$torch.cuda.set_device(@device)",
+        "$monai.utils.set_determinism(seed=123)",
+        "$setattr(torch.backends.cudnn, 'benchmark', True)",
+        "$@train#trainer.run()",
+        "$dist.destroy_process_group()"
+    ]
+}

configs/train.json ADDED Viewed

	@@ -0,0 +1,321 @@

+{
+    "imports": [
+        "$import glob",
+        "$import os",
+        "$import ignite"
+    ],
+    "bundle_root": "/models/renalStructures_UNEST_segmentation",
+    "ckpt_dir": "$@bundle_root + '/models'",
+    "output_dir": "$@bundle_root + '/eval'",
+    "dataset_dir": "$@bundle_root + './dataset'",
+    "images": "$list(sorted(glob.glob(@dataset_dir + '/imagesTr/*.nii.gz')))",
+    "labels": "$list(sorted(glob.glob(@dataset_dir + '/labelsTr/*.nii.gz')))",
+    "val_interval": 5,
+    "device": "$torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')",
+    "network_def": {
+        "_target_": "scripts.networks.unest.UNesT",
+        "in_channels": 1,
+        "out_channels": 4
+    },
+    "network": "$@network_def.to(@device)",
+    "loss": {
+        "_target_": "DiceCELoss",
+        "to_onehot_y": true,
+        "softmax": true,
+        "squared_pred": true,
+        "batch": true
+    },
+    "optimizer": {
+        "_target_": "torch.optim.Adam",
+        "params": "[email protected]()",
+        "lr": 0.0002
+    },
+    "train": {
+        "deterministic_transforms": [
+            {
+                "_target_": "LoadImaged",
+                "keys": [
+                    "image",
+                    "label"
+                ]
+            },
+            {
+                "_target_": "EnsureChannelFirstd",
+                "keys": [
+                    "image",
+                    "label"
+                ]
+            },
+            {
+                "_target_": "Orientationd",
+                "keys": [
+                    "image",
+                    "label"
+                ],
+                "axcodes": "RAS"
+            },
+            {
+                "_target_": "Spacingd",
+                "keys": [
+                    "image",
+                    "label"
+                ],
+                "pixdim": [
+                    1.0,
+                    1.0,
+                    1.0
+                ],
+                "mode": [
+                    "bilinear",
+                    "nearest"
+                ]
+            },
+            {
+                "_target_": "ScaleIntensityRanged",
+                "keys": "image",
+                "a_min": -175,
+                "a_max": 250,
+                "b_min": 0.0,
+                "b_max": 1.0,
+                "clip": true
+            },
+            {
+                "_target_": "EnsureTyped",
+                "keys": [
+                    "image",
+                    "label"
+                ]
+            }
+        ],
+        "random_transforms": [
+            {
+                "_target_": "RandCropByPosNegLabeld",
+                "keys": [
+                    "image",
+                    "label"
+                ],
+                "label_key": "label",
+                "spatial_size": [
+                    96,
+                    96,
+                    96
+                ],
+                "pos": 1,
+                "neg": 1,
+                "num_samples": 4,
+                "image_key": "image",
+                "image_threshold": 0
+            },
+            {
+                "_target_": "RandFlipd",
+                "keys": [
+                    "image",
+                    "label"
+                ],
+                "spatial_axis": [
+                    0
+                ],
+                "prob": 0.1
+            },
+            {
+                "_target_": "RandFlipd",
+                "keys": [
+                    "image",
+                    "label"
+                ],
+                "spatial_axis": [
+                    1
+                ],
+                "prob": 0.1
+            },
+            {
+                "_target_": "RandFlipd",
+                "keys": [
+                    "image",
+                    "label"
+                ],
+                "spatial_axis": [
+                    2
+                ],
+                "prob": 0.1
+            },
+            {
+                "_target_": "RandRotate90d",
+                "keys": [
+                    "image",
+                    "label"
+                ],
+                "max_k": 3,
+                "prob": 0.1
+            },
+            {
+                "_target_": "RandShiftIntensityd",
+                "keys": "image",
+                "offsets": 0.1,
+                "prob": 0.5
+            }
+        ],
+        "preprocessing": {
+            "_target_": "Compose",
+            "transforms": "$@train#deterministic_transforms + @train#random_transforms"
+        },
+        "dataset": {
+            "_target_": "CacheDataset",
+            "data": "$[{'image': i, 'label': l} for i, l in zip(@images[:-9], @labels[:-9])]",
+            "transform": "@train#preprocessing",
+            "cache_rate": 1.0,
+            "num_workers": 4
+        },
+        "dataloader": {
+            "_target_": "DataLoader",
+            "dataset": "@train#dataset",
+            "batch_size": 2,
+            "shuffle": true,
+            "num_workers": 4
+        },
+        "inferer": {
+            "_target_": "SimpleInferer"
+        },
+        "postprocessing": {
+            "_target_": "Compose",
+            "transforms": [
+                {
+                    "_target_": "Activationsd",
+                    "keys": "pred",
+                    "softmax": true
+                },
+                {
+                    "_target_": "AsDiscreted",
+                    "keys": [
+                        "pred",
+                        "label"
+                    ],
+                    "argmax": [
+                        true,
+                        false
+                    ],
+                    "to_onehot": 4
+                }
+            ]
+        },
+        "handlers": [
+            {
+                "_target_": "ValidationHandler",
+                "validator": "@validate#evaluator",
+                "epoch_level": true,
+                "interval": "@val_interval"
+            },
+            {
+                "_target_": "StatsHandler",
+                "tag_name": "train_loss",
+                "output_transform": "$monai.handlers.from_engine(['loss'], first=True)"
+            },
+            {
+                "_target_": "TensorBoardStatsHandler",
+                "log_dir": "@output_dir",
+                "tag_name": "train_loss",
+                "output_transform": "$monai.handlers.from_engine(['loss'], first=True)"
+            }
+        ],
+        "key_metric": {
+            "train_accuracy": {
+                "_target_": "ignite.metrics.Accuracy",
+                "output_transform": "$monai.handlers.from_engine(['pred', 'label'])"
+            }
+        },
+        "trainer": {
+            "_target_": "SupervisedTrainer",
+            "max_epochs": 1000,
+            "device": "@device",
+            "train_data_loader": "@train#dataloader",
+            "network": "@network",
+            "loss_function": "@loss",
+            "optimizer": "@optimizer",
+            "inferer": "@train#inferer",
+            "postprocessing": "@train#postprocessing",
+            "key_train_metric": "@train#key_metric",
+            "train_handlers": "@train#handlers",
+            "amp": true
+        }
+    },
+    "validate": {
+        "preprocessing": {
+            "_target_": "Compose",
+            "transforms": "%train#deterministic_transforms"
+        },
+        "dataset": {
+            "_target_": "CacheDataset",
+            "data": "$[{'image': i, 'label': l} for i, l in zip(@images[-9:], @labels[-9:])]",
+            "transform": "@validate#preprocessing",
+            "cache_rate": 1.0
+        },
+        "dataloader": {
+            "_target_": "DataLoader",
+            "dataset": "@validate#dataset",
+            "batch_size": 1,
+            "shuffle": false,
+            "num_workers": 4
+        },
+        "inferer": {
+            "_target_": "SlidingWindowInferer",
+            "roi_size": [
+                96,
+                96,
+                96
+            ],
+            "sw_batch_size": 4,
+            "overlap": 0.5
+        },
+        "postprocessing": "%train#postprocessing",
+        "handlers": [
+            {
+                "_target_": "StatsHandler",
+                "iteration_log": false
+            },
+            {
+                "_target_": "TensorBoardStatsHandler",
+                "log_dir": "@output_dir",
+                "iteration_log": false
+            },
+            {
+                "_target_": "CheckpointSaver",
+                "save_dir": "@ckpt_dir",
+                "save_dict": {
+                    "model": "@network"
+                },
+                "save_key_metric": true,
+                "key_metric_filename": "model.pt"
+            }
+        ],
+        "key_metric": {
+            "val_mean_dice": {
+                "_target_": "MeanDice",
+                "include_background": false,
+                "output_transform": "$monai.handlers.from_engine(['pred', 'label'])"
+            }
+        },
+        "additional_metrics": {
+            "val_accuracy": {
+                "_target_": "ignite.metrics.Accuracy",
+                "output_transform": "$monai.handlers.from_engine(['pred', 'label'])"
+            }
+        },
+        "evaluator": {
+            "_target_": "SupervisedEvaluator",
+            "device": "@device",
+            "val_data_loader": "@validate#dataloader",
+            "network": "@network",
+            "inferer": "@validate#inferer",
+            "postprocessing": "@validate#postprocessing",
+            "key_val_metric": "@validate#key_metric",
+            "additional_metrics": "@validate#additional_metrics",
+            "val_handlers": "@validate#handlers",
+            "amp": true
+        }
+    },
+    "training": [
+        "$monai.utils.set_determinism(seed=123)",
+        "$setattr(torch.backends.cudnn, 'benchmark', True)",
+        "$@train#trainer.run()"
+    ]
+}

docs/README.md ADDED Viewed

	@@ -0,0 +1,103 @@

+# Description
+A pre-trained model for training and inferencing volumetric (3D) kidney substructures segmentation from contrast-enhanced CT images (Arterial/Portal Venous Phase). Training pipeline is provided to support model fine-tuning with bundle and MONAI Label active learning.
+A tutorial and release of model for kidney cortex, medulla and collecting system segmentation.
+Authors: Yinchi Zhou ([email protected]) | Xin Yu ([email protected]) | Yucheng Tang ([email protected]) |
+# Model Overview
+A pre-trained UNEST base model [1] for volumetric (3D) renal structures segmentation using dynamic contrast enhanced arterial or venous phase CT images.
+## Data
+The training data is from the [ImageVU RenalSeg dataset] from Vanderbilt University and Vanderbilt University Medical Center.
+(The training data is not public available yet).
+- Target: Renal Cortex | Medulla | Pelvis Collecting System
+- Task: Segmentation
+- Modality: CT (Artrial | Venous phase)
+- Size: 96 3D volumes
+The data and segmentation demonstration is as follow:
+![](./renal.png) <br>
+## Method and Network
+The UNEST model is a 3D hierarchical transformer-based semgnetation network.
+Details of the architecture:
+![](./unest.png) <br>
+## Training configuration
+The training was performed with at least one 16GB-memory GPU.
+Actual Model Input: 96 x 96 x 96
+## Input and output formats
+Input: 1 channel CT image
+Output: 4: 0:Background, 1:Renal Cortex, 2:Medulla, 3:Pelvicalyceal System
+## Performance
+A graph showing the validation mean Dice for 5000 epochs.
+![](./val_dice.png) <br>
+This model achieves the following Dice score on the validation data (our own split from the training dataset):
+Mean Valdiation Dice = 0.8523
+Note that mean dice is computed in the original spacing of the input data.
+## commands example
+Download trained checkpoint model to ./model/model.pt:
+Add scripts component:  To run the workflow with customized components, PYTHONPATH should be revised to include the path to the customized component:
+```
+export PYTHONPATH=$PYTHONPATH:"'<path to the bundle root dir>/scripts'"
+```
+Execute Training:
+```
+python -m monai.bundle run training --meta_file configs/metadata.json --config_file configs/train.json --logging_file configs/logging.conf
+```
+Execute inference:
+```
+python -m monai.bundle run evaluating --meta_file configs/metadata.json --config_file configs/inference.json --logging_file configs/logging.conf
+```
+## More examples output
+![](./demos.png) <br>
+# Disclaimer
+This is an example, not to be used for diagnostic purposes.
+# References
+[1] Yu, Xin, Yinchi Zhou, Yucheng Tang et al. "Characterizing Renal Structures with 3D Block Aggregate Transformers." arXiv preprint arXiv:2203.02430 (2022). https://arxiv.org/pdf/2203.02430.pdf
+[2] Zizhao Zhang et al. "Nested Hierarchical Transformer: Towards Accurate, Data-Efficient and Interpretable Visual Understanding." AAAI Conference on Artificial Intelligence (AAAI) 2022
+# License
+Copyright (c) MONAI Consortium
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.

docs/demos.png ADDED Viewed

Git LFS Details

SHA256: fe4fb5b171619b0c3a1eacf404acab7bfae1b42ca7cc1991e442e6d622d1af00
Pointer size: 131 Bytes
Size of remote file: 377 kB

docs/renal.png ADDED Viewed

Git LFS Details

SHA256: fa598f7b3176d1570c323866d710522cfe8ca41d295e20cd2908e481e06d631d
Pointer size: 131 Bytes
Size of remote file: 162 kB

docs/unest.png ADDED Viewed

docs/val_dice.png ADDED Viewed

models/model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8928e88771d31945c51d1b302a8448825e6f9861a543a6e1023acb9576840962
+size 348887167

scripts/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# Copyright (c) MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

scripts/networks/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# Copyright (c) MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

scripts/networks/nest/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+#!/usr/bin/env python3
+from .utils import (
+    Conv3dSame,
+    DropPath,
+    Linear,
+    Mlp,
+    _assert,
+    conv3d_same,
+    create_conv3d,
+    create_pool3d,
+    get_padding,
+    get_same_padding,
+    pad_same,
+    to_ntuple,
+    trunc_normal_,
+)

scripts/networks/nest/utils.py ADDED Viewed

	@@ -0,0 +1,481 @@

+#!/usr/bin/env python3
+import collections.abc
+import math
+import warnings
+from itertools import repeat
+from typing import List, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+try:
+    from torch import _assert
+except ImportError:
+    def _assert(condition: bool, message: str):
+        assert condition, message
+def drop_block_2d(
+    x,
+    drop_prob: float = 0.1,
+    block_size: int = 7,
+    gamma_scale: float = 1.0,
+    with_noise: bool = False,
+    inplace: bool = False,
+    batchwise: bool = False,
+):
+    """DropBlock. See https://arxiv.org/pdf/1810.12890.pdf
+    DropBlock with an experimental gaussian noise option. This layer has been tested on a few training
+    runs with success, but needs further validation and possibly optimization for lower runtime impact.
+    """
+    b, c, h, w = x.shape
+    total_size = w * h
+    clipped_block_size = min(block_size, min(w, h))
+    # seed_drop_rate, the gamma parameter
+    gamma = gamma_scale * drop_prob * total_size / clipped_block_size**2 / ((w - block_size + 1) * (h - block_size + 1))
+    # Forces the block to be inside the feature map.
+    w_i, h_i = torch.meshgrid(torch.arange(w).to(x.device), torch.arange(h).to(x.device))
+    valid_block = ((w_i >= clipped_block_size // 2) & (w_i < w - (clipped_block_size - 1) // 2)) & (
+        (h_i >= clipped_block_size // 2) & (h_i < h - (clipped_block_size - 1) // 2)
+    )
+    valid_block = torch.reshape(valid_block, (1, 1, h, w)).to(dtype=x.dtype)
+    if batchwise:
+        # one mask for whole batch, quite a bit faster
+        uniform_noise = torch.rand((1, c, h, w), dtype=x.dtype, device=x.device)
+    else:
+        uniform_noise = torch.rand_like(x)
+    block_mask = ((2 - gamma - valid_block + uniform_noise) >= 1).to(dtype=x.dtype)
+    block_mask = -F.max_pool2d(
+        -block_mask, kernel_size=clipped_block_size, stride=1, padding=clipped_block_size // 2  # block_size,
+    )
+    if with_noise:
+        normal_noise = torch.randn((1, c, h, w), dtype=x.dtype, device=x.device) if batchwise else torch.randn_like(x)
+        if inplace:
+            x.mul_(block_mask).add_(normal_noise * (1 - block_mask))
+        else:
+            x = x * block_mask + normal_noise * (1 - block_mask)
+    else:
+        normalize_scale = (block_mask.numel() / block_mask.to(dtype=torch.float32).sum().add(1e-7)).to(x.dtype)
+        if inplace:
+            x.mul_(block_mask * normalize_scale)
+        else:
+            x = x * block_mask * normalize_scale
+    return x
+def drop_block_fast_2d(
+    x: torch.Tensor,
+    drop_prob: float = 0.1,
+    block_size: int = 7,
+    gamma_scale: float = 1.0,
+    with_noise: bool = False,
+    inplace: bool = False,
+):
+    """DropBlock. See https://arxiv.org/pdf/1810.12890.pdf
+    DropBlock with an experimental gaussian noise option. Simplied from above without concern for valid
+    block mask at edges.
+    """
+    b, c, h, w = x.shape
+    total_size = w * h
+    clipped_block_size = min(block_size, min(w, h))
+    gamma = gamma_scale * drop_prob * total_size / clipped_block_size**2 / ((w - block_size + 1) * (h - block_size + 1))
+    block_mask = torch.empty_like(x).bernoulli_(gamma)
+    block_mask = F.max_pool2d(
+        block_mask.to(x.dtype), kernel_size=clipped_block_size, stride=1, padding=clipped_block_size // 2
+    )
+    if with_noise:
+        normal_noise = torch.empty_like(x).normal_()
+        if inplace:
+            x.mul_(1.0 - block_mask).add_(normal_noise * block_mask)
+        else:
+            x = x * (1.0 - block_mask) + normal_noise * block_mask
+    else:
+        block_mask = 1 - block_mask
+        normalize_scale = (block_mask.numel() / block_mask.to(dtype=torch.float32).sum().add(1e-6)).to(dtype=x.dtype)
+        if inplace:
+            x.mul_(block_mask * normalize_scale)
+        else:
+            x = x * block_mask * normalize_scale
+    return x
+class DropBlock2d(nn.Module):
+    """DropBlock. See https://arxiv.org/pdf/1810.12890.pdf"""
+    def __init__(
+        self, drop_prob=0.1, block_size=7, gamma_scale=1.0, with_noise=False, inplace=False, batchwise=False, fast=True
+    ):
+        super(DropBlock2d, self).__init__()
+        self.drop_prob = drop_prob
+        self.gamma_scale = gamma_scale
+        self.block_size = block_size
+        self.with_noise = with_noise
+        self.inplace = inplace
+        self.batchwise = batchwise
+        self.fast = fast  # FIXME finish comparisons of fast vs not
+    def forward(self, x):
+        if not self.training or not self.drop_prob:
+            return x
+        if self.fast:
+            return drop_block_fast_2d(
+                x, self.drop_prob, self.block_size, self.gamma_scale, self.with_noise, self.inplace
+            )
+        else:
+            return drop_block_2d(
+                x, self.drop_prob, self.block_size, self.gamma_scale, self.with_noise, self.inplace, self.batchwise
+            )
+def drop_path(x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=None, scale_by_keep=True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+def create_conv3d(in_channels, out_channels, kernel_size, **kwargs):
+    """Select a 2d convolution implementation based on arguments
+    Creates and returns one of torch.nn.Conv2d, Conv2dSame, MixedConv3d, or CondConv2d.
+    Used extensively by EfficientNet, MobileNetv3 and related networks.
+    """
+    depthwise = kwargs.pop("depthwise", False)
+    # for DW out_channels must be multiple of in_channels as must have out_channels % groups == 0
+    groups = in_channels if depthwise else kwargs.pop("groups", 1)
+    m = create_conv3d_pad(in_channels, out_channels, kernel_size, groups=groups, **kwargs)
+    return m
+def conv3d_same(
+    x,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    stride: Tuple[int, int] = (1, 1, 1),
+    padding: Tuple[int, int] = (0, 0, 0),
+    dilation: Tuple[int, int] = (1, 1, 1),
+    groups: int = 1,
+):
+    x = pad_same(x, weight.shape[-3:], stride, dilation)
+    return F.conv3d(x, weight, bias, stride, (0, 0, 0), dilation, groups)
+class Conv3dSame(nn.Conv2d):
+    """Tensorflow like 'SAME' convolution wrapper for 2D convolutions"""
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True):
+        super(Conv3dSame, self).__init__(in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias)
+    def forward(self, x):
+        return conv3d_same(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
+def create_conv3d_pad(in_chs, out_chs, kernel_size, **kwargs):
+    padding = kwargs.pop("padding", "")
+    kwargs.setdefault("bias", False)
+    padding, is_dynamic = get_padding_value(padding, kernel_size, **kwargs)
+    if is_dynamic:
+        return Conv3dSame(in_chs, out_chs, kernel_size, **kwargs)
+    else:
+        return nn.Conv3d(in_chs, out_chs, kernel_size, padding=padding, **kwargs)
+# Calculate symmetric padding for a convolution
+def get_padding(kernel_size: int, stride: int = 1, dilation: int = 1, **_) -> int:
+    padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
+    return padding
+# Calculate asymmetric TensorFlow-like 'SAME' padding for a convolution
+def get_same_padding(x: int, k: int, s: int, d: int):
+    return max((math.ceil(x / s) - 1) * s + (k - 1) * d + 1 - x, 0)
+# Can SAME padding for given args be done statically?
+def is_static_pad(kernel_size: int, stride: int = 1, dilation: int = 1, **_):
+    return stride == 1 and (dilation * (kernel_size - 1)) % 2 == 0
+# Dynamically pad input x with 'SAME' padding for conv with specified args
+def pad_same(x, k: List[int], s: List[int], d: List[int] = (1, 1, 1), value: float = 0):
+    id, ih, iw = x.size()[-3:]
+    pad_d, pad_h, pad_w = (
+        get_same_padding(id, k[0], s[0], d[0]),
+        get_same_padding(ih, k[1], s[1], d[1]),
+        get_same_padding(iw, k[2], s[2], d[2]),
+    )
+    if pad_d > 0 or pad_h > 0 or pad_w > 0:
+        x = F.pad(
+            x,
+            [pad_d // 2, pad_d - pad_d // 2, pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2],
+            value=value,
+        )
+    return x
+def get_padding_value(padding, kernel_size, **kwargs) -> Tuple[Tuple, bool]:
+    dynamic = False
+    if isinstance(padding, str):
+        # for any string padding, the padding will be calculated for you, one of three ways
+        padding = padding.lower()
+        if padding == "same":
+            # TF compatible 'SAME' padding, has a performance and GPU memory allocation impact
+            if is_static_pad(kernel_size, **kwargs):
+                # static case, no extra overhead
+                padding = get_padding(kernel_size, **kwargs)
+            else:
+                # dynamic 'SAME' padding, has runtime/GPU memory overhead
+                padding = 0
+                dynamic = True
+        elif padding == "valid":
+            # 'VALID' padding, same as padding=0
+            padding = 0
+        else:
+            # Default to PyTorch style 'same'-ish symmetric padding
+            padding = get_padding(kernel_size, **kwargs)
+    return padding, dynamic
+# From PyTorch internals
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+to_ntuple = _ntuple
+def make_divisible(v, divisor=8, min_value=None, round_limit=0.9):
+    min_value = min_value or divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < round_limit * v:
+        new_v += divisor
+    return new_v
+class Linear(nn.Linear):
+    r"""Applies a linear transformation to the incoming data: :math:`y = xA^T + b`
+    Wraps torch.nn.Linear to support AMP + torchscript usage by manually casting
+    weight & bias to input.dtype to work around an issue w/ torch.addmm in this use case.
+    """
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if torch.jit.is_scripting():
+            bias = self.bias.to(dtype=input.dtype) if self.bias is not None else None
+            return F.linear(input, self.weight.to(dtype=input.dtype), bias=bias)
+        else:
+            return F.linear(input, self.weight, self.bias)
+class Mlp(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        drop_probs = to_2tuple(drop)
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop2 = nn.Dropout(drop_probs[1])
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+def avg_pool3d_same(
+    x,
+    kernel_size: List[int],
+    stride: List[int],
+    padding: List[int] = (0, 0, 0),
+    ceil_mode: bool = False,
+    count_include_pad: bool = True,
+):
+    # FIXME how to deal with count_include_pad vs not for external padding?
+    x = pad_same(x, kernel_size, stride)
+    return F.avg_pool3d(x, kernel_size, stride, (0, 0, 0), ceil_mode, count_include_pad)
+class AvgPool3dSame(nn.AvgPool2d):
+    """Tensorflow like 'SAME' wrapper for 2D average pooling"""
+    def __init__(self, kernel_size: int, stride=None, padding=0, ceil_mode=False, count_include_pad=True):
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        super(AvgPool3dSame, self).__init__(kernel_size, stride, (0, 0, 0), ceil_mode, count_include_pad)
+    def forward(self, x):
+        x = pad_same(x, self.kernel_size, self.stride)
+        return F.avg_pool3d(x, self.kernel_size, self.stride, self.padding, self.ceil_mode, self.count_include_pad)
+def max_pool3d_same(
+    x,
+    kernel_size: List[int],
+    stride: List[int],
+    padding: List[int] = (0, 0, 0),
+    dilation: List[int] = (1, 1, 1),
+    ceil_mode: bool = False,
+):
+    x = pad_same(x, kernel_size, stride, value=-float("inf"))
+    return F.max_pool3d(x, kernel_size, stride, (0, 0, 0), dilation, ceil_mode)
+class MaxPool3dSame(nn.MaxPool2d):
+    """Tensorflow like 'SAME' wrapper for 3D max pooling"""
+    def __init__(self, kernel_size: int, stride=None, padding=0, dilation=1, ceil_mode=False):
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+        super(MaxPool3dSame, self).__init__(kernel_size, stride, (0, 0, 0), dilation, ceil_mode)
+    def forward(self, x):
+        x = pad_same(x, self.kernel_size, self.stride, value=-float("inf"))
+        return F.max_pool3d(x, self.kernel_size, self.stride, (0, 0, 0), self.dilation, self.ceil_mode)
+def create_pool3d(pool_type, kernel_size, stride=None, **kwargs):
+    stride = stride or kernel_size
+    padding = kwargs.pop("padding", "")
+    padding, is_dynamic = get_padding_value(padding, kernel_size, stride=stride, **kwargs)
+    if is_dynamic:
+        if pool_type == "avg":
+            return AvgPool3dSame(kernel_size, stride=stride, **kwargs)
+        elif pool_type == "max":
+            return MaxPool3dSame(kernel_size, stride=stride, **kwargs)
+        else:
+            raise AssertionError()
+            # assert False, f"Unsupported pool type {pool_type}"
+    else:
+        if pool_type == "avg":
+            return nn.AvgPool3d(kernel_size, stride=stride, padding=padding, **kwargs)
+        elif pool_type == "max":
+            return nn.MaxPool3d(kernel_size, stride=stride, padding=padding, **kwargs)
+        else:
+            raise AssertionError()
+            # assert False, f"Unsupported pool type {pool_type}"
+def _float_to_int(x: float) -> int:
+    """
+    Symbolic tracing helper to substitute for inbuilt `int`.
+    Hint: Inbuilt `int` can't accept an argument of type `Proxy`
+    """
+    return int(x)
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * l - 1, 2 * u - 1)
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.0))
+        tensor.add_(mean)
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.trunc_normal_(w)
+    """
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)

scripts/networks/nest_transformer_3D.py ADDED Viewed

	@@ -0,0 +1,489 @@

+#!/usr/bin/env python3
+# =========================================================================
+# Adapted from https://github.com/google-research/nested-transformer.
+# which has the following license...
+# https://github.com/pytorch/vision/blob/main/LICENSE
+#
+# BSD 3-Clause License
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+""" Nested Transformer (NesT) in PyTorch
+A PyTorch implement of Aggregating Nested Transformers as described in:
+'Aggregating Nested Transformers'
+    - https://arxiv.org/abs/2105.12723
+The official Jax code is released and available at https://github.com/google-research/nested-transformer.
+The weights have been converted with convert/convert_nest_flax.py
+Acknowledgments:
+* The paper authors for sharing their research, code, and model weights
+* Ross Wightman's existing code off which I based this
+Copyright 2021 Alexander Soare
+"""
+import collections.abc
+import logging
+import math
+from functools import partial
+from typing import Callable, Sequence
+import torch
+import torch.nn.functional as F
+from torch import nn
+from .nest import DropPath, Mlp, _assert, create_conv3d, create_pool3d, to_ntuple, trunc_normal_
+from .patchEmbed3D import PatchEmbed3D
+_logger = logging.getLogger(__name__)
+class Attention(nn.Module):
+    """
+    This is much like `.vision_transformer.Attention` but uses *localised* self attention by accepting an input with
+     an extra "image block" dim
+    """
+    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0.0, proj_drop=0.0):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, 3 * dim, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x):
+        """
+        x is shape: B (batch_size), T (image blocks), N (seq length per image block), C (embed dim)
+        """
+        b, t, n, c = x.shape
+        # result of next line is (qkv, B, num (H)eads, T, N, (C')hannels per head)
+        qkv = self.qkv(x).reshape(b, t, n, 3, self.num_heads, c // self.num_heads).permute(3, 0, 4, 1, 2, 5)
+        q, k, v = qkv.unbind(0)  # make torchscript happy (cannot use tensor as tuple)
+        attn = (q @ k.transpose(-2, -1)) * self.scale  # (B, H, T, N, N)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).permute(0, 2, 3, 4, 1).reshape(b, t, n, c)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x  # (B, T, N, C)
+class TransformerLayer(nn.Module):
+    """
+    This is much like `.vision_transformer.Block` but:
+        - Called TransformerLayer here to allow for "block" as defined in the paper ("non-overlapping image blocks")
+        - Uses modified Attention layer that handles the "block" dimension
+    """
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+    def forward(self, x):
+        y = self.norm1(x)
+        x = x + self.drop_path(self.attn(y))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class ConvPool(nn.Module):
+    def __init__(self, in_channels, out_channels, norm_layer, pad_type=""):
+        super().__init__()
+        self.conv = create_conv3d(in_channels, out_channels, kernel_size=3, padding=pad_type, bias=True)
+        self.norm = norm_layer(out_channels)
+        self.pool = create_pool3d("max", kernel_size=3, stride=2, padding=pad_type)
+    def forward(self, x):
+        """
+        x is expected to have shape (B, C, D, H, W)
+        """
+        _assert(x.shape[-3] % 2 == 0, "BlockAggregation requires even input spatial dims")
+        _assert(x.shape[-2] % 2 == 0, "BlockAggregation requires even input spatial dims")
+        _assert(x.shape[-1] % 2 == 0, "BlockAggregation requires even input spatial dims")
+        # print('In ConvPool x : {}'.format(x.shape))
+        x = self.conv(x)
+        # Layer norm done over channel dim only
+        x = self.norm(x.permute(0, 2, 3, 4, 1)).permute(0, 4, 1, 2, 3)
+        x = self.pool(x)
+        return x  # (B, C, D//2, H//2, W//2)
+def blockify(x, block_size: int):
+    """image to blocks
+    Args:
+        x (Tensor): with shape (B, D, H, W, C)
+        block_size (int): edge length of a single square block in units of D, H, W
+    """
+    b, d, h, w, c = x.shape
+    _assert(d % block_size == 0, "`block_size` must divide input depth evenly")
+    _assert(h % block_size == 0, "`block_size` must divide input height evenly")
+    _assert(w % block_size == 0, "`block_size` must divide input width evenly")
+    grid_depth = d // block_size
+    grid_height = h // block_size
+    grid_width = w // block_size
+    x = x.reshape(b, grid_depth, block_size, grid_height, block_size, grid_width, block_size, c)
+    x = x.permute(0, 1, 3, 5, 2, 4, 6, 7).reshape(
+        b, grid_depth * grid_height * grid_width, -1, c
+    )  # shape [2, 512, 27, 128]
+    return x  # (B, T, N, C)
+# @register_notrace_function  # reason: int receives Proxy
+def deblockify(x, block_size: int):
+    """blocks to image
+    Args:
+        x (Tensor): with shape (B, T, N, C) where T is number of blocks and N is sequence size per block
+        block_size (int): edge length of a single square block in units of desired D, H, W
+    """
+    b, t, _, c = x.shape
+    grid_size = round(math.pow(t, 1 / 3))
+    depth = height = width = grid_size * block_size
+    x = x.reshape(b, grid_size, grid_size, grid_size, block_size, block_size, block_size, c)
+    x = x.permute(0, 1, 4, 2, 5, 3, 6, 7).reshape(b, depth, height, width, c)
+    return x  # (B, D, H, W, C)
+class NestLevel(nn.Module):
+    """Single hierarchical level of a Nested Transformer"""
+    def __init__(
+        self,
+        num_blocks,
+        block_size,
+        seq_length,
+        num_heads,
+        depth,
+        embed_dim,
+        prev_embed_dim=None,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rates: Sequence[int] = (),
+        norm_layer=None,
+        act_layer=None,
+        pad_type="",
+    ):
+        super().__init__()
+        self.block_size = block_size
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_blocks, seq_length, embed_dim))
+        if prev_embed_dim is not None:
+            self.pool = ConvPool(prev_embed_dim, embed_dim, norm_layer=norm_layer, pad_type=pad_type)
+        else:
+            self.pool = nn.Identity()
+        # Transformer encoder
+        if len(drop_path_rates):
+            assert len(drop_path_rates) == depth, "Must provide as many drop path rates as there are transformer layers"
+        self.transformer_encoder = nn.Sequential(
+            *[
+                TransformerLayer(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=drop_path_rates[i],
+                    norm_layer=norm_layer,
+                    act_layer=act_layer,
+                )
+                for i in range(depth)
+            ]
+        )
+    def forward(self, x):
+        """
+        expects x as (B, C, D, H, W)
+        """
+        x = self.pool(x)
+        x = x.permute(0, 2, 3, 4, 1)  # (B, H', W', C), switch to channels last for transformer
+        x = blockify(x, self.block_size)  # (B, T, N, C')
+        x = x + self.pos_embed
+        x = self.transformer_encoder(x)  # (B, ,T, N, C')
+        x = deblockify(x, self.block_size)  # (B, D', H', W', C') [2, 24, 24, 24, 128]
+        # Channel-first for block aggregation, and generally to replicate convnet feature map at each stage
+        return x.permute(0, 4, 1, 2, 3)  # (B, C, D', H', W')
+class NestTransformer3D(nn.Module):
+    """Nested Transformer (NesT)
+    A PyTorch impl of : `Aggregating Nested Transformers`
+        - https://arxiv.org/abs/2105.12723
+    """
+    def __init__(
+        self,
+        img_size=96,
+        in_chans=1,
+        patch_size=2,
+        num_levels=3,
+        embed_dims=(128, 256, 512),
+        num_heads=(4, 8, 16),
+        depths=(2, 2, 20),
+        num_classes=1000,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.5,
+        norm_layer=None,
+        act_layer=None,
+        pad_type="",
+        weight_init="",
+        global_pool="avg",
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            in_chans (int): number of input channels
+            patch_size (int): patch size
+            num_levels (int): number of block hierarchies (T_d in the paper)
+            embed_dims (int, tuple): embedding dimensions of each level
+            num_heads (int, tuple): number of attention heads for each level
+            depths (int, tuple): number of transformer layers for each level
+            num_classes (int): number of classes for classification head
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim for MLP of transformer layers
+            qkv_bias (bool): enable bias for qkv if True
+            drop_rate (float): dropout rate for MLP of transformer layers, MSA final projection layer, and classifier
+            attn_drop_rate (float): attention dropout rate
+            drop_path_rate (float): stochastic depth rate
+            norm_layer: (nn.Module): normalization layer for transformer layers
+            act_layer: (nn.Module): activation layer in MLP of transformer layers
+            pad_type: str: Type of padding to use '' for PyTorch symmetric, 'same' for TF SAME
+            weight_init: (str): weight init scheme
+            global_pool: (str): type of pooling operation to apply to final feature map
+        Notes:
+            - Default values follow NesT-B from the original Jax code.
+            - `embed_dims`, `num_heads`, `depths` should be ints or tuples with length `num_levels`.
+            - For those following the paper, Table A1 may have errors!
+                - https://github.com/google-research/nested-transformer/issues/2
+        """
+        super().__init__()
+        for param_name in ["embed_dims", "num_heads", "depths"]:
+            param_value = locals()[param_name]
+            if isinstance(param_value, collections.abc.Sequence):
+                assert len(param_value) == num_levels, f"Require `len({param_name}) == num_levels`"
+        embed_dims = to_ntuple(num_levels)(embed_dims)
+        num_heads = to_ntuple(num_levels)(num_heads)
+        depths = to_ntuple(num_levels)(depths)
+        self.num_classes = num_classes
+        self.num_features = embed_dims[-1]
+        self.feature_info = []
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        act_layer = act_layer or nn.GELU
+        self.drop_rate = drop_rate
+        self.num_levels = num_levels
+        if isinstance(img_size, collections.abc.Sequence):
+            assert img_size[0] == img_size[1], "Model only handles square inputs"
+            img_size = img_size[0]
+        assert img_size % patch_size == 0, "`patch_size` must divide `img_size` evenly"
+        self.patch_size = patch_size
+        # Number of blocks at each level
+        self.num_blocks = (8 ** torch.arange(num_levels)).flip(0).tolist()
+        assert (img_size // patch_size) % round(
+            math.pow(self.num_blocks[0], 1 / 3)
+        ) == 0, "First level blocks don't fit evenly. Check `img_size`, `patch_size`, and `num_levels`"
+        # Block edge size in units of patches
+        # Hint: (img_size // patch_size) gives number of patches along edge of image. sqrt(self.num_blocks[0]) is the
+        #  number of blocks along edge of image
+        self.block_size = int((img_size // patch_size) // round(math.pow(self.num_blocks[0], 1 / 3)))
+        # Patch embedding
+        self.patch_embed = PatchEmbed3D(
+            img_size=[img_size, img_size, img_size],
+            patch_size=[patch_size, patch_size, patch_size],
+            in_chans=in_chans,
+            embed_dim=embed_dims[0],
+        )
+        self.num_patches = self.patch_embed.num_patches
+        self.seq_length = self.num_patches // self.num_blocks[0]
+        # Build up each hierarchical level
+        levels = []
+        dp_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+        prev_dim = None
+        curr_stride = 4
+        for i in range(len(self.num_blocks)):
+            dim = embed_dims[i]
+            levels.append(
+                NestLevel(
+                    self.num_blocks[i],
+                    self.block_size,
+                    self.seq_length,
+                    num_heads[i],
+                    depths[i],
+                    dim,
+                    prev_dim,
+                    mlp_ratio,
+                    qkv_bias,
+                    drop_rate,
+                    attn_drop_rate,
+                    dp_rates[i],
+                    norm_layer,
+                    act_layer,
+                    pad_type=pad_type,
+                )
+            )
+            self.feature_info += [dict(num_chs=dim, reduction=curr_stride, module=f"levels.{i}")]
+            prev_dim = dim
+            curr_stride *= 2
+        self.levels = nn.ModuleList([levels[i] for i in range(num_levels)])
+        # Final normalization layer
+        self.norm = norm_layer(embed_dims[-1])
+        self.init_weights(weight_init)
+    def init_weights(self, mode=""):
+        assert mode in ("nlhb", "")
+        head_bias = -math.log(self.num_classes) if "nlhb" in mode else 0.0
+        for level in self.levels:
+            trunc_normal_(level.pos_embed, std=0.02, a=-2, b=2)
+        named_apply(partial(_init_nest_weights, head_bias=head_bias), self)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {f"level.{i}.pos_embed" for i in range(len(self.levels))}
+    def get_classifier(self):
+        return self.head
+    def forward_features(self, x):
+        """x shape (B, C, D, H, W)"""
+        x = self.patch_embed(x)
+        hidden_states_out = [x]
+        for _, level in enumerate(self.levels):
+            x = level(x)
+            hidden_states_out.append(x)
+        # Layer norm done over channel dim only (to NDHWC and back)
+        x = self.norm(x.permute(0, 2, 3, 4, 1)).permute(0, 4, 1, 2, 3)
+        return x, hidden_states_out
+    def forward(self, x):
+        """x shape (B, C, D, H, W)"""
+        x = self.forward_features(x)
+        if self.drop_rate > 0.0:
+            x = F.dropout(x, p=self.drop_rate, training=self.training)
+        return x
+def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+def _init_nest_weights(module: nn.Module, name: str = "", head_bias: float = 0.0):
+    """NesT weight initialization
+    Can replicate Jax implementation. Otherwise follows vision_transformer.py
+    """
+    if isinstance(module, nn.Linear):
+        if name.startswith("head"):
+            trunc_normal_(module.weight, std=0.02, a=-2, b=2)
+            nn.init.constant_(module.bias, head_bias)
+        else:
+            trunc_normal_(module.weight, std=0.02, a=-2, b=2)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.Conv2d):
+        trunc_normal_(module.weight, std=0.02, a=-2, b=2)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif isinstance(module, (nn.LayerNorm, nn.GroupNorm, nn.BatchNorm2d)):
+        nn.init.zeros_(module.bias)
+        nn.init.ones_(module.weight)
+def resize_pos_embed(posemb, posemb_new):
+    """
+    Rescale the grid of position embeddings when loading from state_dict
+    Expected shape of position embeddings is (1, T, N, C), and considers only square images
+    """
+    _logger.info("Resized position embedding: %s to %s", posemb.shape, posemb_new.shape)
+    seq_length_old = posemb.shape[2]
+    num_blocks_new, seq_length_new = posemb_new.shape[1:3]
+    size_new = int(math.sqrt(num_blocks_new * seq_length_new))
+    # First change to (1, C, H, W)
+    posemb = deblockify(posemb, int(math.sqrt(seq_length_old))).permute(0, 3, 1, 2)
+    posemb = F.interpolate(posemb, size=[size_new, size_new], mode="bicubic", align_corners=False)
+    # Now change to new (1, T, N, C)
+    posemb = blockify(posemb.permute(0, 2, 3, 1), int(math.sqrt(seq_length_new)))
+    return posemb
+def checkpoint_filter_fn(state_dict, model):
+    """resize positional embeddings of pretrained weights"""
+    pos_embed_keys = [k for k in state_dict.keys() if k.startswith("pos_embed_")]
+    for k in pos_embed_keys:
+        if state_dict[k].shape != getattr(model, k).shape:
+            state_dict[k] = resize_pos_embed(state_dict[k], getattr(model, k))
+    return state_dict

scripts/networks/patchEmbed3D.py ADDED Viewed

	@@ -0,0 +1,190 @@

+#!/usr/bin/env python3
+# Copyright 2020 - 2021 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Sequence, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from monai.utils import optional_import
+Rearrange, _ = optional_import("einops.layers.torch", name="Rearrange")
+class PatchEmbeddingBlock(nn.Module):
+    """
+    A patch embedding block, based on: "Dosovitskiy et al.,
+    An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>"
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        img_size: Tuple[int, int, int],
+        patch_size: Tuple[int, int, int],
+        hidden_size: int,
+        num_heads: int,
+        pos_embed: str,
+        dropout_rate: float = 0.0,
+    ) -> None:
+        """
+        Args:
+            in_channels: dimension of input channels.
+            img_size: dimension of input image.
+            patch_size: dimension of patch size.
+            hidden_size: dimension of hidden layer.
+            num_heads: number of attention heads.
+            pos_embed: position embedding layer type.
+            dropout_rate: faction of the input units to drop.
+        """
+        super().__init__()
+        if not (0 <= dropout_rate <= 1):
+            raise AssertionError("dropout_rate should be between 0 and 1.")
+        if hidden_size % num_heads != 0:
+            raise AssertionError("hidden size should be divisible by num_heads.")
+        for m, p in zip(img_size, patch_size):
+            if m < p:
+                raise AssertionError("patch_size should be smaller than img_size.")
+        if pos_embed not in ["conv", "perceptron"]:
+            raise KeyError(f"Position embedding layer of type {pos_embed} is not supported.")
+        if pos_embed == "perceptron":
+            if img_size[0] % patch_size[0] != 0:
+                raise AssertionError("img_size should be divisible by patch_size for perceptron patch embedding.")
+        self.n_patches = (
+            (img_size[0] // patch_size[0]) * (img_size[1] // patch_size[1]) * (img_size[2] // patch_size[2])
+        )
+        self.patch_dim = in_channels * patch_size[0] * patch_size[1] * patch_size[2]
+        self.pos_embed = pos_embed
+        self.patch_embeddings: Union[nn.Conv3d, nn.Sequential]
+        if self.pos_embed == "conv":
+            self.patch_embeddings = nn.Conv3d(
+                in_channels=in_channels, out_channels=hidden_size, kernel_size=patch_size, stride=patch_size
+            )
+        elif self.pos_embed == "perceptron":
+            self.patch_embeddings = nn.Sequential(
+                Rearrange(
+                    "b c (h p1) (w p2) (d p3)-> b (h w d) (p1 p2 p3 c)",
+                    p1=patch_size[0],
+                    p2=patch_size[1],
+                    p3=patch_size[2],
+                ),
+                nn.Linear(self.patch_dim, hidden_size),
+            )
+        self.position_embeddings = nn.Parameter(torch.zeros(1, self.n_patches, hidden_size))
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, hidden_size))
+        self.dropout = nn.Dropout(dropout_rate)
+        self.trunc_normal_(self.position_embeddings, mean=0.0, std=0.02, a=-2.0, b=2.0)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            self.trunc_normal_(m.weight, mean=0.0, std=0.02, a=-2.0, b=2.0)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def trunc_normal_(self, tensor, mean, std, a, b):
+        # From PyTorch official master until it's in a few official releases - RW
+        # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+        def norm_cdf(x):
+            return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+        with torch.no_grad():
+            l = norm_cdf((a - mean) / std)
+            u = norm_cdf((b - mean) / std)
+            tensor.uniform_(2 * l - 1, 2 * u - 1)
+            tensor.erfinv_()
+            tensor.mul_(std * math.sqrt(2.0))
+            tensor.add_(mean)
+            tensor.clamp_(min=a, max=b)
+            return tensor
+    def forward(self, x):
+        if self.pos_embed == "conv":
+            x = self.patch_embeddings(x)
+            x = x.flatten(2)
+            x = x.transpose(-1, -2)
+        elif self.pos_embed == "perceptron":
+            x = self.patch_embeddings(x)
+        embeddings = x + self.position_embeddings
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class PatchEmbed3D(nn.Module):
+    """Video to Patch Embedding.
+    Args:
+        patch_size (int): Patch token size. Default: (2,4,4).
+        in_chans (int): Number of input video channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+    def __init__(
+        self,
+        img_size: Sequence[int] = (96, 96, 96),
+        patch_size=(4, 4, 4),
+        in_chans: int = 1,
+        embed_dim: int = 96,
+        norm_layer=None,
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1], img_size[2] // patch_size[2])
+        self.num_patches = self.grid_size[0] * self.grid_size[1] * self.grid_size[2]
+        self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, d, h, w = x.size()
+        if w % self.patch_size[2] != 0:
+            x = F.pad(x, (0, self.patch_size[2] - w % self.patch_size[2]))
+        if h % self.patch_size[1] != 0:
+            x = F.pad(x, (0, 0, 0, self.patch_size[1] - h % self.patch_size[1]))
+        if d % self.patch_size[0] != 0:
+            x = F.pad(x, (0, 0, 0, 0, 0, self.patch_size[0] - d % self.patch_size[0]))
+        x = self.proj(x)  # B C D Wh Ww
+        if self.norm is not None:
+            d, wh, ww = x.size(2), x.size(3), x.size(4)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dim, d, wh, ww)
+            # pdb.set_trace()
+        return x

scripts/networks/unest.py ADDED Viewed

	@@ -0,0 +1,274 @@

+#!/usr/bin/env python3
+"""
+The 3D NEST transformer based segmentation model
+MASI Lab, Vanderbilty University
+Authors: Xin Yu, Yinchi Zhou, Yucheng Tang, Bennett Landman
+The NEST code is partly from
+Nested Hierarchical Transformer: Towards Accurate, Data-Efficient and
+Interpretable Visual Understanding
+https://arxiv.org/pdf/2105.12723.pdf
+"""
+# limitations under the License.
+from typing import Sequence, Tuple, Union
+import torch
+import torch.nn as nn
+from monai.networks.blocks import Convolution
+from monai.networks.blocks.dynunet_block import UnetOutBlock
+# from scripts.networks.swin_transformer_3d import SwinTransformer3D
+from scripts.networks.nest_transformer_3D import NestTransformer3D
+from scripts.networks.unest_block import UNesTBlock, UNesTConvBlock, UNestUpBlock
+# from monai.networks.blocks.unetr_block import UnetstrBasicBlock, UnetrPrUpBlock, UnetResBlock
+class UNesT(nn.Module):
+    """
+    UNesT model implementation
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        img_size: Sequence[int] = (96, 96, 96),
+        feature_size: int = 16,
+        patch_size: int = 2,
+        depths: Sequence[int] = (2, 2, 2, 2),
+        num_heads: Sequence[int] = (3, 6, 12, 24),
+        window_size: Sequence[int] = (7, 7, 7),
+        norm_name: Union[Tuple, str] = "instance",
+        conv_block: bool = False,
+        res_block: bool = True,
+        # featResBlock: bool = False,
+        dropout_rate: float = 0.0,
+    ) -> None:
+        """
+        Args:
+            in_channels: dimension of input channels.
+            out_channels: dimension of output channels.
+            img_size: dimension of input image.
+            feature_size: dimension of network feature size.
+            hidden_size: dimension of hidden layer.
+            mlp_dim: dimension of feedforward layer.
+            num_heads: number of attention heads.
+            pos_embed: position embedding layer type.
+            norm_name: feature normalization type and arguments.
+            conv_block: bool argument to determine if convolutional block is used.
+            res_block: bool argument to determine if residual block is used.
+            dropout_rate: faction of the input units to drop.
+        """
+        super().__init__()
+        if not (0 <= dropout_rate <= 1):
+            raise AssertionError("dropout_rate should be between 0 and 1.")
+        self.embed_dim = [128, 256, 512]
+        self.nestViT = NestTransformer3D(
+            img_size=96,
+            in_chans=1,
+            patch_size=4,
+            num_levels=3,
+            embed_dims=(128, 256, 512),
+            num_heads=(4, 8, 16),
+            depths=(2, 2, 8),
+            num_classes=1000,
+            mlp_ratio=4.0,
+            qkv_bias=True,
+            drop_rate=0.0,
+            attn_drop_rate=0.0,
+            drop_path_rate=0.5,
+            norm_layer=None,
+            act_layer=None,
+            pad_type="",
+            weight_init="",
+            global_pool="avg",
+        )
+        self.encoder1 = UNesTConvBlock(
+            spatial_dims=3,
+            in_channels=1,
+            out_channels=feature_size * 2,
+            kernel_size=3,
+            stride=1,
+            norm_name=norm_name,
+            res_block=res_block,
+        )
+        self.encoder2 = UNestUpBlock(
+            spatial_dims=3,
+            in_channels=self.embed_dim[0],
+            out_channels=feature_size * 4,
+            num_layer=1,
+            kernel_size=3,
+            stride=1,
+            upsample_kernel_size=2,
+            norm_name=norm_name,
+            conv_block=False,
+            res_block=False,
+        )
+        self.encoder3 = UNesTConvBlock(
+            spatial_dims=3,
+            in_channels=self.embed_dim[0],
+            out_channels=8 * feature_size,
+            kernel_size=3,
+            stride=1,
+            norm_name=norm_name,
+            res_block=res_block,
+        )
+        self.encoder4 = UNesTConvBlock(
+            spatial_dims=3,
+            in_channels=self.embed_dim[1],
+            out_channels=16 * feature_size,
+            kernel_size=3,
+            stride=1,
+            norm_name=norm_name,
+            res_block=res_block,
+        )
+        self.decoder5 = UNesTBlock(
+            spatial_dims=3,
+            in_channels=2 * self.embed_dim[2],
+            out_channels=feature_size * 32,
+            stride=1,
+            kernel_size=3,
+            upsample_kernel_size=2,
+            norm_name=norm_name,
+            res_block=res_block,
+        )
+        self.decoder4 = UNesTBlock(
+            spatial_dims=3,
+            in_channels=self.embed_dim[2],
+            out_channels=feature_size * 16,
+            stride=1,
+            kernel_size=3,
+            upsample_kernel_size=2,
+            norm_name=norm_name,
+            res_block=res_block,
+        )
+        self.decoder3 = UNesTBlock(
+            spatial_dims=3,
+            in_channels=feature_size * 16,
+            out_channels=feature_size * 8,
+            stride=1,
+            kernel_size=3,
+            upsample_kernel_size=2,
+            norm_name=norm_name,
+            res_block=res_block,
+        )
+        self.decoder2 = UNesTBlock(
+            spatial_dims=3,
+            in_channels=feature_size * 8,
+            out_channels=feature_size * 4,
+            stride=1,
+            kernel_size=3,
+            upsample_kernel_size=2,
+            norm_name=norm_name,
+            res_block=res_block,
+        )
+        self.decoder1 = UNesTBlock(
+            spatial_dims=3,
+            in_channels=feature_size * 4,
+            out_channels=feature_size * 2,
+            stride=1,
+            kernel_size=3,
+            upsample_kernel_size=2,
+            norm_name=norm_name,
+            res_block=res_block,
+        )
+        self.encoder10 = Convolution(
+            spatial_dims=3,
+            in_channels=32 * feature_size,
+            out_channels=64 * feature_size,
+            strides=2,
+            adn_ordering="ADN",
+            dropout=0.0,
+        )
+        self.out = UnetOutBlock(spatial_dims=3, in_channels=feature_size * 2, out_channels=out_channels)  # type: ignore
+    def proj_feat(self, x, hidden_size, feat_size):
+        x = x.view(x.size(0), feat_size[0], feat_size[1], feat_size[2], hidden_size)
+        x = x.permute(0, 4, 1, 2, 3).contiguous()
+        return x
+    def load_from(self, weights):
+        with torch.no_grad():
+            # copy weights from patch embedding
+            for i in weights["state_dict"]:
+                print(i)
+            self.vit.patch_embedding.position_embeddings.copy_(
+                weights["state_dict"]["module.transformer.patch_embedding.position_embeddings_3d"]
+            )
+            self.vit.patch_embedding.cls_token.copy_(
+                weights["state_dict"]["module.transformer.patch_embedding.cls_token"]
+            )
+            self.vit.patch_embedding.patch_embeddings[1].weight.copy_(
+                weights["state_dict"]["module.transformer.patch_embedding.patch_embeddings_3d.1.weight"]
+            )
+            self.vit.patch_embedding.patch_embeddings[1].bias.copy_(
+                weights["state_dict"]["module.transformer.patch_embedding.patch_embeddings_3d.1.bias"]
+            )
+            # copy weights from  encoding blocks (default: num of blocks: 12)
+            for bname, block in self.vit.blocks.named_children():
+                print(block)
+                block.loadFrom(weights, n_block=bname)
+            # last norm layer of transformer
+            self.vit.norm.weight.copy_(weights["state_dict"]["module.transformer.norm.weight"])
+            self.vit.norm.bias.copy_(weights["state_dict"]["module.transformer.norm.bias"])
+    def forward(self, x_in):
+        x, hidden_states_out = self.nestViT(x_in)
+        enc0 = self.encoder1(x_in)  # 2, 32, 96, 96, 96
+        x1 = hidden_states_out[0]  # 2, 128, 24, 24, 24
+        enc1 = self.encoder2(x1)  # 2, 64, 48, 48, 48
+        x2 = hidden_states_out[1]  # 2, 128, 24, 24, 24
+        enc2 = self.encoder3(x2)  # 2, 128, 24, 24, 24
+        x3 = hidden_states_out[2]  # 2, 256, 12, 12, 12
+        enc3 = self.encoder4(x3)  # 2, 256, 12, 12, 12
+        x4 = hidden_states_out[3]
+        enc4 = x4  # 2, 512, 6, 6, 6
+        dec4 = x  # 2, 512, 6, 6, 6
+        dec4 = self.encoder10(dec4)  # 2, 1024, 3, 3, 3
+        dec3 = self.decoder5(dec4, enc4)  # 2, 512, 6, 6, 6
+        dec2 = self.decoder4(dec3, enc3)  # 2, 256, 12, 12, 12
+        dec1 = self.decoder3(dec2, enc2)  # 2, 128, 24, 24, 24
+        dec0 = self.decoder2(dec1, enc1)  # 2, 64, 48, 48, 48
+        out = self.decoder1(dec0, enc0)  # 2, 32, 96, 96, 96
+        logits = self.out(out)
+        return logits

scripts/networks/unest_block.py ADDED Viewed

	@@ -0,0 +1,245 @@

+#!/usr/bin/env python3
+from typing import Sequence, Tuple, Union
+import torch
+import torch.nn as nn
+from monai.networks.blocks.dynunet_block import UnetBasicBlock, UnetResBlock, get_conv_layer
+class UNesTBlock(nn.Module):
+    """ """
+    def __init__(
+        self,
+        spatial_dims: int,
+        in_channels: int,
+        out_channels: int,  # type: ignore
+        kernel_size: Union[Sequence[int], int],
+        stride: Union[Sequence[int], int],
+        upsample_kernel_size: Union[Sequence[int], int],
+        norm_name: Union[Tuple, str],
+        res_block: bool = False,
+    ) -> None:
+        """
+        Args:
+            spatial_dims: number of spatial dimensions.
+            in_channels: number of input channels.
+            out_channels: number of output channels.
+            kernel_size: convolution kernel size.
+            stride: convolution stride.
+            upsample_kernel_size: convolution kernel size for transposed convolution layers.
+            norm_name: feature normalization type and arguments.
+            res_block: bool argument to determine if residual block is used.
+        """
+        super(UNesTBlock, self).__init__()
+        upsample_stride = upsample_kernel_size
+        self.transp_conv = get_conv_layer(
+            spatial_dims,
+            in_channels,
+            out_channels,
+            kernel_size=upsample_kernel_size,
+            stride=upsample_stride,
+            conv_only=True,
+            is_transposed=True,
+        )
+        if res_block:
+            self.conv_block = UnetResBlock(
+                spatial_dims,
+                out_channels + out_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=1,
+                norm_name=norm_name,
+            )
+        else:
+            self.conv_block = UnetBasicBlock(  # type: ignore
+                spatial_dims,
+                out_channels + out_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=1,
+                norm_name=norm_name,
+            )
+    def forward(self, inp, skip):
+        # number of channels for skip should equals to out_channels
+        out = self.transp_conv(inp)
+        # print(out.shape)
+        # print(skip.shape)
+        out = torch.cat((out, skip), dim=1)
+        out = self.conv_block(out)
+        return out
+class UNestUpBlock(nn.Module):
+    """ """
+    def __init__(
+        self,
+        spatial_dims: int,
+        in_channels: int,
+        out_channels: int,
+        num_layer: int,
+        kernel_size: Union[Sequence[int], int],
+        stride: Union[Sequence[int], int],
+        upsample_kernel_size: Union[Sequence[int], int],
+        norm_name: Union[Tuple, str],
+        conv_block: bool = False,
+        res_block: bool = False,
+    ) -> None:
+        """
+        Args:
+            spatial_dims: number of spatial dimensions.
+            in_channels: number of input channels.
+            out_channels: number of output channels.
+            num_layer: number of upsampling blocks.
+            kernel_size: convolution kernel size.
+            stride: convolution stride.
+            upsample_kernel_size: convolution kernel size for transposed convolution layers.
+            norm_name: feature normalization type and arguments.
+            conv_block: bool argument to determine if convolutional block is used.
+            res_block: bool argument to determine if residual block is used.
+        """
+        super().__init__()
+        upsample_stride = upsample_kernel_size
+        self.transp_conv_init = get_conv_layer(
+            spatial_dims,
+            in_channels,
+            out_channels,
+            kernel_size=upsample_kernel_size,
+            stride=upsample_stride,
+            conv_only=True,
+            is_transposed=True,
+        )
+        if conv_block:
+            if res_block:
+                self.blocks = nn.ModuleList(
+                    [
+                        nn.Sequential(
+                            get_conv_layer(
+                                spatial_dims,
+                                out_channels,
+                                out_channels,
+                                kernel_size=upsample_kernel_size,
+                                stride=upsample_stride,
+                                conv_only=True,
+                                is_transposed=True,
+                            ),
+                            UnetResBlock(
+                                spatial_dims=3,
+                                in_channels=out_channels,
+                                out_channels=out_channels,
+                                kernel_size=kernel_size,
+                                stride=stride,
+                                norm_name=norm_name,
+                            ),
+                        )
+                        for i in range(num_layer)
+                    ]
+                )
+            else:
+                self.blocks = nn.ModuleList(
+                    [
+                        nn.Sequential(
+                            get_conv_layer(
+                                spatial_dims,
+                                out_channels,
+                                out_channels,
+                                kernel_size=upsample_kernel_size,
+                                stride=upsample_stride,
+                                conv_only=True,
+                                is_transposed=True,
+                            ),
+                            UnetBasicBlock(
+                                spatial_dims=3,
+                                in_channels=out_channels,
+                                out_channels=out_channels,
+                                kernel_size=kernel_size,
+                                stride=stride,
+                                norm_name=norm_name,
+                            ),
+                        )
+                        for i in range(num_layer)
+                    ]
+                )
+        else:
+            self.blocks = nn.ModuleList(
+                [
+                    get_conv_layer(
+                        spatial_dims,
+                        out_channels,
+                        out_channels,
+                        kernel_size=1,
+                        stride=1,
+                        conv_only=True,
+                        is_transposed=True,
+                    )
+                    for i in range(num_layer)
+                ]
+            )
+    def forward(self, x):
+        x = self.transp_conv_init(x)
+        for blk in self.blocks:
+            x = blk(x)
+        return x
+class UNesTConvBlock(nn.Module):
+    """
+    UNesT block with skip connections
+    """
+    def __init__(
+        self,
+        spatial_dims: int,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[Sequence[int], int],
+        stride: Union[Sequence[int], int],
+        norm_name: Union[Tuple, str],
+        res_block: bool = False,
+    ) -> None:
+        """
+        Args:
+            spatial_dims: number of spatial dimensions.
+            in_channels: number of input channels.
+            out_channels: number of output channels.
+            kernel_size: convolution kernel size.
+            stride: convolution stride.
+            norm_name: feature normalization type and arguments.
+            res_block: bool argument to determine if residual block is used.
+        """
+        super().__init__()
+        if res_block:
+            self.layer = UnetResBlock(
+                spatial_dims=spatial_dims,
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                norm_name=norm_name,
+            )
+        else:
+            self.layer = UnetBasicBlock(  # type: ignore
+                spatial_dims=spatial_dims,
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                norm_name=norm_name,
+            )
+    def forward(self, inp):
+        out = self.layer(inp)
+        return out