diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..939e5341e74dc2371c8b47f0e27b50581bed5f63
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,28 @@
+# How to Contribute
+
+We'd love to accept your patches and contributions to this project. There are
+just a few small guidelines you need to follow.
+
+## Contributor License Agreement
+
+Contributions to this project must be accompanied by a Contributor License
+Agreement. You (or your employer) retain the copyright to your contribution;
+this simply gives us permission to use and redistribute your contributions as
+part of the project. Head over to <https://cla.developers.google.com/> to see
+your current agreements on file or to sign a new one.
+
+You generally only need to submit a CLA once, so if you've already submitted one
+(even if it was for a different project), you probably don't need to do it
+again.
+
+## Code reviews
+
+All submissions, including submissions by project members, require review. We
+use GitHub pull requests for this purpose. Consult
+[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
+information on using pull requests.
+
+## Community Guidelines
+
+This project follows [Google's Open Source Community
+Guidelines](https://opensource.google.com/conduct/).
diff --git a/DeepLab_Demo.ipynb b/DeepLab_Demo.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..09634890eb73e17fe7690a94b7fad2c8b570bda0
--- /dev/null
+++ b/DeepLab_Demo.ipynb
@@ -0,0 +1,392 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "B8a_URGiowPn"
+      },
+      "source": [
+        "## Overview\n",
+        "This colab demonstrates the steps to run a family of DeepLab models built by the DeepLab2 library to perform dense pixel labeling tasks. The models used in this colab perform panoptic segmentation, where the predicted value encodes both semantic class and instance label for every pixel (including both ‘thing’ and ‘stuff’ pixels).\n",
+        "\n",
+        "### About DeepLab2\n",
+        "DeepLab2 is a TensorFlow library for deep labeling, aiming to facilitate future research on dense pixel labeling tasks by providing state-of-the-art and easy-to-use TensorFlow models. Code is made publicly available at https://github.com/google-research/deeplab2"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "IGVFjkE2o0K8"
+      },
+      "source": [
+        "### Import and helper methods"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "dQNiIp-LoV6f"
+      },
+      "outputs": [],
+      "source": [
+        "import collections\n",
+        "import os\n",
+        "import tempfile\n",
+        "\n",
+        "from matplotlib import gridspec\n",
+        "from matplotlib import pyplot as plt\n",
+        "import numpy as np\n",
+        "from PIL import Image\n",
+        "import urllib\n",
+        "\n",
+        "import tensorflow as tf\n",
+        "\n",
+        "from google.colab import files"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Avk0g2-wo2AO"
+      },
+      "outputs": [],
+      "source": [
+        "DatasetInfo = collections.namedtuple(\n",
+        "    'DatasetInfo',\n",
+        "    'num_classes, label_divisor, thing_list, colormap, class_names')\n",
+        "\n",
+        "\n",
+        "def _cityscapes_label_colormap():\n",
+        "  \"\"\"Creates a label colormap used in CITYSCAPES segmentation benchmark.\n",
+        "\n",
+        "  See more about CITYSCAPES dataset at https://www.cityscapes-dataset.com/\n",
+        "  M. Cordts, et al. \"The Cityscapes Dataset for Semantic Urban Scene Understanding.\" CVPR. 2016.\n",
+        "\n",
+        "  Returns:\n",
+        "    A 2-D numpy array with each row being mapped RGB color (in uint8 range).\n",
+        "  \"\"\"\n",
+        "  colormap = np.zeros((256, 3), dtype=np.uint8)\n",
+        "  colormap[0] = [128, 64, 128]\n",
+        "  colormap[1] = [244, 35, 232]\n",
+        "  colormap[2] = [70, 70, 70]\n",
+        "  colormap[3] = [102, 102, 156]\n",
+        "  colormap[4] = [190, 153, 153]\n",
+        "  colormap[5] = [153, 153, 153]\n",
+        "  colormap[6] = [250, 170, 30]\n",
+        "  colormap[7] = [220, 220, 0]\n",
+        "  colormap[8] = [107, 142, 35]\n",
+        "  colormap[9] = [152, 251, 152]\n",
+        "  colormap[10] = [70, 130, 180]\n",
+        "  colormap[11] = [220, 20, 60]\n",
+        "  colormap[12] = [255, 0, 0]\n",
+        "  colormap[13] = [0, 0, 142]\n",
+        "  colormap[14] = [0, 0, 70]\n",
+        "  colormap[15] = [0, 60, 100]\n",
+        "  colormap[16] = [0, 80, 100]\n",
+        "  colormap[17] = [0, 0, 230]\n",
+        "  colormap[18] = [119, 11, 32]\n",
+        "  return colormap\n",
+        "\n",
+        "\n",
+        "def _cityscapes_class_names():\n",
+        "  return ('road', 'sidewalk', 'building', 'wall', 'fence', 'pole',\n",
+        "          'traffic light', 'traffic sign', 'vegetation', 'terrain', 'sky',\n",
+        "          'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle',\n",
+        "          'bicycle')\n",
+        "\n",
+        "\n",
+        "def cityscapes_dataset_information():\n",
+        "  return DatasetInfo(\n",
+        "      num_classes=19,\n",
+        "      label_divisor=1000,\n",
+        "      thing_list=tuple(range(11, 19)),\n",
+        "      colormap=_cityscapes_label_colormap(),\n",
+        "      class_names=_cityscapes_class_names())\n",
+        "\n",
+        "\n",
+        "def perturb_color(color, noise, used_colors, max_trials=50, random_state=None):\n",
+        "  \"\"\"Pertrubs the color with some noise.\n",
+        "\n",
+        "  If `used_colors` is not None, we will return the color that has\n",
+        "  not appeared before in it.\n",
+        "\n",
+        "  Args:\n",
+        "    color: A numpy array with three elements [R, G, B].\n",
+        "    noise: Integer, specifying the amount of perturbing noise (in uint8 range).\n",
+        "    used_colors: A set, used to keep track of used colors.\n",
+        "    max_trials: An integer, maximum trials to generate random color.\n",
+        "    random_state: An optional np.random.RandomState. If passed, will be used to\n",
+        "      generate random numbers.\n",
+        "\n",
+        "  Returns:\n",
+        "    A perturbed color that has not appeared in used_colors.\n",
+        "  \"\"\"\n",
+        "  if random_state is None:\n",
+        "    random_state = np.random\n",
+        "\n",
+        "  for _ in range(max_trials):\n",
+        "    random_color = color + random_state.randint(\n",
+        "        low=-noise, high=noise + 1, size=3)\n",
+        "    random_color = np.clip(random_color, 0, 255)\n",
+        "\n",
+        "    if tuple(random_color) not in used_colors:\n",
+        "      used_colors.add(tuple(random_color))\n",
+        "      return random_color\n",
+        "\n",
+        "  print('Max trial reached and duplicate color will be used. Please consider '\n",
+        "        'increase noise in `perturb_color()`.')\n",
+        "  return random_color\n",
+        "\n",
+        "\n",
+        "def color_panoptic_map(panoptic_prediction, dataset_info, perturb_noise):\n",
+        "  \"\"\"Helper method to colorize output panoptic map.\n",
+        "\n",
+        "  Args:\n",
+        "    panoptic_prediction: A 2D numpy array, panoptic prediction from deeplab\n",
+        "      model.\n",
+        "    dataset_info: A DatasetInfo object, dataset associated to the model.\n",
+        "    perturb_noise: Integer, the amount of noise (in uint8 range) added to each\n",
+        "      instance of the same semantic class.\n",
+        "\n",
+        "  Returns:\n",
+        "    colored_panoptic_map: A 3D numpy array with last dimension of 3, colored\n",
+        "      panoptic prediction map.\n",
+        "    used_colors: A dictionary mapping semantic_ids to a set of colors used\n",
+        "      in `colored_panoptic_map`.\n",
+        "  \"\"\"\n",
+        "  if panoptic_prediction.ndim != 2:\n",
+        "    raise ValueError('Expect 2-D panoptic prediction. Got {}'.format(\n",
+        "        panoptic_prediction.shape))\n",
+        "\n",
+        "  semantic_map = panoptic_prediction // dataset_info.label_divisor\n",
+        "  instance_map = panoptic_prediction % dataset_info.label_divisor\n",
+        "  height, width = panoptic_prediction.shape\n",
+        "  colored_panoptic_map = np.zeros((height, width, 3), dtype=np.uint8)\n",
+        "\n",
+        "  used_colors = collections.defaultdict(set)\n",
+        "  # Use a fixed seed to reproduce the same visualization.\n",
+        "  random_state = np.random.RandomState(0)\n",
+        "\n",
+        "  unique_semantic_ids = np.unique(semantic_map)\n",
+        "  for semantic_id in unique_semantic_ids:\n",
+        "    semantic_mask = semantic_map == semantic_id\n",
+        "    if semantic_id in dataset_info.thing_list:\n",
+        "      # For `thing` class, we will add a small amount of random noise to its\n",
+        "      # correspondingly predefined semantic segmentation colormap.\n",
+        "      unique_instance_ids = np.unique(instance_map[semantic_mask])\n",
+        "      for instance_id in unique_instance_ids:\n",
+        "        instance_mask = np.logical_and(semantic_mask,\n",
+        "                                       instance_map == instance_id)\n",
+        "        random_color = perturb_color(\n",
+        "            dataset_info.colormap[semantic_id],\n",
+        "            perturb_noise,\n",
+        "            used_colors[semantic_id],\n",
+        "            random_state=random_state)\n",
+        "        colored_panoptic_map[instance_mask] = random_color\n",
+        "    else:\n",
+        "      # For `stuff` class, we use the defined semantic color.\n",
+        "      colored_panoptic_map[semantic_mask] = dataset_info.colormap[semantic_id]\n",
+        "      used_colors[semantic_id].add(tuple(dataset_info.colormap[semantic_id]))\n",
+        "  return colored_panoptic_map, used_colors\n",
+        "\n",
+        "\n",
+        "def vis_segmentation(image,\n",
+        "                     panoptic_prediction,\n",
+        "                     dataset_info,\n",
+        "                     perturb_noise=60):\n",
+        "  \"\"\"Visualizes input image, segmentation map and overlay view.\"\"\"\n",
+        "  plt.figure(figsize=(30, 20))\n",
+        "  grid_spec = gridspec.GridSpec(2, 2)\n",
+        "\n",
+        "  ax = plt.subplot(grid_spec[0])\n",
+        "  plt.imshow(image)\n",
+        "  plt.axis('off')\n",
+        "  ax.set_title('input image', fontsize=20)\n",
+        "\n",
+        "  ax = plt.subplot(grid_spec[1])\n",
+        "  panoptic_map, used_colors = color_panoptic_map(panoptic_prediction,\n",
+        "                                                 dataset_info, perturb_noise)\n",
+        "  plt.imshow(panoptic_map)\n",
+        "  plt.axis('off')\n",
+        "  ax.set_title('panoptic map', fontsize=20)\n",
+        "\n",
+        "  ax = plt.subplot(grid_spec[2])\n",
+        "  plt.imshow(image)\n",
+        "  plt.imshow(panoptic_map, alpha=0.7)\n",
+        "  plt.axis('off')\n",
+        "  ax.set_title('panoptic overlay', fontsize=20)\n",
+        "\n",
+        "  ax = plt.subplot(grid_spec[3])\n",
+        "  max_num_instances = max(len(color) for color in used_colors.values())\n",
+        "  # RGBA image as legend.\n",
+        "  legend = np.zeros((len(used_colors), max_num_instances, 4), dtype=np.uint8)\n",
+        "  class_names = []\n",
+        "  for i, semantic_id in enumerate(sorted(used_colors)):\n",
+        "    legend[i, :len(used_colors[semantic_id]), :3] = np.array(\n",
+        "        list(used_colors[semantic_id]))\n",
+        "    legend[i, :len(used_colors[semantic_id]), 3] = 255\n",
+        "    if semantic_id \u003c dataset_info.num_classes:\n",
+        "      class_names.append(dataset_info.class_names[semantic_id])\n",
+        "    else:\n",
+        "      class_names.append('ignore')\n",
+        "\n",
+        "  plt.imshow(legend, interpolation='nearest')\n",
+        "  ax.yaxis.tick_left()\n",
+        "  plt.yticks(range(len(legend)), class_names, fontsize=15)\n",
+        "  plt.xticks([], [])\n",
+        "  ax.tick_params(width=0.0, grid_linewidth=0.0)\n",
+        "  plt.grid('off')\n",
+        "  plt.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "1ly6p6M2o8SF"
+      },
+      "source": [
+        "### Select a pretrained model"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "peo7LUTtulpQ"
+      },
+      "outputs": [],
+      "source": [
+        "MODEL_NAME = 'max_deeplab_l_backbone_os16_axial_deeplab_cityscapes_trainfine_saved_model'  # @param ['resnet50_os32_panoptic_deeplab_cityscapes_crowd_trainfine_saved_model', 'resnet50_beta_os32_panoptic_deeplab_cityscapes_trainfine_saved_model', 'wide_resnet41_os16_panoptic_deeplab_cityscapes_trainfine_saved_model', 'swidernet_sac_1_1_1_os16_panoptic_deeplab_cityscapes_trainfine_saved_model', 'swidernet_sac_1_1_3_os16_panoptic_deeplab_cityscapes_trainfine_saved_model', 'swidernet_sac_1_1_4.5_os16_panoptic_deeplab_cityscapes_trainfine_saved_model', 'axial_swidernet_1_1_1_os16_axial_deeplab_cityscapes_trainfine_saved_model', 'axial_swidernet_1_1_3_os16_axial_deeplab_cityscapes_trainfine_saved_model', 'axial_swidernet_1_1_4.5_os16_axial_deeplab_cityscapes_trainfine_saved_model', 'max_deeplab_s_backbone_os16_axial_deeplab_cityscapes_trainfine_saved_model', 'max_deeplab_l_backbone_os16_axial_deeplab_cityscapes_trainfine_saved_model']\n",
+        "\n",
+        "\n",
+        "_MODELS = ('resnet50_os32_panoptic_deeplab_cityscapes_crowd_trainfine_saved_model',\n",
+        "           'resnet50_beta_os32_panoptic_deeplab_cityscapes_trainfine_saved_model',\n",
+        "           'wide_resnet41_os16_panoptic_deeplab_cityscapes_trainfine_saved_model',\n",
+        "           'swidernet_sac_1_1_1_os16_panoptic_deeplab_cityscapes_trainfine_saved_model',\n",
+        "           'swidernet_sac_1_1_3_os16_panoptic_deeplab_cityscapes_trainfine_saved_model',\n",
+        "           'swidernet_sac_1_1_4.5_os16_panoptic_deeplab_cityscapes_trainfine_saved_model',\n",
+        "           'axial_swidernet_1_1_1_os16_axial_deeplab_cityscapes_trainfine_saved_model',\n",
+        "           'axial_swidernet_1_1_3_os16_axial_deeplab_cityscapes_trainfine_saved_model',\n",
+        "           'axial_swidernet_1_1_4.5_os16_axial_deeplab_cityscapes_trainfine_saved_model',\n",
+        "           'max_deeplab_s_backbone_os16_axial_deeplab_cityscapes_trainfine_saved_model',\n",
+        "           'max_deeplab_l_backbone_os16_axial_deeplab_cityscapes_trainfine_saved_model')\n",
+        "_DOWNLOAD_URL_PATTERN = 'https://storage.googleapis.com/gresearch/tf-deeplab/saved_model/%s.tar.gz'\n",
+        "\n",
+        "_MODEL_NAME_TO_URL_AND_DATASET = {\n",
+        "    model: (_DOWNLOAD_URL_PATTERN % model, cityscapes_dataset_information())\n",
+        "    for model in _MODELS\n",
+        "}\n",
+        "\n",
+        "MODEL_URL, DATASET_INFO = _MODEL_NAME_TO_URL_AND_DATASET[MODEL_NAME]\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "UjYwP1Sjo4dd"
+      },
+      "outputs": [],
+      "source": [
+        "model_dir = tempfile.mkdtemp()\n",
+        "\n",
+        "download_path = os.path.join(model_dir, MODEL_NAME + '.gz')\n",
+        "urllib.request.urlretrieve(MODEL_URL, download_path)\n",
+        "\n",
+        "!tar -xzvf {download_path} -C {model_dir}\n",
+        "\n",
+        "LOADED_MODEL = tf.saved_model.load(os.path.join(model_dir, MODEL_NAME))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "umpwnn4etG6z"
+      },
+      "source": [
+        "### Run on sample images"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "6552FXlAOHnX"
+      },
+      "outputs": [],
+      "source": [
+        "# Optional, upload an image from your local machine.\n",
+        "\n",
+        "uploaded = files.upload()\n",
+        "\n",
+        "if not uploaded:\n",
+        "  UPLOADED_FILE = ''\n",
+        "elif len(uploaded) == 1:\n",
+        "  UPLOADED_FILE = list(uploaded.keys())[0]\n",
+        "else:\n",
+        "  raise AssertionError('Please upload one image at a time')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "SF40dAWFPZmN"
+      },
+      "outputs": [],
+      "source": [
+        "# Using provided sample image if no file is uploaded.\n",
+        "\n",
+        "if not UPLOADED_FILE:\n",
+        "  # Default image from Mapillary dataset samples (https://www.mapillary.com/dataset/vistas).\n",
+        "  # Neuhold, Gerhard, et al. \"The mapillary vistas dataset for semantic understanding of street scenes.\" ICCV. 2017.\n",
+        "  image_dir = tempfile.mkdtemp()\n",
+        "  download_path = os.path.join(image_dir, 'MVD_research_samples.zip')\n",
+        "  urllib.request.urlretrieve(\n",
+        "      'https://static.mapillary.com/MVD_research_samples.zip', download_path)\n",
+        "\n",
+        "  !unzip {download_path} -d {image_dir}\n",
+        "  UPLOADED_FILE = os.path.join(image_dir, 'Asia/tlxGlVwxyGUdUBfkjy1UOQ.jpg')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "bsQ7Oj7jtHDz"
+      },
+      "outputs": [],
+      "source": [
+        "with tf.io.gfile.GFile(UPLOADED_FILE, 'rb') as f:\n",
+        "  im = np.array(Image.open(f))\n",
+        "\n",
+        "output = LOADED_MODEL(tf.cast(im, tf.uint8))\n",
+        "vis_segmentation(im, output['panoptic_pred'][0], DATASET_INFO)"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "name": "DeepLab_Demo.ipynb",
+      "private_outputs": true,
+      "provenance": [
+        {
+          "file_id": "18PFmyE_Tcs97fX892SHgtvxaCa0QXTta",
+          "timestamp": 1623189153618
+        }
+      ]
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..d645695673349e3947e8e5ae42332d0ac3164cd7
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.md b/README.md
index 4554b79446d8d4151161c8ab402ed7883452e384..b123b7b37826edbd13bcadba725f6aea618afda6 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
 ---
-title: Axial DeepLab SWideRNet
-emoji: 🏃
-colorFrom: green
-colorTo: blue
+title: Axial-DeepLab-SWideRNet
+emoji: 🔥
+colorFrom: red
+colorTo: orange
 sdk: gradio
 app_file: app.py
 pinned: false
@@ -25,13 +25,13 @@ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gr
 `sdk`: _string_  
 Can be either `gradio` or `streamlit`
 
-`sdk_version` : _string_  
-Only applicable for `streamlit` SDK.  
-See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
-
 `app_file`: _string_  
 Path to your main application file (which contains either `gradio` or `streamlit` Python code).  
 Path is relative to the root of the repository.
 
 `pinned`: _boolean_  
 Whether the Space stays on top of your list.
+
+
+
+# from https://huggingface.co/spaces/akhaliq/deeplab2
\ No newline at end of file
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..35e4ce02ff422f3aa84ab644b88d65b13e0cbc03
--- /dev/null
+++ b/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..b09096b0dea4592b5316a8ef0cbf761a1deff85f
--- /dev/null
+++ b/app.py
@@ -0,0 +1,224 @@
+import collections
+import os
+import tempfile
+from matplotlib import gridspec
+from matplotlib import pyplot as plt
+import numpy as np
+from PIL import Image
+import urllib
+import tensorflow as tf
+import gradio as gr
+from subprocess import call
+import sys
+import requests
+url1 = 'https://cdn.pixabay.com/photo/2014/09/07/21/52/city-438393_1280.jpg'
+r = requests.get(url1, allow_redirects=True)
+open("city1.jpg", 'wb').write(r.content)
+url2 = 'https://cdn.pixabay.com/photo/2016/02/19/11/36/canal-1209808_1280.jpg'
+r = requests.get(url2, allow_redirects=True)
+open("city2.jpg", 'wb').write(r.content)
+DatasetInfo = collections.namedtuple(
+    'DatasetInfo',
+    'num_classes, label_divisor, thing_list, colormap, class_names')
+def _cityscapes_label_colormap():
+  """Creates a label colormap used in CITYSCAPES segmentation benchmark.
+  See more about CITYSCAPES dataset at https://www.cityscapes-dataset.com/
+  M. Cordts, et al. "The Cityscapes Dataset for Semantic Urban Scene Understanding." CVPR. 2016.
+  Returns:
+    A 2-D numpy array with each row being mapped RGB color (in uint8 range).
+  """
+  colormap = np.zeros((256, 3), dtype=np.uint8)
+  colormap[0] = [128, 64, 128]
+  colormap[1] = [244, 35, 232]
+  colormap[2] = [70, 70, 70]
+  colormap[3] = [102, 102, 156]
+  colormap[4] = [190, 153, 153]
+  colormap[5] = [153, 153, 153]
+  colormap[6] = [250, 170, 30]
+  colormap[7] = [220, 220, 0]
+  colormap[8] = [107, 142, 35]
+  colormap[9] = [152, 251, 152]
+  colormap[10] = [70, 130, 180]
+  colormap[11] = [220, 20, 60]
+  colormap[12] = [255, 0, 0]
+  colormap[13] = [0, 0, 142]
+  colormap[14] = [0, 0, 70]
+  colormap[15] = [0, 60, 100]
+  colormap[16] = [0, 80, 100]
+  colormap[17] = [0, 0, 230]
+  colormap[18] = [119, 11, 32]
+  return colormap
+def _cityscapes_class_names():
+  return ('road', 'sidewalk', 'building', 'wall', 'fence', 'pole',
+          'traffic light', 'traffic sign', 'vegetation', 'terrain', 'sky',
+          'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle',
+          'bicycle')
+def cityscapes_dataset_information():
+  return DatasetInfo(
+      num_classes=19,
+      label_divisor=1000,
+      thing_list=tuple(range(11, 19)),
+      colormap=_cityscapes_label_colormap(),
+      class_names=_cityscapes_class_names())
+def perturb_color(color, noise, used_colors, max_trials=50, random_state=None):
+  """Pertrubs the color with some noise.
+  If `used_colors` is not None, we will return the color that has
+  not appeared before in it.
+  Args:
+    color: A numpy array with three elements [R, G, B].
+    noise: Integer, specifying the amount of perturbing noise (in uint8 range).
+    used_colors: A set, used to keep track of used colors.
+    max_trials: An integer, maximum trials to generate random color.
+    random_state: An optional np.random.RandomState. If passed, will be used to
+      generate random numbers.
+  Returns:
+    A perturbed color that has not appeared in used_colors.
+  """
+  if random_state is None:
+    random_state = np.random
+  for _ in range(max_trials):
+    random_color = color + random_state.randint(
+        low=-noise, high=noise + 1, size=3)
+    random_color = np.clip(random_color, 0, 255)
+    if tuple(random_color) not in used_colors:
+      used_colors.add(tuple(random_color))
+      return random_color
+  print('Max trial reached and duplicate color will be used. Please consider '
+        'increase noise in `perturb_color()`.')
+  return random_color
+def color_panoptic_map(panoptic_prediction, dataset_info, perturb_noise):
+  """Helper method to colorize output panoptic map.
+  Args:
+    panoptic_prediction: A 2D numpy array, panoptic prediction from deeplab
+      model.
+    dataset_info: A DatasetInfo object, dataset associated to the model.
+    perturb_noise: Integer, the amount of noise (in uint8 range) added to each
+      instance of the same semantic class.
+  Returns:
+    colored_panoptic_map: A 3D numpy array with last dimension of 3, colored
+      panoptic prediction map.
+    used_colors: A dictionary mapping semantic_ids to a set of colors used
+      in `colored_panoptic_map`.
+  """
+  if panoptic_prediction.ndim != 2:
+    raise ValueError('Expect 2-D panoptic prediction. Got {}'.format(
+        panoptic_prediction.shape))
+  semantic_map = panoptic_prediction // dataset_info.label_divisor
+  instance_map = panoptic_prediction % dataset_info.label_divisor
+  height, width = panoptic_prediction.shape
+  colored_panoptic_map = np.zeros((height, width, 3), dtype=np.uint8)
+  used_colors = collections.defaultdict(set)
+  # Use a fixed seed to reproduce the same visualization.
+  random_state = np.random.RandomState(0)
+  unique_semantic_ids = np.unique(semantic_map)
+  for semantic_id in unique_semantic_ids:
+    semantic_mask = semantic_map == semantic_id
+    if semantic_id in dataset_info.thing_list:
+      # For `thing` class, we will add a small amount of random noise to its
+      # correspondingly predefined semantic segmentation colormap.
+      unique_instance_ids = np.unique(instance_map[semantic_mask])
+      for instance_id in unique_instance_ids:
+        instance_mask = np.logical_and(semantic_mask,
+                                       instance_map == instance_id)
+        random_color = perturb_color(
+            dataset_info.colormap[semantic_id],
+            perturb_noise,
+            used_colors[semantic_id],
+            random_state=random_state)
+        colored_panoptic_map[instance_mask] = random_color
+    else:
+      # For `stuff` class, we use the defined semantic color.
+      colored_panoptic_map[semantic_mask] = dataset_info.colormap[semantic_id]
+      used_colors[semantic_id].add(tuple(dataset_info.colormap[semantic_id]))
+  return colored_panoptic_map, used_colors
+def vis_segmentation(image,
+                     panoptic_prediction,
+                     dataset_info,
+                     perturb_noise=60):
+  """Visualizes input image, segmentation map and overlay view."""
+  plt.figure(figsize=(30, 20))
+  grid_spec = gridspec.GridSpec(2, 2)
+  ax = plt.subplot(grid_spec[0])
+  plt.imshow(image)
+  plt.axis('off')
+  ax.set_title('input image', fontsize=20)
+  ax = plt.subplot(grid_spec[1])
+  panoptic_map, used_colors = color_panoptic_map(panoptic_prediction,
+                                                 dataset_info, perturb_noise)
+  plt.imshow(panoptic_map)
+  plt.axis('off')
+  ax.set_title('panoptic map', fontsize=20)
+  ax = plt.subplot(grid_spec[2])
+  plt.imshow(image)
+  plt.imshow(panoptic_map, alpha=0.7)
+  plt.axis('off')
+  ax.set_title('panoptic overlay', fontsize=20)
+  ax = plt.subplot(grid_spec[3])
+  max_num_instances = max(len(color) for color in used_colors.values())
+  # RGBA image as legend.
+  legend = np.zeros((len(used_colors), max_num_instances, 4), dtype=np.uint8)
+  class_names = []
+  for i, semantic_id in enumerate(sorted(used_colors)):
+    legend[i, :len(used_colors[semantic_id]), :3] = np.array(
+        list(used_colors[semantic_id]))
+    legend[i, :len(used_colors[semantic_id]), 3] = 255
+    if semantic_id < dataset_info.num_classes:
+      class_names.append(dataset_info.class_names[semantic_id])
+    else:
+      class_names.append('ignore')
+  plt.imshow(legend, interpolation='nearest')
+  ax.yaxis.tick_left()
+  plt.yticks(range(len(legend)), class_names, fontsize=15)
+  plt.xticks([], [])
+  ax.tick_params(width=0.0, grid_linewidth=0.0)
+  plt.grid('off')
+  return plt
+def run_cmd(command):
+    try:
+        print(command)
+        call(command, shell=True)
+    except KeyboardInterrupt:
+        print("Process interrupted")
+        sys.exit(1)
+MODEL_NAME = 'axial_swidernet_1_1_4.5_os16_axial_deeplab_cityscapes_trainfine_saved_model'
+_MODELS = ('resnet50_os32_panoptic_deeplab_cityscapes_crowd_trainfine_saved_model',
+           'resnet50_beta_os32_panoptic_deeplab_cityscapes_trainfine_saved_model',
+           'wide_resnet41_os16_panoptic_deeplab_cityscapes_trainfine_saved_model',
+           'swidernet_sac_1_1_1_os16_panoptic_deeplab_cityscapes_trainfine_saved_model',
+           'swidernet_sac_1_1_3_os16_panoptic_deeplab_cityscapes_trainfine_saved_model',
+           'swidernet_sac_1_1_4.5_os16_panoptic_deeplab_cityscapes_trainfine_saved_model',
+           'axial_swidernet_1_1_1_os16_axial_deeplab_cityscapes_trainfine_saved_model',
+           'axial_swidernet_1_1_3_os16_axial_deeplab_cityscapes_trainfine_saved_model',
+           'axial_swidernet_1_1_4.5_os16_axial_deeplab_cityscapes_trainfine_saved_model',
+           'max_deeplab_s_backbone_os16_axial_deeplab_cityscapes_trainfine_saved_model',
+           'max_deeplab_l_backbone_os16_axial_deeplab_cityscapes_trainfine_saved_model')
+_DOWNLOAD_URL_PATTERN = 'https://storage.googleapis.com/gresearch/tf-deeplab/saved_model/%s.tar.gz'
+_MODEL_NAME_TO_URL_AND_DATASET = {
+    model: (_DOWNLOAD_URL_PATTERN % model, cityscapes_dataset_information())
+    for model in _MODELS
+}
+MODEL_URL, DATASET_INFO = _MODEL_NAME_TO_URL_AND_DATASET[MODEL_NAME]
+model_dir = tempfile.mkdtemp()
+download_path = os.path.join(model_dir, MODEL_NAME + '.gz')
+urllib.request.urlretrieve(MODEL_URL, download_path)
+run_cmd("tar -xzvf " + download_path + " -C " + model_dir)
+LOADED_MODEL = tf.saved_model.load(os.path.join(model_dir, MODEL_NAME))
+def inference(image):
+    image = image.resize(size=(512, 512))
+    im = np.array(image)
+    output = LOADED_MODEL(tf.cast(im, tf.uint8))
+    return vis_segmentation(im, output['panoptic_pred'][0], DATASET_INFO)
+title = "Deeplab2 - Axial Deeplab SWideRNet-(1, 1, 4.5)"
+description = "demo for Deeplab2. To use it, simply upload your image, or click one of the examples to load them. Read more at the links below.\nModel: 'axial_swidernet_1_1_4.5_os16_axial_deeplab_cityscapes_trainfine_saved_model'"
+article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2106.09748'>DeepLab2: A TensorFlow Library for Deep Labeling</a> | <a href='https://github.com/google-research/deeplab2'>Github Repo</a></p>"
+gr.Interface(
+    inference, 
+    [gr.inputs.Image(type="pil", label="Input")], 
+    gr.outputs.Image(type="plot", label="Output"),
+    title=title,
+    description=description,
+    article=article,
+    examples=[
+            ["city1.jpg"],
+            ["city2.jpg"]
+        ]).launch()
diff --git a/common.py b/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..447ddea710a3f3dcdf49219a4940b8bc0ae7694e
--- /dev/null
+++ b/common.py
@@ -0,0 +1,152 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This file contains common methods and constants used across this framework."""
+
+# Prediction keys used by the model output dictionary.
+PRED_PANOPTIC_KEY = 'panoptic_pred'
+PRED_SEMANTIC_KEY = 'semantic_pred'
+PRED_INSTANCE_KEY = 'instance_pred'
+PRED_INSTANCE_CENTER_KEY = 'instance_center_pred'
+
+
+PRED_SEMANTIC_LOGITS_KEY = 'semantic_logits'
+PRED_SEMANTIC_PROBS_KEY = 'semantic_probs'
+PRED_INSTANCE_SCORES_KEY = 'instance_scores'
+PRED_CENTER_HEATMAP_KEY = 'center_heatmap'
+PRED_OFFSET_MAP_KEY = 'offset_map'
+PRED_FRAME_OFFSET_MAP_KEY = 'frame_offset_map'
+PRED_NEXT_OFFSET_MAP_KEY = 'next_offset_map'
+PRED_NEXT_PANOPTIC_KEY = 'next_panoptic_pred'
+PRED_CONCAT_NEXT_PANOPTIC_KEY = 'concat_next_panoptic_pred'
+
+PRED_PIXEL_SPACE_NORMALIZED_FEATURE_KEY = 'pixel_space_normalized_feature'
+PRED_PIXEL_SPACE_MASK_LOGITS_KEY = 'pixel_space_mask_logits'
+PRED_TRANSFORMER_CLASS_LOGITS_KEY = 'transformer_class_logits'
+
+# Ground-truth keys used by the model.
+GT_PANOPTIC_KEY = 'panoptic_gt'
+GT_SEMANTIC_KEY = 'semantic_gt'
+GT_INSTANCE_CENTER_KEY = 'instance_center_gt'
+GT_INSTANCE_REGRESSION_KEY = 'instance_regression_gt'
+GT_FRAME_OFFSET_KEY = 'frame_offset_gt'
+GT_IS_CROWD = 'is_crowd_gt'
+GT_THING_ID_MASK_KEY = 'thing_id_mask_gt'
+GT_THING_ID_CLASS_KEY = 'thing_id_class_gt'
+GT_NEXT_INSTANCE_REGRESSION_KEY = 'next_instance_regression_gt'
+
+# Raw labels.
+GT_PANOPTIC_RAW = 'panoptic_raw'
+GT_SEMANTIC_RAW = 'semantic_raw'
+GT_IS_CROWD_RAW = 'is_crowd_raw'
+GT_SIZE_RAW = 'size_raw'
+GT_NEXT_PANOPTIC_RAW = 'next_panoptic_raw'
+
+# Loss keys.
+SEMANTIC_LOSS = 'semantic_loss'
+CENTER_LOSS = 'center_loss'
+REGRESSION_LOSS = 'regression_loss'
+MOTION_LOSS = 'motion_loss'
+NEXT_REGRESSION_LOSS = 'next_regression_loss'
+PQ_STYLE_LOSS = 'pq_style_loss'
+# The PQ-style loss consists of a class term and a mask dice term.
+PQ_STYLE_LOSS_CLASS_TERM = 'pq_style_loss_class_term'
+PQ_STYLE_LOSS_MASK_DICE_TERM = 'pq_style_loss_mask_dice_term'
+MASK_ID_CROSS_ENTROPY_LOSS = 'mask_id_cross_entropy_loss'
+INSTANCE_DISCRIMINATION_LOSS = 'instance_discrimination_loss'
+TOTAL_LOSS = 'total_loss'
+
+# Weight keys used by the model.
+SEMANTIC_LOSS_WEIGHT_KEY = 'semantic_loss_weight'
+CENTER_LOSS_WEIGHT_KEY = 'center_loss_weight'
+REGRESSION_LOSS_WEIGHT_KEY = 'regression_loss_weight'
+FRAME_REGRESSION_LOSS_WEIGHT_KEY = 'frame_regression_loss_weight'
+NEXT_REGRESSION_LOSS_WEIGHT_KEY = 'next_regression_loss_weight'
+
+# Misc.
+RESIZED_IMAGE = 'resized_image'
+IMAGE = 'image'
+IMAGE_NAME = 'image_name'
+SEQUENCE_ID = 'sequence_id'
+NEXT_IMAGE = 'next_image'
+
+# TfExample keys.
+KEY_ENCODED_IMAGE = 'image/encoded'
+KEY_ENCODED_PREV_IMAGE = 'prev_image/encoded'
+KEY_ENCODED_NEXT_IMAGE = 'next_image/encoded'
+KEY_IMAGE_FILENAME = 'image/filename'
+KEY_IMAGE_FORMAT = 'image/format'
+KEY_IMAGE_HEIGHT = 'image/height'
+KEY_IMAGE_WIDTH = 'image/width'
+KEY_IMAGE_CHANNELS = 'image/channels'
+KEY_ENCODED_LABEL = 'image/segmentation/class/encoded'
+KEY_ENCODED_PREV_LABEL = 'prev_image/segmentation/class/encoded'
+KEY_ENCODED_NEXT_LABEL = 'next_image/segmentation/class/encoded'
+KEY_LABEL_FORMAT = 'image/segmentation/class/format'
+KEY_SEQUENCE_ID = 'video/sequence_id'
+KEY_FRAME_ID = 'video/frame_id'
+KEY_ENCODED_DEPTH = 'image/depth/encoded'
+KEY_DEPTH_FORMAT = 'image/depth/format'
+
+# Checkpoint Items
+# All models
+CKPT_SEMANTIC_LAST_LAYER = 'semantic_last_layer'
+
+# DeepLabV3
+CKPT_DEEPLABV3_ASPP = 'deeplab_v3_aspp'
+CKPT_DEEPLABV3_CLASSIFIER_CONV_BN_ACT = 'classifier_conv_bn_act'
+
+# DeepLabV3+
+CKPT_DEEPLABV3PLUS_ASPP = 'deeplab_v3plus_aspp'
+CKPT_DEEPLABV3PLUS_PROJECT_CONV_BN_ACT = 'deeplab_v3plus_project_conv_bn_act'
+CKPT_DEEPLABV3PLUS_FUSE = 'deeplab_v3plus_fuse'
+
+# Panoptic-DeepLab
+CKPT_SEMANTIC_DECODER = 'semantic_decoder'
+CKPT_SEMANTIC_HEAD_WITHOUT_LAST_LAYER = 'semantic_head_without_last_layer'
+
+CKPT_INSTANCE_DECODER = 'instance_decoder'
+CKPT_INSTANCE_CENTER_HEAD_WITHOUT_LAST_LAYER = ('instance_center_head'
+                                                '_without_last_layer')
+CKPT_INSTANCE_CENTER_HEAD_LAST_LAYER = 'instance_center_head_last_layer'
+CKPT_INSTANCE_REGRESSION_HEAD_WITHOUT_LAST_LAYER = ('instance_regression_head'
+                                                    '_without_last_layer')
+CKPT_INSTANCE_REGRESSION_HEAD_LAST_LAYER = 'instance_regression_head_last_layer'
+
+# Motion-DeepLab
+CKPT_MOTION_REGRESSION_HEAD_WITHOUT_LAST_LAYER = ('motion_regression_head'
+                                                  '_without_last_layer')
+CKPT_MOTION_REGRESSION_HEAD_LAST_LAYER = 'motion_regression_head_last_layer'
+
+# ViP-DeepLab
+CKPT_NEXT_INSTANCE_DECODER = 'next_instance_decoder'
+CKPT_NEXT_INSTANCE_REGRESSION_HEAD_WITHOUT_LAST_LAYER = (
+    'next_instance_regression_head_without_last_layer')
+CKPT_NEXT_INSTANCE_REGRESSION_HEAD_LAST_LAYER = (
+    'next_instance_regression_head_last_layer')
+
+# MaX-DeepLab
+CKPT_PIXEL_SPACE_HEAD = 'pixel_space_head'
+CKPT_TRANSFORMER_MASK_HEAD = 'transformer_mask_head'
+CKPT_TRANSFORMER_CLASS_HEAD = 'transformer_class_head'
+CKPT_PIXEL_SPACE_FEATURE_BATCH_NORM = 'pixel_space_feature_batch_norm'
+CKPT_PIXEL_SPACE_MASK_BATCH_NORM = 'pixel_space_mask_batch_norm'
+
+# Supported Tasks
+TASK_PANOPTIC_SEGMENTATION = 'panoptic_segmentation'
+TASK_INSTANCE_SEGMENTATION = 'instance_segmentation'
+TASK_VIDEO_PANOPTIC_SEGMENTATION = 'video_panoptic_segmentation'
+TASK_DEPTH_AWARE_VIDEO_PANOPTIC_SEGMENTATION = (
+    'depth_aware_video_panoptic_segmentation')
diff --git a/common_test.py b/common_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..54587e52fc6555ffa20146b55dfb8615c8132877
--- /dev/null
+++ b/common_test.py
@@ -0,0 +1,74 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for common.py."""
+import tensorflow as tf
+
+from deeplab2 import common
+
+
+class CommonTest(tf.test.TestCase):
+
+  def test_constants_keys(self):
+    self.assertEqual(common.PRED_PANOPTIC_KEY, 'panoptic_pred')
+    self.assertEqual(common.PRED_SEMANTIC_KEY, 'semantic_pred')
+    self.assertEqual(common.PRED_INSTANCE_CENTER_KEY, 'instance_center_pred')
+    self.assertEqual(common.PRED_INSTANCE_KEY, 'instance_pred')
+
+    self.assertEqual(common.PRED_SEMANTIC_LOGITS_KEY, 'semantic_logits')
+    self.assertEqual(common.PRED_CENTER_HEATMAP_KEY, 'center_heatmap')
+    self.assertEqual(common.PRED_OFFSET_MAP_KEY, 'offset_map')
+    self.assertEqual(common.PRED_FRAME_OFFSET_MAP_KEY, 'frame_offset_map')
+
+    self.assertEqual(common.GT_PANOPTIC_KEY, 'panoptic_gt')
+    self.assertEqual(common.GT_SEMANTIC_KEY, 'semantic_gt')
+    self.assertEqual(common.GT_INSTANCE_CENTER_KEY, 'instance_center_gt')
+    self.assertEqual(common.GT_FRAME_OFFSET_KEY, 'frame_offset_gt')
+    self.assertEqual(common.GT_INSTANCE_REGRESSION_KEY,
+                     'instance_regression_gt')
+    self.assertEqual(common.GT_PANOPTIC_RAW, 'panoptic_raw')
+    self.assertEqual(common.GT_SEMANTIC_RAW, 'semantic_raw')
+    self.assertEqual(common.GT_SIZE_RAW, 'size_raw')
+
+    self.assertEqual(common.SEMANTIC_LOSS_WEIGHT_KEY, 'semantic_loss_weight')
+    self.assertEqual(common.CENTER_LOSS_WEIGHT_KEY, 'center_loss_weight')
+    self.assertEqual(common.REGRESSION_LOSS_WEIGHT_KEY,
+                     'regression_loss_weight')
+    self.assertEqual(common.FRAME_REGRESSION_LOSS_WEIGHT_KEY,
+                     'frame_regression_loss_weight')
+
+    self.assertEqual(common.RESIZED_IMAGE, 'resized_image')
+    self.assertEqual(common.IMAGE, 'image')
+    self.assertEqual(common.IMAGE_NAME, 'image_name')
+    self.assertEqual(common.SEQUENCE_ID, 'sequence_id')
+
+    self.assertEqual(common.KEY_FRAME_ID, 'video/frame_id')
+    self.assertEqual(common.KEY_SEQUENCE_ID, 'video/sequence_id')
+    self.assertEqual(common.KEY_LABEL_FORMAT, 'image/segmentation/class/format')
+    self.assertEqual(common.KEY_ENCODED_PREV_LABEL,
+                     'prev_image/segmentation/class/encoded')
+    self.assertEqual(common.KEY_ENCODED_LABEL,
+                     'image/segmentation/class/encoded')
+    self.assertEqual(common.KEY_IMAGE_CHANNELS, 'image/channels')
+    self.assertEqual(common.KEY_IMAGE_WIDTH, 'image/width')
+    self.assertEqual(common.KEY_IMAGE_HEIGHT, 'image/height')
+    self.assertEqual(common.KEY_IMAGE_FORMAT, 'image/format')
+    self.assertEqual(common.KEY_IMAGE_FILENAME, 'image/filename')
+    self.assertEqual(common.KEY_ENCODED_PREV_IMAGE, 'prev_image/encoded')
+    self.assertEqual(common.KEY_ENCODED_IMAGE, 'image/encoded')
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/compile.sh b/compile.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2afdcf2afc04835e81bc57f877a65bc6903d1ba1
--- /dev/null
+++ b/compile.sh
@@ -0,0 +1,114 @@
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Quick start command line to setup deeplab2 (Linux only).
+# Example command to run:
+#   deeplab2/compile.sh ${PATH_TO_PROTOC}
+#
+# This script assumes that the following folder structure:
+#
+#   + root
+#    + deeplab2
+#    + models
+#      + orbit
+#    + cocoapi
+#      + PythonAPI
+#
+# Besides, the script also assumes that `protoc` can be accessed from command
+# line.
+
+#!/bin/bash
+
+set -e
+
+# cpu or gpu
+CONFIG="cpu"
+
+function tolower() {
+  echo "${1,,}"
+}
+
+if [[ ! -z "$1" ]]
+then
+  echo "Setting configuration from argument($1)..."
+  CONFIG=$(tolower "$1")
+  if  [ "$CONFIG" != "cpu" ] && [ "$CONFIG" != "gpu" ]
+  then
+    echo "Configuration must be either \"cpu\" or \"gpu\", exiting..."
+    exit 1
+  fi
+fi
+
+echo "Running configuration with $CONFIG."
+
+# Protobuf compilation
+# Replace `protoc` with `${PATH_TO_PROTOC}` if protobuf compilier is downloaded
+# from web.
+echo "-----------------------------------------------------------------------"
+echo "Compiling protobuf..."
+echo "-----------------------------------------------------------------------"
+protoc deeplab2/*.proto --python_out=.
+
+# Compile custom ops
+# See details in https://www.tensorflow.org/guide/create_op#compile_the_op_using_your_system_compiler_tensorflow_binary_installation
+TF_CFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))') )
+TF_LFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))') )
+OP_NAME='deeplab2/tensorflow_ops/kernels/merge_semantic_and_instance_maps_op'
+
+if [ "$CONFIG" == "cpu" ]
+then
+  # CPU
+  echo "-----------------------------------------------------------------------"
+  echo "Compiling the custom cc op: merge_semantic_and_instance_maps_op (CPU)..."
+  echo "-----------------------------------------------------------------------"
+  g++ -std=c++14 -shared \
+  ${OP_NAME}.cc ${OP_NAME}_kernel.cc -o ${OP_NAME}.so -fPIC ${TF_CFLAGS[@]} ${TF_LFLAGS[@]} -O2
+else
+  # GPU
+  # (https://www.tensorflow.org/guide/create_op#compiling_the_kernel_for_the_gpu_device)
+  echo "-----------------------------------------------------------------------"
+  echo "Compiling the custom cc op: merge_semantic_and_instance_maps_op (GPU)..."
+  echo "-----------------------------------------------------------------------"
+  nvcc -std=c++14 -c -o ${OP_NAME}_kernel.cu.o \
+  ${OP_NAME}_kernel.cu.cc \
+    ${TF_CFLAGS[@]} -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC --expt-relaxed-constexpr
+
+  g++ -std=c++14 -shared -o ${OP_NAME}.so ${OP_NAME}.cc ${OP_NAME}_kernel.cc \
+    ${OP_NAME}_kernel.cu.o ${TF_CFLAGS[@]} -fPIC -lcudart ${TF_LFLAGS[@]}
+fi
+
+# PYTHONPATH
+export PYTHONPATH=$PYTHONPATH:`pwd`:`pwd`/models:`pwd`/cocoapi/PythonAPI
+
+# Runing test
+echo "-----------------------------------------------------------------------"
+echo "Running tests for merge_semantic_and_instance_maps_op..."
+echo "-----------------------------------------------------------------------"
+python deeplab2/tensorflow_ops/python/kernel_tests/merge_semantic_and_instance_maps_op_test.py
+
+# End-to-end tests
+echo "-----------------------------------------------------------------------"
+echo "Running end-to-end tests..."
+echo "-----------------------------------------------------------------------"
+
+# Model training test (test for custom ops, protobug)
+python deeplab2/model/deeplab_test.py
+
+# Model evaluation test (test for other packages such as orbit, cocoapi, etc)
+python deeplab2/trainer/evaluator_test.py
+
+echo "------------------------"
+echo "Done with configuration!"
+echo "------------------------"
+
diff --git a/config.proto b/config.proto
new file mode 100644
index 0000000000000000000000000000000000000000..f126375293957817ec9b848327614a7611276969
--- /dev/null
+++ b/config.proto
@@ -0,0 +1,40 @@
+// Copyright 2021 The Deeplab2 Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+
+package deeplab2;
+
+import public 'deeplab2/dataset.proto';
+import public 'deeplab2/evaluator.proto';
+import public 'deeplab2/model.proto';
+import public 'deeplab2/trainer.proto';
+
+option java_multiple_files = true;
+
+// Configure experiment options.
+message ExperimentOptions {
+  // Set the experiment name.
+  optional string experiment_name = 1;
+  // Set the options for the model.
+  optional ModelOptions model_options = 2;
+  // Set the options for the trainer.
+  optional TrainerOptions trainer_options = 3;
+  // Set the options for the training dataset.
+  optional DatasetOptions train_dataset_options = 4;
+  // Set the options for the evaluator.
+  optional EvaluatorOptions evaluator_options = 5;
+  // Set the options for the validation dataset.
+  optional DatasetOptions eval_dataset_options = 6;
+}
diff --git a/configs/cityscapes/axial_deeplab/axial_swidernet_1_1_1_os16.textproto b/configs/cityscapes/axial_deeplab/axial_swidernet_1_1_1_os16.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..de31258e40781fc848a5bee1f386091f841c1a87
--- /dev/null
+++ b/configs/cityscapes/axial_deeplab/axial_swidernet_1_1_1_os16.textproto
@@ -0,0 +1,162 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+#
+# Panoptic-DeepLab with Axial-SWideRNet-(1, 1, 1) and output stride 16.
+#
+############### PLEASE READ THIS BEFORE USING THIS CONFIG ###############
+# Before using this config, you need to update the following fields:
+# - experiment_name: Use a unique experiment name for each experiment.
+# - initial_checkpoint: Update the path to the initial checkpoint.
+# - train_dataset_options.file_pattern: Update the path to the
+#   training set. e.g., your_dataset/train*.tfrecord
+# - eval_dataset_options.file_pattern: Update the path to the
+#   validation set, e.g., your_dataset/eval*.tfrecord
+# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you
+#   could successfully compile the provided efficient merging operation
+#   under the folder `tensorflow_ops`.
+#########################################################################
+#
+# Axial-SWideRNet-(1, 1, 1) applies the axial attention blocks (instead of
+# convolutional blocks) to the last two stages of SWideRNet-(1, 1, 1).
+#
+# For axial attention, see
+# - Huiyu Wang, et al. "Axial-DeepLab: Stand-Alone Axial-Attention for Panoptic
+#   Segmentation." In ECCV, 2020.
+# For SWideRNet, see
+# - Liang-Chieh Chen, et al. "Scaling Wide Residual Networks for Panoptic
+#   Segmentation." arXiv: 2011.11675.
+# For Panoptic-DeepLab, see
+# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast Baseline
+#   for Bottom-Up Panoptic Segmentation." In CVPR, 2020.
+
+# Use a unique experiment_name for each experiment.
+experiment_name: "${EXPERIMENT_NAME}"
+model_options {
+  # Update the path to the initial checkpoint (e.g., ImageNet
+  # pretrained checkpoint).
+  initial_checkpoint: "${INIT_CHECKPOINT}"
+  backbone {
+    name: "axial_swidernet"
+    output_stride: 16
+    stem_width_multiplier: 1
+    backbone_width_multiplier: 1
+    backbone_layer_multiplier: 1
+    drop_path_keep_prob: 0.8
+    drop_path_schedule: "linear"
+  }
+  decoder {
+    feature_key: "res5"
+    decoder_channels: 256
+    aspp_channels: 256
+    aspp_use_only_1x1_proj_conv: true
+  }
+  panoptic_deeplab {
+    low_level {
+      feature_key: "res3"
+      channels_project: 64
+    }
+    low_level {
+      feature_key: "res2"
+      channels_project: 32
+    }
+    instance {
+      low_level_override {
+        feature_key: "res3"
+        channels_project: 32
+      }
+      low_level_override {
+        feature_key: "res2"
+        channels_project: 16
+      }
+      instance_decoder_override {
+        feature_key: "res5"
+        decoder_channels: 128
+        aspp_use_only_1x1_proj_conv: true
+      }
+      center_head {
+        output_channels: 1
+        head_channels: 32
+      }
+      regression_head {
+        output_channels: 2
+        head_channels: 32
+      }
+    }
+    semantic_head {
+      output_channels: 19
+      head_channels: 256
+    }
+  }
+}
+trainer_options {
+  save_checkpoints_steps: 1000
+  save_summaries_steps: 100
+  steps_per_loop: 100
+  loss_options {
+    semantic_loss {
+      name: "softmax_cross_entropy"
+      weight: 1.0
+      top_k_percent: 0.2
+    }
+    center_loss {
+      name: "mse"
+      weight: 200
+    }
+    regression_loss {
+      name: "l1"
+      weight: 0.01
+    }
+  }
+  solver_options {
+    base_learning_rate: 0.0001
+    training_number_of_steps: 60000
+  }
+}
+train_dataset_options {
+  dataset: "cityscapes_panoptic"
+  # Update the path to training set.
+  file_pattern: "${TRAIN_SET}"
+  # Adjust the batch_size accordingly to better fit your GPU/TPU memory.
+  # Also see Q1 in g3doc/faq.md.
+  batch_size: 32
+  crop_size: 1025
+  crop_size: 2049
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  augmentations {
+    min_scale_factor: 0.5
+    max_scale_factor: 2.0
+    scale_factor_step_size: 0.1
+    autoaugment_policy_name: "simple_classification_policy_magnitude_scale_0.2"
+  }
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+eval_dataset_options {
+  dataset: "cityscapes_panoptic"
+  # Update the path to validation set.
+  file_pattern: "${VAL_SET}"
+  batch_size: 1
+  crop_size: 1025
+  crop_size: 2049
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  # Add options to make the evaluation loss comparable to the training loss.
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+evaluator_options {
+  continuous_eval_timeout: 43200
+  stuff_area_limit: 2048
+  center_score_threshold: 0.1
+  nms_kernel: 13
+  save_predictions: true
+  save_raw_predictions: false
+  # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and
+  # instance maps. For faster speed, compile TensorFlow with provided kernel
+  # implementation under the folder `tensorflow_ops`, and set
+  # merge_semantic_and_instance_with_tf_op to true.
+  merge_semantic_and_instance_with_tf_op: false
+}
diff --git a/configs/cityscapes/axial_deeplab/axial_swidernet_1_1_3_os16.textproto b/configs/cityscapes/axial_deeplab/axial_swidernet_1_1_3_os16.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..51c35608431a9a01adaa15851e052711b84497ad
--- /dev/null
+++ b/configs/cityscapes/axial_deeplab/axial_swidernet_1_1_3_os16.textproto
@@ -0,0 +1,162 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+#
+# Panoptic-DeepLab with Axial-SWideRNet-(1, 1, 3) and output stride 16.
+#
+############### PLEASE READ THIS BEFORE USING THIS CONFIG ###############
+# Before using this config, you need to update the following fields:
+# - experiment_name: Use a unique experiment name for each experiment.
+# - initial_checkpoint: Update the path to the initial checkpoint.
+# - train_dataset_options.file_pattern: Update the path to the
+#   training set. e.g., your_dataset/train*.tfrecord
+# - eval_dataset_options.file_pattern: Update the path to the
+#   validation set, e.g., your_dataset/eval*.tfrecord
+# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you
+#   could successfully compile the provided efficient merging operation
+#   under the folder `tensorflow_ops`.
+#########################################################################
+#
+# Axial-SWideRNet-(1, 1, 3) applies the axial attention blocks (instead of
+# convolutional blocks) to the last two stages of SWideRNet-(1, 1, 3).
+#
+# For axial attention, see
+# - Huiyu Wang, et al. "Axial-DeepLab: Stand-Alone Axial-Attention for Panoptic
+#   Segmentation." In ECCV, 2020.
+# For SWideRNet, see
+# - Liang-Chieh Chen, et al. "Scaling Wide Residual Networks for Panoptic
+#   Segmentation." arXiv: 2011.11675.
+# For Panoptic-DeepLab, see
+# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast Baseline
+#   for Bottom-Up Panoptic Segmentation." In CVPR, 2020.
+
+# Use a unique experiment_name for each experiment.
+experiment_name: "${EXPERIMENT_NAME}"
+model_options {
+  # Update the path to the initial checkpoint (e.g., ImageNet
+  # pretrained checkpoint).
+  initial_checkpoint: "${INIT_CHECKPOINT}"
+  backbone {
+    name: "axial_swidernet"
+    output_stride: 16
+    stem_width_multiplier: 1
+    backbone_width_multiplier: 1
+    backbone_layer_multiplier: 3
+    drop_path_keep_prob: 0.8
+    drop_path_schedule: "linear"
+  }
+  decoder {
+    feature_key: "res5"
+    decoder_channels: 256
+    aspp_channels: 256
+    aspp_use_only_1x1_proj_conv: true
+  }
+  panoptic_deeplab {
+    low_level {
+      feature_key: "res3"
+      channels_project: 64
+    }
+    low_level {
+      feature_key: "res2"
+      channels_project: 32
+    }
+    instance {
+      low_level_override {
+        feature_key: "res3"
+        channels_project: 32
+      }
+      low_level_override {
+        feature_key: "res2"
+        channels_project: 16
+      }
+      instance_decoder_override {
+        feature_key: "res5"
+        decoder_channels: 128
+        aspp_use_only_1x1_proj_conv: true
+      }
+      center_head {
+        output_channels: 1
+        head_channels: 32
+      }
+      regression_head {
+        output_channels: 2
+        head_channels: 32
+      }
+    }
+    semantic_head {
+      output_channels: 19
+      head_channels: 256
+    }
+  }
+}
+trainer_options {
+  save_checkpoints_steps: 1000
+  save_summaries_steps: 100
+  steps_per_loop: 100
+  loss_options {
+    semantic_loss {
+      name: "softmax_cross_entropy"
+      weight: 1.0
+      top_k_percent: 0.2
+    }
+    center_loss {
+      name: "mse"
+      weight: 200
+    }
+    regression_loss {
+      name: "l1"
+      weight: 0.01
+    }
+  }
+  solver_options {
+    base_learning_rate: 0.0001
+    training_number_of_steps: 60000
+  }
+}
+train_dataset_options {
+  dataset: "cityscapes_panoptic"
+  # Update the path to training set.
+  file_pattern: "${TRAIN_SET}"
+  # Adjust the batch_size accordingly to better fit your GPU/TPU memory.
+  # Also see Q1 in g3doc/faq.md.
+  batch_size: 32
+  crop_size: 1025
+  crop_size: 2049
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  augmentations {
+    min_scale_factor: 0.5
+    max_scale_factor: 2.0
+    scale_factor_step_size: 0.1
+    autoaugment_policy_name: "simple_classification_policy_magnitude_scale_0.2"
+  }
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+eval_dataset_options {
+  dataset: "cityscapes_panoptic"
+  # Update the path to validation set.
+  file_pattern: "${VAL_SET}"
+  batch_size: 1
+  crop_size: 1025
+  crop_size: 2049
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  # Add options to make the evaluation loss comparable to the training loss.
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+evaluator_options {
+  continuous_eval_timeout: 43200
+  stuff_area_limit: 2048
+  center_score_threshold: 0.1
+  nms_kernel: 13
+  save_predictions: true
+  save_raw_predictions: false
+  # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and
+  # instance maps. For faster speed, compile TensorFlow with provided kernel
+  # implementation under the folder `tensorflow_ops`, and set
+  # merge_semantic_and_instance_with_tf_op to true.
+  merge_semantic_and_instance_with_tf_op: false
+}
diff --git a/configs/cityscapes/axial_deeplab/axial_swidernet_1_1_4.5_os16.textproto b/configs/cityscapes/axial_deeplab/axial_swidernet_1_1_4.5_os16.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..cb035f2e01ca28995affbf1b9ae57b888e7fe4e9
--- /dev/null
+++ b/configs/cityscapes/axial_deeplab/axial_swidernet_1_1_4.5_os16.textproto
@@ -0,0 +1,162 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+#
+# Panoptic-DeepLab with Axial-SWideRNet-(1, 1, 4.5) and output stride 16.
+#
+############### PLEASE READ THIS BEFORE USING THIS CONFIG ###############
+# Before using this config, you need to update the following fields:
+# - experiment_name: Use a unique experiment name for each experiment.
+# - initial_checkpoint: Update the path to the initial checkpoint.
+# - train_dataset_options.file_pattern: Update the path to the
+#   training set. e.g., your_dataset/train*.tfrecord
+# - eval_dataset_options.file_pattern: Update the path to the
+#   validation set, e.g., your_dataset/eval*.tfrecord
+# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you
+#   could successfully compile the provided efficient merging operation
+#   under the folder `tensorflow_ops`.
+#########################################################################
+#
+# Axial-SWideRNet-(1, 1, 4.5) applies the axial attention blocks (instead of
+# convolutional blocks) to the last two stages of SWideRNet-(1, 1, 4.5).
+#
+# For axial attention, see
+# - Huiyu Wang, et al. "Axial-DeepLab: Stand-Alone Axial-Attention for Panoptic
+#   Segmentation." In ECCV, 2020.
+# For SWideRNet, see
+# - Liang-Chieh Chen, et al. "Scaling Wide Residual Networks for Panoptic
+#   Segmentation." arXiv: 2011.11675.
+# For Panoptic-DeepLab, see
+# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast Baseline
+#   for Bottom-Up Panoptic Segmentation." In CVPR, 2020.
+
+# Use a unique experiment_name for each experiment.
+experiment_name: "${EXPERIMENT_NAME}"
+model_options {
+  # Update the path to the initial checkpoint (e.g., ImageNet
+  # pretrained checkpoint).
+  initial_checkpoint: "${INIT_CHECKPOINT}"
+  backbone {
+    name: "axial_swidernet"
+    output_stride: 16
+    stem_width_multiplier: 1
+    backbone_width_multiplier: 1
+    backbone_layer_multiplier: 4.5
+    drop_path_keep_prob: 0.8
+    drop_path_schedule: "linear"
+  }
+  decoder {
+    feature_key: "res5"
+    decoder_channels: 256
+    aspp_channels: 256
+    aspp_use_only_1x1_proj_conv: true
+  }
+  panoptic_deeplab {
+    low_level {
+      feature_key: "res3"
+      channels_project: 64
+    }
+    low_level {
+      feature_key: "res2"
+      channels_project: 32
+    }
+    instance {
+      low_level_override {
+        feature_key: "res3"
+        channels_project: 32
+      }
+      low_level_override {
+        feature_key: "res2"
+        channels_project: 16
+      }
+      instance_decoder_override {
+        feature_key: "res5"
+        decoder_channels: 128
+        aspp_use_only_1x1_proj_conv: true
+      }
+      center_head {
+        output_channels: 1
+        head_channels: 32
+      }
+      regression_head {
+        output_channels: 2
+        head_channels: 32
+      }
+    }
+    semantic_head {
+      output_channels: 19
+      head_channels: 256
+    }
+  }
+}
+trainer_options {
+  save_checkpoints_steps: 1000
+  save_summaries_steps: 100
+  steps_per_loop: 100
+  loss_options {
+    semantic_loss {
+      name: "softmax_cross_entropy"
+      weight: 1.0
+      top_k_percent: 0.2
+    }
+    center_loss {
+      name: "mse"
+      weight: 200
+    }
+    regression_loss {
+      name: "l1"
+      weight: 0.01
+    }
+  }
+  solver_options {
+    base_learning_rate: 0.000075
+    training_number_of_steps: 60000
+  }
+}
+train_dataset_options {
+  dataset: "cityscapes_panoptic"
+    # Update the path to training set.
+  file_pattern: "${TRAIN_SET}"
+  # Adjust the batch_size accordingly to better fit your GPU/TPU memory.
+  # Also see Q1 in g3doc/faq.md.
+  batch_size: 32
+  crop_size: 1025
+  crop_size: 2049
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  augmentations {
+    min_scale_factor: 0.5
+    max_scale_factor: 2.0
+    scale_factor_step_size: 0.1
+    autoaugment_policy_name: "simple_classification_policy_magnitude_scale_0.2"
+  }
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+eval_dataset_options {
+  dataset: "cityscapes_panoptic"
+  # Update the path to validation set.
+  file_pattern: "${VAL_SET}"
+  batch_size: 1
+  crop_size: 1025
+  crop_size: 2049
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  # Add options to make the evaluation loss comparable to the training loss.
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+evaluator_options {
+  continuous_eval_timeout: 43200
+  stuff_area_limit: 2048
+  center_score_threshold: 0.1
+  nms_kernel: 13
+  save_predictions: true
+  save_raw_predictions: false
+  # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and
+  # instance maps. For faster speed, compile TensorFlow with provided kernel
+  # implementation under the folder `tensorflow_ops`, and set
+  # merge_semantic_and_instance_with_tf_op to true.
+  merge_semantic_and_instance_with_tf_op: false
+}
diff --git a/configs/cityscapes/axial_deeplab/max_deeplab_l_backbone_os16.textproto b/configs/cityscapes/axial_deeplab/max_deeplab_l_backbone_os16.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..42ce9f5074f2568b78eac5ee98c8f1a9abebaa55
--- /dev/null
+++ b/configs/cityscapes/axial_deeplab/max_deeplab_l_backbone_os16.textproto
@@ -0,0 +1,156 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+#
+# Panoptic-DeepLab with MaX-DeepLab-L backbone and output stride 16.
+#
+############### PLEASE READ THIS BEFORE USING THIS CONFIG ###############
+# Before using this config, you need to update the following fields:
+# - experiment_name: Use a unique experiment name for each experiment.
+# - initial_checkpoint: Update the path to the initial checkpoint.
+# - train_dataset_options.file_pattern: Update the path to the
+#   training set. e.g., your_dataset/train*.tfrecord
+# - eval_dataset_options.file_pattern: Update the path to the
+#   validation set, e.g., your_dataset/eval*.tfrecord
+# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you
+#   could successfully compile the provided efficient merging operation
+#   under the folder `tensorflow_ops`.
+#########################################################################
+#
+# This script employs the MaX-DeepLab-L backbone (i.e., without the memory
+# path in the dual-path transformer blocks) as the network backbone.
+#
+# For MaX-DeepLab-L, see
+# - Huiyu Wang, et al. "MaX-DeepLab: End-to-End Panoptic Segmentation with
+#   Mask Transformers." In CVPR, 2021.
+# For Panoptic-DeepLab, see
+# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast Baseline
+#   for Bottom-Up Panoptic Segmentation." In CVPR, 2020.
+
+# Use a unique experiment_name for each experiment.
+experiment_name: "${EXPERIMENT_NAME}"
+model_options {
+  # Update the path to the initial checkpoint (e.g., ImageNet
+  # pretrained checkpoint).
+  initial_checkpoint: "${INIT_CHECKPOINT}"
+  backbone {
+    name: "max_deeplab_l_backbone"
+    output_stride: 16
+    drop_path_keep_prob: 0.8
+    drop_path_schedule: "linear"
+  }
+  decoder {
+    feature_key: "res5"
+    decoder_channels: 256
+    aspp_channels: 256
+    aspp_use_only_1x1_proj_conv: true
+  }
+  panoptic_deeplab {
+    low_level {
+      feature_key: "res3"
+      channels_project: 64
+    }
+    low_level {
+      feature_key: "res2"
+      channels_project: 32
+    }
+    instance {
+      low_level_override {
+        feature_key: "res3"
+        channels_project: 32
+      }
+      low_level_override {
+        feature_key: "res2"
+        channels_project: 16
+      }
+      instance_decoder_override {
+        feature_key: "res5"
+        decoder_channels: 128
+        aspp_use_only_1x1_proj_conv: true
+      }
+      center_head {
+        output_channels: 1
+        head_channels: 32
+      }
+      regression_head {
+        output_channels: 2
+        head_channels: 32
+      }
+    }
+    semantic_head {
+      output_channels: 19
+      head_channels: 256
+    }
+  }
+}
+trainer_options {
+  save_checkpoints_steps: 1000
+  save_summaries_steps: 100
+  steps_per_loop: 100
+  loss_options {
+    semantic_loss {
+      name: "softmax_cross_entropy"
+      weight: 1.0
+      top_k_percent: 0.2
+    }
+    center_loss {
+      name: "mse"
+      weight: 200
+    }
+    regression_loss {
+      name: "l1"
+      weight: 0.01
+    }
+  }
+  solver_options {
+    base_learning_rate: 0.000075
+    training_number_of_steps: 60000
+  }
+}
+train_dataset_options {
+  dataset: "cityscapes_panoptic"
+  # Update the path to training set.
+  file_pattern: "${TRAIN_SET}"
+  # Adjust the batch_size accordingly to better fit your GPU/TPU memory.
+  # Also see Q1 in g3doc/faq.md.
+  batch_size: 32
+  crop_size: 1025
+  crop_size: 2049
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  augmentations {
+    min_scale_factor: 0.5
+    max_scale_factor: 2.0
+    scale_factor_step_size: 0.1
+    autoaugment_policy_name: "simple_classification_policy_magnitude_scale_0.2"
+  }
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+eval_dataset_options {
+  dataset: "cityscapes_panoptic"
+  # Update the path to validation set.
+  file_pattern: "${VAL_SET}"
+  batch_size: 1
+  crop_size: 1025
+  crop_size: 2049
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  # Add options to make the evaluation loss comparable to the training loss.
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+evaluator_options {
+  continuous_eval_timeout: 43200
+  stuff_area_limit: 2048
+  center_score_threshold: 0.1
+  nms_kernel: 13
+  save_predictions: true
+  save_raw_predictions: false
+  # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and
+  # instance maps. For faster speed, compile TensorFlow with provided kernel
+  # implementation under the folder `tensorflow_ops`, and set
+  # merge_semantic_and_instance_with_tf_op to true.
+  merge_semantic_and_instance_with_tf_op: false
+}
diff --git a/configs/cityscapes/axial_deeplab/max_deeplab_s_backbone_os16.textproto b/configs/cityscapes/axial_deeplab/max_deeplab_s_backbone_os16.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..f4f75f21ac5c348d26d9a7a85c78c253c43656a2
--- /dev/null
+++ b/configs/cityscapes/axial_deeplab/max_deeplab_s_backbone_os16.textproto
@@ -0,0 +1,156 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+#
+# Panoptic-DeepLab with MaX-DeepLab-S backbone and output stride 16.
+#
+############### PLEASE READ THIS BEFORE USING THIS CONFIG ###############
+# Before using this config, you need to update the following fields:
+# - experiment_name: Use a unique experiment name for each experiment.
+# - initial_checkpoint: Update the path to the initial checkpoint.
+# - train_dataset_options.file_pattern: Update the path to the
+#   training set. e.g., your_dataset/train*.tfrecord
+# - eval_dataset_options.file_pattern: Update the path to the
+#   validation set, e.g., your_dataset/eval*.tfrecord
+# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you
+#   could successfully compile the provided efficient merging operation
+#   under the folder `tensorflow_ops`.
+#########################################################################
+#
+# This script employs the MaX-DeepLab-S backbone (i.e., without the memory
+# path in the dual-path transformer blocks) as the network backbone.
+#
+# For MaX-DeepLab-S, see
+# - Huiyu Wang, et al. "MaX-DeepLab: End-to-End Panoptic Segmentation with
+#   Mask Transformers." In CVPR, 2021.
+# For Panoptic-DeepLab, see
+# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast Baseline
+#   for Bottom-Up Panoptic Segmentation." In CVPR, 2020.
+
+# Use a unique experiment_name for each experiment.
+experiment_name: "${EXPERIMENT_NAME}"
+model_options {
+  # Update the path to the initial checkpoint (e.g., ImageNet
+  # pretrained checkpoint).
+  initial_checkpoint: "${INIT_CHECKPOINT}"
+  backbone {
+    name: "max_deeplab_s_backbone"
+    output_stride: 16
+    drop_path_keep_prob: 0.8
+    drop_path_schedule: "linear"
+  }
+  decoder {
+    feature_key: "res5"
+    decoder_channels: 256
+    aspp_channels: 256
+    aspp_use_only_1x1_proj_conv: true
+  }
+  panoptic_deeplab {
+    low_level {
+      feature_key: "res3"
+      channels_project: 64
+    }
+    low_level {
+      feature_key: "res2"
+      channels_project: 32
+    }
+    instance {
+      low_level_override {
+        feature_key: "res3"
+        channels_project: 32
+      }
+      low_level_override {
+        feature_key: "res2"
+        channels_project: 16
+      }
+      instance_decoder_override {
+        feature_key: "res5"
+        decoder_channels: 128
+        aspp_use_only_1x1_proj_conv: true
+      }
+      center_head {
+        output_channels: 1
+        head_channels: 32
+      }
+      regression_head {
+        output_channels: 2
+        head_channels: 32
+      }
+    }
+    semantic_head {
+      output_channels: 19
+      head_channels: 256
+    }
+  }
+}
+trainer_options {
+  save_checkpoints_steps: 1000
+  save_summaries_steps: 100
+  steps_per_loop: 100
+  loss_options {
+    semantic_loss {
+      name: "softmax_cross_entropy"
+      weight: 1.0
+      top_k_percent: 0.2
+    }
+    center_loss {
+      name: "mse"
+      weight: 200
+    }
+    regression_loss {
+      name: "l1"
+      weight: 0.01
+    }
+  }
+  solver_options {
+    base_learning_rate: 0.0001
+    training_number_of_steps: 60000
+  }
+}
+train_dataset_options {
+  dataset: "cityscapes_panoptic"
+  # Update the path to training set.
+  file_pattern: "${TRAIN_SET}"
+  # Adjust the batch_size accordingly to better fit your GPU/TPU memory.
+  # Also see Q1 in g3doc/faq.md.
+  batch_size: 32
+  crop_size: 1025
+  crop_size: 2049
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  augmentations {
+    min_scale_factor: 0.5
+    max_scale_factor: 2.0
+    scale_factor_step_size: 0.1
+    autoaugment_policy_name: "simple_classification_policy_magnitude_scale_0.2"
+  }
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+eval_dataset_options {
+  dataset: "cityscapes_panoptic"
+  # Update the path to validation set.
+  file_pattern: "${VAL_SET}"
+  batch_size: 1
+  crop_size: 1025
+  crop_size: 2049
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  # Add options to make the evaluation loss comparable to the training loss.
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+evaluator_options {
+  continuous_eval_timeout: 43200
+  stuff_area_limit: 2048
+  center_score_threshold: 0.1
+  nms_kernel: 13
+  save_predictions: true
+  save_raw_predictions: false
+  # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and
+  # instance maps. For faster speed, compile TensorFlow with provided kernel
+  # implementation under the folder `tensorflow_ops`, and set
+  # merge_semantic_and_instance_with_tf_op to true.
+  merge_semantic_and_instance_with_tf_op: false
+}
diff --git a/configs/cityscapes/panoptic_deeplab/mobilenet_v3_large_os32.textproto b/configs/cityscapes/panoptic_deeplab/mobilenet_v3_large_os32.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..c9bc507dce44a8399f78823410804b5134cfdf59
--- /dev/null
+++ b/configs/cityscapes/panoptic_deeplab/mobilenet_v3_large_os32.textproto
@@ -0,0 +1,156 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+#
+# Panoptic-DeepLab with MobilenetV3-Large model and output stride 32.
+#
+############### PLEASE READ THIS BEFORE USING THIS CONFIG ###############
+# Before using this config, you need to update the following fields:
+# - experiment_name: Use a unique experiment name for each experiment.
+# - initial_checkpoint: Update the path to the initial checkpoint.
+# - train_dataset_options.file_pattern: Update the path to the
+#   training set. e.g., your_dataset/train*.tfrecord
+# - eval_dataset_options.file_pattern: Update the path to the
+#   validation set, e.g., your_dataset/eval*.tfrecord
+# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you
+#   could successfully compile the provided efficient merging operation
+#   under the folder `tensorflow_ops`.
+#########################################################################
+#
+# References:
+#
+# For Mobilenet V3, see
+# - Andrew Howard, et al. "Searching for MobileNetV3" In ICCV, 2019.
+#
+# For Panoptic-DeepLab, see
+# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast Baseline
+#   for Bottom-Up Panoptic Segmentation." In CVPR, 2020.
+
+# Use a unique experiment_name for each experiment.
+experiment_name: "${EXPERIMENT_NAME}"
+model_options {
+  # Update the path to the initial checkpoint (e.g., ImageNet
+  # pretrained checkpoint).
+  initial_checkpoint: "${INIT_CHECKPOINT}"
+  backbone {
+    name: "mobilenet_v3_large"
+    output_stride: 32
+  }
+  decoder {
+    feature_key: "res5"
+    decoder_channels: 256
+    aspp_channels: 256
+    atrous_rates: 3
+    atrous_rates: 6
+    atrous_rates: 9
+  }
+  panoptic_deeplab {
+    low_level {
+      feature_key: "res3"
+      channels_project: 64
+    }
+    low_level {
+      feature_key: "res2"
+      channels_project: 32
+    }
+    instance {
+      low_level_override {
+        feature_key: "res3"
+        channels_project: 32
+      }
+      low_level_override {
+        feature_key: "res2"
+        channels_project: 16
+      }
+      instance_decoder_override {
+        feature_key: "res5"
+        decoder_channels: 128
+        atrous_rates: 3
+        atrous_rates: 6
+        atrous_rates: 9
+      }
+      center_head {
+        output_channels: 1
+        head_channels: 32
+      }
+      regression_head {
+        output_channels: 2
+        head_channels: 32
+      }
+    }
+    semantic_head {
+      output_channels: 19
+      head_channels: 256
+    }
+  }
+}
+trainer_options {
+  save_checkpoints_steps: 1000
+  save_summaries_steps: 100
+  steps_per_loop: 100
+  loss_options {
+    semantic_loss {
+      name: "softmax_cross_entropy"
+      weight: 1.0
+      top_k_percent: 0.2
+    }
+    center_loss {
+      name: "mse"
+      weight: 200
+    }
+    regression_loss {
+      name: "l1"
+      weight: 0.01
+    }
+  }
+  solver_options {
+    base_learning_rate: 0.0004
+    training_number_of_steps: 30000
+  }
+}
+train_dataset_options {
+  dataset: "cityscapes_panoptic"
+  # Update the path to training set.
+  file_pattern: "${TRAIN_SET}"
+  # Adjust the batch_size accordingly to better fit your GPU/TPU memory.
+  # Also see Q1 in g3doc/faq.md.
+  batch_size: 64
+  crop_size: 1025
+  crop_size: 2049
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  augmentations {
+    min_scale_factor: 0.5
+    max_scale_factor: 2.0
+    scale_factor_step_size: 0.1
+  }
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+eval_dataset_options {
+  dataset: "cityscapes_panoptic"
+  # Update the path to validation set.
+  file_pattern: "${VAL_SET}"
+  batch_size: 1
+  crop_size: 1025
+  crop_size: 2049
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  # Add options to make the evaluation loss comparable to the training loss.
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+evaluator_options {
+  continuous_eval_timeout: 43200
+  stuff_area_limit: 2048
+  center_score_threshold: 0.1
+  nms_kernel: 13
+  save_predictions: true
+  save_raw_predictions: false
+  # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and
+  # instance maps. For faster speed, compile TensorFlow with provided kernel
+  # implementation under the folder `tensorflow_ops`, and set
+  # merge_semantic_and_instance_with_tf_op to true.
+  merge_semantic_and_instance_with_tf_op: false
+}
diff --git a/configs/cityscapes/panoptic_deeplab/mobilenet_v3_small_os32.textproto b/configs/cityscapes/panoptic_deeplab/mobilenet_v3_small_os32.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..c6c797ee94e4a4b4b2f7aa642c4d0cf87fdf810c
--- /dev/null
+++ b/configs/cityscapes/panoptic_deeplab/mobilenet_v3_small_os32.textproto
@@ -0,0 +1,156 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+#
+# Panoptic-DeepLab with MobilenetV3-Small model and output stride 32.
+#
+############### PLEASE READ THIS BEFORE USING THIS CONFIG ###############
+# Before using this config, you need to update the following fields:
+# - experiment_name: Use a unique experiment name for each experiment.
+# - initial_checkpoint: Update the path to the initial checkpoint.
+# - train_dataset_options.file_pattern: Update the path to the
+#   training set. e.g., your_dataset/train*.tfrecord
+# - eval_dataset_options.file_pattern: Update the path to the
+#   validation set, e.g., your_dataset/eval*.tfrecord
+# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you
+#   could successfully compile the provided efficient merging operation
+#   under the folder `tensorflow_ops`.
+#########################################################################
+#
+# References:
+#
+# For Mobilenet V3, see
+# - Andrew Howard, et al. "Searching for MobileNetV3" In ICCV, 2019.
+#
+# For Panoptic-DeepLab, see
+# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast Baseline
+#   for Bottom-Up Panoptic Segmentation." In CVPR, 2020.
+
+# Use a unique experiment_name for each experiment.
+experiment_name: "${EXPERIMENT_NAME}"
+model_options {
+  # Update the path to the initial checkpoint (e.g., ImageNet
+  # pretrained checkpoint).
+  initial_checkpoint: "${INIT_CHECKPOINT}"
+  backbone {
+    name: "mobilenet_v3_small"
+    output_stride: 32
+  }
+  decoder {
+    feature_key: "res5"
+    decoder_channels: 256
+    aspp_channels: 256
+    atrous_rates: 3
+    atrous_rates: 6
+    atrous_rates: 9
+  }
+  panoptic_deeplab {
+    low_level {
+      feature_key: "res3"
+      channels_project: 64
+    }
+    low_level {
+      feature_key: "res2"
+      channels_project: 32
+    }
+    instance {
+      low_level_override {
+        feature_key: "res3"
+        channels_project: 32
+      }
+      low_level_override {
+        feature_key: "res2"
+        channels_project: 16
+      }
+      instance_decoder_override {
+        feature_key: "res5"
+        decoder_channels: 128
+        atrous_rates: 3
+        atrous_rates: 6
+        atrous_rates: 9
+      }
+      center_head {
+        output_channels: 1
+        head_channels: 32
+      }
+      regression_head {
+        output_channels: 2
+        head_channels: 32
+      }
+    }
+    semantic_head {
+      output_channels: 19
+      head_channels: 256
+    }
+  }
+}
+trainer_options {
+  save_checkpoints_steps: 1000
+  save_summaries_steps: 100
+  steps_per_loop: 100
+  loss_options {
+    semantic_loss {
+      name: "softmax_cross_entropy"
+      weight: 1.0
+      top_k_percent: 0.2
+    }
+    center_loss {
+      name: "mse"
+      weight: 200
+    }
+    regression_loss {
+      name: "l1"
+      weight: 0.01
+    }
+  }
+  solver_options {
+    base_learning_rate: 0.0004
+    training_number_of_steps: 30000
+  }
+}
+train_dataset_options {
+  dataset: "cityscapes_panoptic"
+  # Update the path to training set.
+  file_pattern: "${TRAIN_SET}"
+  # Adjust the batch_size accordingly to better fit your GPU/TPU memory.
+  # Also see Q1 in g3doc/faq.md.
+  batch_size: 64
+  crop_size: 1025
+  crop_size: 2049
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  augmentations {
+    min_scale_factor: 0.5
+    max_scale_factor: 2.0
+    scale_factor_step_size: 0.1
+  }
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+eval_dataset_options {
+  dataset: "cityscapes_panoptic"
+  # Update the path to validation set.
+  file_pattern: "${VAL_SET}"
+  batch_size: 1
+  crop_size: 1025
+  crop_size: 2049
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  # Add options to make the evaluation loss comparable to the training loss.
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+evaluator_options {
+  continuous_eval_timeout: 43200
+  stuff_area_limit: 2048
+  center_score_threshold: 0.1
+  nms_kernel: 13
+  save_predictions: true
+  save_raw_predictions: false
+  # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and
+  # instance maps. For faster speed, compile TensorFlow with provided kernel
+  # implementation under the folder `tensorflow_ops`, and set
+  # merge_semantic_and_instance_with_tf_op to true.
+  merge_semantic_and_instance_with_tf_op: false
+}
diff --git a/configs/cityscapes/panoptic_deeplab/resnet50_beta_os32.textproto b/configs/cityscapes/panoptic_deeplab/resnet50_beta_os32.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..431a71e21702edcc953788e65ae3af25e1acd63b
--- /dev/null
+++ b/configs/cityscapes/panoptic_deeplab/resnet50_beta_os32.textproto
@@ -0,0 +1,158 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+#
+# Panoptic-DeepLab with ResNet-50-beta model variant and output stride 32.
+#
+############### PLEASE READ THIS BEFORE USING THIS CONFIG ###############
+# Before using this config, you need to update the following fields:
+# - experiment_name: Use a unique experiment name for each experiment.
+# - initial_checkpoint: Update the path to the initial checkpoint.
+# - train_dataset_options.file_pattern: Update the path to the
+#   training set. e.g., your_dataset/train*.tfrecord
+# - eval_dataset_options.file_pattern: Update the path to the
+#   validation set, e.g., your_dataset/eval*.tfrecord
+# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you
+#   could successfully compile the provided efficient merging operation
+#   under the folder `tensorflow_ops`.
+#########################################################################
+#
+# The `resnet50_beta` model variant replaces the first 7x7 convolutions in the
+# original `resnet50` with three 3x3 convolutions, which is useful for dense
+# prediction tasks.
+#
+# References:
+# For resnet-50-beta, see
+# https://github.com/tensorflow/models/blob/master/research/deeplab/core/resnet_v1_beta.py
+# For Panoptic-DeepLab, see
+# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast Baseline
+#   for Bottom-Up Panoptic Segmentation." In CVPR, 2020.
+
+# Use a unique experiment_name for each experiment.
+experiment_name: "${EXPERIMENT_NAME}"
+model_options {
+  # Update the path to the initial checkpoint (e.g., ImageNet
+  # pretrained checkpoint).
+  initial_checkpoint: "${INIT_CHECKPOINT}"
+  backbone {
+    name: "resnet50_beta"
+    output_stride: 32
+  }
+  decoder {
+    feature_key: "res5"
+    decoder_channels: 256
+    aspp_channels: 256
+    atrous_rates: 3
+    atrous_rates: 6
+    atrous_rates: 9
+  }
+  panoptic_deeplab {
+    low_level {
+      feature_key: "res3"
+      channels_project: 64
+    }
+    low_level {
+      feature_key: "res2"
+      channels_project: 32
+    }
+    instance {
+      low_level_override {
+        feature_key: "res3"
+        channels_project: 32
+      }
+      low_level_override {
+        feature_key: "res2"
+        channels_project: 16
+      }
+      instance_decoder_override {
+        feature_key: "res5"
+        decoder_channels: 128
+        atrous_rates: 3
+        atrous_rates: 6
+        atrous_rates: 9
+      }
+      center_head {
+        output_channels: 1
+        head_channels: 32
+      }
+      regression_head {
+        output_channels: 2
+        head_channels: 32
+      }
+    }
+    semantic_head {
+      output_channels: 19
+      head_channels: 256
+    }
+  }
+}
+trainer_options {
+  save_checkpoints_steps: 1000
+  save_summaries_steps: 100
+  steps_per_loop: 100
+  loss_options {
+    semantic_loss {
+      name: "softmax_cross_entropy"
+      weight: 1.0
+      top_k_percent: 0.2
+    }
+    center_loss {
+      name: "mse"
+      weight: 200
+    }
+    regression_loss {
+      name: "l1"
+      weight: 0.01
+    }
+  }
+  solver_options {
+    base_learning_rate: 0.00025
+    training_number_of_steps: 60000
+  }
+}
+train_dataset_options {
+  dataset: "cityscapes_panoptic"
+  # Update the path to training set.
+  file_pattern: "${TRAIN_SET}"
+  # Adjust the batch_size accordingly to better fit your GPU/TPU memory.
+  # Also see Q1 in g3doc/faq.md.
+  batch_size: 32
+  crop_size: 1025
+  crop_size: 2049
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  augmentations {
+    min_scale_factor: 0.5
+    max_scale_factor: 2.0
+    scale_factor_step_size: 0.1
+  }
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+eval_dataset_options {
+  dataset: "cityscapes_panoptic"
+  # Update the path to validation set.
+  file_pattern: "${VAL_SET}"
+  batch_size: 1
+  crop_size: 1025
+  crop_size: 2049
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  # Add options to make the evaluation loss comparable to the training loss.
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+evaluator_options {
+  continuous_eval_timeout: 43200
+  stuff_area_limit: 2048
+  center_score_threshold: 0.1
+  nms_kernel: 13
+  save_predictions: true
+  save_raw_predictions: false
+  # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and
+  # instance maps. For faster speed, compile TensorFlow with provided kernel
+  # implementation under the folder `tensorflow_ops`, and set
+  # merge_semantic_and_instance_with_tf_op to true.
+  merge_semantic_and_instance_with_tf_op: false
+}
diff --git a/configs/cityscapes/panoptic_deeplab/resnet50_os32_merge_with_pure_tf_func.textproto b/configs/cityscapes/panoptic_deeplab/resnet50_os32_merge_with_pure_tf_func.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..49a0f495856554aa623cf9a6711ef50296677355
--- /dev/null
+++ b/configs/cityscapes/panoptic_deeplab/resnet50_os32_merge_with_pure_tf_func.textproto
@@ -0,0 +1,161 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+#
+# Panoptic-DeepLab with ResNet-50 and output stride 32.
+#
+############### PLEASE READ THIS BEFORE USING THIS CONFIG ###############
+# Before using this config, you need to update the following fields:
+# - experiment_name: Use a unique experiment name for each experiment.
+# - initial_checkpoint: Update the path to the initial checkpoint.
+# - train_dataset_options.file_pattern: Update the path to the
+#   training set. e.g., your_dataset/train*.tfrecord
+# - eval_dataset_options.file_pattern: Update the path to the
+#   validation set, e.g., your_dataset/eval*.tfrecord
+# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you
+#   could successfully compile the provided efficient merging operation
+#   under the folder `tensorflow_ops`.
+#########################################################################
+#
+# This config provides an example to launch GPU training with
+# `merge_semantic_and_instance_with_tf_op` = false, which will NOT invoke
+# our efficient merging operation. For faster inference speed, please
+# compile the provided `tensorflow_ops` and then set
+# `merge_semantic_and_instance_with_tf_op` to true.
+#
+# References:
+# For ResNet, see
+# - Kaiming He, et al. "Deep Residual Learning for Image Recognition."
+#   In CVPR, 2016.
+# For Panoptic-DeepLab, see
+# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast Baseline
+#   for Bottom-Up Panoptic Segmentation." In CVPR, 2020.
+
+# Use a unique experiment_name for each experiment.
+experiment_name: "${EXPERIMENT_NAME}"
+model_options {
+  # Update the path to the initial checkpoint (e.g., ImageNet
+  # pretrained checkpoint).
+  initial_checkpoint: "${INIT_CHECKPOINT}"
+  backbone {
+    name: "resnet50"
+    output_stride: 32
+  }
+  decoder {
+    feature_key: "res5"
+    decoder_channels: 256
+    aspp_channels: 256
+    atrous_rates: 3
+    atrous_rates: 6
+    atrous_rates: 9
+  }
+  panoptic_deeplab {
+    low_level {
+      feature_key: "res3"
+      channels_project: 64
+    }
+    low_level {
+      feature_key: "res2"
+      channels_project: 32
+    }
+    instance {
+      low_level_override {
+        feature_key: "res3"
+        channels_project: 32
+      }
+      low_level_override {
+        feature_key: "res2"
+        channels_project: 16
+      }
+      instance_decoder_override {
+        feature_key: "res5"
+        decoder_channels: 128
+        atrous_rates: 3
+        atrous_rates: 6
+        atrous_rates: 9
+      }
+      center_head {
+        output_channels: 1
+        head_channels: 32
+      }
+      regression_head {
+        output_channels: 2
+        head_channels: 32
+      }
+    }
+    semantic_head {
+      output_channels: 19
+      head_channels: 256
+    }
+  }
+}
+trainer_options {
+  save_checkpoints_steps: 1000
+  save_summaries_steps: 100
+  steps_per_loop: 100
+  loss_options {
+    semantic_loss {
+      name: "softmax_cross_entropy"
+      weight: 1.0
+      top_k_percent: 0.2
+    }
+    center_loss {
+      name: "mse"
+      weight: 200
+    }
+    regression_loss {
+      name: "l1"
+      weight: 0.01
+    }
+  }
+  solver_options {
+    base_learning_rate: 0.00025
+    training_number_of_steps: 60000
+  }
+}
+train_dataset_options {
+  dataset: "cityscapes_panoptic"
+  # Update the path to training set.
+  file_pattern: "${TRAIN_SET}"
+  # Adjust the batch_size accordingly to better fit your GPU/TPU memory.
+  # Also see Q1 in g3doc/faq.md.
+  batch_size: 8
+  crop_size: 1025
+  crop_size: 2049
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  augmentations {
+    min_scale_factor: 0.5
+    max_scale_factor: 2.0
+    scale_factor_step_size: 0.1
+  }
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+eval_dataset_options {
+  dataset: "cityscapes_panoptic"
+  # Update the path to validation set.
+  file_pattern: "${VAL_SET}"
+  batch_size: 1
+  crop_size: 1025
+  crop_size: 2049
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  # Add options to make the evaluation loss comparable to the training loss.
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+evaluator_options {
+  continuous_eval_timeout: 43200
+  stuff_area_limit: 2048
+  center_score_threshold: 0.1
+  nms_kernel: 13
+  save_predictions: true
+  save_raw_predictions: false
+  # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and
+  # instance maps. For faster speed, compile TensorFlow with provided kernel
+  # implementation under the folder `tensorflow_ops`, and set
+  # merge_semantic_and_instance_with_tf_op to true.
+  merge_semantic_and_instance_with_tf_op: false
+}
diff --git a/configs/cityscapes/panoptic_deeplab/swidernet_sac_1_1_1_os16.textproto b/configs/cityscapes/panoptic_deeplab/swidernet_sac_1_1_1_os16.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..944b6650a8128b90eb3382ca2147aca62bc2429c
--- /dev/null
+++ b/configs/cityscapes/panoptic_deeplab/swidernet_sac_1_1_1_os16.textproto
@@ -0,0 +1,166 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+#
+# Panoptic-DeepLab with SWideRNet-SAC-(1, 1, 1) and output stride 16.
+#
+############### PLEASE READ THIS BEFORE USING THIS CONFIG ###############
+# Before using this config, you need to update the following fields:
+# - experiment_name: Use a unique experiment name for each experiment.
+# - initial_checkpoint: Update the path to the initial checkpoint.
+# - train_dataset_options.file_pattern: Update the path to the
+#   training set. e.g., your_dataset/train*.tfrecord
+# - eval_dataset_options.file_pattern: Update the path to the
+#   validation set, e.g., your_dataset/eval*.tfrecord
+# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you
+#   could successfully compile the provided efficient merging operation
+#   under the folder `tensorflow_ops`.
+#########################################################################
+#
+# SWideRNet-SAC-(1, 1, 1) employs the Switchable Atrous Convolution (SAC)
+# in the last stage of network backbone.
+#
+# References:
+# For SAC, see
+# - Siyuan Qiao, et al. "DetectoRS: Detecting Objects with Recursive
+#   Feature Pyramid and Switchable Atrous Convolution." In CVPR, 2021.
+# For SWideRNet, see
+# - Liang-Chieh Chen, et al. "Scaling Wide Residual Networks for
+#   Panoptic Segmentation." arXiv: 2011.11675.
+# For Panoptic-DeepLab, see
+# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast
+#   Baseline for Bottom-Up Panoptic Segmentation." In CVPR, 2020.
+
+# Use a unique experiment_name for each experiment.
+experiment_name: "${EXPERIMENT_NAME}"
+model_options {
+  initial_checkpoint: "${INIT_CHECKPOINT}"
+  backbone {
+    name: "swidernet"
+    output_stride: 16
+    stem_width_multiplier: 1
+    backbone_width_multiplier: 1
+    backbone_layer_multiplier: 1
+    use_sac_beyond_stride: 32
+    drop_path_keep_prob: 0.8
+    drop_path_schedule: "linear"
+  }
+  decoder {
+    feature_key: "res5"
+    decoder_channels: 256
+    aspp_channels: 256
+    atrous_rates: 6
+    atrous_rates: 12
+    atrous_rates: 18
+  }
+  panoptic_deeplab {
+    low_level {
+      feature_key: "res3"
+      channels_project: 64
+    }
+    low_level {
+      feature_key: "res2"
+      channels_project: 32
+    }
+    instance {
+      low_level_override {
+        feature_key: "res3"
+        channels_project: 32
+      }
+      low_level_override {
+        feature_key: "res2"
+        channels_project: 16
+      }
+      instance_decoder_override {
+        feature_key: "res5"
+        decoder_channels: 128
+        atrous_rates: 6
+        atrous_rates: 12
+        atrous_rates: 18
+      }
+      center_head {
+        output_channels: 1
+        head_channels: 32
+      }
+      regression_head {
+        output_channels: 2
+        head_channels: 32
+      }
+    }
+    semantic_head {
+      output_channels: 19
+      head_channels: 256
+    }
+  }
+}
+trainer_options {
+  save_checkpoints_steps: 1000
+  save_summaries_steps: 100
+  steps_per_loop: 100
+  loss_options {
+    semantic_loss {
+      name: "softmax_cross_entropy"
+      weight: 1.0
+      top_k_percent: 0.2
+    }
+    center_loss {
+      name: "mse"
+      weight: 200
+    }
+    regression_loss {
+      name: "l1"
+      weight: 0.01
+    }
+  }
+  solver_options {
+    base_learning_rate: 0.0001
+    training_number_of_steps: 60000
+  }
+}
+train_dataset_options {
+  dataset: "cityscapes_panoptic"
+  # Update the path to training set.
+  file_pattern: "${TRAIN_SET}"
+  # Adjust the batch_size accordingly to better fit your GPU/TPU memory.
+  # Also see Q1 in g3doc/faq.md.
+  batch_size: 32
+  crop_size: 1025
+  crop_size: 2049
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  augmentations {
+    min_scale_factor: 0.5
+    max_scale_factor: 2.0
+    scale_factor_step_size: 0.1
+    autoaugment_policy_name: "simple_classification_policy_magnitude_scale_0.2"
+  }
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+eval_dataset_options {
+  dataset: "cityscapes_panoptic"
+  # Update the path to validation set.
+  file_pattern: "${VAL_SET}"
+  batch_size: 1
+  crop_size: 1025
+  crop_size: 2049
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  # Add options to make the evaluation loss comparable to the training loss.
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+evaluator_options {
+  continuous_eval_timeout: 43200
+  stuff_area_limit: 2048
+  center_score_threshold: 0.1
+  nms_kernel: 13
+  save_predictions: true
+  save_raw_predictions: false
+  # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and
+  # instance maps. For faster speed, compile TensorFlow with provided kernel
+  # implementation under the folder `tensorflow_ops`, and set
+  # merge_semantic_and_instance_with_tf_op to true.
+  merge_semantic_and_instance_with_tf_op: false
+}
diff --git a/configs/cityscapes/panoptic_deeplab/swidernet_sac_1_1_3_os16.textproto b/configs/cityscapes/panoptic_deeplab/swidernet_sac_1_1_3_os16.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..8eec2ad45fb69717c1c216e5f07f86a236c7493d
--- /dev/null
+++ b/configs/cityscapes/panoptic_deeplab/swidernet_sac_1_1_3_os16.textproto
@@ -0,0 +1,167 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+#
+# Panoptic-DeepLab with SWideRNet-SAC-(1, 1, 3) and output stride 16.
+#
+############### PLEASE READ THIS BEFORE USING THIS CONFIG ###############
+# Before using this config, you need to update the following fields:
+# - experiment_name: Use a unique experiment name for each experiment.
+# - initial_checkpoint: Update the path to the initial checkpoint.
+# - train_dataset_options.file_pattern: Update the path to the
+#   training set. e.g., your_dataset/train*.tfrecord
+# - eval_dataset_options.file_pattern: Update the path to the
+#   validation set, e.g., your_dataset/eval*.tfrecord
+# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you
+#   could successfully compile the provided efficient merging operation
+#   under the folder `tensorflow_ops`.
+#########################################################################
+#
+# SWideRNet-SAC-(1, 1, 3) employs the Switchable Atrous Convolution (SAC)
+# in the last stage of network backbone.
+#
+# References:
+# For SAC, see
+# - Siyuan Qiao, et al. "DetectoRS: Detecting Objects with Recursive
+#   Feature Pyramid and Switchable Atrous Convolution." In CVPR, 2021.
+# For SWideRNet, see
+# - Liang-Chieh Chen, et al. "Scaling Wide Residual Networks for
+#   Panoptic Segmentation." arXiv: 2011.11675.
+# For Panoptic-DeepLab, see
+# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast
+#   Baseline for Bottom-Up Panoptic Segmentation." In CVPR, 2020.
+
+# Use a unique experiment_name for each experiment.
+experiment_name: "${EXPERIMENT_NAME}"
+model_options {
+  initial_checkpoint: "${INIT_CHECKPOINT}"
+  backbone {
+    name: "swidernet"
+    output_stride: 16
+    stem_width_multiplier: 1
+    backbone_width_multiplier: 1
+    backbone_layer_multiplier: 3
+    use_sac_beyond_stride: 32
+    drop_path_keep_prob: 0.8
+    drop_path_schedule: "linear"
+  }
+  decoder {
+    feature_key: "res5"
+    decoder_channels: 256
+    aspp_channels: 256
+    atrous_rates: 6
+    atrous_rates: 12
+    atrous_rates: 18
+  }
+  panoptic_deeplab {
+    low_level {
+      feature_key: "res3"
+      channels_project: 64
+    }
+    low_level {
+      feature_key: "res2"
+      channels_project: 32
+    }
+    instance {
+      low_level_override {
+        feature_key: "res3"
+        channels_project: 32
+      }
+      low_level_override {
+        feature_key: "res2"
+        channels_project: 16
+      }
+      instance_decoder_override {
+        feature_key: "res5"
+        decoder_channels: 128
+        atrous_rates: 6
+        atrous_rates: 12
+        atrous_rates: 18
+      }
+      center_head {
+        output_channels: 1
+        head_channels: 32
+      }
+      regression_head {
+        output_channels: 2
+        head_channels: 32
+      }
+    }
+    semantic_head {
+      output_channels: 19
+      head_channels: 256
+    }
+  }
+}
+trainer_options {
+  save_checkpoints_steps: 1000
+  save_summaries_steps: 100
+  steps_per_loop: 100
+  loss_options {
+    semantic_loss {
+      name: "softmax_cross_entropy"
+      weight: 1.0
+      top_k_percent: 0.2
+    }
+    center_loss {
+      name: "mse"
+      weight: 200
+    }
+    regression_loss {
+      name: "l1"
+      weight: 0.01
+    }
+  }
+  solver_options {
+    base_learning_rate: 0.0001
+    training_number_of_steps: 60000
+  }
+}
+train_dataset_options {
+  dataset: "cityscapes_panoptic"
+  # Update the path to training set.
+  file_pattern: "${TRAIN_SET}"
+  # Adjust the batch_size accordingly to better fit your GPU/TPU memory.
+  # Also see Q1 in g3doc/faq.md.
+  batch_size: 32
+  crop_size: 1025
+  crop_size: 2049
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  augmentations {
+    min_scale_factor: 0.5
+    max_scale_factor: 2.0
+    scale_factor_step_size: 0.1
+    autoaugment_policy_name: "simple_classification_policy_magnitude_scale_0.2"
+  }
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+eval_dataset_options {
+  dataset: "cityscapes_panoptic"
+  # Update the path to validation set.
+  file_pattern: "${VAL_SET}"
+  batch_size: 1
+  crop_size: 1025
+  crop_size: 2049
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  # Add options to make the evaluation loss comparable to the training loss.
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+evaluator_options {
+  continuous_eval_timeout: 43200
+  stuff_area_limit: 2048
+  center_score_threshold: 0.1
+  nms_kernel: 13
+  save_predictions: true
+  save_raw_predictions: false
+  # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and
+  # instance maps. For faster speed, compile TensorFlow with provided kernel
+  # implementation under the folder `tensorflow_ops`, and set
+  # merge_semantic_and_instance_with_tf_op to true.
+  merge_semantic_and_instance_with_tf_op: false
+}
+
diff --git a/configs/cityscapes/panoptic_deeplab/swidernet_sac_1_1_4.5_os16.textproto b/configs/cityscapes/panoptic_deeplab/swidernet_sac_1_1_4.5_os16.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..fcda36d90977edd7164ccc0989c62ed796955d56
--- /dev/null
+++ b/configs/cityscapes/panoptic_deeplab/swidernet_sac_1_1_4.5_os16.textproto
@@ -0,0 +1,166 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+#
+# Panoptic-DeepLab with SWideRNet-SAC-(1, 1, 4.5) and output stride 16.
+#
+############### PLEASE READ THIS BEFORE USING THIS CONFIG ###############
+# Before using this config, you need to update the following fields:
+# - experiment_name: Use a unique experiment name for each experiment.
+# - initial_checkpoint: Update the path to the initial checkpoint.
+# - train_dataset_options.file_pattern: Update the path to the
+#   training set. e.g., your_dataset/train*.tfrecord
+# - eval_dataset_options.file_pattern: Update the path to the
+#   validation set, e.g., your_dataset/eval*.tfrecord
+# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you
+#   could successfully compile the provided efficient merging operation
+#   under the folder `tensorflow_ops`.
+#########################################################################
+#
+# SWideRNet-SAC-(1, 1, 4.5) employs the Switchable Atrous Convolution (SAC)
+# in the last stage of network backbone.
+#
+# References:
+# For SAC, see
+# - Siyuan Qiao, et al. "DetectoRS: Detecting Objects with Recursive
+#   Feature Pyramid and Switchable Atrous Convolution." In CVPR, 2021.
+# For SWideRNet, see
+# - Liang-Chieh Chen, et al. "Scaling Wide Residual Networks for
+#   Panoptic Segmentation." arXiv: 2011.11675.
+# For Panoptic-DeepLab, see
+# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast
+#   Baseline for Bottom-Up Panoptic Segmentation." In CVPR, 2020.
+
+# Use a unique experiment_name for each experiment.
+experiment_name: "${EXPERIMENT_NAME}"
+model_options {
+  initial_checkpoint: "${INIT_CHECKPOINT}"
+  backbone {
+    name: "swidernet"
+    output_stride: 16
+    stem_width_multiplier: 1
+    backbone_width_multiplier: 1
+    backbone_layer_multiplier: 4.5
+    use_sac_beyond_stride: 32
+    drop_path_keep_prob: 0.8
+    drop_path_schedule: "linear"
+  }
+  decoder {
+    feature_key: "res5"
+    decoder_channels: 256
+    aspp_channels: 256
+    atrous_rates: 6
+    atrous_rates: 12
+    atrous_rates: 18
+  }
+  panoptic_deeplab {
+    low_level {
+      feature_key: "res3"
+      channels_project: 64
+    }
+    low_level {
+      feature_key: "res2"
+      channels_project: 32
+    }
+    instance {
+      low_level_override {
+        feature_key: "res3"
+        channels_project: 32
+      }
+      low_level_override {
+        feature_key: "res2"
+        channels_project: 16
+      }
+      instance_decoder_override {
+        feature_key: "res5"
+        decoder_channels: 128
+        atrous_rates: 6
+        atrous_rates: 12
+        atrous_rates: 18
+      }
+      center_head {
+        output_channels: 1
+        head_channels: 32
+      }
+      regression_head {
+        output_channels: 2
+        head_channels: 32
+      }
+    }
+    semantic_head {
+      output_channels: 19
+      head_channels: 256
+    }
+  }
+}
+trainer_options {
+  save_checkpoints_steps: 1000
+  save_summaries_steps: 100
+  steps_per_loop: 100
+  loss_options {
+    semantic_loss {
+      name: "softmax_cross_entropy"
+      weight: 1.0
+      top_k_percent: 0.2
+    }
+    center_loss {
+      name: "mse"
+      weight: 200
+    }
+    regression_loss {
+      name: "l1"
+      weight: 0.01
+    }
+  }
+  solver_options {
+    base_learning_rate: 0.00025
+    training_number_of_steps: 60000
+  }
+}
+train_dataset_options {
+  dataset: "cityscapes_panoptic"
+  # Update the path to training set.
+  file_pattern: "${TRAIN_SET}"
+  # Adjust the batch_size accordingly to better fit your GPU/TPU memory.
+  # Also see Q1 in g3doc/faq.md.
+  batch_size: 32
+  crop_size: 1025
+  crop_size: 2049
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  augmentations {
+    min_scale_factor: 0.5
+    max_scale_factor: 2.0
+    scale_factor_step_size: 0.1
+    autoaugment_policy_name: "simple_classification_policy_magnitude_scale_0.2"
+  }
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+eval_dataset_options {
+  dataset: "cityscapes_panoptic"
+  # Update the path to validation set.
+  file_pattern: "${VAL_SET}"
+  batch_size: 1
+  crop_size: 1025
+  crop_size: 2049
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  # Add options to make the evaluation loss comparable to the training loss.
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+evaluator_options {
+  continuous_eval_timeout: 43200
+  stuff_area_limit: 2048
+  center_score_threshold: 0.1
+  nms_kernel: 13
+  save_predictions: true
+  save_raw_predictions: false
+  # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and
+  # instance maps. For faster speed, compile TensorFlow with provided kernel
+  # implementation under the folder `tensorflow_ops`, and set
+  # merge_semantic_and_instance_with_tf_op to true.
+  merge_semantic_and_instance_with_tf_op: false
+}
diff --git a/configs/cityscapes/panoptic_deeplab/wide_resnet41_os16.textproto b/configs/cityscapes/panoptic_deeplab/wide_resnet41_os16.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..f04b18b78c2c1aa56182cc0e5d2950389be2d15b
--- /dev/null
+++ b/configs/cityscapes/panoptic_deeplab/wide_resnet41_os16.textproto
@@ -0,0 +1,162 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+#
+# Panoptic-DeepLab with Wide ResNet-41 and output stride 16.
+#
+############### PLEASE READ THIS BEFORE USING THIS CONFIG ###############
+# Before using this config, you need to update the following fields:
+# - experiment_name: Use a unique experiment name for each experiment.
+# - initial_checkpoint: Update the path to the initial checkpoint.
+# - train_dataset_options.file_pattern: Update the path to the
+#   training set. e.g., your_dataset/train*.tfrecord
+# - eval_dataset_options.file_pattern: Update the path to the
+#   validation set, e.g., your_dataset/eval*.tfrecord
+# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you
+#   could successfully compile the provided efficient merging operation
+#   under the folder `tensorflow_ops`.
+#########################################################################
+#
+# Wide ResNet-41 improves over Wide ResNet-38 by (1) removing the last residual
+# block, and (2) repeating the second last residual block two more times.
+#
+# References:
+# For Wide ResNet-38, see
+# - Zifeng Wu, et al. "Wider or deeper: Revisiting the ResNet model for
+#   visual recognition." Pattern Recognition, 2019.
+# For Wide ResNet-41, see
+# - Liang-Chieh Chen, et al. "Naive-Student: Leveraging Semi-Supervised
+#   Learning in Video Sequences for Urban Scene Segmentation.", In ECCV, 2020.
+# For Panoptic-DeepLab, see
+# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast
+#   Baseline for Bottom-Up Panoptic Segmentation." In CVPR, 2020.
+
+# Use a unique experiment_name for each experiment.
+experiment_name: "${EXPERIMENT_NAME}"
+model_options {
+  initial_checkpoint: "${INIT_CHECKPOINT}"
+  backbone {
+    name: "wide_resnet41"
+    output_stride: 16
+    drop_path_keep_prob: 0.8
+    drop_path_schedule: "linear"
+  }
+  decoder {
+    feature_key: "res5"
+    decoder_channels: 256
+    aspp_channels: 256
+    atrous_rates: 6
+    atrous_rates: 12
+    atrous_rates: 18
+  }
+  panoptic_deeplab {
+    low_level {
+      feature_key: "res3"
+      channels_project: 64
+    }
+    low_level {
+      feature_key: "res2"
+      channels_project: 32
+    }
+    instance {
+      low_level_override {
+        feature_key: "res3"
+        channels_project: 32
+      }
+      low_level_override {
+        feature_key: "res2"
+        channels_project: 16
+      }
+      instance_decoder_override {
+        feature_key: "res5"
+        decoder_channels: 128
+        atrous_rates: 6
+        atrous_rates: 12
+        atrous_rates: 18
+      }
+      center_head {
+        output_channels: 1
+        head_channels: 32
+      }
+      regression_head {
+        output_channels: 2
+        head_channels: 32
+      }
+    }
+    semantic_head {
+      output_channels: 19
+      head_channels: 256
+    }
+  }
+}
+trainer_options {
+  save_checkpoints_steps: 1000
+  save_summaries_steps: 100
+  steps_per_loop: 100
+  loss_options {
+    semantic_loss {
+      name: "softmax_cross_entropy"
+      weight: 1.0
+      top_k_percent: 0.2
+    }
+    center_loss {
+      name: "mse"
+      weight: 200
+    }
+    regression_loss {
+      name: "l1"
+      weight: 0.01
+    }
+  }
+  solver_options {
+    base_learning_rate: 0.0001
+    training_number_of_steps: 60000
+  }
+}
+train_dataset_options {
+  dataset: "cityscapes_panoptic"
+  # Update the path to training set.
+  file_pattern: "${TRAIN_SET}"
+  # Adjust the batch_size accordingly to better fit your GPU/TPU memory.
+  # Also see Q1 in g3doc/faq.md.
+  batch_size: 32
+  crop_size: 1025
+  crop_size: 2049
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  augmentations {
+    min_scale_factor: 0.5
+    max_scale_factor: 2.0
+    scale_factor_step_size: 0.1
+    autoaugment_policy_name: "simple_classification_policy_magnitude_scale_0.2"
+  }
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+eval_dataset_options {
+  dataset: "cityscapes_panoptic"
+  # Update the path to validation set.
+  file_pattern: "${VAL_SET}"
+  batch_size: 1
+  crop_size: 1025
+  crop_size: 2049
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  # Add options to make the evaluation loss comparable to the training loss.
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+evaluator_options {
+  continuous_eval_timeout: 43200
+  stuff_area_limit: 2048
+  center_score_threshold: 0.1
+  nms_kernel: 13
+  save_predictions: true
+  save_raw_predictions: false
+  # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and
+  # instance maps. For faster speed, compile TensorFlow with provided kernel
+  # implementation under the folder `tensorflow_ops`, and set
+  # merge_semantic_and_instance_with_tf_op to true.
+  merge_semantic_and_instance_with_tf_op: false
+}
diff --git a/configs/cityscapes_dvps/vip_deeplab/resnet50_beta_os32.textproto b/configs/cityscapes_dvps/vip_deeplab/resnet50_beta_os32.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..825671ab92b9d1fcb4abdc132d1f9f8f1e1cba05
--- /dev/null
+++ b/configs/cityscapes_dvps/vip_deeplab/resnet50_beta_os32.textproto
@@ -0,0 +1,168 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+#
+# ViP-DeepLab with ResNet-50-beta model variant and output stride 32.
+#
+############### PLEASE READ THIS BEFORE USING THIS CONFIG ###############
+# Before using this config, you need to update the following fields:
+# - experiment_name: Use a unique experiment name for each experiment.
+# - initial_checkpoint: Update the path to the initial checkpoint.
+# - train_dataset_options.file_pattern: Update the path to the
+#   training set. e.g., your_dataset/train*.tfrecord
+# - eval_dataset_options.file_pattern: Update the path to the
+#   validation set, e.g., your_dataset/eval*.tfrecord
+# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you
+#   could successfully compile the provided efficient merging operation
+#   under the folder `tensorflow_ops`.
+#########################################################################
+#
+# The `resnet50_beta` model variant replaces the first 7x7 convolutions in the
+# original `resnet50` with three 3x3 convolutions, which is useful for dense
+# prediction tasks.
+#
+# References:
+# For resnet-50-beta, see
+# https://github.com/tensorflow/models/blob/master/research/deeplab/core/resnet_v1_beta.py
+# For ViP-DeepLab, see
+# - Siyuan Qiao, et al. "ViP-DeepLab: Learning Visual Perception with
+#     Depth-aware Video Panoptic Segmentation" In CVPR, 2021.
+
+# Use a unique experiment_name for each experiment.
+experiment_name: "${EXPERIMENT_NAME}"
+model_options {
+  # Update the path to the initial checkpoint (e.g., ImageNet
+  # pretrained checkpoint).
+  initial_checkpoint: "${INIT_CHECKPOINT}"
+  backbone {
+    name: "resnet50_beta"
+    output_stride: 32
+  }
+  decoder {
+    feature_key: "res5"
+    decoder_channels: 256
+    aspp_channels: 256
+    atrous_rates: 3
+    atrous_rates: 6
+    atrous_rates: 9
+  }
+  vip_deeplab {
+    low_level {
+      feature_key: "res3"
+      channels_project: 64
+    }
+    low_level {
+      feature_key: "res2"
+      channels_project: 32
+    }
+    instance {
+      low_level_override {
+        feature_key: "res3"
+        channels_project: 32
+      }
+      low_level_override {
+        feature_key: "res2"
+        channels_project: 16
+      }
+      instance_decoder_override {
+        feature_key: "res5"
+        decoder_channels: 128
+        atrous_rates: 3
+        atrous_rates: 6
+        atrous_rates: 9
+      }
+      center_head {
+        output_channels: 1
+        head_channels: 32
+      }
+      regression_head {
+        output_channels: 2
+        head_channels: 32
+      }
+      next_regression_head {
+        output_channels: 2
+        head_channels: 32
+      }
+    }
+    semantic_head {
+      output_channels: 19
+      head_channels: 256
+    }
+  }
+}
+trainer_options {
+  save_checkpoints_steps: 1000
+  save_summaries_steps: 100
+  steps_per_loop: 100
+  loss_options {
+    semantic_loss {
+      name: "softmax_cross_entropy"
+      weight: 1.0
+      top_k_percent: 0.2
+    }
+    center_loss {
+      name: "mse"
+      weight: 200
+    }
+    regression_loss {
+      name: "l1"
+      weight: 0.01
+    }
+    next_regression_loss {
+      name: "l1"
+      weight: 0.01
+    }
+  }
+  solver_options {
+    base_learning_rate: 0.00003125
+    training_number_of_steps: 60000
+  }
+}
+train_dataset_options {
+  dataset: "cityscapes_dvps"
+  # Update the path to training set.
+  file_pattern: "${TRAIN_SET}"
+  # Adjust the batch_size accordingly to better fit your GPU/TPU memory.
+  # Also see Q1 in g3doc/fag.md.
+  batch_size: 4
+  crop_size: 513
+  crop_size: 1025
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  augmentations {
+    min_scale_factor: 0.5
+    max_scale_factor: 2.0
+    scale_factor_step_size: 0.1
+  }
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+  use_next_frame: true
+}
+eval_dataset_options {
+  dataset: "cityscapes_dvps"
+  # Update the path to validation set.
+  file_pattern: "${VAL_SET}"
+  batch_size: 1
+  crop_size: 1025
+  crop_size: 2049
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  # Add options to make the evaluation loss comparable to the training loss.
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+  use_next_frame: true
+}
+evaluator_options {
+  continuous_eval_timeout: 43200
+  stuff_area_limit: 2048
+  center_score_threshold: 0.1
+  nms_kernel: 13
+  save_predictions: true
+  save_raw_predictions: false
+  # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and
+  # instance maps. For faster speed, compile TensorFlow with provided kernel
+  # implementation under the folder `tensorflow_ops`, and set
+  # merge_semantic_and_instance_with_tf_op to true.
+  merge_semantic_and_instance_with_tf_op: false
+}
diff --git a/configs/coco/max_deeplab/max_deeplab_s_os16_res1025_100k.textproto b/configs/coco/max_deeplab/max_deeplab_s_os16_res1025_100k.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..aa9059adb8101283312cb39535837258e810c411
--- /dev/null
+++ b/configs/coco/max_deeplab/max_deeplab_s_os16_res1025_100k.textproto
@@ -0,0 +1,137 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+#
+# MaX-DeepLab-S with resolution 1025x1025 and 100k training steps.
+#
+############### PLEASE READ THIS BEFORE USING THIS CONFIG ###############
+# Before using this config, you need to update the following fields:
+# - experiment_name: Use a unique experiment name for each experiment.
+# - initial_checkpoint: Update the path to the initial checkpoint.
+# - train_dataset_options.file_pattern: Update the path to the
+#   training set. e.g., your_dataset/train*.tfrecord
+# - eval_dataset_options.file_pattern: Update the path to the
+#   validation set, e.g., your_dataset/eval*.tfrecord
+#########################################################################
+#
+# MaX-DeepLab-S replaces the last two stages of ResNet-50-beta with axial-
+# attention blocks and applies a small dual-path transformer.
+#
+# For axial-attention, see
+# - Huiyu Wang, et al. "Axial-DeepLab: Stand-Alone Axial-Attention for Panoptic
+#   Segmentation." In ECCV, 2020.
+# For MaX-DeepLab, see
+# - Huiyu Wang, et al. "MaX-DeepLab: End-to-End Panoptic Segmentation with Mask
+#   Transformers." In CVPR, 2021.
+
+# Use a unique experiment_name for each experiment.
+experiment_name: "${EXPERIMENT_NAME}"
+model_options {
+  # Update the path to the initial checkpoint (e.g., ImageNet
+  # pretrained checkpoint).
+  initial_checkpoint: "${INIT_CHECKPOINT}"
+  backbone {
+    name: "max_deeplab_s"
+    output_stride: 16
+    drop_path_keep_prob: 0.8
+    drop_path_schedule: "linear"
+  }
+  decoder {
+    feature_key: "feature_semantic"
+    decoder_channels: 256
+    aspp_channels: 256
+    atrous_rates: 6
+    atrous_rates: 12
+    atrous_rates: 18
+  }
+  max_deeplab {
+    pixel_space_head {
+      output_channels: 128
+      head_channels: 256
+    }
+    auxiliary_low_level {
+      feature_key: "res3"
+      channels_project: 64
+    }
+    auxiliary_low_level {
+      feature_key: "res2"
+      channels_project: 32
+    }
+    auxiliary_semantic_head {
+      output_channels: 134
+      head_channels: 256
+    }
+  }
+}
+trainer_options {
+  save_checkpoints_steps: 1000
+  save_summaries_steps: 100
+  steps_per_loop: 100
+  loss_options {
+    semantic_loss {
+      name: "softmax_cross_entropy"
+      weight: 1.0
+    }
+    pq_style_loss {
+      weight: 3.0
+    }
+    mask_id_cross_entropy_loss {
+      weight: 0.3
+    }
+    instance_discrimination_loss {
+      weight: 1.0
+    }
+  }
+  solver_options {
+    base_learning_rate: 0.001
+    training_number_of_steps: 100000
+    warmup_steps: 5000
+    backbone_learning_rate_multiplier: 0.1
+  }
+}
+train_dataset_options {
+  dataset: "coco_panoptic"
+  file_pattern: "${TRAIN_SET}"
+  # Adjust the batch_size accordingly to better fit your GPU/TPU memory.
+  # Also see Q1 in g3doc/faq.md.
+  batch_size: 64
+  crop_size: 1025
+  crop_size: 1025
+  min_resize_value: 1025
+  max_resize_value: 1025
+  augmentations {
+    min_scale_factor: 0.5
+    max_scale_factor: 1.5
+    scale_factor_step_size: 0.1
+  }
+  increase_small_instance_weights: false
+  small_instance_weight: 1.0
+  # This option generates ground truth labels for MaX-Deeplab.
+  thing_id_mask_annotations: true
+}
+eval_dataset_options {
+  dataset: "coco_panoptic"
+  # Update the path to validation set.
+  file_pattern: "${VAL_SET}"
+  batch_size: 1
+  crop_size: 1025
+  crop_size: 1025
+  min_resize_value: 1025
+  max_resize_value: 1025
+  # Add options to make the evaluation loss comparable to the training loss.
+  increase_small_instance_weights: false
+  small_instance_weight: 1.0
+  # This option generates ground truth labels for MaX-Deeplab.
+  thing_id_mask_annotations: true
+}
+evaluator_options {
+  continuous_eval_timeout: 43200
+  thing_area_limit: 256
+  stuff_area_limit: 4096
+  transformer_class_confidence_threshold: 0.7
+  pixel_confidence_threshold: 0.4
+  save_predictions: true
+  save_raw_predictions: false
+  # Some options are inapplicable to MaX-DeepLab, including nms_kernel,
+  # merge_semantic_and_instance_with_tf_op, center_score_threshold,
+  # keep_k_centers, add_flipped_images, and eval_scales.
+}
diff --git a/configs/coco/max_deeplab/max_deeplab_s_os16_res1025_200k.textproto b/configs/coco/max_deeplab/max_deeplab_s_os16_res1025_200k.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..a15a5b6dbd139895277a6c515e87f518853415b1
--- /dev/null
+++ b/configs/coco/max_deeplab/max_deeplab_s_os16_res1025_200k.textproto
@@ -0,0 +1,137 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+#
+# MaX-DeepLab-S with resolution 1025x1025 and 200k training steps.
+#
+############### PLEASE READ THIS BEFORE USING THIS CONFIG ###############
+# Before using this config, you need to update the following fields:
+# - experiment_name: Use a unique experiment name for each experiment.
+# - initial_checkpoint: Update the path to the initial checkpoint.
+# - train_dataset_options.file_pattern: Update the path to the
+#   training set. e.g., your_dataset/train*.tfrecord
+# - eval_dataset_options.file_pattern: Update the path to the
+#   validation set, e.g., your_dataset/eval*.tfrecord
+#########################################################################
+#
+# MaX-DeepLab-S replaces the last two stages of ResNet-50-beta with axial-
+# attention blocks and applies a small dual-path transformer.
+#
+# For axial-attention, see
+# - Huiyu Wang, et al. "Axial-DeepLab: Stand-Alone Axial-Attention for Panoptic
+#   Segmentation." In ECCV, 2020.
+# For MaX-DeepLab, see
+# - Huiyu Wang, et al. "MaX-DeepLab: End-to-End Panoptic Segmentation with Mask
+#   Transformers." In CVPR, 2021.
+
+# Use a unique experiment_name for each experiment.
+experiment_name: "${EXPERIMENT_NAME}"
+model_options {
+  # Update the path to the initial checkpoint (e.g., ImageNet
+  # pretrained checkpoint).
+  initial_checkpoint: "${INIT_CHECKPOINT}"
+  backbone {
+    name: "max_deeplab_s"
+    output_stride: 16
+    drop_path_keep_prob: 0.8
+    drop_path_schedule: "linear"
+  }
+  decoder {
+    feature_key: "feature_semantic"
+    decoder_channels: 256
+    aspp_channels: 256
+    atrous_rates: 6
+    atrous_rates: 12
+    atrous_rates: 18
+  }
+  max_deeplab {
+    pixel_space_head {
+      output_channels: 128
+      head_channels: 256
+    }
+    auxiliary_low_level {
+      feature_key: "res3"
+      channels_project: 64
+    }
+    auxiliary_low_level {
+      feature_key: "res2"
+      channels_project: 32
+    }
+    auxiliary_semantic_head {
+      output_channels: 134
+      head_channels: 256
+    }
+  }
+}
+trainer_options {
+  save_checkpoints_steps: 1000
+  save_summaries_steps: 100
+  steps_per_loop: 100
+  loss_options {
+    semantic_loss {
+      name: "softmax_cross_entropy"
+      weight: 1.0
+    }
+    pq_style_loss {
+      weight: 3.0
+    }
+    mask_id_cross_entropy_loss {
+      weight: 0.3
+    }
+    instance_discrimination_loss {
+      weight: 1.0
+    }
+  }
+  solver_options {
+    base_learning_rate: 0.001
+    training_number_of_steps: 200000
+    warmup_steps: 5000
+    backbone_learning_rate_multiplier: 0.1
+  }
+}
+train_dataset_options {
+  dataset: "coco_panoptic"
+  file_pattern: "${TRAIN_SET}"
+  # Adjust the batch_size accordingly to better fit your GPU/TPU memory.
+  # Also see Q1 in g3doc/faq.md.
+  batch_size: 64
+  crop_size: 1025
+  crop_size: 1025
+  min_resize_value: 1025
+  max_resize_value: 1025
+  augmentations {
+    min_scale_factor: 0.5
+    max_scale_factor: 1.5
+    scale_factor_step_size: 0.1
+  }
+  increase_small_instance_weights: false
+  small_instance_weight: 1.0
+  # This option generates ground truth labels for MaX-Deeplab.
+  thing_id_mask_annotations: true
+}
+eval_dataset_options {
+  dataset: "coco_panoptic"
+  # Update the path to validation set.
+  file_pattern: "${VAL_SET}"
+  batch_size: 1
+  crop_size: 1025
+  crop_size: 1025
+  min_resize_value: 1025
+  max_resize_value: 1025
+  # Add options to make the evaluation loss comparable to the training loss.
+  increase_small_instance_weights: false
+  small_instance_weight: 1.0
+  # This option generates ground truth labels for MaX-Deeplab.
+  thing_id_mask_annotations: true
+}
+evaluator_options {
+  continuous_eval_timeout: 43200
+  thing_area_limit: 256
+  stuff_area_limit: 4096
+  transformer_class_confidence_threshold: 0.7
+  pixel_confidence_threshold: 0.4
+  save_predictions: true
+  save_raw_predictions: false
+  # Some options are inapplicable to MaX-DeepLab, including nms_kernel,
+  # merge_semantic_and_instance_with_tf_op, center_score_threshold,
+  # keep_k_centers, add_flipped_images, and eval_scales.
+}
diff --git a/configs/coco/max_deeplab/max_deeplab_s_os16_res641_100k.textproto b/configs/coco/max_deeplab/max_deeplab_s_os16_res641_100k.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..c6c385757b16a2e0801f03bf56dcdd2ad78b187d
--- /dev/null
+++ b/configs/coco/max_deeplab/max_deeplab_s_os16_res641_100k.textproto
@@ -0,0 +1,137 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+#
+# MaX-DeepLab-S with resolution 641x641 and 100k training steps.
+#
+############### PLEASE READ THIS BEFORE USING THIS CONFIG ###############
+# Before using this config, you need to update the following fields:
+# - experiment_name: Use a unique experiment name for each experiment.
+# - initial_checkpoint: Update the path to the initial checkpoint.
+# - train_dataset_options.file_pattern: Update the path to the
+#   training set. e.g., your_dataset/train*.tfrecord
+# - eval_dataset_options.file_pattern: Update the path to the
+#   validation set, e.g., your_dataset/eval*.tfrecord
+#########################################################################
+#
+# MaX-DeepLab-S replaces the last two stages of ResNet-50-beta with axial-
+# attention blocks and applies a small dual-path transformer.
+#
+# For axial-attention, see
+# - Huiyu Wang, et al. "Axial-DeepLab: Stand-Alone Axial-Attention for Panoptic
+#   Segmentation." In ECCV, 2020.
+# For MaX-DeepLab, see
+# - Huiyu Wang, et al. "MaX-DeepLab: End-to-End Panoptic Segmentation with Mask
+#   Transformers." In CVPR, 2021.
+
+# Use a unique experiment_name for each experiment.
+experiment_name: "${EXPERIMENT_NAME}"
+model_options {
+  # Update the path to the initial checkpoint (e.g., ImageNet
+  # pretrained checkpoint).
+  initial_checkpoint: "${INIT_CHECKPOINT}"
+  backbone {
+    name: "max_deeplab_s"
+    output_stride: 16
+    drop_path_keep_prob: 0.8
+    drop_path_schedule: "linear"
+  }
+  decoder {
+    feature_key: "feature_semantic"
+    decoder_channels: 256
+    aspp_channels: 256
+    atrous_rates: 6
+    atrous_rates: 12
+    atrous_rates: 18
+  }
+  max_deeplab {
+    pixel_space_head {
+      output_channels: 128
+      head_channels: 256
+    }
+    auxiliary_low_level {
+      feature_key: "res3"
+      channels_project: 64
+    }
+    auxiliary_low_level {
+      feature_key: "res2"
+      channels_project: 32
+    }
+    auxiliary_semantic_head {
+      output_channels: 134
+      head_channels: 256
+    }
+  }
+}
+trainer_options {
+  save_checkpoints_steps: 1000
+  save_summaries_steps: 100
+  steps_per_loop: 100
+  loss_options {
+    semantic_loss {
+      name: "softmax_cross_entropy"
+      weight: 1.0
+    }
+    pq_style_loss {
+      weight: 3.0
+    }
+    mask_id_cross_entropy_loss {
+      weight: 0.3
+    }
+    instance_discrimination_loss {
+      weight: 1.0
+    }
+  }
+  solver_options {
+    base_learning_rate: 0.001
+    training_number_of_steps: 100000
+    warmup_steps: 5000
+    backbone_learning_rate_multiplier: 0.1
+  }
+}
+train_dataset_options {
+  dataset: "coco_panoptic"
+  file_pattern: "${TRAIN_SET}"
+  # Adjust the batch_size accordingly to better fit your GPU/TPU memory.
+  # Also see Q1 in g3doc/faq.md.
+  batch_size: 64
+  crop_size: 641
+  crop_size: 641
+  min_resize_value: 641
+  max_resize_value: 641
+  augmentations {
+    min_scale_factor: 0.5
+    max_scale_factor: 1.5
+    scale_factor_step_size: 0.1
+  }
+  increase_small_instance_weights: false
+  small_instance_weight: 1.0
+  # This option generates ground truth labels for MaX-Deeplab.
+  thing_id_mask_annotations: true
+}
+eval_dataset_options {
+  dataset: "coco_panoptic"
+  # Update the path to validation set.
+  file_pattern: "${VAL_SET}"
+  batch_size: 1
+  crop_size: 641
+  crop_size: 641
+  min_resize_value: 641
+  max_resize_value: 641
+  # Add options to make the evaluation loss comparable to the training loss.
+  increase_small_instance_weights: false
+  small_instance_weight: 1.0
+  # This option generates ground truth labels for MaX-Deeplab.
+  thing_id_mask_annotations: true
+}
+evaluator_options {
+  continuous_eval_timeout: 43200
+  thing_area_limit: 100
+  stuff_area_limit: 1600
+  transformer_class_confidence_threshold: 0.7
+  pixel_confidence_threshold: 0.4
+  save_predictions: true
+  save_raw_predictions: false
+  # Some options are inapplicable to MaX-DeepLab, including nms_kernel,
+  # merge_semantic_and_instance_with_tf_op, center_score_threshold,
+  # keep_k_centers, add_flipped_images, and eval_scales.
+}
diff --git a/configs/coco/max_deeplab/max_deeplab_s_os16_res641_200k.textproto b/configs/coco/max_deeplab/max_deeplab_s_os16_res641_200k.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..3261da40abb2be7c980760c70be183dc63e7255b
--- /dev/null
+++ b/configs/coco/max_deeplab/max_deeplab_s_os16_res641_200k.textproto
@@ -0,0 +1,137 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+#
+# MaX-DeepLab-S with resolution 641x641 and 200k training steps.
+#
+############### PLEASE READ THIS BEFORE USING THIS CONFIG ###############
+# Before using this config, you need to update the following fields:
+# - experiment_name: Use a unique experiment name for each experiment.
+# - initial_checkpoint: Update the path to the initial checkpoint.
+# - train_dataset_options.file_pattern: Update the path to the
+#   training set. e.g., your_dataset/train*.tfrecord
+# - eval_dataset_options.file_pattern: Update the path to the
+#   validation set, e.g., your_dataset/eval*.tfrecord
+#########################################################################
+#
+# MaX-DeepLab-S replaces the last two stages of ResNet-50-beta with axial-
+# attention blocks and applies a small dual-path transformer.
+#
+# For axial-attention, see
+# - Huiyu Wang, et al. "Axial-DeepLab: Stand-Alone Axial-Attention for Panoptic
+#   Segmentation." In ECCV, 2020.
+# For MaX-DeepLab, see
+# - Huiyu Wang, et al. "MaX-DeepLab: End-to-End Panoptic Segmentation with Mask
+#   Transformers." In CVPR, 2021.
+
+# Use a unique experiment_name for each experiment.
+experiment_name: "${EXPERIMENT_NAME}"
+model_options {
+  # Update the path to the initial checkpoint (e.g., ImageNet
+  # pretrained checkpoint).
+  initial_checkpoint: "${INIT_CHECKPOINT}"
+  backbone {
+    name: "max_deeplab_s"
+    output_stride: 16
+    drop_path_keep_prob: 0.8
+    drop_path_schedule: "linear"
+  }
+  decoder {
+    feature_key: "feature_semantic"
+    decoder_channels: 256
+    aspp_channels: 256
+    atrous_rates: 6
+    atrous_rates: 12
+    atrous_rates: 18
+  }
+  max_deeplab {
+    pixel_space_head {
+      output_channels: 128
+      head_channels: 256
+    }
+    auxiliary_low_level {
+      feature_key: "res3"
+      channels_project: 64
+    }
+    auxiliary_low_level {
+      feature_key: "res2"
+      channels_project: 32
+    }
+    auxiliary_semantic_head {
+      output_channels: 134
+      head_channels: 256
+    }
+  }
+}
+trainer_options {
+  save_checkpoints_steps: 1000
+  save_summaries_steps: 100
+  steps_per_loop: 100
+  loss_options {
+    semantic_loss {
+      name: "softmax_cross_entropy"
+      weight: 1.0
+    }
+    pq_style_loss {
+      weight: 3.0
+    }
+    mask_id_cross_entropy_loss {
+      weight: 0.3
+    }
+    instance_discrimination_loss {
+      weight: 1.0
+    }
+  }
+  solver_options {
+    base_learning_rate: 0.001
+    training_number_of_steps: 200000
+    warmup_steps: 5000
+    backbone_learning_rate_multiplier: 0.1
+  }
+}
+train_dataset_options {
+  dataset: "coco_panoptic"
+  file_pattern: "${TRAIN_SET}"
+  # Adjust the batch_size accordingly to better fit your GPU/TPU memory.
+  # Also see Q1 in g3doc/faq.md.
+  batch_size: 64
+  crop_size: 641
+  crop_size: 641
+  min_resize_value: 641
+  max_resize_value: 641
+  augmentations {
+    min_scale_factor: 0.5
+    max_scale_factor: 1.5
+    scale_factor_step_size: 0.1
+  }
+  increase_small_instance_weights: false
+  small_instance_weight: 1.0
+  # This option generates ground truth labels for MaX-Deeplab.
+  thing_id_mask_annotations: true
+}
+eval_dataset_options {
+  dataset: "coco_panoptic"
+  # Update the path to validation set.
+  file_pattern: "${VAL_SET}"
+  batch_size: 1
+  crop_size: 641
+  crop_size: 641
+  min_resize_value: 641
+  max_resize_value: 641
+  # Add options to make the evaluation loss comparable to the training loss.
+  increase_small_instance_weights: false
+  small_instance_weight: 1.0
+  # This option generates ground truth labels for MaX-Deeplab.
+  thing_id_mask_annotations: true
+}
+evaluator_options {
+  continuous_eval_timeout: 43200
+  thing_area_limit: 100
+  stuff_area_limit: 1600
+  transformer_class_confidence_threshold: 0.7
+  pixel_confidence_threshold: 0.4
+  save_predictions: true
+  save_raw_predictions: false
+  # Some options are inapplicable to MaX-DeepLab, including nms_kernel,
+  # merge_semantic_and_instance_with_tf_op, center_score_threshold,
+  # keep_k_centers, add_flipped_images, and eval_scales.
+}
diff --git a/configs/coco/max_deeplab/max_deeplab_s_os16_res641_400k.textproto b/configs/coco/max_deeplab/max_deeplab_s_os16_res641_400k.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..6e5cb1a9adb6587a38041f085073751910d52508
--- /dev/null
+++ b/configs/coco/max_deeplab/max_deeplab_s_os16_res641_400k.textproto
@@ -0,0 +1,137 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+#
+# MaX-DeepLab-S with resolution 641x641 and 400k training steps.
+#
+############### PLEASE READ THIS BEFORE USING THIS CONFIG ###############
+# Before using this config, you need to update the following fields:
+# - experiment_name: Use a unique experiment name for each experiment.
+# - initial_checkpoint: Update the path to the initial checkpoint.
+# - train_dataset_options.file_pattern: Update the path to the
+#   training set. e.g., your_dataset/train*.tfrecord
+# - eval_dataset_options.file_pattern: Update the path to the
+#   validation set, e.g., your_dataset/eval*.tfrecord
+#########################################################################
+#
+# MaX-DeepLab-S replaces the last two stages of ResNet-50-beta with axial-
+# attention blocks and applies a small dual-path transformer.
+#
+# For axial-attention, see
+# - Huiyu Wang, et al. "Axial-DeepLab: Stand-Alone Axial-Attention for Panoptic
+#   Segmentation." In ECCV, 2020.
+# For MaX-DeepLab, see
+# - Huiyu Wang, et al. "MaX-DeepLab: End-to-End Panoptic Segmentation with Mask
+#   Transformers." In CVPR, 2021.
+
+# Use a unique experiment_name for each experiment.
+experiment_name: "${EXPERIMENT_NAME}"
+model_options {
+  # Update the path to the initial checkpoint (e.g., ImageNet
+  # pretrained checkpoint).
+  initial_checkpoint: "${INIT_CHECKPOINT}"
+  backbone {
+    name: "max_deeplab_s"
+    output_stride: 16
+    drop_path_keep_prob: 0.8
+    drop_path_schedule: "linear"
+  }
+  decoder {
+    feature_key: "feature_semantic"
+    decoder_channels: 256
+    aspp_channels: 256
+    atrous_rates: 6
+    atrous_rates: 12
+    atrous_rates: 18
+  }
+  max_deeplab {
+    pixel_space_head {
+      output_channels: 128
+      head_channels: 256
+    }
+    auxiliary_low_level {
+      feature_key: "res3"
+      channels_project: 64
+    }
+    auxiliary_low_level {
+      feature_key: "res2"
+      channels_project: 32
+    }
+    auxiliary_semantic_head {
+      output_channels: 134
+      head_channels: 256
+    }
+  }
+}
+trainer_options {
+  save_checkpoints_steps: 1000
+  save_summaries_steps: 100
+  steps_per_loop: 100
+  loss_options {
+    semantic_loss {
+      name: "softmax_cross_entropy"
+      weight: 1.0
+    }
+    pq_style_loss {
+      weight: 3.0
+    }
+    mask_id_cross_entropy_loss {
+      weight: 0.3
+    }
+    instance_discrimination_loss {
+      weight: 1.0
+    }
+  }
+  solver_options {
+    base_learning_rate: 0.001
+    training_number_of_steps: 400000
+    warmup_steps: 5000
+    backbone_learning_rate_multiplier: 0.1
+  }
+}
+train_dataset_options {
+  dataset: "coco_panoptic"
+  file_pattern: "${TRAIN_SET}"
+  # Adjust the batch_size accordingly to better fit your GPU/TPU memory.
+  # Also see Q1 in g3doc/faq.md.
+  batch_size: 64
+  crop_size: 641
+  crop_size: 641
+  min_resize_value: 641
+  max_resize_value: 641
+  augmentations {
+    min_scale_factor: 0.5
+    max_scale_factor: 1.5
+    scale_factor_step_size: 0.1
+  }
+  increase_small_instance_weights: false
+  small_instance_weight: 1.0
+  # This option generates ground truth labels for MaX-Deeplab.
+  thing_id_mask_annotations: true
+}
+eval_dataset_options {
+  dataset: "coco_panoptic"
+  # Update the path to validation set.
+  file_pattern: "${VAL_SET}"
+  batch_size: 1
+  crop_size: 641
+  crop_size: 641
+  min_resize_value: 641
+  max_resize_value: 641
+  # Add options to make the evaluation loss comparable to the training loss.
+  increase_small_instance_weights: false
+  small_instance_weight: 1.0
+  # This option generates ground truth labels for MaX-Deeplab.
+  thing_id_mask_annotations: true
+}
+evaluator_options {
+  continuous_eval_timeout: 43200
+  thing_area_limit: 100
+  stuff_area_limit: 1600
+  transformer_class_confidence_threshold: 0.7
+  pixel_confidence_threshold: 0.4
+  save_predictions: true
+  save_raw_predictions: false
+  # Some options are inapplicable to MaX-DeepLab, including nms_kernel,
+  # merge_semantic_and_instance_with_tf_op, center_score_threshold,
+  # keep_k_centers, add_flipped_images, and eval_scales.
+}
diff --git a/configs/coco/panoptic_deeplab/resnet50_beta_os16.textproto b/configs/coco/panoptic_deeplab/resnet50_beta_os16.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..08c575651e3975b7f84a5b18d49b9ff3e6f11711
--- /dev/null
+++ b/configs/coco/panoptic_deeplab/resnet50_beta_os16.textproto
@@ -0,0 +1,159 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+#
+# Panoptic-DeepLab with ResNet-50-beta model variant and output stride 16.
+#
+############### PLEASE READ THIS BEFORE USING THIS CONFIG ###############
+# Before using this config, you need to update the following fields:
+# - experiment_name: Use a unique experiment name for each experiment.
+# - initial_checkpoint: Update the path to the initial checkpoint.
+# - train_dataset_options.file_pattern: Update the path to the
+#   training set. e.g., your_dataset/train*.tfrecord
+# - eval_dataset_options.file_pattern: Update the path to the
+#   validation set, e.g., your_dataset/eval*.tfrecord
+# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you
+#   could successfully compile the provided efficient merging operation
+#   under the folder `tensorflow_ops`.
+#########################################################################
+#
+# The `resnet50_beta` model variant replaces the first 7x7 convolutions in the
+# original `resnet50` with three 3x3 convolutions, which is useful for dense
+# prediction tasks.
+#
+# References:
+# For resnet-50-beta, see
+# https://github.com/tensorflow/models/blob/master/research/deeplab/core/resnet_v1_beta.py
+# For Panoptic-DeepLab, see
+# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast Baseline
+#   for Bottom-Up Panoptic Segmentation." In CVPR, 2020.
+
+# Use a unique experiment_name for each experiment.
+experiment_name: "${EXPERIMENT_NAME}"
+model_options {
+  # Update the path to the initial checkpoint (e.g., ImageNet
+  # pretrained checkpoint).
+  initial_checkpoint: "${INIT_CHECKPOINT}"
+  backbone {
+    name: "resnet50_beta"
+    output_stride: 16
+  }
+  decoder {
+    feature_key: "res5"
+    decoder_channels: 256
+    aspp_channels: 256
+    atrous_rates: 6
+    atrous_rates: 12
+    atrous_rates: 18
+  }
+  panoptic_deeplab {
+    low_level {
+      feature_key: "res3"
+      channels_project: 64
+    }
+    low_level {
+      feature_key: "res2"
+      channels_project: 32
+    }
+    instance {
+      low_level_override {
+        feature_key: "res3"
+        channels_project: 32
+      }
+      low_level_override {
+        feature_key: "res2"
+        channels_project: 16
+      }
+      instance_decoder_override {
+        feature_key: "res5"
+        decoder_channels: 128
+        atrous_rates: 6
+        atrous_rates: 12
+        atrous_rates: 18
+      }
+      center_head {
+        output_channels: 1
+        head_channels: 32
+      }
+      regression_head {
+        output_channels: 2
+        head_channels: 32
+      }
+    }
+    semantic_head {
+      output_channels: 134
+      head_channels: 256
+    }
+  }
+}
+trainer_options {
+  save_checkpoints_steps: 1000
+  save_summaries_steps: 100
+  steps_per_loop: 100
+  loss_options {
+    semantic_loss {
+      name: "softmax_cross_entropy"
+      weight: 1.0
+      top_k_percent: 0.2
+    }
+    center_loss {
+      name: "mse"
+      weight: 200
+    }
+    regression_loss {
+      name: "l1"
+      weight: 0.01
+    }
+  }
+  solver_options {
+    base_learning_rate: 0.0005
+    training_number_of_steps: 200000
+    warmup_steps: 2000
+  }
+}
+train_dataset_options {
+  dataset: "coco_panoptic"
+  # Update the path to training set.
+  file_pattern: "${TRAIN_SET}"
+  # Adjust the batch_size accordingly to better fit your GPU/TPU memory.
+  # Also see Q1 in g3doc/faq.md.
+  batch_size: 64
+  crop_size: 641
+  crop_size: 641
+  min_resize_value: 641
+  max_resize_value: 641
+  augmentations {
+    min_scale_factor: 0.5
+    max_scale_factor: 1.5
+    scale_factor_step_size: 0.1
+    autoaugment_policy_name: "simple_classification_policy_magnitude_scale_0.2"
+  }
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+eval_dataset_options {
+  dataset: "coco_panoptic"
+  # Update the path to validation set.
+  file_pattern: "${VAL_SET}"
+  batch_size: 1
+  crop_size: 641
+  crop_size: 641
+  min_resize_value: 641
+  max_resize_value: 641
+  # Add options to make the evaluation loss comparable to the training loss.
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+evaluator_options {
+  continuous_eval_timeout: 43200
+  stuff_area_limit: 4096
+  center_score_threshold: 0.1
+  nms_kernel: 41
+  save_predictions: true
+  save_raw_predictions: false
+  # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and
+  # instance maps. For faster speed, compile TensorFlow with provided kernel
+  # implementation under the folder `tensorflow_ops`, and set
+  # merge_semantic_and_instance_with_tf_op to true.
+  merge_semantic_and_instance_with_tf_op: false
+}
+
diff --git a/configs/coco/panoptic_deeplab/resnet50_beta_os32.textproto b/configs/coco/panoptic_deeplab/resnet50_beta_os32.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..f4ad475800f8ef8ddc42123bd50cc8689f244dae
--- /dev/null
+++ b/configs/coco/panoptic_deeplab/resnet50_beta_os32.textproto
@@ -0,0 +1,158 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+#
+# Panoptic-DeepLab with ResNet-50-beta model variant and output stride 32.
+#
+############### PLEASE READ THIS BEFORE USING THIS CONFIG ###############
+# Before using this config, you need to update the following fields:
+# - experiment_name: Use a unique experiment name for each experiment.
+# - initial_checkpoint: Update the path to the initial checkpoint.
+# - train_dataset_options.file_pattern: Update the path to the
+#   training set. e.g., your_dataset/train*.tfrecord
+# - eval_dataset_options.file_pattern: Update the path to the
+#   validation set, e.g., your_dataset/eval*.tfrecord
+# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you
+#   could successfully compile the provided efficient merging operation
+#   under the folder `tensorflow_ops`.
+#########################################################################
+#
+# The `resnet50_beta` model variant replaces the first 7x7 convolutions in the
+# original `resnet50` with three 3x3 convolutions, which is useful for dense
+# prediction tasks.
+#
+# References:
+# For resnet-50-beta, see
+# https://github.com/tensorflow/models/blob/master/research/deeplab/core/resnet_v1_beta.py
+# For Panoptic-DeepLab, see
+# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast Baseline
+#   for Bottom-Up Panoptic Segmentation." In CVPR, 2020.
+
+# Use a unique experiment_name for each experiment.
+experiment_name: "${EXPERIMENT_NAME}"
+model_options {
+  # Update the path to the initial checkpoint (e.g., ImageNet
+  # pretrained checkpoint).
+  initial_checkpoint: "${INIT_CHECKPOINT}"
+  backbone {
+    name: "resnet50_beta"
+    output_stride: 32
+  }
+  decoder {
+    feature_key: "res5"
+    decoder_channels: 256
+    aspp_channels: 256
+    atrous_rates: 3
+    atrous_rates: 6
+    atrous_rates: 9
+  }
+  panoptic_deeplab {
+    low_level {
+      feature_key: "res3"
+      channels_project: 64
+    }
+    low_level {
+      feature_key: "res2"
+      channels_project: 32
+    }
+    instance {
+      low_level_override {
+        feature_key: "res3"
+        channels_project: 32
+      }
+      low_level_override {
+        feature_key: "res2"
+        channels_project: 16
+      }
+      instance_decoder_override {
+        feature_key: "res5"
+        decoder_channels: 128
+        atrous_rates: 3
+        atrous_rates: 6
+        atrous_rates: 9
+      }
+      center_head {
+        output_channels: 1
+        head_channels: 32
+      }
+      regression_head {
+        output_channels: 2
+        head_channels: 32
+      }
+    }
+    semantic_head {
+      output_channels: 134
+      head_channels: 256
+    }
+  }
+}
+trainer_options {
+  save_checkpoints_steps: 1000
+  save_summaries_steps: 100
+  steps_per_loop: 100
+  loss_options {
+    semantic_loss {
+      name: "softmax_cross_entropy"
+      weight: 1.0
+      top_k_percent: 0.2
+    }
+    center_loss {
+      name: "mse"
+      weight: 200
+    }
+    regression_loss {
+      name: "l1"
+      weight: 0.01
+    }
+  }
+  solver_options {
+    base_learning_rate: 0.0005
+    training_number_of_steps: 200000
+    warmup_steps: 2000
+  }
+}
+train_dataset_options {
+  dataset: "coco_panoptic"
+  # Update the path to training set.
+  file_pattern: "${TRAIN_SET}"
+  # Adjust the batch_size accordingly to better fit your GPU/TPU memory.
+  # Also see Q1 in g3doc/faq.md.
+  batch_size: 64
+  crop_size: 641
+  crop_size: 641
+  min_resize_value: 641
+  max_resize_value: 641
+  augmentations {
+    min_scale_factor: 0.5
+    max_scale_factor: 1.5
+    scale_factor_step_size: 0.1
+    autoaugment_policy_name: "simple_classification_policy_magnitude_scale_0.2"
+  }
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+eval_dataset_options {
+  dataset: "coco_panoptic"
+  # Update the path to validation set.
+  file_pattern: "${VAL_SET}"
+  batch_size: 1
+  crop_size: 641
+  crop_size: 641
+  min_resize_value: 641
+  max_resize_value: 641
+  # Add options to make the evaluation loss comparable to the training loss.
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+evaluator_options {
+  continuous_eval_timeout: 43200
+  stuff_area_limit: 4096
+  center_score_threshold: 0.1
+  nms_kernel: 41
+  save_predictions: true
+  save_raw_predictions: false
+  # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and
+  # instance maps. For faster speed, compile TensorFlow with provided kernel
+  # implementation under the folder `tensorflow_ops`, and set
+  # merge_semantic_and_instance_with_tf_op to true.
+  merge_semantic_and_instance_with_tf_op: false
+}
diff --git a/configs/coco/panoptic_deeplab/resnet50_os16.textproto b/configs/coco/panoptic_deeplab/resnet50_os16.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..c8749fbcd795a4346cfc4c893682535ff4bd1454
--- /dev/null
+++ b/configs/coco/panoptic_deeplab/resnet50_os16.textproto
@@ -0,0 +1,155 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+#
+# Panoptic-DeepLab with ResNet-50 and output stride 16.
+#
+############### PLEASE READ THIS BEFORE USING THIS CONFIG ###############
+# Before using this config, you need to update the following fields:
+# - experiment_name: Use a unique experiment name for each experiment.
+# - initial_checkpoint: Update the path to the initial checkpoint.
+# - train_dataset_options.file_pattern: Update the path to the
+#   training set. e.g., your_dataset/train*.tfrecord
+# - eval_dataset_options.file_pattern: Update the path to the
+#   validation set, e.g., your_dataset/eval*.tfrecord
+# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you
+#   could successfully compile the provided efficient merging operation
+#   under the folder `tensorflow_ops`.
+#########################################################################
+#
+# References:
+# For ResNet, see
+# - Kaiming He, et al. "Deep Residual Learning for Image Recognition."
+#   In CVPR, 2016.
+# For Panoptic-DeepLab, see
+# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast Baseline
+#   for Bottom-Up Panoptic Segmentation." In CVPR, 2020.
+
+# Use a unique experiment_name for each experiment.
+experiment_name: "${EXPERIMENT_NAME}"
+model_options {
+  # Update the path to the initial checkpoint (e.g., ImageNet
+  # pretrained checkpoint).
+  initial_checkpoint: "${INIT_CHECKPOINT}"
+  backbone {
+    name: "resnet50"
+    output_stride: 16
+  }
+  decoder {
+    feature_key: "res5"
+    decoder_channels: 256
+    aspp_channels: 256
+    atrous_rates: 6
+    atrous_rates: 12
+    atrous_rates: 18
+  }
+  panoptic_deeplab {
+    low_level {
+      feature_key: "res3"
+      channels_project: 64
+    }
+    low_level {
+      feature_key: "res2"
+      channels_project: 32
+    }
+    instance {
+      low_level_override {
+        feature_key: "res3"
+        channels_project: 32
+      }
+      low_level_override {
+        feature_key: "res2"
+        channels_project: 16
+      }
+      instance_decoder_override {
+        feature_key: "res5"
+        decoder_channels: 128
+        atrous_rates: 6
+        atrous_rates: 12
+        atrous_rates: 18
+      }
+      center_head {
+        output_channels: 1
+        head_channels: 32
+      }
+      regression_head {
+        output_channels: 2
+        head_channels: 32
+      }
+    }
+    semantic_head {
+      output_channels: 134
+      head_channels: 256
+    }
+  }
+}
+trainer_options {
+  save_checkpoints_steps: 1000
+  save_summaries_steps: 100
+  steps_per_loop: 100
+  loss_options {
+    semantic_loss {
+      name: "softmax_cross_entropy"
+      weight: 1.0
+      top_k_percent: 0.2
+    }
+    center_loss {
+      name: "mse"
+      weight: 200
+    }
+    regression_loss {
+      name: "l1"
+      weight: 0.01
+    }
+  }
+  solver_options {
+    base_learning_rate: 0.0005
+    training_number_of_steps: 200000
+    warmup_steps: 2000
+  }
+}
+train_dataset_options {
+  dataset: "coco_panoptic"
+  # Update the path to training set.
+  file_pattern: "${TRAIN_SET}"
+  # Adjust the batch_size accordingly to better fit your GPU/TPU memory.
+  # Also see Q1 in g3doc/faq.md.
+  batch_size: 64
+  crop_size: 641
+  crop_size: 641
+  min_resize_value: 641
+  max_resize_value: 641
+  augmentations {
+    min_scale_factor: 0.5
+    max_scale_factor: 1.5
+    scale_factor_step_size: 0.1
+    autoaugment_policy_name: "simple_classification_policy_magnitude_scale_0.2"
+  }
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+eval_dataset_options {
+  dataset: "coco_panoptic"
+  # Update the path to validation set.
+  file_pattern: "${VAL_SET}"
+  batch_size: 1
+  crop_size: 641
+  crop_size: 641
+  min_resize_value: 641
+  max_resize_value: 641
+  # Add options to make the evaluation loss comparable to the training loss.
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+evaluator_options {
+  continuous_eval_timeout: 43200
+  stuff_area_limit: 4096
+  center_score_threshold: 0.1
+  nms_kernel: 41
+  save_predictions: true
+  save_raw_predictions: false
+  # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and
+  # instance maps. For faster speed, compile TensorFlow with provided kernel
+  # implementation under the folder `tensorflow_ops`, and set
+  # merge_semantic_and_instance_with_tf_op to true.
+  merge_semantic_and_instance_with_tf_op: false
+}
diff --git a/configs/coco/panoptic_deeplab/resnet50_os32.textproto b/configs/coco/panoptic_deeplab/resnet50_os32.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..5ebab1b352c1574c9ed5410a617905598f409c88
--- /dev/null
+++ b/configs/coco/panoptic_deeplab/resnet50_os32.textproto
@@ -0,0 +1,157 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+#
+# Panoptic-DeepLab with ResNet-50 and output stride 32.
+#
+############### PLEASE READ THIS BEFORE USING THIS CONFIG ###############
+# Before using this config, you need to update the following fields:
+# - experiment_name: Use a unique experiment name for each experiment.
+# - initial_checkpoint: Update the path to the initial checkpoint.
+# - train_dataset_options.file_pattern: Update the path to the
+#   training set. e.g., your_dataset/train*.tfrecord
+# - eval_dataset_options.file_pattern: Update the path to the
+#   validation set, e.g., your_dataset/eval*.tfrecord
+# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you
+#   could successfully compile the provided efficient merging operation
+#   under the folder `tensorflow_ops`.
+#########################################################################
+#
+# References:
+# For ResNet, see
+# - Kaiming He, et al. "Deep Residual Learning for Image Recognition."
+#   In CVPR, 2016.
+# For Panoptic-DeepLab, see
+# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast Baseline
+#   for Bottom-Up Panoptic Segmentation." In CVPR, 2020.
+
+
+# Use a unique experiment_name for each experiment.
+experiment_name: "${EXPERIMENT_NAME}"
+model_options {
+  # Update the path to the initial checkpoint (e.g., ImageNet
+  # pretrained checkpoint).
+  initial_checkpoint: "${INIT_CHECKPOINT}"
+  backbone {
+    name: "resnet50"
+    output_stride: 32
+  }
+  decoder {
+    feature_key: "res5"
+    decoder_channels: 256
+    aspp_channels: 256
+    atrous_rates: 3
+    atrous_rates: 6
+    atrous_rates: 9
+  }
+  panoptic_deeplab {
+    low_level {
+      feature_key: "res3"
+      channels_project: 64
+    }
+    low_level {
+      feature_key: "res2"
+      channels_project: 32
+    }
+    instance {
+      low_level_override {
+        feature_key: "res3"
+        channels_project: 32
+      }
+      low_level_override {
+        feature_key: "res2"
+        channels_project: 16
+      }
+      instance_decoder_override {
+        feature_key: "res5"
+        decoder_channels: 128
+        atrous_rates: 3
+        atrous_rates: 6
+        atrous_rates: 9
+      }
+      center_head {
+        output_channels: 1
+        head_channels: 32
+      }
+      regression_head {
+        output_channels: 2
+        head_channels: 32
+      }
+    }
+    semantic_head {
+      output_channels: 134
+      head_channels: 256
+    }
+  }
+}
+trainer_options {
+  save_checkpoints_steps: 1000
+  save_summaries_steps: 100
+  steps_per_loop: 100
+  loss_options {
+    semantic_loss {
+      name: "softmax_cross_entropy"
+      weight: 1.0
+      top_k_percent: 0.2
+    }
+    center_loss {
+      name: "mse"
+      weight: 200
+    }
+    regression_loss {
+      name: "l1"
+      weight: 0.01
+    }
+  }
+  solver_options {
+    base_learning_rate: 0.0005
+    training_number_of_steps: 200000
+    warmup_steps: 2000
+  }
+}
+train_dataset_options {
+  dataset: "coco_panoptic"
+  # Update the path to training set.
+  file_pattern: "${TRAIN_SET}"
+  # Adjust the batch_size accordingly to better fit your GPU/TPU memory.
+  # Also see Q1 in g3doc/faq.md.
+  batch_size: 64
+  crop_size: 641
+  crop_size: 641
+  min_resize_value: 641
+  max_resize_value: 641
+  augmentations {
+    min_scale_factor: 0.5
+    max_scale_factor: 1.5
+    scale_factor_step_size: 0.1
+    autoaugment_policy_name: "simple_classification_policy_magnitude_scale_0.2"
+  }
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+eval_dataset_options {
+  dataset: "coco_panoptic"
+  # Update the path to validation set.
+  file_pattern: "${VAL_SET}"
+  batch_size: 1
+  crop_size: 641
+  crop_size: 641
+  min_resize_value: 641
+  max_resize_value: 641
+  # Add options to make the evaluation loss comparable to the training loss.
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+evaluator_options {
+  continuous_eval_timeout: 43200
+  stuff_area_limit: 4096
+  center_score_threshold: 0.1
+  nms_kernel: 41
+  save_predictions: true
+  save_raw_predictions: false
+  # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and
+  # instance maps. For faster speed, compile TensorFlow with provided kernel
+  # implementation under the folder `tensorflow_ops`, and set
+  # merge_semantic_and_instance_with_tf_op to true.
+  merge_semantic_and_instance_with_tf_op: false
+}
+
diff --git a/configs/example/example_cityscapes_deeplabv3.textproto b/configs/example/example_cityscapes_deeplabv3.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..4e29b9a745240114bf1baf85fb513424e14c10fe
--- /dev/null
+++ b/configs/example/example_cityscapes_deeplabv3.textproto
@@ -0,0 +1,25 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+
+model_options {
+  decoder {
+    feature_key: "res5"
+    atrous_rates: 6
+    atrous_rates: 12
+    atrous_rates: 18
+  }
+
+  backbone {
+    name: "resnet50"
+  }
+
+  # Example for cityscapes.
+  deeplab_v3 {
+    num_classes: 19
+  }
+}
+
+train_dataset_options {
+  crop_size: 1025
+  crop_size: 2049
+}
diff --git a/configs/example/example_cityscapes_deeplabv3_mv3l.textproto b/configs/example/example_cityscapes_deeplabv3_mv3l.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..f190564ddc683bdbe7c62ddf5df556075b2b5a15
--- /dev/null
+++ b/configs/example/example_cityscapes_deeplabv3_mv3l.textproto
@@ -0,0 +1,26 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+
+model_options {
+  decoder {
+    feature_key: "res5"
+    atrous_rates: 6
+    atrous_rates: 12
+    atrous_rates: 18
+  }
+
+  backbone {
+    name: "mobilenet_v3_large"
+    use_squeeze_and_excite: true
+  }
+
+  # Example for cityscapes.
+  deeplab_v3 {
+    num_classes: 19
+  }
+}
+
+train_dataset_options {
+  crop_size: 1025
+  crop_size: 2049
+}
diff --git a/configs/example/example_cityscapes_deeplabv3plus.textproto b/configs/example/example_cityscapes_deeplabv3plus.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..eb79993563237f5de4f58014a1bfd928eb7c6c83
--- /dev/null
+++ b/configs/example/example_cityscapes_deeplabv3plus.textproto
@@ -0,0 +1,29 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+
+model_options {
+  decoder {
+    feature_key: "res5"
+    atrous_rates: 6
+    atrous_rates: 12
+    atrous_rates: 18
+  }
+
+  backbone {
+    name: "resnet50"
+  }
+
+  deeplab_v3_plus {
+    low_level {
+      feature_key: "res2"
+      channels_project: 48
+    }
+    # Example for cityscapes.
+    num_classes: 19
+  }
+}
+
+train_dataset_options {
+  crop_size: 1025
+  crop_size: 2049
+}
diff --git a/configs/example/example_cityscapes_panoptic_deeplab.textproto b/configs/example/example_cityscapes_panoptic_deeplab.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..a06b9b696e8c30487b184a9b7b1a3c05c634992a
--- /dev/null
+++ b/configs/example/example_cityscapes_panoptic_deeplab.textproto
@@ -0,0 +1,61 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+
+model_options {
+  decoder {
+    feature_key: "res5"
+    atrous_rates: 6
+    atrous_rates: 12
+    atrous_rates: 18
+  }
+
+  backbone {
+    name: "resnet50"
+  }
+
+  panoptic_deeplab {
+    low_level {
+      feature_key: "res3"
+      channels_project: 64
+    }
+    low_level {
+      feature_key: "res2"
+      channels_project: 32
+    }
+    semantic_head {
+      # Example for cityscapes.
+      output_channels: 19
+      head_channels: 256
+    }
+    instance {
+      instance_decoder_override {
+        feature_key: "res5"
+        decoder_channels: 128
+        atrous_rates: 6
+        atrous_rates: 12
+        atrous_rates: 18
+      }
+      low_level_override {
+        feature_key: "res3"
+        channels_project: 32
+      }
+      low_level_override {
+        feature_key: "res2"
+        channels_project: 16
+      }
+      center_head {
+        output_channels: 1
+        head_channels: 32
+      }
+      regression_head {
+        output_channels: 2
+        head_channels: 32
+      }
+    }
+  }
+}
+
+train_dataset_options {
+  crop_size: 1025
+  crop_size: 2049
+}
diff --git a/configs/example/example_cityscapes_panoptic_deeplab_mv3l.textproto b/configs/example/example_cityscapes_panoptic_deeplab_mv3l.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..7ea7cae2e44d4edb20c3c5c685e6193350458cbe
--- /dev/null
+++ b/configs/example/example_cityscapes_panoptic_deeplab_mv3l.textproto
@@ -0,0 +1,62 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+
+model_options {
+  decoder {
+    feature_key: "res5"
+    atrous_rates: 6
+    atrous_rates: 12
+    atrous_rates: 18
+  }
+
+  backbone {
+    name: "mobilenet_v3_large"
+    use_squeeze_and_excite: true
+  }
+
+  panoptic_deeplab {
+    low_level {
+      feature_key: "res3"
+      channels_project: 64
+    }
+    low_level {
+      feature_key: "res2"
+      channels_project: 32
+    }
+    semantic_head {
+      # Example for cityscapes.
+      output_channels: 19
+      head_channels: 256
+    }
+    instance {
+      instance_decoder_override {
+        feature_key: "res5"
+        decoder_channels: 128
+        atrous_rates: 6
+        atrous_rates: 12
+        atrous_rates: 18
+      }
+      low_level_override {
+        feature_key: "res3"
+        channels_project: 32
+      }
+      low_level_override {
+        feature_key: "res2"
+        channels_project: 16
+      }
+      center_head {
+        output_channels: 1
+        head_channels: 32
+      }
+      regression_head {
+        output_channels: 2
+        head_channels: 32
+      }
+    }
+  }
+}
+
+train_dataset_options {
+  crop_size: 1025
+  crop_size: 2049
+}
diff --git a/configs/example/example_coco_max_deeplab.textproto b/configs/example/example_coco_max_deeplab.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..c53549d77dc60725f5bd1960ebd9e89931b316fc
--- /dev/null
+++ b/configs/example/example_coco_max_deeplab.textproto
@@ -0,0 +1,41 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+
+model_options {
+  decoder {
+    feature_key: "feature_semantic"
+    atrous_rates: 6
+    atrous_rates: 12
+    atrous_rates: 18
+  }
+
+  backbone {
+    name: "max_deeplab_s"
+    output_stride: 16
+  }
+
+  max_deeplab {
+    pixel_space_head {
+      output_channels: 128
+      head_channels: 256
+    }
+    auxiliary_low_level {
+      feature_key: "res3"
+      channels_project: 64
+    }
+    auxiliary_low_level {
+      feature_key: "res2"
+      channels_project: 32
+    }
+    auxiliary_semantic_head {
+      # Example for COCO.
+      output_channels: 134
+      head_channels: 256
+    }
+  }
+}
+
+train_dataset_options {
+  crop_size: 65
+  crop_size: 65
+}
diff --git a/configs/example/example_kitti-step_motion_deeplab.textproto b/configs/example/example_kitti-step_motion_deeplab.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..383f8eaac3ba8d538e4578d6aca8f6129cead73f
--- /dev/null
+++ b/configs/example/example_kitti-step_motion_deeplab.textproto
@@ -0,0 +1,60 @@
+# proto-file: deeplab2/model.proto
+# proto-message: ModelOptions
+
+decoder {
+  feature_key: "res5"
+  atrous_rates: 6
+  atrous_rates: 12
+  atrous_rates: 18
+}
+
+backbone {
+  name: "resnet50"
+}
+
+# Motion-Deeplab adopts Panoptic-Deeplab for the task of Video Panoptic
+# Segmentation or Segmenting and Tracking Every Pixel (STEP).
+motion_deeplab {
+  low_level {
+    feature_key: "res3"
+    channels_project: 64
+  }
+  low_level {
+    feature_key: "res2"
+    channels_project: 32
+  }
+  semantic_head {
+    # Example for KITTI-STEP.
+    output_channels: 19
+    head_channels: 256
+  }
+  instance {
+    instance_decoder_override {
+      feature_key: "res5"
+      decoder_channels: 128
+      atrous_rates: 6
+      atrous_rates: 12
+      atrous_rates: 18
+    }
+    low_level_override {
+      feature_key: "res3"
+      channels_project: 32
+    }
+    low_level_override {
+      feature_key: "res2"
+      channels_project: 16
+    }
+    center_head {
+      output_channels: 1
+      head_channels: 32
+    }
+    regression_head {
+      output_channels: 2
+      head_channels: 32
+    }
+  }
+  motion_head {
+    output_channels: 2
+    head_channels: 32
+  }
+}
diff --git a/configs/kitti/motion_deeplab/resnet50_os32.textproto b/configs/kitti/motion_deeplab/resnet50_os32.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..534e6c8b1cc07b200d7d0752f767c17a89fb5ada
--- /dev/null
+++ b/configs/kitti/motion_deeplab/resnet50_os32.textproto
@@ -0,0 +1,168 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+#
+# Motion-DeepLab with ResNet-50 and output stride 32.
+#
+############### PLEASE READ THIS BEFORE USING THIS CONFIG ###############
+# Before using this config, you need to update the following fields:
+# - experiment_name: Use a unique experiment name for each experiment.
+# - initial_checkpoint: Update the path to the initial checkpoint.
+# - train_dataset_options.file_pattern: Update the path to the
+#   training set. e.g., your_dataset/train*.tfrecord
+# - eval_dataset_options.file_pattern: Update the path to the
+#   validation set, e.g., your_dataset/eval*.tfrecord
+# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you
+#   could successfully compile the provided efficient merging operation
+#   under the folder `tensorflow_ops`.
+#########################################################################
+#
+# This config uses the Cityscapes pretrained checkpoint where crowd label is
+# kept to pretrain the semantic segmentation branch. Additionally, we perform
+# net surgery on the first 3x3 convolution to take two-frame inputs.
+#
+# References:
+# For ResNet, see
+# - Kaiming He, et al. "Deep Residual Learning for Image Recognition."
+#   In CVPR, 2016.
+# For Motion-DeepLab, see
+# - Mark Weber, et al. "STEP: Segmenting and Tracking Every Pixel."
+#   arXiv: 2102.11859.
+
+# Use a unique experiment_name for each experiment.
+experiment_name: "${EXPERIMENT_NAME}"
+model_options {
+  # Update the path to the initial checkpoint (e.g., ImageNet
+  # pretrained checkpoint)
+  initial_checkpoint: "${INIT_CHECKPOINT}"
+  backbone {
+    name: "resnet50"
+    output_stride: 32
+  }
+  decoder {
+    feature_key: "res5"
+    decoder_channels: 256
+    aspp_channels: 256
+    atrous_rates: 3
+    atrous_rates: 6
+    atrous_rates: 9
+  }
+  motion_deeplab {
+    low_level {
+      feature_key: "res3"
+      channels_project: 64
+    }
+    low_level {
+      feature_key: "res2"
+      channels_project: 32
+    }
+    instance {
+      low_level_override {
+        feature_key: "res3"
+        channels_project: 32
+      }
+      low_level_override {
+        feature_key: "res2"
+        channels_project: 16
+      }
+      instance_decoder_override {
+        feature_key: "res5"
+        decoder_channels: 128
+        atrous_rates: 3
+        atrous_rates: 6
+        atrous_rates: 9
+      }
+      center_head {
+        output_channels: 1
+        head_channels: 32
+      }
+      regression_head {
+        output_channels: 2
+        head_channels: 32
+      }
+    }
+    semantic_head {
+      output_channels: 19
+      head_channels: 256
+    }
+    motion_head {
+      output_channels: 2
+      head_channels: 32
+    }
+  }
+}
+trainer_options {
+  save_checkpoints_steps: 500
+  save_summaries_steps: 100
+  steps_per_loop: 100
+  loss_options {
+    semantic_loss {
+      name: "softmax_cross_entropy"
+      weight: 1.0
+      top_k_percent: 0.2
+    }
+    center_loss {
+      name: "mse"
+      weight: 200
+    }
+    regression_loss {
+      name: "l1"
+      weight: 0.01
+    }
+    motion_loss {
+      name: "l1"
+      weight: 0.01
+    }
+  }
+  solver_options {
+    base_learning_rate: 0.0001
+    training_number_of_steps: 50000
+  }
+}
+train_dataset_options {
+  dataset: "kitti_step"
+  # Update the path to training set.
+  file_pattern: "${TRAIN_SET}"
+  # Adjust the batch_size accordingly to better fit your GPU/TPU memory.
+  # Also see Q1 in g3doc/fag.md.
+  batch_size: 32
+  crop_size: 385
+  crop_size: 1249
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  augmentations {
+    min_scale_factor: 0.5
+    max_scale_factor: 2.0
+    scale_factor_step_size: 0.1
+  }
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+  use_two_frames: true
+}
+eval_dataset_options {
+  dataset: "kitti_step"
+  # Update the path to validation set.
+  file_pattern: "${VAL_SET}"
+  batch_size: 1
+  crop_size: 385
+  crop_size: 1249
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  # Add options to make the evaluation loss comparable to the training loss.
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+  use_two_frames: true
+}
+evaluator_options {
+  continuous_eval_timeout: 21600
+  stuff_area_limit: 0
+  center_score_threshold: 0.1
+  nms_kernel: 13
+  save_predictions: true
+  # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and
+  # instance maps. For faster speed, compile TensorFlow with provided kernel
+  # implementation under the folder `tensorflow_ops`, and set
+  # merge_semantic_and_instance_with_tf_op to true.
+  merge_semantic_and_instance_with_tf_op: false
+}
diff --git a/configs/kitti/motion_deeplab/resnet50_os32_trainval.textproto b/configs/kitti/motion_deeplab/resnet50_os32_trainval.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..2fb6c6a998d2af9cac805f31c1e9c3c1b4a049e6
--- /dev/null
+++ b/configs/kitti/motion_deeplab/resnet50_os32_trainval.textproto
@@ -0,0 +1,169 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+#
+# Motion-DeepLab with ResNet-50 and output stride 32.
+#
+############### PLEASE READ THIS BEFORE USING THIS CONFIG ###############
+# Before using this config, you need to update the following fields:
+# - experiment_name: Use a unique experiment name for each experiment.
+# - initial_checkpoint: Update the path to the initial checkpoint.
+# - train_dataset_options.file_pattern: Update the path to the
+#   training set. e.g., your_dataset/train*.tfrecord
+# - eval_dataset_options.file_pattern: Update the path to the
+#   validation set, e.g., your_dataset/eval*.tfrecord
+# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you
+#   could successfully compile the provided efficient merging operation
+#   under the folder `tensorflow_ops`.
+#########################################################################
+#
+# This config uses the Cityscapes pretrained checkpoint where crowd label is
+# kept to pretrain the semantic segmentation branch. Additionally, we perform
+# net surgery on the first 3x3 convolution to take two-frame inputs.
+#
+# References:
+# For ResNet, see
+# - Kaiming He, et al. "Deep Residual Learning for Image Recognition."
+#   In CVPR, 2016.
+# For Motion-DeepLab, see
+# - Mark Weber, et al. "STEP: Segmenting and Tracking Every Pixel."
+#   arXiv: 2102.11859.
+
+# Use a unique experiment_name for each experiment.
+experiment_name: "${EXPERIMENT_NAME}"
+model_options {
+  # Update the path to the initial checkpoint (e.g., ImageNet
+  # pretrained checkpoint)
+  initial_checkpoint: "${INIT_CHECKPOINT}"
+  backbone {
+    name: "resnet50"
+    output_stride: 32
+  }
+  decoder {
+    feature_key: "res5"
+    decoder_channels: 256
+    aspp_channels: 256
+    atrous_rates: 3
+    atrous_rates: 6
+    atrous_rates: 9
+  }
+  motion_deeplab {
+    low_level {
+      feature_key: "res3"
+      channels_project: 64
+    }
+    low_level {
+      feature_key: "res2"
+      channels_project: 32
+    }
+    instance {
+      low_level_override {
+        feature_key: "res3"
+        channels_project: 32
+      }
+      low_level_override {
+        feature_key: "res2"
+        channels_project: 16
+      }
+      instance_decoder_override {
+        feature_key: "res5"
+        decoder_channels: 128
+        atrous_rates: 3
+        atrous_rates: 6
+        atrous_rates: 9
+      }
+      center_head {
+        output_channels: 1
+        head_channels: 32
+      }
+      regression_head {
+        output_channels: 2
+        head_channels: 32
+      }
+    }
+    semantic_head {
+      output_channels: 19
+      head_channels: 256
+    }
+    motion_head {
+      output_channels: 2
+      head_channels: 32
+    }
+  }
+}
+trainer_options {
+  save_checkpoints_steps: 500
+  save_summaries_steps: 100
+  steps_per_loop: 100
+  loss_options {
+    semantic_loss {
+      name: "softmax_cross_entropy"
+      weight: 1.0
+      top_k_percent: 0.2
+    }
+    center_loss {
+      name: "mse"
+      weight: 200
+    }
+    regression_loss {
+      name: "l1"
+      weight: 0.01
+    }
+    motion_loss {
+      name: "l1"
+      weight: 0.01
+    }
+  }
+  solver_options {
+    base_learning_rate: 0.00001
+    training_number_of_steps: 50000
+  }
+}
+train_dataset_options {
+  dataset: "kitti_step"
+  # Update the path to training set.
+  file_pattern: "${TRAIN_SET}"
+  file_pattern: "${VAL_SET}"
+  # Adjust the batch_size accordingly to better fit your GPU/TPU memory.
+  # Also see Q1 in g3doc/fag.md.
+  batch_size: 32
+  crop_size: 385
+  crop_size: 1249
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  augmentations {
+    min_scale_factor: 0.5
+    max_scale_factor: 2.0
+    scale_factor_step_size: 0.1
+  }
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+  use_two_frames: true
+}
+eval_dataset_options {
+  dataset: "kitti_step"
+  # Update the path to validation set.
+  file_pattern: "${VAL_SET}"
+  batch_size: 1
+  crop_size: 385
+  crop_size: 1249
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  # Add options to make the evaluation loss comparable to the training loss.
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+  use_two_frames: true
+}
+evaluator_options {
+  continuous_eval_timeout: 21600
+  stuff_area_limit: 0
+  center_score_threshold: 0.1
+  nms_kernel: 13
+  save_predictions: true
+  # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and
+  # instance maps. For faster speed, compile TensorFlow with provided kernel
+  # implementation under the folder `tensorflow_ops`, and set
+  # merge_semantic_and_instance_with_tf_op to true.
+  merge_semantic_and_instance_with_tf_op: false
+}
diff --git a/configs/kitti/panoptic_deeplab/resnet50_os32.textproto b/configs/kitti/panoptic_deeplab/resnet50_os32.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..7fcf81a82bc09ce5deab4534349ac8156b0197ca
--- /dev/null
+++ b/configs/kitti/panoptic_deeplab/resnet50_os32.textproto
@@ -0,0 +1,159 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+#
+# Panoptic-DeepLab with ResNet-50 and output stride 32.
+#
+############### PLEASE READ THIS BEFORE USING THIS CONFIG ###############
+# Before using this config, you need to update the following fields:
+# - experiment_name: Use a unique experiment name for each experiment.
+# - initial_checkpoint: Update the path to the initial checkpoint.
+# - train_dataset_options.file_pattern: Update the path to the
+#   training set. e.g., your_dataset/train*.tfrecord
+# - eval_dataset_options.file_pattern: Update the path to the
+#   validation set, e.g., your_dataset/eval*.tfrecord
+# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you
+#   could successfully compile the provided efficient merging operation
+#   under the folder `tensorflow_ops`.
+#########################################################################
+#
+# This config uses the Cityscapes pretrained checkpoint where crowd label is
+# kept to pretrain the semantic segmentation branch.
+#
+# References:
+# For ResNet, see
+# - Kaiming He, et al. "Deep Residual Learning for Image Recognition."
+#   In CVPR, 2016.
+# For Panoptic-DeepLab, see
+# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast Baseline
+#   for Bottom-Up Panoptic Segmentation." In CVPR, 2020.
+
+# Use a unique experiment_name for each experiment.
+experiment_name: "${EXPERIMENT_NAME}"
+model_options {
+  # Update the path to the initial checkpoint (e.g., ImageNet
+  # pretrained checkpoint)
+  initial_checkpoint: "${INIT_CHECKPOINT}"
+  backbone {
+    name: "resnet50"
+    output_stride: 32
+  }
+  decoder {
+    feature_key: "res5"
+    decoder_channels: 256
+    aspp_channels: 256
+    atrous_rates: 3
+    atrous_rates: 6
+    atrous_rates: 9
+  }
+  panoptic_deeplab {
+    low_level {
+      feature_key: "res3"
+      channels_project: 64
+    }
+    low_level {
+      feature_key: "res2"
+      channels_project: 32
+    }
+    instance {
+      low_level_override {
+        feature_key: "res3"
+        channels_project: 32
+      }
+      low_level_override {
+        feature_key: "res2"
+        channels_project: 16
+      }
+      instance_decoder_override {
+        feature_key: "res5"
+        decoder_channels: 128
+        atrous_rates: 3
+        atrous_rates: 6
+        atrous_rates: 9
+      }
+      center_head {
+        output_channels: 1
+        head_channels: 32
+      }
+      regression_head {
+        output_channels: 2
+        head_channels: 32
+      }
+    }
+    semantic_head {
+      output_channels: 19
+      head_channels: 256
+    }
+  }
+}
+trainer_options {
+  save_checkpoints_steps: 1000
+  save_summaries_steps: 500
+  steps_per_loop: 500
+  loss_options {
+    semantic_loss {
+      name: "softmax_cross_entropy"
+      weight: 1.0
+      top_k_percent: 0.2
+    }
+    center_loss {
+      name: "mse"
+      weight: 200
+    }
+    regression_loss {
+      name: "l1"
+      weight: 0.01
+    }
+  }
+  solver_options {
+    base_learning_rate: 0.00001
+    training_number_of_steps: 30000
+  }
+}
+train_dataset_options {
+  dataset: "kitti_step"
+  # Update the path to training set.
+  file_pattern: "${TRAIN_SET}"
+  # Adjust the batch_size accordingly to better fit your GPU/TPU memory.
+  # Also see Q1 in g3doc/fag.md.
+  batch_size: 32
+  crop_size: 385
+  crop_size: 1249
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  augmentations {
+    min_scale_factor: 0.5
+    max_scale_factor: 2.0
+    scale_factor_step_size: 0.1
+  }
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+eval_dataset_options {
+  dataset: "kitti_step"
+  # Update the path to validation set.
+  file_pattern: "${VAL_SET}"
+  batch_size: 1
+  crop_size: 385
+  crop_size: 1249
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  # Add options to make the evaluation loss comparable to the training loss.
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+evaluator_options {
+  continuous_eval_timeout: 10000
+  stuff_area_limit: 0
+  center_score_threshold: 0.1
+  nms_kernel: 13
+  save_predictions: true
+  save_raw_predictions: false
+  convert_raw_to_eval_ids: false
+  # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and
+  # instance maps. For faster speed, compile TensorFlow with provided kernel
+  # implementation under the folder `tensorflow_ops`, and set
+  # merge_semantic_and_instance_with_tf_op to true.
+  merge_semantic_and_instance_with_tf_op: false
+}
diff --git a/configs/kitti/panoptic_deeplab/resnet50_os32_trainval.textproto b/configs/kitti/panoptic_deeplab/resnet50_os32_trainval.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..549eea5064b8346ee96022270dd0133d9cc15351
--- /dev/null
+++ b/configs/kitti/panoptic_deeplab/resnet50_os32_trainval.textproto
@@ -0,0 +1,160 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+#
+# Panoptic-DeepLab with ResNet-50 and output stride 32.
+#
+############### PLEASE READ THIS BEFORE USING THIS CONFIG ###############
+# Before using this config, you need to update the following fields:
+# - experiment_name: Use a unique experiment name for each experiment.
+# - initial_checkpoint: Update the path to the initial checkpoint.
+# - train_dataset_options.file_pattern: Update the path to the
+#   training set. e.g., your_dataset/train*.tfrecord
+# - eval_dataset_options.file_pattern: Update the path to the
+#   validation set, e.g., your_dataset/eval*.tfrecord
+# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you
+#   could successfully compile the provided efficient merging operation
+#   under the folder `tensorflow_ops`.
+#########################################################################
+#
+# This config uses the Cityscapes pretrained checkpoint where crowd label is
+# kept to pretrain the semantic segmentation branch.
+#
+# References:
+# For ResNet, see
+# - Kaiming He, et al. "Deep Residual Learning for Image Recognition."
+#   In CVPR, 2016.
+# For Panoptic-DeepLab, see
+# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast Baseline
+#   for Bottom-Up Panoptic Segmentation." In CVPR, 2020.
+
+# Use a unique experiment_name for each experiment.
+experiment_name: "${EXPERIMENT_NAME}"
+model_options {
+  # Update the path to the initial checkpoint (e.g., ImageNet
+  # pretrained checkpoint)
+  initial_checkpoint: "${INIT_CHECKPOINT}"
+  backbone {
+    name: "resnet50"
+    output_stride: 32
+  }
+  decoder {
+    feature_key: "res5"
+    decoder_channels: 256
+    aspp_channels: 256
+    atrous_rates: 3
+    atrous_rates: 6
+    atrous_rates: 9
+  }
+  panoptic_deeplab {
+    low_level {
+      feature_key: "res3"
+      channels_project: 64
+    }
+    low_level {
+      feature_key: "res2"
+      channels_project: 32
+    }
+    instance {
+      low_level_override {
+        feature_key: "res3"
+        channels_project: 32
+      }
+      low_level_override {
+        feature_key: "res2"
+        channels_project: 16
+      }
+      instance_decoder_override {
+        feature_key: "res5"
+        decoder_channels: 128
+        atrous_rates: 3
+        atrous_rates: 6
+        atrous_rates: 9
+      }
+      center_head {
+        output_channels: 1
+        head_channels: 32
+      }
+      regression_head {
+        output_channels: 2
+        head_channels: 32
+      }
+    }
+    semantic_head {
+      output_channels: 19
+      head_channels: 256
+    }
+  }
+}
+trainer_options {
+  save_checkpoints_steps: 1000
+  save_summaries_steps: 500
+  steps_per_loop: 500
+  loss_options {
+    semantic_loss {
+      name: "softmax_cross_entropy"
+      weight: 1.0
+      top_k_percent: 0.2
+    }
+    center_loss {
+      name: "mse"
+      weight: 200
+    }
+    regression_loss {
+      name: "l1"
+      weight: 0.01
+    }
+  }
+  solver_options {
+    base_learning_rate: 0.000001
+    training_number_of_steps: 30000
+  }
+}
+train_dataset_options {
+  dataset: "kitti_step"
+  # Update the path to training set.
+  file_pattern: "${TRAIN_SET}"
+  file_pattern: "${VAL_SET}"
+  # Adjust the batch_size accordingly to better fit your GPU/TPU memory.
+  # Also see Q1 in g3doc/fag.md.
+  batch_size: 32
+  crop_size: 385
+  crop_size: 1249
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  augmentations {
+    min_scale_factor: 0.5
+    max_scale_factor: 2.0
+    scale_factor_step_size: 0.1
+  }
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+eval_dataset_options {
+  dataset: "kitti_step"
+  # Update the path to validation set.
+  file_pattern: "${VAL_SET}"
+  batch_size: 1
+  crop_size: 385
+  crop_size: 1249
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  # Add options to make the evaluation loss comparable to the training loss.
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+evaluator_options {
+  continuous_eval_timeout: 10000
+  stuff_area_limit: 0
+  center_score_threshold: 0.1
+  nms_kernel: 13
+  save_predictions: true
+  save_raw_predictions: false
+  convert_raw_to_eval_ids: false
+  # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and
+  # instance maps. For faster speed, compile TensorFlow with provided kernel
+  # implementation under the folder `tensorflow_ops`, and set
+  # merge_semantic_and_instance_with_tf_op to true.
+  merge_semantic_and_instance_with_tf_op: false
+}
diff --git a/configs/motchallenge/motion_deeplab/resnet50_os32.textproto b/configs/motchallenge/motion_deeplab/resnet50_os32.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..3b6ca505b02723119620db23f17f2d73280906fa
--- /dev/null
+++ b/configs/motchallenge/motion_deeplab/resnet50_os32.textproto
@@ -0,0 +1,172 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+#
+# Motion-DeepLab with ResNet-50 and output stride 32.
+#
+############### PLEASE READ THIS BEFORE USING THIS CONFIG ###############
+# Before using this config, you need to update the following fields:
+# - experiment_name: Use a unique experiment name for each experiment.
+# - initial_checkpoint: Update the path to the initial checkpoint.
+# - train_dataset_options.file_pattern: Update the path to the
+#   training set. e.g., your_dataset/train*.tfrecord
+# - eval_dataset_options.file_pattern: Update the path to the
+#   validation set, e.g., your_dataset/eval*.tfrecord
+# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you
+#   could successfully compile the provided efficient merging operation
+#   under the folder `tensorflow_ops`.
+#########################################################################
+#
+# This config uses the Cityscapes pretrained checkpoint where crowd label is
+# kept to pretrain the semantic segmentation branch. Note that we additionally
+# perform the net-surgery on the first convolution and the last prediction layer
+# since (1) Motion-DeepLab takes two-frame as inputs, and (2) MOTChallenge-STEP
+# contains a subeset of semantic classes of Cityscapes. For net-surgery details,
+# see utils/net_surgery_convert_last_layer.py.
+#
+# References:
+# For ResNet, see
+# - Kaiming He, et al. "Deep Residual Learning for Image Recognition."
+#   In CVPR, 2016.
+# For Motion-DeepLab, see
+# - Mark Weber, et al. "STEP: Segmenting and Tracking Every Pixel."
+#   arXiv: 2102.11859.
+
+# Use a unique experiment_name for each experiment.
+experiment_name: "${EXPERIMENT_NAME}"
+model_options {
+  # Update the path to the initial checkpoint (e.g., ImageNet
+  # pretrained checkpoint)
+  initial_checkpoint: "${INIT_CHECKPOINT}"
+  backbone {
+    name: "resnet50"
+    output_stride: 32
+  }
+  decoder {
+    feature_key: "res5"
+    decoder_channels: 256
+    aspp_channels: 256
+    atrous_rates: 3
+    atrous_rates: 6
+    atrous_rates: 9
+  }
+  motion_deeplab {
+    low_level {
+      feature_key: "res3"
+      channels_project: 64
+    }
+    low_level {
+      feature_key: "res2"
+      channels_project: 32
+    }
+    instance {
+      low_level_override {
+        feature_key: "res3"
+        channels_project: 32
+      }
+      low_level_override {
+        feature_key: "res2"
+        channels_project: 16
+      }
+      instance_decoder_override {
+        feature_key: "res5"
+        decoder_channels: 128
+        atrous_rates: 3
+        atrous_rates: 6
+        atrous_rates: 9
+      }
+      center_head {
+        output_channels: 1
+        head_channels: 32
+      }
+      regression_head {
+        output_channels: 2
+        head_channels: 32
+      }
+    }
+    semantic_head {
+      output_channels: 7
+      head_channels: 256
+    }
+    motion_head {
+      output_channels: 2
+      head_channels: 32
+    }
+  }
+}
+trainer_options {
+  save_checkpoints_steps: 100
+  save_summaries_steps: 50
+  steps_per_loop: 50
+  loss_options {
+    semantic_loss {
+      name: "softmax_cross_entropy"
+      weight: 1.0
+      top_k_percent: 0.2
+    }
+    center_loss {
+      name: "mse"
+      weight: 200
+    }
+    regression_loss {
+      name: "l1"
+      weight: 0.01
+    }
+    motion_loss {
+      name: "l1"
+      weight: 0.01
+    }
+  }
+  solver_options {
+    base_learning_rate: 0.00001
+    training_number_of_steps: 10000
+  }
+}
+train_dataset_options {
+  dataset: "motchallenge_step"
+  # Update the path to training set.
+  file_pattern: "${TRAIN_SET}"
+  # Adjust the batch_size accordingly to better fit your GPU/TPU memory.
+  # Also see Q1 in g3doc/fag.md.
+  batch_size: 32
+  crop_size: 1089
+  crop_size: 1921
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  augmentations {
+    min_scale_factor: 0.5
+    max_scale_factor: 2.0
+    scale_factor_step_size: 0.1
+  }
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+  use_two_frames: true
+}
+eval_dataset_options {
+  dataset: "motchallenge_step"
+  # Update the path to validation set.
+  file_pattern: "${VAL_SET}"
+  batch_size: 1
+  crop_size: 1089
+  crop_size: 1921
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  # Add options to make the evaluation loss comparable to the training loss.
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+  use_two_frames: true
+}
+evaluator_options {
+  continuous_eval_timeout: 10000
+  stuff_area_limit: 0
+  center_score_threshold: 0.1
+  nms_kernel: 13
+  save_predictions: true
+  save_raw_predictions: false
+  # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and
+  # instance maps. For faster speed, compile TensorFlow with provided kernel
+  # implementation under the folder `tensorflow_ops`, and set
+  # merge_semantic_and_instance_with_tf_op to true.
+  merge_semantic_and_instance_with_tf_op: false
+}
diff --git a/configs/motchallenge/panoptic_deeplab/resnet50_os32.textproto b/configs/motchallenge/panoptic_deeplab/resnet50_os32.textproto
new file mode 100644
index 0000000000000000000000000000000000000000..6d33cbcd210a9d25c298571b44d8ee30b3f7dcd2
--- /dev/null
+++ b/configs/motchallenge/panoptic_deeplab/resnet50_os32.textproto
@@ -0,0 +1,161 @@
+# proto-file: deeplab2/config.proto
+# proto-message: ExperimentOptions
+#
+# Panoptic-DeepLab with ResNet-50 and output stride 32.
+#
+############### PLEASE READ THIS BEFORE USING THIS CONFIG ###############
+# Before using this config, you need to update the following fields:
+# - experiment_name: Use a unique experiment name for each experiment.
+# - initial_checkpoint: Update the path to the initial checkpoint.
+# - train_dataset_options.file_pattern: Update the path to the
+#   training set. e.g., your_dataset/train*.tfrecord
+# - eval_dataset_options.file_pattern: Update the path to the
+#   validation set, e.g., your_dataset/eval*.tfrecord
+# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you
+#   could successfully compile the provided efficient merging operation
+#   under the folder `tensorflow_ops`.
+#########################################################################
+#
+# This config uses the Cityscapes pretrained checkpoint where crowd label is
+# kept to pretrain the semantic segmentation branch. Note that we additionally
+# perform the net-surgery on the last prediction layer since MOTChallenge-STEP
+# contains a subeset of semantic classes of Cityscapes. For net-surgery details,
+# see utils/net_surgery_convert_last_layer.py.
+#
+# References:
+# For ResNet, see
+# - Kaiming He, et al. "Deep Residual Learning for Image Recognition."
+#   In CVPR, 2016.
+# For Panoptic-DeepLab, see
+# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast Baseline
+#   for Bottom-Up Panoptic Segmentation." In CVPR, 2020.
+
+# Use a unique experiment_name for each experiment.
+experiment_name: "${EXPERIMENT_NAME}"
+model_options {
+  # Update the path to the initial checkpoint (e.g., ImageNet
+  # pretrained checkpoint)
+  initial_checkpoint: "${INIT_CHECKPOINT}"
+  backbone {
+    name: "resnet50"
+    output_stride: 32
+  }
+  decoder {
+    feature_key: "res5"
+    decoder_channels: 256
+    aspp_channels: 256
+    atrous_rates: 3
+    atrous_rates: 6
+    atrous_rates: 9
+  }
+  panoptic_deeplab {
+    low_level {
+      feature_key: "res3"
+      channels_project: 64
+    }
+    low_level {
+      feature_key: "res2"
+      channels_project: 32
+    }
+    instance {
+      low_level_override {
+        feature_key: "res3"
+        channels_project: 32
+      }
+      low_level_override {
+        feature_key: "res2"
+        channels_project: 16
+      }
+      instance_decoder_override {
+        feature_key: "res5"
+        decoder_channels: 128
+        atrous_rates: 3
+        atrous_rates: 6
+        atrous_rates: 9
+      }
+      center_head {
+        output_channels: 1
+        head_channels: 32
+      }
+      regression_head {
+        output_channels: 2
+        head_channels: 32
+      }
+    }
+    semantic_head {
+      output_channels: 7
+      head_channels: 256
+    }
+  }
+}
+trainer_options {
+  save_checkpoints_steps: 200
+  save_summaries_steps: 50
+  steps_per_loop: 50
+  loss_options {
+    semantic_loss {
+      name: "softmax_cross_entropy"
+      weight: 1.0
+      top_k_percent: 0.2
+    }
+    center_loss {
+      name: "mse"
+      weight: 200
+    }
+    regression_loss {
+      name: "l1"
+      weight: 0.01
+    }
+  }
+  solver_options {
+    base_learning_rate: 0.00001
+    training_number_of_steps: 10000
+  }
+}
+train_dataset_options {
+  dataset: "motchallenge_step"
+  # Update the path to training set.
+  file_pattern: "${TRAIN_SET}"
+  # Adjust the batch_size accordingly to better fit your GPU/TPU memory.
+  # Also see Q1 in g3doc/fag.md.
+  batch_size: 32
+  crop_size: 1089
+  crop_size: 1921
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  augmentations {
+    min_scale_factor: 0.5
+    max_scale_factor: 2.0
+    scale_factor_step_size: 0.1
+  }
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+eval_dataset_options {
+  dataset: "motchallenge_step"
+  # Update the path to validation set.
+  file_pattern: "${VAL_SET}"
+  batch_size: 1
+  crop_size: 1089
+  crop_size: 1921
+  # Skip resizing.
+  min_resize_value: 0
+  max_resize_value: 0
+  # Add options to make the evaluation loss comparable to the training loss.
+  increase_small_instance_weights: true
+  small_instance_weight: 3.0
+}
+evaluator_options {
+  continuous_eval_timeout: 10000
+  stuff_area_limit: 0
+  center_score_threshold: 0.1
+  nms_kernel: 13
+  save_predictions: true
+  save_raw_predictions: false
+  # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and
+  # instance maps. For faster speed, compile TensorFlow with provided kernel
+  # implementation under the folder `tensorflow_ops`, and set
+  # merge_semantic_and_instance_with_tf_op to true.
+  merge_semantic_and_instance_with_tf_op: false
+}
diff --git a/data/__init__.py b/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..35e4ce02ff422f3aa84ab644b88d65b13e0cbc03
--- /dev/null
+++ b/data/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/data/build_cityscapes_data.py b/data/build_cityscapes_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3001d3a5e52619b7e68f77e48d25440763e883f
--- /dev/null
+++ b/data/build_cityscapes_data.py
@@ -0,0 +1,321 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Converts Cityscapes data to sharded TFRecord file format with Example protos.
+
+Please check ../g3doc/setup/cityscapes.md for instructions.
+"""
+
+import collections
+import json
+import math
+import os
+
+from absl import app
+from absl import flags
+from absl import logging
+import numpy as np
+import tensorflow as tf
+
+from deeplab2.data import data_utils
+from deeplab2.data import dataset
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('cityscapes_root', None, 'Cityscapes dataset root folder.')
+
+flags.DEFINE_string('output_dir', None,
+                    'Path to save converted TFRecord of TensorFlow examples.')
+
+flags.DEFINE_boolean('create_panoptic_data', True,
+                     'Whether to create semantic or panoptic dataset.')
+
+flags.DEFINE_boolean('treat_crowd_as_ignore', True,
+                     'Whether to apply ignore labels to crowd pixels in '
+                     'panoptic label.')
+
+_NUM_SHARDS = 10
+_SPLITS_TO_SIZES = dataset.CITYSCAPES_INFORMATION.splits_to_sizes
+_IGNORE_LABEL = dataset.CITYSCAPES_PANOPTIC_INFORMATION.ignore_label
+_CLASS_HAS_INSTANCE_LIST = dataset.CITYSCAPES_PANOPTIC_INFORMATION.class_has_instances_list
+_PANOPTIC_LABEL_DIVISOR = dataset.CITYSCAPES_PANOPTIC_INFORMATION.panoptic_label_divisor
+
+# A map from data type to folder name that saves the data.
+_FOLDERS_MAP = {
+    'image': 'leftImg8bit',
+    'label': 'gtFine',
+}
+
+# A map from data type to filename postfix.
+_POSTFIX_MAP = {
+    'image': '_leftImg8bit',
+    'label': '_gtFine_labelTrainIds',
+}
+
+# A map from data type to data format.
+_DATA_FORMAT_MAP = {
+    'image': 'png',
+    'label': 'png',
+}
+_PANOPTIC_LABEL_FORMAT = 'raw'
+
+
+def _get_images(cityscapes_root, dataset_split):
+  """Gets files for the specified data type and dataset split.
+
+  Args:
+    cityscapes_root: String, path to Cityscapes dataset root folder.
+    dataset_split: String, dataset split ('train', 'val', 'test')
+
+  Returns:
+    A list of sorted file names or None when getting label for
+      test set.
+  """
+  pattern = '*%s.%s' % (_POSTFIX_MAP['image'], _DATA_FORMAT_MAP['image'])
+  search_files = os.path.join(
+      cityscapes_root, _FOLDERS_MAP['image'], dataset_split, '*', pattern)
+  filenames = tf.io.gfile.glob(search_files)
+  return sorted(filenames)
+
+
+def _split_image_path(image_path):
+  """Helper method to extract split paths from input image path.
+
+  Args:
+    image_path: String, path to the image file.
+
+  Returns:
+    A tuple of (cityscape root, dataset split, cityname and shared filename
+      prefix).
+  """
+  image_path = os.path.normpath(image_path)
+  path_list = image_path.split(os.sep)
+  image_folder, dataset_split, city_name, file_name = path_list[-4:]
+  if image_folder != _FOLDERS_MAP['image']:
+    raise ValueError('Expects image path %s containing image folder.'
+                     % image_path)
+
+  pattern = '%s.%s' % (_POSTFIX_MAP['image'], _DATA_FORMAT_MAP['image'])
+  if not file_name.endswith(pattern):
+    raise ValueError('Image file name %s should end with %s' %
+                     (file_name, pattern))
+
+  file_prefix = file_name[:-len(pattern)]
+  return os.sep.join(path_list[:-4]), dataset_split, city_name, file_prefix
+
+
+def _get_semantic_annotation(image_path):
+  cityscapes_root, dataset_split, city_name, file_prefix = _split_image_path(
+      image_path)
+  semantic_annotation = '%s%s.%s' % (file_prefix, _POSTFIX_MAP['label'],
+                                     _DATA_FORMAT_MAP['label'])
+  return os.path.join(cityscapes_root, _FOLDERS_MAP['label'], dataset_split,
+                      city_name, semantic_annotation)
+
+
+def _get_panoptic_annotation(cityscapes_root, dataset_split,
+                             annotation_file_name):
+  panoptic_folder = 'cityscapes_panoptic_%s_trainId' % dataset_split
+  return os.path.join(cityscapes_root, _FOLDERS_MAP['label'], panoptic_folder,
+                      annotation_file_name)
+
+
+def _read_segments(cityscapes_root, dataset_split):
+  """Reads segments information from json file.
+
+  Args:
+    cityscapes_root: String, path to Cityscapes dataset root folder.
+    dataset_split: String, dataset split.
+
+  Returns:
+    segments_dict: A dictionary that maps `image_id` (common file prefix) to
+      a tuple of (panoptic annotation file name, segments). Please refer to
+      _generate_panoptic_label() method on the detail structure of `segments`.
+  """
+  json_filename = os.path.join(
+      cityscapes_root, _FOLDERS_MAP['label'],
+      'cityscapes_panoptic_%s_trainId.json' % dataset_split)
+  with tf.io.gfile.GFile(json_filename) as f:
+    panoptic_dataset = json.load(f)
+
+  segments_dict = {}
+  for annotation in panoptic_dataset['annotations']:
+    image_id = annotation['image_id']
+    if image_id in segments_dict:
+      raise ValueError('Image ID %s already exists' % image_id)
+    annotation_file_name = annotation['file_name']
+    segments = annotation['segments_info']
+
+    segments_dict[image_id] = (annotation_file_name, segments)
+  return segments_dict
+
+
+def _generate_panoptic_label(panoptic_annotation_file, segments):
+  """Creates panoptic label map from annotations.
+
+  Args:
+    panoptic_annotation_file: String, path to panoptic annotation (populated
+      with `trainId`).
+    segments: A list of dictionaries containing information of every segment.
+      Read from panoptic_${DATASET_SPLIT}_trainId.json. This method consumes
+      the following fields in each dictionary:
+        - id: panoptic id
+        - category_id: semantic class id
+        - area: pixel area of this segment
+        - iscrowd: if this segment is crowd region
+
+  Returns:
+    A 2D numpy int32 array with the same height / width with panoptic
+    annotation. Each pixel value represents its panoptic ID. Please refer to
+    ../g3doc/setup/cityscapes.md for more details about how panoptic ID is
+    assigned.
+  """
+  with tf.io.gfile.GFile(panoptic_annotation_file, 'rb') as f:
+    panoptic_label = data_utils.read_image(f.read())
+
+  if panoptic_label.mode != 'RGB':
+    raise ValueError('Expect RGB image for panoptic label, gets %s' %
+                     panoptic_label.mode)
+
+  panoptic_label = np.array(panoptic_label, dtype=np.int32)
+  # Cityscapes panoptic map is created by:
+  #   color = [segmentId % 256, segmentId // 256, segmentId // 256 // 256]
+  panoptic_label = np.dot(panoptic_label, [1, 256, 256 * 256])
+
+  semantic_label = np.ones_like(panoptic_label) * _IGNORE_LABEL
+  instance_label = np.zeros_like(panoptic_label)
+  # Running count of instances per semantic category.
+  instance_count = collections.defaultdict(int)
+  for segment in segments:
+    selected_pixels = panoptic_label == segment['id']
+    pixel_area = np.sum(selected_pixels)
+    if pixel_area != segment['area']:
+      raise ValueError('Expect %d pixels for segment %s, gets %d.' %
+                       (segment['area'], segment, pixel_area))
+
+    category_id = segment['category_id']
+    semantic_label[selected_pixels] = category_id
+
+    if category_id in _CLASS_HAS_INSTANCE_LIST:
+      if segment['iscrowd']:
+        # Cityscapes crowd pixels will have instance ID of 0.
+        if FLAGS.treat_crowd_as_ignore:
+          semantic_label[selected_pixels] = _IGNORE_LABEL
+        continue
+      # Non-crowd pixels will have instance ID starting from 1.
+      instance_count[category_id] += 1
+      if instance_count[category_id] >= _PANOPTIC_LABEL_DIVISOR:
+        raise ValueError('Too many instances for category %d in this image.' %
+                         category_id)
+      instance_label[selected_pixels] = instance_count[category_id]
+    elif segment['iscrowd']:
+      raise ValueError('Stuff class should not have `iscrowd` label.')
+
+  panoptic_label = semantic_label * _PANOPTIC_LABEL_DIVISOR + instance_label
+  return panoptic_label.astype(np.int32)
+
+
+def _convert_split_name(dataset_split):
+  return dataset_split + '_fine'
+
+
+def _create_semantic_label(image_path):
+  """Creates labels for semantic segmentation."""
+  with tf.io.gfile.GFile(_get_semantic_annotation(image_path), 'rb') as f:
+    label_data = f.read()
+
+  return label_data, _DATA_FORMAT_MAP['label']
+
+
+def _create_panoptic_label(image_path, segments_dict):
+  """Creates labels for panoptic segmentation."""
+  cityscapes_root, dataset_split, _, file_prefix = _split_image_path(image_path)
+
+  annotation_file_name, segments = segments_dict[file_prefix]
+  panoptic_annotation_file = _get_panoptic_annotation(cityscapes_root,
+                                                      dataset_split,
+                                                      annotation_file_name)
+
+  panoptic_label = _generate_panoptic_label(panoptic_annotation_file, segments)
+  return panoptic_label.tostring(), _PANOPTIC_LABEL_FORMAT
+
+
+def _convert_dataset(cityscapes_root, dataset_split, output_dir):
+  """Converts the specified dataset split to TFRecord format.
+
+  Args:
+    cityscapes_root: String, path to Cityscapes dataset root folder.
+    dataset_split: String, the dataset split (one of `train`, `val` and `test`).
+    output_dir: String, directory to write output TFRecords to.
+
+  Raises:
+    RuntimeError: If loaded image and label have different shape, or if the
+      image file with specified postfix could not be found.
+  """
+  image_files = _get_images(cityscapes_root, dataset_split)
+
+  num_images = len(image_files)
+  expected_dataset_size = _SPLITS_TO_SIZES[_convert_split_name(dataset_split)]
+  if num_images != expected_dataset_size:
+    raise ValueError('Expects %d images, gets %d' %
+                     (expected_dataset_size, num_images))
+
+  segments_dict = None
+  if FLAGS.create_panoptic_data:
+    segments_dict = _read_segments(FLAGS.cityscapes_root, dataset_split)
+
+  num_per_shard = int(math.ceil(len(image_files) / _NUM_SHARDS))
+
+  for shard_id in range(_NUM_SHARDS):
+    shard_filename = '%s-%05d-of-%05d.tfrecord' % (
+        dataset_split, shard_id, _NUM_SHARDS)
+    output_filename = os.path.join(output_dir, shard_filename)
+    with tf.io.TFRecordWriter(output_filename) as tfrecord_writer:
+      start_idx = shard_id * num_per_shard
+      end_idx = min((shard_id + 1) * num_per_shard, num_images)
+      for i in range(start_idx, end_idx):
+        # Read the image.
+        with tf.io.gfile.GFile(image_files[i], 'rb') as f:
+          image_data = f.read()
+
+        if dataset_split == 'test':
+          label_data, label_format = None, None
+        elif FLAGS.create_panoptic_data:
+          label_data, label_format = _create_panoptic_label(
+              image_files[i], segments_dict)
+        else:
+          label_data, label_format = _create_semantic_label(image_files[i])
+
+        # Convert to tf example.
+        _, _, _, file_prefix = _split_image_path(image_files[i])
+        example = data_utils.create_tfexample(image_data,
+                                              _DATA_FORMAT_MAP['image'],
+                                              file_prefix, label_data,
+                                              label_format)
+
+        tfrecord_writer.write(example.SerializeToString())
+
+
+def main(unused_argv):
+  tf.io.gfile.makedirs(FLAGS.output_dir)
+
+  for dataset_split in ('train', 'val', 'test'):
+    logging.info('Starts to processing dataset split %s.', dataset_split)
+    _convert_dataset(FLAGS.cityscapes_root, dataset_split, FLAGS.output_dir)
+
+
+if __name__ == '__main__':
+  flags.mark_flags_as_required(['cityscapes_root', 'output_dir'])
+  app.run(main)
diff --git a/data/build_cityscapes_data_test.py b/data/build_cityscapes_data_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9707f89366f79753f93bfdd6e217f20456ffb978
--- /dev/null
+++ b/data/build_cityscapes_data_test.py
@@ -0,0 +1,67 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for build_cityscapes_data."""
+
+import os
+
+from absl import flags
+import numpy as np
+from PIL import Image
+import tensorflow as tf
+
+from deeplab2.data import build_cityscapes_data
+
+
+FLAGS = flags.FLAGS
+_TEST_DATA_DIR = 'deeplab2/data/testdata'
+_TEST_FILE_PREFIX = 'dummy_000000_000000'
+
+
+class BuildCityscapesDataTest(tf.test.TestCase):
+
+  def test_read_segments(self):
+    cityscapes_root = os.path.join(_TEST_DATA_DIR)
+    segments_dict = build_cityscapes_data._read_segments(
+        cityscapes_root, dataset_split='dummy')
+    self.assertIn(_TEST_FILE_PREFIX, segments_dict)
+    _, segments = segments_dict[_TEST_FILE_PREFIX]
+    self.assertLen(segments, 10)
+
+  def test_generate_panoptic_label(self):
+    FLAGS.treat_crowd_as_ignore = False  # Test a more complicated setting
+    cityscapes_root = os.path.join(_TEST_DATA_DIR)
+    segments_dict = build_cityscapes_data._read_segments(
+        cityscapes_root, dataset_split='dummy')
+    annotation_file_name, segments = segments_dict[_TEST_FILE_PREFIX]
+    panoptic_annotation_file = build_cityscapes_data._get_panoptic_annotation(
+        cityscapes_root, dataset_split='dummy',
+        annotation_file_name=annotation_file_name)
+    panoptic_label = build_cityscapes_data._generate_panoptic_label(
+        panoptic_annotation_file, segments)
+
+    # Check panoptic label matches golden file.
+    golden_file_path = os.path.join(_TEST_DATA_DIR,
+                                    'dummy_gt_for_vps.png')
+    with tf.io.gfile.GFile(golden_file_path, 'rb') as f:
+      golden_label = Image.open(f)
+      # The PNG file is encoded by:
+      #   color = [segmentId % 256, segmentId // 256, segmentId // 256 // 256]
+      golden_label = np.dot(np.asarray(golden_label), [1, 256, 256 * 256])
+
+    np.testing.assert_array_equal(panoptic_label, golden_label)
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/data/build_coco_data.py b/data/build_coco_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ae9176e78608f12e86000686baac209f903053e
--- /dev/null
+++ b/data/build_coco_data.py
@@ -0,0 +1,309 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Converts COCO data to sharded TFRecord file format with Example protos.
+
+Please check
+  ../g3doc/setup/coco.md
+for instructions.
+"""
+
+import collections
+import json
+import math
+import os
+
+from typing import Sequence, Tuple, Any
+
+from absl import app
+from absl import flags
+from absl import logging
+import numpy as np
+import tensorflow as tf
+
+from deeplab2.data import coco_constants
+from deeplab2.data import data_utils
+from deeplab2.data import dataset
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('coco_root', None, 'coco dataset root folder.')
+
+flags.DEFINE_string('output_dir', None,
+                    'Path to save converted TFRecord of TensorFlow examples.')
+
+flags.DEFINE_boolean('treat_crowd_as_ignore', True,
+                     'Whether to apply ignore labels to crowd pixels in '
+                     'panoptic label.')
+
+_NUM_SHARDS = 1000
+
+
+_SPLITS_TO_SIZES = dataset.COCO_PANOPTIC_INFORMATION.splits_to_sizes
+_IGNORE_LABEL = dataset.COCO_PANOPTIC_INFORMATION.ignore_label
+_CLASS_HAS_INSTANCE_LIST = dataset.COCO_PANOPTIC_INFORMATION.class_has_instances_list
+_PANOPTIC_LABEL_DIVISOR = dataset.COCO_PANOPTIC_INFORMATION.panoptic_label_divisor
+_CLASS_MAPPING = coco_constants.get_id_mapping()
+
+# A map from data type to folder name that saves the data.
+_FOLDERS_MAP = {
+    'train': {
+        'image': 'train2017',
+        'label': 'annotations',
+    },
+    'val': {
+        'image': 'val2017',
+        'label': 'annotations',
+    },
+    'test': {
+        'image': 'test2017',
+        'label': '',
+    }
+}
+
+# A map from data type to data format.
+_DATA_FORMAT_MAP = {
+    'image': 'jpg',
+    'label': 'png',
+}
+_PANOPTIC_LABEL_FORMAT = 'raw'
+
+
+def _get_images(coco_root: str, dataset_split: str) -> Sequence[str]:
+  """Gets files for the specified data type and dataset split.
+
+  Args:
+    coco_root: String, path to coco dataset root folder.
+    dataset_split: String, dataset split ('train', 'val', 'test').
+
+  Returns:
+    A list of sorted file names.
+  """
+  pattern = '*.%s' % _DATA_FORMAT_MAP['image']
+  search_files = os.path.join(
+      coco_root, _FOLDERS_MAP[dataset_split]['image'], pattern)
+  filenames = tf.io.gfile.glob(search_files)
+  return sorted(filenames)
+
+
+def _get_panoptic_annotation(coco_root: str, dataset_split: str,
+                             annotation_file_name: str) -> str:
+  panoptic_folder = 'panoptic_%s2017' % dataset_split
+  return os.path.join(coco_root, _FOLDERS_MAP[dataset_split]['label'],
+                      panoptic_folder, annotation_file_name)
+
+
+def _read_segments(coco_root: str, dataset_split: str):
+  """Reads segments information from json file.
+
+  Args:
+    coco_root: String, path to coco dataset root folder.
+    dataset_split: String, dataset split.
+
+  Returns:
+    segments_dict: A dictionary that maps file prefix of annotation_file_name to
+      a tuple of (panoptic annotation file name, segments). Please refer to
+      _generate_panoptic_label() method on the detail structure of `segments`.
+
+  Raises:
+    ValueError: If found duplicated image id in annotations.
+  """
+  json_filename = os.path.join(
+      coco_root, _FOLDERS_MAP[dataset_split]['label'],
+      'panoptic_%s2017.json' % dataset_split)
+  with tf.io.gfile.GFile(json_filename) as f:
+    panoptic_dataset = json.load(f)
+
+  segments_dict = {}
+  for annotation in panoptic_dataset['annotations']:
+    image_id = annotation['image_id']
+    if image_id in segments_dict:
+      raise ValueError('Image ID %s already exists' % image_id)
+    annotation_file_name = annotation['file_name']
+    segments = annotation['segments_info']
+
+    segments_dict[os.path.splitext(annotation_file_name)[-2]] = (
+        annotation_file_name, segments)
+
+  return segments_dict
+
+
+def _generate_panoptic_label(panoptic_annotation_file: str, segments:
+                             Any) -> np.ndarray:
+  """Creates panoptic label map from annotations.
+
+  Args:
+    panoptic_annotation_file: String, path to panoptic annotation.
+    segments: A list of dictionaries containing information of every segment.
+      Read from panoptic_${DATASET_SPLIT}2017.json. This method consumes
+      the following fields in each dictionary:
+        - id: panoptic id
+        - category_id: semantic class id
+        - area: pixel area of this segment
+        - iscrowd: if this segment is crowd region
+
+  Returns:
+    A 2D numpy int32 array with the same height / width with panoptic
+    annotation. Each pixel value represents its panoptic ID. Please refer to
+    g3doc/setup/coco.md for more details about how panoptic ID is assigned.
+  """
+  with tf.io.gfile.GFile(panoptic_annotation_file, 'rb') as f:
+    panoptic_label = data_utils.read_image(f.read())
+
+  if panoptic_label.mode != 'RGB':
+    raise ValueError('Expect RGB image for panoptic label, gets %s' %
+                     panoptic_label.mode)
+
+  panoptic_label = np.array(panoptic_label, dtype=np.int32)
+  # COCO panoptic map is created by:
+  #   color = [segmentId % 256, segmentId // 256, segmentId // 256 // 256]
+  panoptic_label = np.dot(panoptic_label, [1, 256, 256 * 256])
+
+  semantic_label = np.ones_like(panoptic_label) * _IGNORE_LABEL
+  instance_label = np.zeros_like(panoptic_label)
+  # Running count of instances per semantic category.
+  instance_count = collections.defaultdict(int)
+
+  for segment in segments:
+    selected_pixels = panoptic_label == segment['id']
+    pixel_area = np.sum(selected_pixels)
+    if pixel_area != segment['area']:
+      raise ValueError('Expect %d pixels for segment %s, gets %d.' %
+                       (segment['area'], segment, pixel_area))
+
+    category_id = segment['category_id']
+
+    # Map the category_id to contiguous ids
+    category_id = _CLASS_MAPPING[category_id]
+
+    semantic_label[selected_pixels] = category_id
+
+    if category_id in _CLASS_HAS_INSTANCE_LIST:
+      if segment['iscrowd']:
+        # COCO crowd pixels will have instance ID of 0.
+        if FLAGS.treat_crowd_as_ignore:
+          semantic_label[selected_pixels] = _IGNORE_LABEL
+        continue
+      # Non-crowd pixels will have instance ID starting from 1.
+      instance_count[category_id] += 1
+      if instance_count[category_id] >= _PANOPTIC_LABEL_DIVISOR:
+        raise ValueError('Too many instances for category %d in this image.' %
+                         category_id)
+      instance_label[selected_pixels] = instance_count[category_id]
+    elif segment['iscrowd']:
+      raise ValueError('Stuff class should not have `iscrowd` label.')
+
+  panoptic_label = semantic_label * _PANOPTIC_LABEL_DIVISOR + instance_label
+  return panoptic_label.astype(np.int32)
+
+
+def _create_panoptic_label(coco_root: str, dataset_split: str, image_path: str,
+                           segments_dict: Any
+                           ) -> Tuple[str, str]:
+  """Creates labels for panoptic segmentation.
+
+  Args:
+    coco_root: String, path to coco dataset root folder.
+    dataset_split: String, dataset split ('train', 'val', 'test').
+    image_path: String, path to the image file.
+    segments_dict:
+      Read from panoptic_${DATASET_SPLIT}2017.json. This method consumes
+      the following fields in each dictionary:
+        - id: panoptic id
+        - category_id: semantic class id
+        - area: pixel area of this segment
+        - iscrowd: if this segment is crowd region
+
+  Returns:
+    A panoptic label where each pixel value represents its panoptic ID.
+      Please refer to g3doc/setup/coco.md for more details about howpanoptic ID
+      is assigned.
+    A string indicating label format in TFRecord.
+  """
+
+  image_path = os.path.normpath(image_path)
+  path_list = image_path.split(os.sep)
+  file_name = path_list[-1]
+
+  annotation_file_name, segments = segments_dict[
+      os.path.splitext(file_name)[-2]]
+  panoptic_annotation_file = _get_panoptic_annotation(coco_root,
+                                                      dataset_split,
+                                                      annotation_file_name)
+
+  panoptic_label = _generate_panoptic_label(panoptic_annotation_file, segments)
+  return panoptic_label.tostring(), _PANOPTIC_LABEL_FORMAT
+
+
+def _convert_dataset(coco_root: str, dataset_split: str,
+                     output_dir: str) -> None:
+  """Converts the specified dataset split to TFRecord format.
+
+  Args:
+    coco_root: String, path to coco dataset root folder.
+    dataset_split: String, the dataset split (one of `train`, `val` and `test`).
+    output_dir: String, directory to write output TFRecords to.
+  """
+  image_files = _get_images(coco_root, dataset_split)
+
+  num_images = len(image_files)
+
+  if dataset_split != 'test':
+    segments_dict = _read_segments(coco_root, dataset_split)
+
+  num_per_shard = int(math.ceil(len(image_files) / _NUM_SHARDS))
+
+  for shard_id in range(_NUM_SHARDS):
+    shard_filename = '%s-%05d-of-%05d.tfrecord' % (
+        dataset_split, shard_id, _NUM_SHARDS)
+    output_filename = os.path.join(output_dir, shard_filename)
+    with tf.io.TFRecordWriter(output_filename) as tfrecord_writer:
+      start_idx = shard_id * num_per_shard
+      end_idx = min((shard_id + 1) * num_per_shard, num_images)
+      for i in range(start_idx, end_idx):
+        # Read the image.
+        with tf.io.gfile.GFile(image_files[i], 'rb') as f:
+          image_data = f.read()
+
+        if dataset_split == 'test':
+          label_data, label_format = None, None
+        else:
+          label_data, label_format = _create_panoptic_label(
+              coco_root, dataset_split, image_files[i], segments_dict)
+
+        # Convert to tf example.
+        image_path = os.path.normpath(image_files[i])
+        path_list = image_path.split(os.sep)
+        file_name = path_list[-1]
+        file_prefix = file_name.replace(_DATA_FORMAT_MAP['image'], '')
+        example = data_utils.create_tfexample(image_data,
+                                              'jpeg',
+                                              file_prefix, label_data,
+                                              label_format)
+
+        tfrecord_writer.write(example.SerializeToString())
+
+
+def main(unused_argv: Sequence[str]) -> None:
+  tf.io.gfile.makedirs(FLAGS.output_dir)
+
+  for dataset_split in ('train', 'val', 'test'):
+    logging.info('Starts processing dataset split %s.', dataset_split)
+    _convert_dataset(FLAGS.coco_root, dataset_split, FLAGS.output_dir)
+
+
+if __name__ == '__main__':
+  flags.mark_flags_as_required(['coco_root', 'output_dir'])
+  app.run(main)
diff --git a/data/build_coco_data_test.py b/data/build_coco_data_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..63f835ec7cac5b7c087f86548f0766f5b0c677a3
--- /dev/null
+++ b/data/build_coco_data_test.py
@@ -0,0 +1,174 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for build_coco_data."""
+
+import json
+import os
+
+from absl import flags
+import numpy as np
+from PIL import Image
+import tensorflow as tf
+
+from deeplab2.data import build_coco_data
+from deeplab2.data import coco_constants
+
+FLAGS = flags.FLAGS
+_TEST_FILE_NAME = '000000123456.png'
+
+
+class BuildCOCODataTest(tf.test.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    self.data_dir = FLAGS.test_tmpdir
+    self.height = 100
+    self.width = 100
+    self.split = 'train'
+    image_path = os.path.join(self.data_dir,
+                              build_coco_data._FOLDERS_MAP[self.split]['image'])
+    panoptic_map_path = os.path.join(self.data_dir,
+                                     build_coco_data._FOLDERS_MAP
+                                     [self.split]['label'])
+    tf.io.gfile.makedirs(panoptic_map_path)
+    panoptic_map_path = os.path.join(panoptic_map_path,
+                                     'panoptic_%s2017' % self.split)
+
+    tf.io.gfile.makedirs(image_path)
+    tf.io.gfile.makedirs(panoptic_map_path)
+    self.panoptic_maps = {}
+    image_id = int(_TEST_FILE_NAME[:-4])
+    self.panoptic_maps[image_id] = self._create_image_and_panoptic_map(
+        image_path, panoptic_map_path, image_id)
+
+  def _create_image_and_panoptic_map(self, image_path, panoptic_path, image_id):
+    def id2rgb(id_map):
+      id_map_copy = id_map.copy()
+      rgb_shape = tuple(list(id_map.shape) + [3])
+      rgb_map = np.zeros(rgb_shape, dtype=np.uint8)
+      for i in range(3):
+        rgb_map[..., i] = id_map_copy % 256
+        id_map_copy //= 256
+      return rgb_map
+
+    # Creates dummy images and panoptic maps.
+    # Dummy image.
+    image = np.random.randint(
+        0, 255, (self.height, self.width, 3), dtype=np.uint8)
+    with tf.io.gfile.GFile(
+        os.path.join(image_path, '%012d.jpg' % image_id), 'wb') as f:
+      Image.fromarray(image).save(f, format='JPEG')
+
+    # Dummy panoptic map.
+    semantic = np.random.randint(
+        0, 201, (self.height, self.width), dtype=np.int32)
+    instance_ = np.random.randint(
+        0, 100, (self.height, self.width), dtype=np.int32)
+    id_mapping = coco_constants.get_id_mapping()
+    valid_semantic = id_mapping.keys()
+    for i in range(201):
+      if i not in valid_semantic:
+        mask = (semantic == i)
+        semantic[mask] = 0
+        instance_[mask] = 0
+
+    instance = instance_.copy()
+    segments_info = []
+    for sem in np.unique(semantic):
+      ins_id = 1
+      if sem == 0:
+        continue
+      if id_mapping[sem] in build_coco_data._CLASS_HAS_INSTANCE_LIST:
+        for ins in np.unique(instance_[semantic == sem]):
+          instance[np.logical_and(semantic == sem, instance_ == ins)] = ins_id
+          area = np.logical_and(semantic == sem, instance_ == ins).sum()
+          idx = sem * 256 + ins_id
+          iscrowd = 0
+          segments_info.append({
+              'id': idx.tolist(),
+              'category_id': sem.tolist(),
+              'area': area.tolist(),
+              'iscrowd': iscrowd,
+          })
+          ins_id += 1
+      else:
+        instance[semantic == sem] = 0
+        area = (semantic == sem).sum()
+        idx = sem * 256
+        iscrowd = 0
+        segments_info.append({
+            'id': idx.tolist(),
+            'category_id': sem.tolist(),
+            'area': area.tolist(),
+            'iscrowd': iscrowd,
+        })
+
+    encoded_panoptic_map = semantic * 256 + instance
+    encoded_panoptic_map = id2rgb(encoded_panoptic_map)
+    with tf.io.gfile.GFile(
+        os.path.join(panoptic_path, '%012d.png' % image_id), 'wb') as f:
+      Image.fromarray(encoded_panoptic_map).save(f, format='PNG')
+
+    for i in range(201):
+      if i in valid_semantic:
+        mask = (semantic == i)
+        semantic[mask] = id_mapping[i]
+
+    decoded_panoptic_map = semantic * 256 + instance
+
+    # Write json file
+    json_annotation = {
+        'annotations': [
+            {
+                'file_name': _TEST_FILE_NAME,
+                'image_id': int(_TEST_FILE_NAME[:-4]),
+                'segments_info': segments_info
+            }
+        ]
+    }
+    json_annotation_path = os.path.join(self.data_dir,
+                                        build_coco_data._FOLDERS_MAP
+                                        [self.split]['label'],
+                                        'panoptic_%s2017.json' % self.split)
+    with tf.io.gfile.GFile(json_annotation_path, 'w') as f:
+      json.dump(json_annotation, f, indent=2)
+
+    return decoded_panoptic_map
+
+  def test_build_coco_dataset_correct(self):
+    build_coco_data._convert_dataset(
+        coco_root=self.data_dir,
+        dataset_split=self.split,
+        output_dir=FLAGS.test_tmpdir)
+    output_record = os.path.join(
+        FLAGS.test_tmpdir, '%s-%05d-of-%05d.tfrecord' %
+        (self.split, 0, build_coco_data._NUM_SHARDS))
+    self.assertTrue(tf.io.gfile.exists(output_record))
+
+    # Parses tf record.
+    image_ids = sorted(self.panoptic_maps)
+    for i, raw_record in enumerate(
+        tf.data.TFRecordDataset([output_record]).take(5)):
+      image_id = image_ids[i]
+      example = tf.train.Example.FromString(raw_record.numpy())
+      panoptic_map = np.fromstring(
+          example.features.feature['image/segmentation/class/encoded']
+          .bytes_list.value[0],
+          dtype=np.int32).reshape((self.height, self.width))
+      np.testing.assert_array_equal(panoptic_map, self.panoptic_maps[image_id])
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/data/build_dvps_data.py b/data/build_dvps_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..7057aae62cb23d8571e7c65f5bb3bf789a02b2f2
--- /dev/null
+++ b/data/build_dvps_data.py
@@ -0,0 +1,264 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""Converts Depth-aware Video Panoptic Segmentation (DVPS) data to sharded TFRecord file format with tf.train.Example protos.
+
+The expected directory structure of the DVPS dataset should be as follows:
+
+  + DVPS_ROOT
+    + train | val
+      - ground-truth depth maps (*_depth.png)
+      - ground-truth panoptic maps (*_gtFine_instanceTrainIds.png)
+      - images (*_leftImg8bit.png)
+    + test
+      - images (*_leftImg8bit.png)
+
+The ground-truth panoptic map is encoded as the following in PNG format:
+
+  panoptic ID = semantic ID * panoptic divisor (1000) + instance ID
+
+
+The output Example proto contains the following fields:
+
+  image/encoded: encoded image content.
+  image/filename: image filename.
+  image/format: image file format.
+  image/height: image height.
+  image/width: image width.
+  image/channels: image channels.
+  image/segmentation/class/encoded: encoded panoptic segmentation content.
+  image/segmentation/class/format: segmentation encoding format.
+  image/depth/encoded: encoded depth content.
+  image/depth/format: depth encoding format.
+  video/sequence_id: sequence ID of the frame.
+  video/frame_id: ID of the frame of the video sequence.
+  next_image/encoded: encoded next-frame image content.
+  next_image/segmentation/class/encoded: encoded panoptic segmentation content
+    of the next frame.
+
+The output panoptic segmentation map stored in the Example will be the raw bytes
+of an int32 panoptic map, where each pixel is assigned to a panoptic ID:
+
+  panoptic ID = semantic ID * panoptic divisor (1000) + instance ID
+
+where semantic ID will be the same with `category_id` for each segment, and
+ignore label for pixels not belong to any segment.
+
+The depth map will be the raw bytes of an int32 depth map, where each pixel is:
+
+  depth map = depth ground truth * 256
+
+Example to run the scipt:
+
+   python deeplab2/data/build_dvps_data.py \
+     --dvps_root=${DVPS_ROOT} \
+     --output_dir=${OUTPUT_DIR}
+"""
+
+import math
+import os
+
+from typing import Sequence, Tuple, Optional
+
+from absl import app
+from absl import flags
+from absl import logging
+import numpy as np
+
+from PIL import Image
+
+import tensorflow as tf
+
+from deeplab2.data import data_utils
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('dvps_root', None, 'DVPS dataset root folder.')
+
+flags.DEFINE_string('output_dir', None,
+                    'Path to save converted TFRecord of TensorFlow examples.')
+
+_PANOPTIC_DEPTH_FORMAT = 'raw'
+_NUM_SHARDS = 1000
+_TF_RECORD_PATTERN = '%s-%05d-of-%05d.tfrecord'
+_IMAGE_SUFFIX = '_leftImg8bit.png'
+_LABEL_SUFFIX = '_gtFine_instanceTrainIds.png'
+_DEPTH_SUFFIX = '_depth.png'
+
+
+def _get_image_info_from_path(image_path: str) -> Tuple[str, str]:
+  """Gets image info including sequence id and image id.
+
+  Image path is in the format of '{sequence_id}_{image_id}_*.png',
+  where `sequence_id` refers to the id of the video sequence, and `image_id` is
+  the id of the image in the video sequence.
+
+  Args:
+    image_path: Absolute path of the image.
+
+  Returns:
+    sequence_id, and image_id as strings.
+  """
+  image_path = os.path.basename(image_path)
+  return tuple(image_path.split('_')[:2])
+
+
+def _get_images(dvps_root: str, dataset_split: str) -> Sequence[str]:
+  """Gets files for the specified data type and dataset split.
+
+  Args:
+    dvps_root: String, path to DVPS dataset root folder.
+    dataset_split: String, dataset split ('train', 'val', 'test').
+
+  Returns:
+    A list of sorted file names under dvps_root and dataset_split.
+  """
+  search_files = os.path.join(dvps_root, dataset_split, '*' + _IMAGE_SUFFIX)
+  filenames = tf.io.gfile.glob(search_files)
+  return sorted(filenames)
+
+
+def _decode_panoptic_or_depth_map(map_path: str) -> Optional[str]:
+  """Decodes the panoptic or depth map from encoded image file.
+
+  Args:
+    map_path: Path to the panoptic or depth map image file.
+
+  Returns:
+    Panoptic or depth map as an encoded int32 numpy array bytes or None if not
+      existing.
+  """
+  if not tf.io.gfile.exists(map_path):
+    return None
+  with tf.io.gfile.GFile(map_path, 'rb') as f:
+    decoded_map = np.array(Image.open(f)).astype(np.int32)
+  return decoded_map.tobytes()
+
+
+def _get_next_frame_path(image_path: str) -> Optional[str]:
+  """Gets next frame path.
+
+  If not exists, return None.
+
+  The files are named {sequence_id}_{frame_id}*. To get the path of the next
+  frame, this function keeps sequence_id and increase the frame_id by 1. It
+  finds all the files matching this pattern, and returns the corresponding
+  file path matching the input type.
+
+  Args:
+    image_path: String, path to the image.
+
+  Returns:
+    A string for the path of the next frame of the given image path or None if
+      the given image path is the last frame of the sequence.
+  """
+  sequence_id, image_id = _get_image_info_from_path(image_path)
+  next_image_id = '{:06d}'.format(int(image_id) + 1)
+  next_image_name = sequence_id + '_' + next_image_id
+  next_image_path = None
+  for suffix in (_IMAGE_SUFFIX, _LABEL_SUFFIX):
+    if image_path.endswith(suffix):
+      next_image_path = os.path.join(
+          os.path.dirname(image_path), next_image_name + suffix)
+      if not tf.io.gfile.exists(next_image_path):
+        return None
+  return next_image_path
+
+
+def _create_tfexample(image_path: str, panoptic_map_path: str,
+                      depth_map_path: str) -> Optional[tf.train.Example]:
+  """Creates a TF example for each image.
+
+  Args:
+    image_path: Path to the image.
+    panoptic_map_path: Path to the panoptic map (as an image file).
+    depth_map_path: Path to the depth map (as an image file).
+
+  Returns:
+    TF example proto.
+  """
+  with tf.io.gfile.GFile(image_path, 'rb') as f:
+    image_data = f.read()
+  label_data = _decode_panoptic_or_depth_map(panoptic_map_path)
+  depth_data = _decode_panoptic_or_depth_map(depth_map_path)
+  image_name = os.path.basename(image_path)
+  image_format = image_name.split('.')[1].lower()
+  sequence_id, frame_id = _get_image_info_from_path(image_path)
+  next_image_data = None
+  next_label_data = None
+  # Next image.
+  next_image_path = _get_next_frame_path(image_path)
+  # If there is no next image, no examples will be created.
+  if next_image_path is None:
+    return None
+  with tf.io.gfile.GFile(next_image_path, 'rb') as f:
+    next_image_data = f.read()
+  # Next panoptic map.
+  next_panoptic_map_path = _get_next_frame_path(panoptic_map_path)
+  next_label_data = _decode_panoptic_or_depth_map(next_panoptic_map_path)
+  return data_utils.create_video_and_depth_tfexample(
+      image_data,
+      image_format,
+      image_name,
+      label_format=_PANOPTIC_DEPTH_FORMAT,
+      sequence_id=sequence_id,
+      image_id=frame_id,
+      label_data=label_data,
+      next_image_data=next_image_data,
+      next_label_data=next_label_data,
+      depth_data=depth_data,
+      depth_format=_PANOPTIC_DEPTH_FORMAT)
+
+
+def _convert_dataset(dvps_root: str, dataset_split: str, output_dir: str):
+  """Converts the specified dataset split to TFRecord format.
+
+  Args:
+    dvps_root: String, path to DVPS dataset root folder.
+    dataset_split: String, the dataset split (e.g., train, val, test).
+    output_dir: String, directory to write output TFRecords to.
+  """
+  image_files = _get_images(dvps_root, dataset_split)
+  num_images = len(image_files)
+
+  num_per_shard = int(math.ceil(len(image_files) / _NUM_SHARDS))
+
+  for shard_id in range(_NUM_SHARDS):
+    shard_filename = _TF_RECORD_PATTERN % (dataset_split, shard_id, _NUM_SHARDS)
+    output_filename = os.path.join(output_dir, shard_filename)
+    with tf.io.TFRecordWriter(output_filename) as tfrecord_writer:
+      start_idx = shard_id * num_per_shard
+      end_idx = min((shard_id + 1) * num_per_shard, num_images)
+      for i in range(start_idx, end_idx):
+        image_path = image_files[i]
+        panoptic_map_path = image_path.replace(_IMAGE_SUFFIX, _LABEL_SUFFIX)
+        depth_map_path = image_path.replace(_IMAGE_SUFFIX, _DEPTH_SUFFIX)
+        example = _create_tfexample(image_path, panoptic_map_path,
+                                    depth_map_path)
+        if example is not None:
+          tfrecord_writer.write(example.SerializeToString())
+
+
+def main(argv: Sequence[str]) -> None:
+  if len(argv) > 1:
+    raise app.UsageError('Too many command-line arguments.')
+  tf.io.gfile.makedirs(FLAGS.output_dir)
+  for dataset_split in ('train', 'val', 'test'):
+    logging.info('Starts to processing DVPS dataset split %s.', dataset_split)
+    _convert_dataset(FLAGS.dvps_root, dataset_split, FLAGS.output_dir)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/data/build_step_data.py b/data/build_step_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..d08653cbb28661f93763f2af54525c541381879f
--- /dev/null
+++ b/data/build_step_data.py
@@ -0,0 +1,298 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""Converts STEP (KITTI-STEP or MOTChallenge-STEP) data to sharded TFRecord file format with tf.train.Example protos.
+
+The expected directory structure of the STEP dataset should be as follows:
+
+  + {KITTI | MOTChallenge}-STEP
+    + images
+       + train
+         + sequence_id
+           - *.{png|jpg}
+           ...
+       + val
+       + test
+    + panoptic_maps
+       + train
+         + sequence_id
+           - *.png
+           ...
+       + val
+
+The ground-truth panoptic map is encoded as the following in PNG format:
+
+  R: semantic_id
+  G: instance_id // 256
+  B: instance % 256
+
+See ./utils/create_step_panoptic_maps.py for more details of how we create the
+panoptic map by merging semantic and instance maps.
+
+The output Example proto contains the following fields:
+
+  image/encoded: encoded image content.
+  image/filename: image filename.
+  image/format: image file format.
+  image/height: image height.
+  image/width: image width.
+  image/channels: image channels.
+  image/segmentation/class/encoded: encoded panoptic segmentation content.
+  image/segmentation/class/format: segmentation encoding format.
+  video/sequence_id: sequence ID of the frame.
+  video/frame_id: ID of the frame of the video sequence.
+
+The output panoptic segmentation map stored in the Example will be the raw bytes
+of an int32 panoptic map, where each pixel is assigned to a panoptic ID:
+
+  panoptic ID = semantic ID * label divisor (1000) + instance ID
+
+where semantic ID will be the same with `category_id` (use TrainId) for
+each segment, and ignore label for pixels not belong to any segment.
+
+The instance ID will be 0 for pixels belonging to
+  1) `stuff` class
+  2) `thing` class with `iscrowd` label
+  3) pixels with ignore label
+and [1, label divisor) otherwise.
+
+Example to run the scipt:
+
+   python deeplab2/data/build_step_data.py \
+     --step_root=${STEP_ROOT} \
+     --output_dir=${OUTPUT_DIR}
+"""
+
+import math
+import os
+
+from typing import Iterator, Sequence, Tuple, Optional
+
+from absl import app
+from absl import flags
+from absl import logging
+import numpy as np
+
+from PIL import Image
+
+import tensorflow as tf
+
+from deeplab2.data import data_utils
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('step_root', None, 'STEP dataset root folder.')
+
+flags.DEFINE_string('output_dir', None,
+                    'Path to save converted TFRecord of TensorFlow examples.')
+flags.DEFINE_bool(
+    'use_two_frames', False, 'Flag to separate between 1 frame '
+    'per TFExample or 2 consecutive frames per TFExample.')
+
+_PANOPTIC_LABEL_FORMAT = 'raw'
+_NUM_SHARDS = 10
+_IMAGE_FOLDER_NAME = 'images'
+_PANOPTIC_MAP_FOLDER_NAME = 'panoptic_maps'
+_LABEL_MAP_FORMAT = 'png'
+_INSTANCE_LABEL_DIVISOR = 1000
+_ENCODED_INSTANCE_LABEL_DIVISOR = 256
+_TF_RECORD_PATTERN = '%s-%05d-of-%05d.tfrecord'
+_FRAME_ID_PATTERN = '%06d'
+
+
+def _get_image_info_from_path(image_path: str) -> Tuple[str, str]:
+  """Gets image info including sequence id and image id.
+
+  Image path is in the format of '.../split/sequence_id/image_id.png',
+  where `sequence_id` refers to the id of the video sequence, and `image_id` is
+  the id of the image in the video sequence.
+
+  Args:
+    image_path: Absolute path of the image.
+
+  Returns:
+    sequence_id, and image_id as strings.
+  """
+  sequence_id = image_path.split('/')[-2]
+  image_id = os.path.splitext(os.path.basename(image_path))[0]
+  return sequence_id, image_id
+
+
+def _get_images_per_shard(step_root: str, dataset_split: str,
+                          sharded_by_sequence: bool) -> Iterator[Sequence[str]]:
+  """Gets files for the specified data type and dataset split.
+
+  Args:
+    step_root: String, Path to STEP dataset root folder.
+    dataset_split: String, dataset split ('train', 'val', 'test')
+    sharded_by_sequence: Whether the images should be sharded by sequence or
+      even split.
+
+  Yields:
+    A list of sorted file lists. Each inner list corresponds to one shard and is
+    a list of files for this shard.
+  """
+  search_files = os.path.join(step_root, _IMAGE_FOLDER_NAME, dataset_split, '*',
+                              '*')
+  filenames = sorted(tf.io.gfile.glob(search_files))
+  num_per_even_shard = int(math.ceil(len(filenames) / _NUM_SHARDS))
+
+  sequence_ids = [os.path.basename(os.path.dirname(name)) for name in filenames]
+  images_per_shard = []
+  for i, name in enumerate(filenames):
+    images_per_shard.append(name)
+    shard_data = (i == len(filenames) - 1)
+    # Sharded by sequence id.
+    shard_data = shard_data or (sharded_by_sequence and
+                                sequence_ids[i + 1] != sequence_ids[i])
+    # Sharded evenly.
+    shard_data = shard_data or (not sharded_by_sequence and
+                                len(images_per_shard) == num_per_even_shard)
+    if shard_data:
+      yield images_per_shard
+      images_per_shard = []
+
+
+def _decode_panoptic_map(panoptic_map_path: str) -> Optional[str]:
+  """Decodes the panoptic map from encoded image file.
+
+  Args:
+    panoptic_map_path: Path to the panoptic map image file.
+
+  Returns:
+    Panoptic map as an encoded int32 numpy array bytes or None if not existing.
+  """
+  if not tf.io.gfile.exists(panoptic_map_path):
+    return None
+  with tf.io.gfile.GFile(panoptic_map_path, 'rb') as f:
+    panoptic_map = np.array(Image.open(f)).astype(np.int32)
+  semantic_map = panoptic_map[:, :, 0]
+  instance_map = (
+      panoptic_map[:, :, 1] * _ENCODED_INSTANCE_LABEL_DIVISOR +
+      panoptic_map[:, :, 2])
+  panoptic_map = semantic_map * _INSTANCE_LABEL_DIVISOR + instance_map
+  return panoptic_map.tobytes()
+
+
+def _get_previous_frame_path(image_path: str) -> str:
+  """Gets previous frame path. If not exists, duplicate it with image_path."""
+  frame_id, frame_ext = os.path.splitext(os.path.basename(image_path))
+  folder_dir = os.path.dirname(image_path)
+  prev_frame_id = _FRAME_ID_PATTERN % (int(frame_id) - 1)
+  prev_image_path = os.path.join(folder_dir, prev_frame_id + frame_ext)
+  # If first frame, duplicates it.
+  if not tf.io.gfile.exists(prev_image_path):
+    tf.compat.v1.logging.warn(
+        'Could not find previous frame %s of frame %d, duplicate the previous '
+        'frame with the current frame.', prev_image_path, int(frame_id))
+    prev_image_path = image_path
+  return prev_image_path
+
+
+def _create_panoptic_tfexample(image_path: str,
+                               panoptic_map_path: str,
+                               use_two_frames: bool,
+                               is_testing: bool = False) -> tf.train.Example:
+  """Creates a TF example for each image.
+
+  Args:
+    image_path: Path to the image.
+    panoptic_map_path: Path to the panoptic map (as an image file).
+    use_two_frames: Whether to encode consecutive two frames in the Example.
+    is_testing: Whether it is testing data. If so, skip adding label data.
+
+  Returns:
+    TF example proto.
+  """
+  with tf.io.gfile.GFile(image_path, 'rb') as f:
+    image_data = f.read()
+  label_data = None
+  if not is_testing:
+    label_data = _decode_panoptic_map(panoptic_map_path)
+  image_name = os.path.basename(image_path)
+  image_format = image_name.split('.')[1].lower()
+  sequence_id, frame_id = _get_image_info_from_path(image_path)
+  prev_image_data = None
+  prev_label_data = None
+  if use_two_frames:
+    # Previous image.
+    prev_image_path = _get_previous_frame_path(image_path)
+    with tf.io.gfile.GFile(prev_image_path, 'rb') as f:
+      prev_image_data = f.read()
+    # Previous panoptic map.
+    if not is_testing:
+      prev_panoptic_map_path = _get_previous_frame_path(panoptic_map_path)
+      prev_label_data = _decode_panoptic_map(prev_panoptic_map_path)
+  return data_utils.create_video_tfexample(
+      image_data,
+      image_format,
+      image_name,
+      label_format=_PANOPTIC_LABEL_FORMAT,
+      sequence_id=sequence_id,
+      image_id=frame_id,
+      label_data=label_data,
+      prev_image_data=prev_image_data,
+      prev_label_data=prev_label_data)
+
+
+def _convert_dataset(step_root: str,
+                     dataset_split: str,
+                     output_dir: str,
+                     use_two_frames: bool = False):
+  """Converts the specified dataset split to TFRecord format.
+
+  Args:
+    step_root: String, Path to STEP dataset root folder.
+    dataset_split: String, the dataset split (e.g., train, val).
+    output_dir: String, directory to write output TFRecords to.
+    use_two_frames: Whether to encode consecutive two frames in the Example.
+  """
+  # For val and test set, if we run with use_two_frames, we should create a
+  # sorted tfrecord per sequence.
+  create_tfrecord_per_sequence = ('train'
+                                  not in dataset_split) and use_two_frames
+  is_testing = 'test' in dataset_split
+
+  image_files_per_shard = list(
+      _get_images_per_shard(step_root, dataset_split,
+                            sharded_by_sequence=create_tfrecord_per_sequence))
+  num_shards = len(image_files_per_shard)
+
+  for shard_id, image_list in enumerate(image_files_per_shard):
+    shard_filename = _TF_RECORD_PATTERN % (dataset_split, shard_id, num_shards)
+    output_filename = os.path.join(output_dir, shard_filename)
+    with tf.io.TFRecordWriter(output_filename) as tfrecord_writer:
+      for image_path in image_list:
+        sequence_id, image_id = _get_image_info_from_path(image_path)
+        panoptic_map_path = os.path.join(
+            step_root, _PANOPTIC_MAP_FOLDER_NAME, dataset_split, sequence_id,
+            '%s.%s' % (image_id, _LABEL_MAP_FORMAT))
+        example = _create_panoptic_tfexample(image_path, panoptic_map_path,
+                                             use_two_frames, is_testing)
+        tfrecord_writer.write(example.SerializeToString())
+
+
+def main(argv: Sequence[str]) -> None:
+  if len(argv) > 1:
+    raise app.UsageError('Too many command-line arguments.')
+  tf.io.gfile.makedirs(FLAGS.output_dir)
+  for dataset_split in ('train', 'val', 'test'):
+    logging.info('Starts to processing STEP dataset split %s.', dataset_split)
+    _convert_dataset(FLAGS.step_root, dataset_split, FLAGS.output_dir,
+                     FLAGS.use_two_frames)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/data/build_step_data_test.py b/data/build_step_data_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b430b928829f997dc0d093fd9507b8c89550f6bc
--- /dev/null
+++ b/data/build_step_data_test.py
@@ -0,0 +1,164 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for build_step_data."""
+
+import os
+
+from absl import flags
+import numpy as np
+from PIL import Image
+import tensorflow as tf
+
+from deeplab2.data import build_step_data
+
+FLAGS = flags.FLAGS
+
+
+class BuildStepDataTest(tf.test.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    self.data_dir = FLAGS.test_tmpdir
+    self.height = 100
+    self.width = 100
+    self.sequence_id = '010'
+
+  def _create_images(self, split):
+    image_path = os.path.join(self.data_dir, build_step_data._IMAGE_FOLDER_NAME,
+                              split, self.sequence_id)
+    panoptic_map_path = os.path.join(self.data_dir,
+                                     build_step_data._PANOPTIC_MAP_FOLDER_NAME,
+                                     split, self.sequence_id)
+
+    tf.io.gfile.makedirs(image_path)
+    tf.io.gfile.makedirs(panoptic_map_path)
+    self.panoptic_maps = {}
+    for image_id in [101, 100]:
+      self.panoptic_maps[image_id] = self._create_image_and_panoptic_map(
+          image_path, panoptic_map_path, image_id)
+
+  def _create_image_and_panoptic_map(self, image_path, panoptic_path, image_id):
+    """Creates dummy images and panoptic maps."""
+    # Dummy image.
+    image = np.random.randint(
+        0, 255, (self.height, self.width, 3), dtype=np.uint8)
+    with tf.io.gfile.GFile(
+        os.path.join(image_path, '%06d.png' % image_id), 'wb') as f:
+      Image.fromarray(image).save(f, format='PNG')
+
+    # Dummy panoptic map.
+    semantic = np.random.randint(
+        0, 20, (self.height, self.width), dtype=np.int32)
+    instance = np.random.randint(
+        0, 1000, (self.height, self.width), dtype=np.int32)
+    encoded_panoptic_map = np.dstack(
+        (semantic, instance // 256, instance % 256)).astype(np.uint8)
+    with tf.io.gfile.GFile(
+        os.path.join(panoptic_path, '%06d.png' % image_id), 'wb') as f:
+      Image.fromarray(encoded_panoptic_map).save(f, format='PNG')
+    decoded_panoptic_map = semantic * 1000 + instance
+    return decoded_panoptic_map
+
+  def test_build_step_dataset_correct(self):
+    split = 'train'
+    self._create_images(split)
+    build_step_data._convert_dataset(
+        step_root=self.data_dir,
+        dataset_split=split,
+        output_dir=FLAGS.test_tmpdir)
+    # We will have 2 shards with each shard containing 1 image.
+    num_shards = 2
+    output_record = os.path.join(
+        FLAGS.test_tmpdir, build_step_data._TF_RECORD_PATTERN %
+        (split, 0, num_shards))
+    self.assertTrue(tf.io.gfile.exists(output_record))
+
+    # Parses tf record.
+    image_ids = sorted(self.panoptic_maps)
+    for i, raw_record in enumerate(
+        tf.data.TFRecordDataset([output_record]).take(5)):
+      image_id = image_ids[i]
+      example = tf.train.Example.FromString(raw_record.numpy())
+      panoptic_map = np.fromstring(
+          example.features.feature['image/segmentation/class/encoded']
+          .bytes_list.value[0],
+          dtype=np.int32).reshape((self.height, self.width))
+      np.testing.assert_array_equal(panoptic_map, self.panoptic_maps[image_id])
+      self.assertEqual(
+          example.features.feature['video/sequence_id'].bytes_list.value[0],
+          b'010')
+      self.assertEqual(
+          example.features.feature['video/frame_id'].bytes_list.value[0],
+          b'%06d' % image_id)
+
+  def test_build_step_dataset_correct_with_two_frames(self):
+    split = 'train'
+    self._create_images(split)
+    build_step_data._convert_dataset(
+        step_root=self.data_dir,
+        dataset_split=split,
+        output_dir=FLAGS.test_tmpdir, use_two_frames=True)
+    num_shards = 2
+    output_record = os.path.join(
+        FLAGS.test_tmpdir, build_step_data._TF_RECORD_PATTERN %
+        (split, 0, num_shards))
+    self.assertTrue(tf.io.gfile.exists(output_record))
+
+    # Parses tf record.
+    image_ids = sorted(self.panoptic_maps)
+    for i, raw_record in enumerate(
+        tf.data.TFRecordDataset([output_record]).take(5)):
+      image_id = image_ids[i]
+      example = tf.train.Example.FromString(raw_record.numpy())
+      panoptic_map = np.fromstring(
+          example.features.feature['image/segmentation/class/encoded']
+          .bytes_list.value[0],
+          dtype=np.int32).reshape((self.height, self.width))
+      np.testing.assert_array_equal(panoptic_map, self.panoptic_maps[image_id])
+      prev_panoptic_map = np.fromstring(
+          example.features.feature['prev_image/segmentation/class/encoded']
+          .bytes_list.value[0],
+          dtype=np.int32).reshape((self.height, self.width))
+      if i == 0:
+        # First frame.
+        np.testing.assert_array_equal(panoptic_map, prev_panoptic_map)
+      else:
+        # Not a first frame.
+        np.testing.assert_array_equal(prev_panoptic_map, self.panoptic_maps[0])
+      self.assertEqual(
+          example.features.feature['video/sequence_id'].bytes_list.value[0],
+          b'010')
+      self.assertEqual(
+          example.features.feature['video/frame_id'].bytes_list.value[0],
+          b'%06d' % image_id)
+
+  def test_build_step_dataset_with_two_frames_shared_by_sequence(self):
+    split = 'val'
+    self._create_images(split)
+    build_step_data._convert_dataset(
+        step_root=self.data_dir,
+        dataset_split=split,
+        output_dir=FLAGS.test_tmpdir, use_two_frames=True)
+    # Only one shard since there is only one sequence for the val set.
+    num_shards = 1
+    output_record = os.path.join(
+        FLAGS.test_tmpdir, build_step_data._TF_RECORD_PATTERN %
+        (split, 0, num_shards))
+    self.assertTrue(tf.io.gfile.exists(output_record))
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/data/coco_constants.py b/data/coco_constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac0a5ef03db71fa91e93bc9103319c71c2001941
--- /dev/null
+++ b/data/coco_constants.py
@@ -0,0 +1,865 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""File containing the meta info of COCO dataset.
+"""
+
+import copy
+from typing import Sequence, Mapping, Any
+
+_COCO_META = [
+    {
+        'color': [220, 20, 60],
+        'isthing': 1,
+        'id': 1,
+        'name': 'person'
+    },
+    {
+        'color': [119, 11, 32],
+        'isthing': 1,
+        'id': 2,
+        'name': 'bicycle'
+    },
+    {
+        'color': [0, 0, 142],
+        'isthing': 1,
+        'id': 3,
+        'name': 'car'
+    },
+    {
+        'color': [0, 0, 230],
+        'isthing': 1,
+        'id': 4,
+        'name': 'motorcycle'
+    },
+    {
+        'color': [106, 0, 228],
+        'isthing': 1,
+        'id': 5,
+        'name': 'airplane'
+    },
+    {
+        'color': [0, 60, 100],
+        'isthing': 1,
+        'id': 6,
+        'name': 'bus'
+    },
+    {
+        'color': [0, 80, 100],
+        'isthing': 1,
+        'id': 7,
+        'name': 'train'
+    },
+    {
+        'color': [0, 0, 70],
+        'isthing': 1,
+        'id': 8,
+        'name': 'truck'
+    },
+    {
+        'color': [0, 0, 192],
+        'isthing': 1,
+        'id': 9,
+        'name': 'boat'
+    },
+    {
+        'color': [250, 170, 30],
+        'isthing': 1,
+        'id': 10,
+        'name': 'traffic light'
+    },
+    {
+        'color': [100, 170, 30],
+        'isthing': 1,
+        'id': 11,
+        'name': 'fire hydrant'
+    },
+    {
+        'color': [220, 220, 0],
+        'isthing': 1,
+        'id': 13,
+        'name': 'stop sign'
+    },
+    {
+        'color': [175, 116, 175],
+        'isthing': 1,
+        'id': 14,
+        'name': 'parking meter'
+    },
+    {
+        'color': [250, 0, 30],
+        'isthing': 1,
+        'id': 15,
+        'name': 'bench'
+    },
+    {
+        'color': [165, 42, 42],
+        'isthing': 1,
+        'id': 16,
+        'name': 'bird'
+    },
+    {
+        'color': [255, 77, 255],
+        'isthing': 1,
+        'id': 17,
+        'name': 'cat'
+    },
+    {
+        'color': [0, 226, 252],
+        'isthing': 1,
+        'id': 18,
+        'name': 'dog'
+    },
+    {
+        'color': [182, 182, 255],
+        'isthing': 1,
+        'id': 19,
+        'name': 'horse'
+    },
+    {
+        'color': [0, 82, 0],
+        'isthing': 1,
+        'id': 20,
+        'name': 'sheep'
+    },
+    {
+        'color': [120, 166, 157],
+        'isthing': 1,
+        'id': 21,
+        'name': 'cow'
+    },
+    {
+        'color': [110, 76, 0],
+        'isthing': 1,
+        'id': 22,
+        'name': 'elephant'
+    },
+    {
+        'color': [174, 57, 255],
+        'isthing': 1,
+        'id': 23,
+        'name': 'bear'
+    },
+    {
+        'color': [199, 100, 0],
+        'isthing': 1,
+        'id': 24,
+        'name': 'zebra'
+    },
+    {
+        'color': [72, 0, 118],
+        'isthing': 1,
+        'id': 25,
+        'name': 'giraffe'
+    },
+    {
+        'color': [255, 179, 240],
+        'isthing': 1,
+        'id': 27,
+        'name': 'backpack'
+    },
+    {
+        'color': [0, 125, 92],
+        'isthing': 1,
+        'id': 28,
+        'name': 'umbrella'
+    },
+    {
+        'color': [209, 0, 151],
+        'isthing': 1,
+        'id': 31,
+        'name': 'handbag'
+    },
+    {
+        'color': [188, 208, 182],
+        'isthing': 1,
+        'id': 32,
+        'name': 'tie'
+    },
+    {
+        'color': [0, 220, 176],
+        'isthing': 1,
+        'id': 33,
+        'name': 'suitcase'
+    },
+    {
+        'color': [255, 99, 164],
+        'isthing': 1,
+        'id': 34,
+        'name': 'frisbee'
+    },
+    {
+        'color': [92, 0, 73],
+        'isthing': 1,
+        'id': 35,
+        'name': 'skis'
+    },
+    {
+        'color': [133, 129, 255],
+        'isthing': 1,
+        'id': 36,
+        'name': 'snowboard'
+    },
+    {
+        'color': [78, 180, 255],
+        'isthing': 1,
+        'id': 37,
+        'name': 'sports ball'
+    },
+    {
+        'color': [0, 228, 0],
+        'isthing': 1,
+        'id': 38,
+        'name': 'kite'
+    },
+    {
+        'color': [174, 255, 243],
+        'isthing': 1,
+        'id': 39,
+        'name': 'baseball bat'
+    },
+    {
+        'color': [45, 89, 255],
+        'isthing': 1,
+        'id': 40,
+        'name': 'baseball glove'
+    },
+    {
+        'color': [134, 134, 103],
+        'isthing': 1,
+        'id': 41,
+        'name': 'skateboard'
+    },
+    {
+        'color': [145, 148, 174],
+        'isthing': 1,
+        'id': 42,
+        'name': 'surfboard'
+    },
+    {
+        'color': [255, 208, 186],
+        'isthing': 1,
+        'id': 43,
+        'name': 'tennis racket'
+    },
+    {
+        'color': [197, 226, 255],
+        'isthing': 1,
+        'id': 44,
+        'name': 'bottle'
+    },
+    {
+        'color': [171, 134, 1],
+        'isthing': 1,
+        'id': 46,
+        'name': 'wine glass'
+    },
+    {
+        'color': [109, 63, 54],
+        'isthing': 1,
+        'id': 47,
+        'name': 'cup'
+    },
+    {
+        'color': [207, 138, 255],
+        'isthing': 1,
+        'id': 48,
+        'name': 'fork'
+    },
+    {
+        'color': [151, 0, 95],
+        'isthing': 1,
+        'id': 49,
+        'name': 'knife'
+    },
+    {
+        'color': [9, 80, 61],
+        'isthing': 1,
+        'id': 50,
+        'name': 'spoon'
+    },
+    {
+        'color': [84, 105, 51],
+        'isthing': 1,
+        'id': 51,
+        'name': 'bowl'
+    },
+    {
+        'color': [74, 65, 105],
+        'isthing': 1,
+        'id': 52,
+        'name': 'banana'
+    },
+    {
+        'color': [166, 196, 102],
+        'isthing': 1,
+        'id': 53,
+        'name': 'apple'
+    },
+    {
+        'color': [208, 195, 210],
+        'isthing': 1,
+        'id': 54,
+        'name': 'sandwich'
+    },
+    {
+        'color': [255, 109, 65],
+        'isthing': 1,
+        'id': 55,
+        'name': 'orange'
+    },
+    {
+        'color': [0, 143, 149],
+        'isthing': 1,
+        'id': 56,
+        'name': 'broccoli'
+    },
+    {
+        'color': [179, 0, 194],
+        'isthing': 1,
+        'id': 57,
+        'name': 'carrot'
+    },
+    {
+        'color': [209, 99, 106],
+        'isthing': 1,
+        'id': 58,
+        'name': 'hot dog'
+    },
+    {
+        'color': [5, 121, 0],
+        'isthing': 1,
+        'id': 59,
+        'name': 'pizza'
+    },
+    {
+        'color': [227, 255, 205],
+        'isthing': 1,
+        'id': 60,
+        'name': 'donut'
+    },
+    {
+        'color': [147, 186, 208],
+        'isthing': 1,
+        'id': 61,
+        'name': 'cake'
+    },
+    {
+        'color': [153, 69, 1],
+        'isthing': 1,
+        'id': 62,
+        'name': 'chair'
+    },
+    {
+        'color': [3, 95, 161],
+        'isthing': 1,
+        'id': 63,
+        'name': 'couch'
+    },
+    {
+        'color': [163, 255, 0],
+        'isthing': 1,
+        'id': 64,
+        'name': 'potted plant'
+    },
+    {
+        'color': [119, 0, 170],
+        'isthing': 1,
+        'id': 65,
+        'name': 'bed'
+    },
+    {
+        'color': [0, 182, 199],
+        'isthing': 1,
+        'id': 67,
+        'name': 'dining table'
+    },
+    {
+        'color': [0, 165, 120],
+        'isthing': 1,
+        'id': 70,
+        'name': 'toilet'
+    },
+    {
+        'color': [183, 130, 88],
+        'isthing': 1,
+        'id': 72,
+        'name': 'tv'
+    },
+    {
+        'color': [95, 32, 0],
+        'isthing': 1,
+        'id': 73,
+        'name': 'laptop'
+    },
+    {
+        'color': [130, 114, 135],
+        'isthing': 1,
+        'id': 74,
+        'name': 'mouse'
+    },
+    {
+        'color': [110, 129, 133],
+        'isthing': 1,
+        'id': 75,
+        'name': 'remote'
+    },
+    {
+        'color': [166, 74, 118],
+        'isthing': 1,
+        'id': 76,
+        'name': 'keyboard'
+    },
+    {
+        'color': [219, 142, 185],
+        'isthing': 1,
+        'id': 77,
+        'name': 'cell phone'
+    },
+    {
+        'color': [79, 210, 114],
+        'isthing': 1,
+        'id': 78,
+        'name': 'microwave'
+    },
+    {
+        'color': [178, 90, 62],
+        'isthing': 1,
+        'id': 79,
+        'name': 'oven'
+    },
+    {
+        'color': [65, 70, 15],
+        'isthing': 1,
+        'id': 80,
+        'name': 'toaster'
+    },
+    {
+        'color': [127, 167, 115],
+        'isthing': 1,
+        'id': 81,
+        'name': 'sink'
+    },
+    {
+        'color': [59, 105, 106],
+        'isthing': 1,
+        'id': 82,
+        'name': 'refrigerator'
+    },
+    {
+        'color': [142, 108, 45],
+        'isthing': 1,
+        'id': 84,
+        'name': 'book'
+    },
+    {
+        'color': [196, 172, 0],
+        'isthing': 1,
+        'id': 85,
+        'name': 'clock'
+    },
+    {
+        'color': [95, 54, 80],
+        'isthing': 1,
+        'id': 86,
+        'name': 'vase'
+    },
+    {
+        'color': [128, 76, 255],
+        'isthing': 1,
+        'id': 87,
+        'name': 'scissors'
+    },
+    {
+        'color': [201, 57, 1],
+        'isthing': 1,
+        'id': 88,
+        'name': 'teddy bear'
+    },
+    {
+        'color': [246, 0, 122],
+        'isthing': 1,
+        'id': 89,
+        'name': 'hair drier'
+    },
+    {
+        'color': [191, 162, 208],
+        'isthing': 1,
+        'id': 90,
+        'name': 'toothbrush'
+    },
+    {
+        'color': [255, 255, 128],
+        'isthing': 0,
+        'id': 92,
+        'name': 'banner'
+    },
+    {
+        'color': [147, 211, 203],
+        'isthing': 0,
+        'id': 93,
+        'name': 'blanket'
+    },
+    {
+        'color': [150, 100, 100],
+        'isthing': 0,
+        'id': 95,
+        'name': 'bridge'
+    },
+    {
+        'color': [168, 171, 172],
+        'isthing': 0,
+        'id': 100,
+        'name': 'cardboard'
+    },
+    {
+        'color': [146, 112, 198],
+        'isthing': 0,
+        'id': 107,
+        'name': 'counter'
+    },
+    {
+        'color': [210, 170, 100],
+        'isthing': 0,
+        'id': 109,
+        'name': 'curtain'
+    },
+    {
+        'color': [92, 136, 89],
+        'isthing': 0,
+        'id': 112,
+        'name': 'door-stuff'
+    },
+    {
+        'color': [218, 88, 184],
+        'isthing': 0,
+        'id': 118,
+        'name': 'floor-wood'
+    },
+    {
+        'color': [241, 129, 0],
+        'isthing': 0,
+        'id': 119,
+        'name': 'flower'
+    },
+    {
+        'color': [217, 17, 255],
+        'isthing': 0,
+        'id': 122,
+        'name': 'fruit'
+    },
+    {
+        'color': [124, 74, 181],
+        'isthing': 0,
+        'id': 125,
+        'name': 'gravel'
+    },
+    {
+        'color': [70, 70, 70],
+        'isthing': 0,
+        'id': 128,
+        'name': 'house'
+    },
+    {
+        'color': [255, 228, 255],
+        'isthing': 0,
+        'id': 130,
+        'name': 'light'
+    },
+    {
+        'color': [154, 208, 0],
+        'isthing': 0,
+        'id': 133,
+        'name': 'mirror-stuff'
+    },
+    {
+        'color': [193, 0, 92],
+        'isthing': 0,
+        'id': 138,
+        'name': 'net'
+    },
+    {
+        'color': [76, 91, 113],
+        'isthing': 0,
+        'id': 141,
+        'name': 'pillow'
+    },
+    {
+        'color': [255, 180, 195],
+        'isthing': 0,
+        'id': 144,
+        'name': 'platform'
+    },
+    {
+        'color': [106, 154, 176],
+        'isthing': 0,
+        'id': 145,
+        'name': 'playingfield'
+    },
+    {
+        'color': [230, 150, 140],
+        'isthing': 0,
+        'id': 147,
+        'name': 'railroad'
+    },
+    {
+        'color': [60, 143, 255],
+        'isthing': 0,
+        'id': 148,
+        'name': 'river'
+    },
+    {
+        'color': [128, 64, 128],
+        'isthing': 0,
+        'id': 149,
+        'name': 'road'
+    },
+    {
+        'color': [92, 82, 55],
+        'isthing': 0,
+        'id': 151,
+        'name': 'roof'
+    },
+    {
+        'color': [254, 212, 124],
+        'isthing': 0,
+        'id': 154,
+        'name': 'sand'
+    },
+    {
+        'color': [73, 77, 174],
+        'isthing': 0,
+        'id': 155,
+        'name': 'sea'
+    },
+    {
+        'color': [255, 160, 98],
+        'isthing': 0,
+        'id': 156,
+        'name': 'shelf'
+    },
+    {
+        'color': [255, 255, 255],
+        'isthing': 0,
+        'id': 159,
+        'name': 'snow'
+    },
+    {
+        'color': [104, 84, 109],
+        'isthing': 0,
+        'id': 161,
+        'name': 'stairs'
+    },
+    {
+        'color': [169, 164, 131],
+        'isthing': 0,
+        'id': 166,
+        'name': 'tent'
+    },
+    {
+        'color': [225, 199, 255],
+        'isthing': 0,
+        'id': 168,
+        'name': 'towel'
+    },
+    {
+        'color': [137, 54, 74],
+        'isthing': 0,
+        'id': 171,
+        'name': 'wall-brick'
+    },
+    {
+        'color': [135, 158, 223],
+        'isthing': 0,
+        'id': 175,
+        'name': 'wall-stone'
+    },
+    {
+        'color': [7, 246, 231],
+        'isthing': 0,
+        'id': 176,
+        'name': 'wall-tile'
+    },
+    {
+        'color': [107, 255, 200],
+        'isthing': 0,
+        'id': 177,
+        'name': 'wall-wood'
+    },
+    {
+        'color': [58, 41, 149],
+        'isthing': 0,
+        'id': 178,
+        'name': 'water-other'
+    },
+    {
+        'color': [183, 121, 142],
+        'isthing': 0,
+        'id': 180,
+        'name': 'window-blind'
+    },
+    {
+        'color': [255, 73, 97],
+        'isthing': 0,
+        'id': 181,
+        'name': 'window-other'
+    },
+    {
+        'color': [107, 142, 35],
+        'isthing': 0,
+        'id': 184,
+        'name': 'tree-merged'
+    },
+    {
+        'color': [190, 153, 153],
+        'isthing': 0,
+        'id': 185,
+        'name': 'fence-merged'
+    },
+    {
+        'color': [146, 139, 141],
+        'isthing': 0,
+        'id': 186,
+        'name': 'ceiling-merged'
+    },
+    {
+        'color': [70, 130, 180],
+        'isthing': 0,
+        'id': 187,
+        'name': 'sky-other-merged'
+    },
+    {
+        'color': [134, 199, 156],
+        'isthing': 0,
+        'id': 188,
+        'name': 'cabinet-merged'
+    },
+    {
+        'color': [209, 226, 140],
+        'isthing': 0,
+        'id': 189,
+        'name': 'table-merged'
+    },
+    {
+        'color': [96, 36, 108],
+        'isthing': 0,
+        'id': 190,
+        'name': 'floor-other-merged'
+    },
+    {
+        'color': [96, 96, 96],
+        'isthing': 0,
+        'id': 191,
+        'name': 'pavement-merged'
+    },
+    {
+        'color': [64, 170, 64],
+        'isthing': 0,
+        'id': 192,
+        'name': 'mountain-merged'
+    },
+    {
+        'color': [152, 251, 152],
+        'isthing': 0,
+        'id': 193,
+        'name': 'grass-merged'
+    },
+    {
+        'color': [208, 229, 228],
+        'isthing': 0,
+        'id': 194,
+        'name': 'dirt-merged'
+    },
+    {
+        'color': [206, 186, 171],
+        'isthing': 0,
+        'id': 195,
+        'name': 'paper-merged'
+    },
+    {
+        'color': [152, 161, 64],
+        'isthing': 0,
+        'id': 196,
+        'name': 'food-other-merged'
+    },
+    {
+        'color': [116, 112, 0],
+        'isthing': 0,
+        'id': 197,
+        'name': 'building-other-merged'
+    },
+    {
+        'color': [0, 114, 143],
+        'isthing': 0,
+        'id': 198,
+        'name': 'rock-merged'
+    },
+    {
+        'color': [102, 102, 156],
+        'isthing': 0,
+        'id': 199,
+        'name': 'wall-other-merged'
+    },
+    {
+        'color': [250, 141, 255],
+        'isthing': 0,
+        'id': 200,
+        'name': 'rug-merged'
+    },
+]
+
+
+def get_coco_meta() -> Sequence[Any]:
+  return copy.deepcopy(_COCO_META)
+
+
+def get_id_mapping() -> Mapping[int, int]:
+  """Creates a dictionary mapping the original category_id into continuous ones.
+
+  Specifically, in coco annotations, category_id ranges from 1 to 200. Since not
+  every id between 1 to 200 is used, we map them to contiguous ids (1 to 133),
+  which saves memory and computation to some degree.
+
+  Returns:
+    A dictionary mapping original category id to contiguous category ids.
+  """
+  id_mapping = {}
+  for i in range(len(_COCO_META)):
+    id_mapping[_COCO_META[i]['id']] = i + 1
+  return id_mapping
+
+
+def get_id_mapping_inverse() -> Sequence[int]:
+  """Creates a tuple mapping the continuous ids back to original ones.
+
+  Specifically, in coco annotations, category_id ranges from 1 to 200. Since not
+  every id between 1 to 200 is used, we map them to contiguous ids (1 to 133)
+  via the function get_id_mapping, which saves memory and computation to some
+  degree. This function supports remapping back from the contiguous ids to the
+  original ones, which is required for COCO official evaluation.
+
+  Returns:
+    A dictionary mapping contiguous category ids to original COCO category id.
+  """
+  id_mapping_inverse = (0,) + tuple([ori_cat['id'] for ori_cat in _COCO_META])
+  return id_mapping_inverse
+
+
+def get_coco_reduced_meta() -> Sequence[Any]:
+  coco_reduced_meta = get_coco_meta()
+  id_mapping = get_id_mapping()
+  for i in range(len(coco_reduced_meta)):
+    coco_reduced_meta[i].update({'id': id_mapping[coco_reduced_meta[i]['id']]})
+  return coco_reduced_meta
diff --git a/data/data_utils.py b/data/data_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..74260d974565402ca50a7726c280a721358e6502
--- /dev/null
+++ b/data/data_utils.py
@@ -0,0 +1,391 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains common utility functions and classes for building dataset."""
+
+import collections
+import io
+
+import numpy as np
+from PIL import Image
+from PIL import ImageOps
+import tensorflow as tf
+
+from deeplab2 import common
+
+_PANOPTIC_LABEL_FORMAT = 'raw'
+
+
+def read_image(image_data):
+  """Decodes image from in-memory data.
+
+  Args:
+    image_data: Bytes data representing encoded image.
+
+  Returns:
+    Decoded PIL.Image object.
+  """
+  image = Image.open(io.BytesIO(image_data))
+
+  try:
+    image = ImageOps.exif_transpose(image)
+  except TypeError:
+    # capture and ignore this bug:
+    # https://github.com/python-pillow/Pillow/issues/3973
+    pass
+
+  return image
+
+
+def get_image_dims(image_data, check_is_rgb=False):
+  """Decodes image and return its height and width.
+
+  Args:
+    image_data: Bytes data representing encoded image.
+    check_is_rgb: Whether to check encoded image is RGB.
+
+  Returns:
+    Decoded image size as a tuple of (height, width)
+
+  Raises:
+    ValueError: If check_is_rgb is set and input image has other format.
+  """
+  image = read_image(image_data)
+
+  if check_is_rgb and image.mode != 'RGB':
+    raise ValueError('Expects RGB image data, gets mode: %s' % image.mode)
+
+  width, height = image.size
+  return height, width
+
+
+def _int64_list_feature(values):
+  """Returns a TF-Feature of int64_list.
+
+  Args:
+    values: A scalar or an iterable of integer values.
+
+  Returns:
+    A TF-Feature.
+  """
+  if not isinstance(values, collections.Iterable):
+    values = [values]
+
+  return tf.train.Feature(int64_list=tf.train.Int64List(value=values))
+
+
+def _bytes_list_feature(values):
+  """Returns a TF-Feature of bytes.
+
+  Args:
+    values: A string.
+
+  Returns:
+    A TF-Feature.
+  """
+  if isinstance(values, str):
+    values = values.encode()
+
+  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[values]))
+
+
+def create_features(image_data,
+                    image_format,
+                    filename,
+                    label_data=None,
+                    label_format=None):
+  """Creates image/segmentation features.
+
+  Args:
+    image_data: String or byte stream of encoded image data.
+    image_format: String, image data format, should be either 'jpeg' or 'png'.
+    filename: String, image filename.
+    label_data: String or byte stream of (potentially) encoded label data. If
+      None, we skip to write it to tf.train.Example.
+    label_format: String, label data format, should be either 'png' or 'raw'. If
+      None, we skip to write it to tf.train.Example.
+
+  Returns:
+    A dictionary of feature name to tf.train.Feature maaping.
+  """
+  if image_format not in ('jpeg', 'png'):
+    raise ValueError('Unsupported image format: %s' % image_format)
+
+  # Check color mode, and convert grey image to rgb image.
+  image = read_image(image_data)
+  if image.mode != 'RGB':
+    image = image.convert('RGB')
+    image_data = io.BytesIO()
+    image.save(image_data, format=image_format)
+    image_data = image_data.getvalue()
+
+  height, width = get_image_dims(image_data, check_is_rgb=True)
+
+  feature_dict = {
+      common.KEY_ENCODED_IMAGE: _bytes_list_feature(image_data),
+      common.KEY_IMAGE_FILENAME: _bytes_list_feature(filename),
+      common.KEY_IMAGE_FORMAT: _bytes_list_feature(image_format),
+      common.KEY_IMAGE_HEIGHT: _int64_list_feature(height),
+      common.KEY_IMAGE_WIDTH: _int64_list_feature(width),
+      common.KEY_IMAGE_CHANNELS: _int64_list_feature(3),
+  }
+
+  if label_data is None:
+    return feature_dict
+
+  if label_format == 'png':
+    label_height, label_width = get_image_dims(label_data)
+    if (label_height, label_width) != (height, width):
+      raise ValueError('Image (%s) and label (%s) shape mismatch' %
+                       ((height, width), (label_height, label_width)))
+  elif label_format == 'raw':
+    # Raw label encodes int32 array.
+    expected_label_size = height * width * np.dtype(np.int32).itemsize
+    if len(label_data) != expected_label_size:
+      raise ValueError('Expects raw label data length %d, gets %d' %
+                       (expected_label_size, len(label_data)))
+  else:
+    raise ValueError('Unsupported label format: %s' % label_format)
+
+  feature_dict.update({
+      common.KEY_ENCODED_LABEL: _bytes_list_feature(label_data),
+      common.KEY_LABEL_FORMAT: _bytes_list_feature(label_format)
+  })
+
+  return feature_dict
+
+
+def create_tfexample(image_data,
+                     image_format,
+                     filename,
+                     label_data=None,
+                     label_format=None):
+  """Converts one image/segmentation pair to TF example.
+
+  Args:
+    image_data: String or byte stream of encoded image data.
+    image_format: String, image data format, should be either 'jpeg' or 'png'.
+    filename: String, image filename.
+    label_data: String or byte stream of (potentially) encoded label data. If
+      None, we skip to write it to tf.train.Example.
+    label_format: String, label data format, should be either 'png' or 'raw'. If
+      None, we skip to write it to tf.train.Example.
+
+  Returns:
+    TF example proto.
+  """
+  feature_dict = create_features(image_data, image_format, filename, label_data,
+                                 label_format)
+  return tf.train.Example(features=tf.train.Features(feature=feature_dict))
+
+
+def create_video_tfexample(image_data,
+                           image_format,
+                           filename,
+                           sequence_id,
+                           image_id,
+                           label_data=None,
+                           label_format=None,
+                           prev_image_data=None,
+                           prev_label_data=None):
+  """Converts one video frame/panoptic segmentation pair to TF example.
+
+  Args:
+    image_data: String or byte stream of encoded image data.
+    image_format: String, image data format, should be either 'jpeg' or 'png'.
+    filename: String, image filename.
+    sequence_id: ID of the video sequence as a string.
+    image_id: ID of the image as a string.
+    label_data: String or byte stream of (potentially) encoded label data. If
+      None, we skip to write it to tf.train.Example.
+    label_format: String, label data format, should be either 'png' or 'raw'. If
+      None, we skip to write it to tf.train.Example.
+    prev_image_data: An optional string or byte stream of encoded previous image
+      data.
+    prev_label_data: An optional string or byte stream of (potentially) encoded
+      previous label data.
+
+  Returns:
+    TF example proto.
+  """
+  feature_dict = create_features(image_data, image_format, filename, label_data,
+                                 label_format)
+  feature_dict.update({
+      common.KEY_SEQUENCE_ID: _bytes_list_feature(sequence_id),
+      common.KEY_FRAME_ID: _bytes_list_feature(image_id)
+  })
+  if prev_image_data is not None:
+    feature_dict[common.KEY_ENCODED_PREV_IMAGE] = _bytes_list_feature(
+        prev_image_data)
+  if prev_label_data is not None:
+    feature_dict[common.KEY_ENCODED_PREV_LABEL] = _bytes_list_feature(
+        prev_label_data)
+  return tf.train.Example(features=tf.train.Features(feature=feature_dict))
+
+
+def create_video_and_depth_tfexample(image_data,
+                                     image_format,
+                                     filename,
+                                     sequence_id,
+                                     image_id,
+                                     label_data=None,
+                                     label_format=None,
+                                     next_image_data=None,
+                                     next_label_data=None,
+                                     depth_data=None,
+                                     depth_format=None):
+  """Converts an image/segmentation pair and depth of first frame to TF example.
+
+    The image pair contains the current frame and the next frame with the
+    current frame including depth label.
+
+  Args:
+    image_data: String or byte stream of encoded image data.
+    image_format: String, image data format, should be either 'jpeg' or 'png'.
+    filename: String, image filename.
+    sequence_id: ID of the video sequence as a string.
+    image_id: ID of the image as a string.
+    label_data: String or byte stream of (potentially) encoded label data. If
+      None, we skip to write it to tf.train.Example.
+    label_format: String, label data format, should be either 'png' or 'raw'. If
+      None, we skip to write it to tf.train.Example.
+    next_image_data: An optional string or byte stream of encoded next image
+      data.
+    next_label_data: An optional string or byte stream of (potentially) encoded
+      next label data.
+    depth_data: An optional string or byte sream of encoded depth data.
+    depth_format: String, depth data format, should be either 'png' or 'raw'.
+
+  Returns:
+    TF example proto.
+  """
+  feature_dict = create_features(image_data, image_format, filename, label_data,
+                                 label_format)
+  feature_dict.update({
+      common.KEY_SEQUENCE_ID: _bytes_list_feature(sequence_id),
+      common.KEY_FRAME_ID: _bytes_list_feature(image_id)
+  })
+  if next_image_data is not None:
+    feature_dict[common.KEY_ENCODED_NEXT_IMAGE] = _bytes_list_feature(
+        next_image_data)
+  if next_label_data is not None:
+    feature_dict[common.KEY_ENCODED_NEXT_LABEL] = _bytes_list_feature(
+        next_label_data)
+  if depth_data is not None:
+    feature_dict[common.KEY_ENCODED_DEPTH] = _bytes_list_feature(
+        depth_data)
+    feature_dict[common.KEY_DEPTH_FORMAT] = _bytes_list_feature(
+        depth_format)
+  return tf.train.Example(features=tf.train.Features(feature=feature_dict))
+
+
+class SegmentationDecoder(object):
+  """Basic parser to decode serialized tf.Example."""
+
+  def __init__(self,
+               is_panoptic_dataset=True,
+               is_video_dataset=False,
+               use_two_frames=False,
+               use_next_frame=False,
+               decode_groundtruth_label=True):
+    self._is_panoptic_dataset = is_panoptic_dataset
+    self._is_video_dataset = is_video_dataset
+    self._use_two_frames = use_two_frames
+    self._use_next_frame = use_next_frame
+    self._decode_groundtruth_label = decode_groundtruth_label
+    string_feature = tf.io.FixedLenFeature((), tf.string)
+    int_feature = tf.io.FixedLenFeature((), tf.int64)
+    self._keys_to_features = {
+        common.KEY_ENCODED_IMAGE: string_feature,
+        common.KEY_IMAGE_FILENAME: string_feature,
+        common.KEY_IMAGE_FORMAT: string_feature,
+        common.KEY_IMAGE_HEIGHT: int_feature,
+        common.KEY_IMAGE_WIDTH: int_feature,
+        common.KEY_IMAGE_CHANNELS: int_feature,
+    }
+    if decode_groundtruth_label:
+      self._keys_to_features[common.KEY_ENCODED_LABEL] = string_feature
+    if self._is_video_dataset:
+      self._keys_to_features[common.KEY_SEQUENCE_ID] = string_feature
+      self._keys_to_features[common.KEY_FRAME_ID] = string_feature
+    # Two-frame specific processing.
+    if self._use_two_frames:
+      self._keys_to_features[common.KEY_ENCODED_PREV_IMAGE] = string_feature
+      if decode_groundtruth_label:
+        self._keys_to_features[common.KEY_ENCODED_PREV_LABEL] = string_feature
+    # Next-frame specific processing.
+    if self._use_next_frame:
+      self._keys_to_features[common.KEY_ENCODED_NEXT_IMAGE] = string_feature
+      if decode_groundtruth_label:
+        self._keys_to_features[common.KEY_ENCODED_NEXT_LABEL] = string_feature
+
+  def _decode_image(self, parsed_tensors, key):
+    """Decodes image udner key from parsed tensors."""
+    image = tf.io.decode_image(
+        parsed_tensors[key],
+        channels=3,
+        dtype=tf.dtypes.uint8,
+        expand_animations=False)
+    image.set_shape([None, None, 3])
+    return image
+
+  def _decode_label(self, parsed_tensors, label_key):
+    """Decodes segmentation label under label_key from parsed tensors."""
+    if self._is_panoptic_dataset:
+      flattened_label = tf.io.decode_raw(
+          parsed_tensors[label_key], out_type=tf.int32)
+      label_shape = tf.stack([
+          parsed_tensors[common.KEY_IMAGE_HEIGHT],
+          parsed_tensors[common.KEY_IMAGE_WIDTH], 1
+      ])
+      label = tf.reshape(flattened_label, label_shape)
+      return label
+
+    label = tf.io.decode_image(parsed_tensors[label_key], channels=1)
+    label.set_shape([None, None, 1])
+    return label
+
+  def __call__(self, serialized_example):
+    parsed_tensors = tf.io.parse_single_example(
+        serialized_example, features=self._keys_to_features)
+    return_dict = {
+        'image':
+            self._decode_image(parsed_tensors, common.KEY_ENCODED_IMAGE),
+        'image_name':
+            parsed_tensors[common.KEY_IMAGE_FILENAME],
+        'height':
+            tf.cast(parsed_tensors[common.KEY_IMAGE_HEIGHT], dtype=tf.int32),
+        'width':
+            tf.cast(parsed_tensors[common.KEY_IMAGE_WIDTH], dtype=tf.int32),
+    }
+    return_dict['label'] = None
+    if self._decode_groundtruth_label:
+      return_dict['label'] = self._decode_label(parsed_tensors,
+                                                common.KEY_ENCODED_LABEL)
+    if self._is_video_dataset:
+      return_dict['sequence'] = parsed_tensors[common.KEY_SEQUENCE_ID]
+    if self._use_two_frames:
+      return_dict['prev_image'] = self._decode_image(
+          parsed_tensors, common.KEY_ENCODED_PREV_IMAGE)
+      if self._decode_groundtruth_label:
+        return_dict['prev_label'] = self._decode_label(
+            parsed_tensors, common.KEY_ENCODED_PREV_LABEL)
+    if self._use_next_frame:
+      return_dict['next_image'] = self._decode_image(
+          parsed_tensors, common.KEY_ENCODED_NEXT_IMAGE)
+      if self._decode_groundtruth_label:
+        return_dict['next_label'] = self._decode_label(
+            parsed_tensors, common.KEY_ENCODED_NEXT_LABEL)
+    return return_dict
diff --git a/data/data_utils_test.py b/data/data_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e87ba80eaa2f7099bff65f84d725dbbdcd99f161
--- /dev/null
+++ b/data/data_utils_test.py
@@ -0,0 +1,94 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for data_utils."""
+
+import io
+import numpy as np
+from PIL import Image
+import tensorflow as tf
+
+from deeplab2.data import data_utils
+
+
+def _encode_png_image(image):
+  """Helper method to encode input image in PNG format."""
+  buffer = io.BytesIO()
+  Image.fromarray(image).save(buffer, format='png')
+  return buffer.getvalue()
+
+
+class DataUtilsTest(tf.test.TestCase):
+
+  def _create_test_image(self, height, width):
+    rng = np.random.RandomState(319281498)
+    return rng.randint(0, 255, size=(height, width, 3), dtype=np.uint8)
+
+  def test_encode_and_decode(self):
+    """Checks decode created tf.Example for semantic segmentation."""
+    test_image_height = 20
+    test_image_width = 15
+    filename = 'dummy'
+
+    image = self._create_test_image(test_image_height, test_image_width)
+    # Take the last channel as dummy label.
+    label = image[..., 0]
+
+    example = data_utils.create_tfexample(
+        image_data=_encode_png_image(image),
+        image_format='png', filename=filename,
+        label_data=_encode_png_image(label), label_format='png')
+
+    # Parse created example, expect getting identical results.
+    parser = data_utils.SegmentationDecoder(is_panoptic_dataset=False)
+    parsed_tensors = parser(example.SerializeToString())
+
+    self.assertIn('image', parsed_tensors)
+    self.assertIn('image_name', parsed_tensors)
+    self.assertIn('label', parsed_tensors)
+    self.assertEqual(filename, parsed_tensors['image_name'])
+    np.testing.assert_array_equal(image, parsed_tensors['image'].numpy())
+    # Decoded label is a 3-D array with last dimension of 1.
+    decoded_label = parsed_tensors['label'].numpy()
+    np.testing.assert_array_equal(label, decoded_label[..., 0])
+
+  def test_encode_and_decode_panoptic(self):
+    test_image_height = 31
+    test_image_width = 17
+    filename = 'dummy'
+
+    image = self._create_test_image(test_image_height, test_image_width)
+    # Create dummy panoptic label in np.int32 dtype.
+    label = np.dot(image.astype(np.int32), [1, 256, 256 * 256]).astype(np.int32)
+    example = data_utils.create_tfexample(
+        image_data=_encode_png_image(image),
+        image_format='png', filename=filename,
+        label_data=label.tostring(), label_format='raw')
+
+    parser = data_utils.SegmentationDecoder(is_panoptic_dataset=True)
+    parsed_tensors = parser(example.SerializeToString())
+
+    self.assertIn('image', parsed_tensors)
+    self.assertIn('image_name', parsed_tensors)
+    self.assertIn('label', parsed_tensors)
+    self.assertEqual(filename, parsed_tensors['image_name'])
+    np.testing.assert_array_equal(image, parsed_tensors['image'].numpy())
+    # Decoded label is a 3-D array with last dimension of 1.
+    decoded_label = parsed_tensors['label'].numpy()
+    np.testing.assert_array_equal(label, decoded_label[..., 0])
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/data/dataloader/__init__.py b/data/dataloader/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..35e4ce02ff422f3aa84ab644b88d65b13e0cbc03
--- /dev/null
+++ b/data/dataloader/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/data/dataloader/input_reader.py b/data/dataloader/input_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbf384e6f7ff8e3188bf2f98af9d8a14bee15a59
--- /dev/null
+++ b/data/dataloader/input_reader.py
@@ -0,0 +1,91 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Input reader to load segmentation dataset."""
+
+import tensorflow as tf
+
+_NUM_INPUTS_PROCESSED_CONCURRENTLY = 32
+_SHUFFLE_BUFFER_SIZE = 1000
+
+
+class InputReader(object):
+  """Input function that creates a dataset from files."""
+
+  def __init__(self,
+               file_pattern,
+               decoder_fn,
+               generator_fn=None,
+               is_training=False):
+    """Initializes the input reader.
+
+    Args:
+      file_pattern: The file pattern for the data example, in TFRecord format
+      decoder_fn: A callable that takes a serialized tf.Example and produces
+        parsed (and potentially processed / augmented) tensors.
+      generator_fn: An optional `callable` that takes the decoded raw tensors
+        dict and generates a ground-truth dictionary that can be consumed by
+        the model. It will be executed after decoder_fn (default: None).
+      is_training: If this dataset is used for training or not (default: False).
+    """
+    self._file_pattern = file_pattern
+    self._is_training = is_training
+    self._decoder_fn = decoder_fn
+    self._generator_fn = generator_fn
+
+  def __call__(self, batch_size=1, max_num_examples=-1):
+    """Provides tf.data.Dataset object.
+
+    Args:
+      batch_size: Expected batch size input data.
+      max_num_examples: Positive integer or -1. If positive, the returned
+        dataset will only take (at most) this number of examples and raise
+        tf.errors.OutOfRangeError after that (default: -1).
+
+    Returns:
+      tf.data.Dataset object.
+    """
+    dataset = tf.data.Dataset.list_files(self._file_pattern)
+
+    if self._is_training:
+      # File level shuffle.
+      dataset = dataset.shuffle(dataset.cardinality(),
+                                reshuffle_each_iteration=True)
+      dataset = dataset.repeat()
+
+    # During training, interleave TFRecord conversion for maximum efficiency.
+    # During evaluation, read input in consecutive order for tasks requiring
+    # such behavior.
+    dataset = dataset.interleave(
+        map_func=tf.data.TFRecordDataset,
+        cycle_length=(_NUM_INPUTS_PROCESSED_CONCURRENTLY
+                      if self._is_training else 1),
+        num_parallel_calls=tf.data.experimental.AUTOTUNE,
+        deterministic=not self._is_training)
+
+    if self._is_training:
+      dataset = dataset.shuffle(_SHUFFLE_BUFFER_SIZE)
+    if max_num_examples > 0:
+      dataset = dataset.take(max_num_examples)
+
+    # Parses the fetched records to input tensors for model function.
+    dataset = dataset.map(
+        self._decoder_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+    if self._generator_fn is not None:
+      dataset = dataset.map(
+          self._generator_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+    dataset = dataset.batch(batch_size, drop_remainder=True)
+    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+    return dataset
diff --git a/data/dataset.py b/data/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6257830d5c676c89f1a0f6fbb1066e12cf7c8ad
--- /dev/null
+++ b/data/dataset.py
@@ -0,0 +1,228 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Provides data from segmentation datasets.
+
+Currently, we support the following datasets:
+
+1. Cityscapes dataset (https://www.cityscapes-dataset.com).
+
+The Cityscapes dataset contains 19 semantic labels (such as road, person, car,
+and so on) for urban street scenes.
+
+
+2. KITTI-STEP (http://www.cvlibs.net/datasets/kitti/).
+
+The KITTI-STEP enriches the KITTI-MOTS data with additional `stuff'
+anntotations.
+
+3. MOTChallenge-STEP (https://motchallenge.net/).
+
+The MOTChallenge-STEP enriches the MOTSChallenge data with additional `stuff'
+annotations.
+
+4. MSCOCO panoptic segmentation (http://cocodataset.org/#panoptic-2018).
+
+Panoptic segmentation annotations for MSCOCO dataset. Note that we convert the
+provided MSCOCO panoptic segmentation format to the following one:
+panoptic label = semantic label * 256 + instance id.
+
+5. Cityscapes-DVPS (https://github.com/joe-siyuan-qiao/ViP-DeepLab)
+The Cityscapes-DVPS dataset augments Cityscapes-VPS
+(https://github.com/mcahny/vps) with depth annotations.
+
+
+References:
+
+- Marius Cordts, Mohamed Omran, Sebastian Ramos, Timo Rehfeld, Markus
+  Enzweiler, Rodrigo Benenson, Uwe Franke, Stefan Roth, and Bernt Schiele, "The
+  Cityscapes Dataset for Semantic Urban Scene Understanding." In CVPR, 2016.
+
+- Andreas Geiger and Philip Lenz and Raquel Urtasun, "Are we ready for
+  Autonomous Driving? The KITTI Vision Benchmark Suite." In CVPR, 2012.
+
+- Alexander Kirillov, Kaiming He, Ross Girshick, Carsten Rother, and Piotr
+  Dollar, "Panoptic Segmentation." In CVPR, 2019.
+
+- Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev, Ross B.
+  Girshick, James Hays, Pietro Perona, Deva Ramanan, Piotr Dollar, and C.
+  Lawrence Zitnick, "Microsoft COCO: common objects in context." In ECCV, 2014.
+
+- Anton Milan, Laura Leal-Taixe, Ian Reid, Stefan Roth, and Konrad Schindler,
+  "Mot16: A benchmark for multi-object tracking." arXiv:1603.00831, 2016.
+
+- Paul Voigtlaender, Michael Krause, Aljosa Osep, Jonathon Luiten, Berin
+  Balachandar Gnana Sekar, Andreas Geiger, and Bastian Leibe. "MOTS:
+  Multi-object tracking and segmentation." In CVPR, 2019
+
+- Mark Weber, Jun Xie, Maxwell Collins, Yukun Zhu, Paul Voigtlaender, Hartwig
+  Adam, Bradley Green, Andreas Geiger, Bastian Leibe, Daniel Cremers, Aljosa
+  Osep, Laura Leal-Taixe, and Liang-Chieh Chen, "STEP: Segmenting and Tracking
+  Every Pixel." arXiv: 2102.11859, 2021.
+
+- Dahun Kim, Sanghyun Woo, Joon-Young Lee, and In So Kweon. "Video panoptic
+  segmentation." In CVPR, 2020.
+
+- Siyuan Qiao, Yukun Zhu, Hartwig Adam, Alan Yuille, and Liang-Chieh Chen.
+  "ViP-DeepLab: Learning Visual Perception with Depth-aware Video Panoptic
+  Segmentation." In CVPR, 2021.
+"""
+
+import collections
+
+
+# Dataset names.
+_CITYSCAPES = 'cityscapes'
+_CITYSCAPES_PANOPTIC = 'cityscapes_panoptic'
+_KITTI_STEP = 'kitti_step'
+_MOTCHALLENGE_STEP = 'motchallenge_step'
+_CITYSCAPES_DVPS = 'cityscapes_dvps'
+_COCO_PANOPTIC = 'coco_panoptic'
+
+# Colormap names.
+_CITYSCAPES_COLORMAP = 'cityscapes'
+_MOTCHALLENGE_COLORMAP = 'motchallenge'
+_COCO_COLORMAP = 'coco'
+
+
+# Named tuple to describe dataset properties.
+DatasetDescriptor = collections.namedtuple(
+    'DatasetDescriptor', [
+        'dataset_name',  # Dataset name.
+        'splits_to_sizes',  # Splits of the dataset into training, val and test.
+        'num_classes',   # Number of semantic classes.
+        'ignore_label',  # Ignore label value used for semantic segmentation.
+
+        # Fields below are used for panoptic segmentation and will be None for
+        # Semantic segmentation datasets.
+        # Label divisor only used in panoptic segmentation annotation to infer
+        # semantic label and instance id.
+        'panoptic_label_divisor',
+        # A tuple of classes that contains instance annotations. For example,
+        # 'person' class has instance annotations while 'sky' does not.
+        'class_has_instances_list',
+        # A flag indicating whether the dataset is a video dataset that contains
+        # sequence IDs and frame IDs.
+        'is_video_dataset',
+        # A string specifying the colormap that should be used for
+        # visualization. E.g. 'cityscapes'.
+        'colormap',
+        # A flag indicating whether the dataset contains depth annotation.
+        'is_depth_dataset',
+    ]
+)
+
+CITYSCAPES_INFORMATION = DatasetDescriptor(
+    dataset_name=_CITYSCAPES,
+    splits_to_sizes={'train_fine': 2975,
+                     'train_coarse': 22973,
+                     'trainval_fine': 3475,
+                     'trainval_coarse': 23473,
+                     'val_fine': 500,
+                     'test_fine': 1525},
+    num_classes=19,
+    ignore_label=255,
+    panoptic_label_divisor=None,
+    class_has_instances_list=None,
+    is_video_dataset=False,
+    colormap=_CITYSCAPES_COLORMAP,
+    is_depth_dataset=False,
+)
+
+CITYSCAPES_PANOPTIC_INFORMATION = DatasetDescriptor(
+    dataset_name=_CITYSCAPES_PANOPTIC,
+    splits_to_sizes={'train_fine': 2975,
+                     'val_fine': 500,
+                     'trainval_fine': 3475,
+                     'test_fine': 1525},
+    num_classes=19,
+    ignore_label=255,
+    panoptic_label_divisor=1000,
+    class_has_instances_list=tuple(range(11, 19)),
+    is_video_dataset=False,
+    colormap=_CITYSCAPES_COLORMAP,
+    is_depth_dataset=False,
+)
+
+KITTI_STEP_INFORMATION = DatasetDescriptor(
+    dataset_name=_KITTI_STEP,
+    splits_to_sizes={'train': 5027,
+                     'val': 2981,
+                     'test': 11095},
+    num_classes=19,
+    ignore_label=255,
+    panoptic_label_divisor=1000,
+    class_has_instances_list=(11, 13),
+    is_video_dataset=True,
+    colormap=_CITYSCAPES_COLORMAP,
+    is_depth_dataset=False,
+)
+
+MOTCHALLENGE_STEP_INFORMATION = DatasetDescriptor(
+    dataset_name=_MOTCHALLENGE_STEP,
+    splits_to_sizes={'train': 525,  # Sequence 9.
+                     'val': 600,  # Sequence 2.
+                     'test': 0},
+    num_classes=7,
+    ignore_label=255,
+    panoptic_label_divisor=1000,
+    class_has_instances_list=(4,),
+    is_video_dataset=True,
+    colormap=_MOTCHALLENGE_COLORMAP,
+    is_depth_dataset=False,
+)
+
+CITYSCAPES_DVPS_INFORMATION = DatasetDescriptor(
+    dataset_name=_CITYSCAPES_DVPS,
+    # The numbers of images are 2400/300/300 for train/val/test. Here, the
+    # sizes are the number of consecutive frame pairs. As each sequence has 6
+    # frames, the number of pairs for the train split is 2400 / 6 * 5 = 2000.
+    # Similarly, we get 250 pairs for the val split and the test split.
+    splits_to_sizes={'train': 2000,
+                     'val': 250,
+                     'test': 250},
+    num_classes=19,
+    ignore_label=255,
+    panoptic_label_divisor=1000,
+    class_has_instances_list=tuple(range(11, 19)),
+    is_video_dataset=True,
+    colormap=_CITYSCAPES_COLORMAP,
+    is_depth_dataset=True,
+)
+
+COCO_PANOPTIC_INFORMATION = DatasetDescriptor(
+    dataset_name=_COCO_PANOPTIC,
+    splits_to_sizes={'train': 118287,
+                     'val': 5000,
+                     'test': 40670},
+    num_classes=134,
+    ignore_label=0,
+    panoptic_label_divisor=256,
+    class_has_instances_list=tuple(range(1, 81)),
+    is_video_dataset=False,
+    colormap=_COCO_COLORMAP,
+    is_depth_dataset=False,
+)
+
+MAP_NAME_TO_DATASET_INFO = {
+    _CITYSCAPES: CITYSCAPES_INFORMATION,
+    _CITYSCAPES_PANOPTIC: CITYSCAPES_PANOPTIC_INFORMATION,
+    _KITTI_STEP: KITTI_STEP_INFORMATION,
+    _MOTCHALLENGE_STEP: MOTCHALLENGE_STEP_INFORMATION,
+    _CITYSCAPES_DVPS: CITYSCAPES_DVPS_INFORMATION,
+    _COCO_PANOPTIC: COCO_PANOPTIC_INFORMATION,
+}
+
+MAP_NAMES = list(MAP_NAME_TO_DATASET_INFO.keys())
diff --git a/data/dataset_utils.py b/data/dataset_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..167b30a6182cd49ee35f9b3245bf5f0cd9c810a6
--- /dev/null
+++ b/data/dataset_utils.py
@@ -0,0 +1,71 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This file contains utility function for handling the dataset."""
+
+import tensorflow as tf
+
+
+def get_semantic_and_panoptic_label(dataset_info, label, ignore_label):
+  """Helper function to get semantic and panoptic label from panoptic label.
+
+  This functions gets the semantic and panoptic label from panoptic label for
+  different datasets. The labels must be encoded with semantic_label *
+  label_divisor + instance_id. For thing classes, the instance ID 0 is reserved
+  for crowd regions. Please note, the returned panoptic label has replaced
+  the crowd region with ignore regions. Yet, the semantic label makes use of
+  these regions.
+
+  Args:
+    dataset_info: A dictionary storing dataset information.
+    label: A Tensor of panoptic label.
+    ignore_label: An integer specifying the ignore_label.
+
+  Returns:
+    semantic_label: A Tensor of semantic segmentation label.
+    panoptic_label: A Tensor of panoptic segmentation label, which follows the
+      Cityscapes annotation where
+      panoptic_label = semantic_label * panoptic_label_divisor + instance_id.
+    thing_mask: A boolean Tensor specifying the thing regions. Zero if no thing.
+    crowd_region: A boolean Tensor specifying crowd region. Zero if no crowd
+      annotation.
+
+  Raises:
+    ValueError: An error occurs when the ignore_label is not in range
+      [0, label_divisor].
+  """
+  panoptic_label_divisor = dataset_info['panoptic_label_divisor']
+  if ignore_label >= panoptic_label_divisor or ignore_label < 0:
+    raise ValueError('The ignore_label must be in [0, label_divisor].')
+
+  semantic_label = label // panoptic_label_divisor
+  # Find iscrowd region if any and set to ignore for panoptic labels.
+  # 1. Find thing mask.
+  thing_mask = tf.zeros_like(semantic_label, tf.bool)
+  for thing_id in dataset_info['class_has_instances_list']:
+    thing_mask = tf.logical_or(
+        thing_mask,
+        tf.equal(semantic_label, thing_id))
+  # 2. Find crowd region (thing label that have instance_id == 0).
+  crowd_region = tf.logical_and(
+      thing_mask,
+      tf.equal(label % panoptic_label_divisor, 0))
+  # 3. Set crowd region to ignore label.
+  panoptic_label = tf.where(
+      crowd_region,
+      tf.ones_like(label) * ignore_label * panoptic_label_divisor,
+      label)
+
+  return semantic_label, panoptic_label, thing_mask, crowd_region
diff --git a/data/dataset_utils_test.py b/data/dataset_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7538bf0cc6f7199ecbffee8be1e2d70a97b1524
--- /dev/null
+++ b/data/dataset_utils_test.py
@@ -0,0 +1,90 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for dataset_utils."""
+
+import numpy as np
+import tensorflow as tf
+
+from deeplab2.data import dataset_utils
+
+
+class DatasetUtilsTest(tf.test.TestCase):
+
+  def _get_test_labels(self, num_classes, shape, label_divisor):
+    num_ids_per_class = 35
+    semantic_labels = np.random.randint(num_classes, size=shape)
+    panoptic_labels = np.random.randint(
+        num_ids_per_class, size=shape) + semantic_labels * label_divisor
+
+    semantic_labels = tf.convert_to_tensor(semantic_labels, dtype=tf.int32)
+    panoptic_labels = tf.convert_to_tensor(panoptic_labels, dtype=tf.int32)
+
+    return panoptic_labels, semantic_labels
+
+  def setUp(self):
+    super().setUp()
+    self._first_thing_class = 9
+    self._num_classes = 19
+    self._dataset_info = {
+        'panoptic_label_divisor': 1000,
+        'class_has_instances_list': tf.range(self._first_thing_class,
+                                             self._num_classes)
+    }
+    self._num_ids = 37
+    self._labels, self._semantic_classes = self._get_test_labels(
+        self._num_classes, [2, 33, 33],
+        self._dataset_info['panoptic_label_divisor'])
+
+  def test_get_panoptic_and_semantic_label(self):
+    # Note: self._labels contains one crowd instance per class.
+    (returned_sem_labels, returned_pan_labels, returned_thing_mask,
+     returned_crowd_region) = (
+         dataset_utils.get_semantic_and_panoptic_label(
+             self._dataset_info, self._labels, ignore_label=255))
+
+    expected_semantic_labels = self._semantic_classes
+    condition = self._labels % self._dataset_info['panoptic_label_divisor'] == 0
+    condition = tf.logical_and(
+        condition,
+        tf.math.greater_equal(expected_semantic_labels,
+                              self._first_thing_class))
+    expected_crowd_labels = tf.where(condition, 1.0, 0.0)
+    expected_pan_labels = tf.where(
+        condition, 255 * self._dataset_info['panoptic_label_divisor'],
+        self._labels)
+    expected_thing_mask = tf.where(
+        tf.math.greater_equal(expected_semantic_labels,
+                              self._first_thing_class), 1.0, 0.0)
+
+    self.assertListEqual(returned_sem_labels.shape.as_list(),
+                         expected_semantic_labels.shape.as_list())
+    self.assertListEqual(returned_pan_labels.shape.as_list(),
+                         expected_pan_labels.shape.as_list())
+    self.assertListEqual(returned_crowd_region.shape.as_list(),
+                         expected_crowd_labels.shape.as_list())
+    self.assertListEqual(returned_thing_mask.shape.as_list(),
+                         expected_thing_mask.shape.as_list())
+    np.testing.assert_equal(returned_sem_labels.numpy(),
+                            expected_semantic_labels.numpy())
+    np.testing.assert_equal(returned_pan_labels.numpy(),
+                            expected_pan_labels.numpy())
+    np.testing.assert_equal(returned_crowd_region.numpy(),
+                            expected_crowd_labels.numpy())
+    np.testing.assert_equal(returned_thing_mask.numpy(),
+                            expected_thing_mask.numpy())
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/data/preprocessing/__init__.py b/data/preprocessing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..35e4ce02ff422f3aa84ab644b88d65b13e0cbc03
--- /dev/null
+++ b/data/preprocessing/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/data/preprocessing/autoaugment_policy.py b/data/preprocessing/autoaugment_policy.py
new file mode 100644
index 0000000000000000000000000000000000000000..cec895228580f1ea0f4f3b9e96ccb6d5bf288113
--- /dev/null
+++ b/data/preprocessing/autoaugment_policy.py
@@ -0,0 +1,75 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""AutoAugment policy file.
+
+This file contains found auto-augment policy.
+
+Please cite or refer to the following papers for details:
+- Ekin D Cubuk, Barret Zoph, Dandelion Mane, Vijay Vasudevan, and Quoc V Le.
+"Autoaugment: Learning augmentation policies from data." In CVPR, 2019.
+
+- Ekin D Cubuk, Barret Zoph, Jonathon Shlens, and Quoc V Le.
+"Randaugment: Practical automated data augmentation with a reduced search
+space." In CVPR, 2020.
+"""
+
+# Reduced augmentation operation space.
+augmentation_reduced_operations = (
+    'AutoContrast', 'Equalize', 'Invert', 'Posterize',
+    'Solarize', 'Color', 'Contrast', 'Brightness', 'Sharpness')
+
+augmentation_probabilities = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+
+
+def convert_policy(policy,
+                   search_space=augmentation_reduced_operations,
+                   probability_scale=1.0,
+                   magnitude_scale=1):
+  """Converts policy from a list of numbers."""
+  if len(policy) % 6:
+    raise ValueError('Policy length must be a multiple of 6.')
+  num_policies = len(policy) // 6
+  policy_list = [[] for _ in range(num_policies)]
+  for n in range(num_policies):
+    for i in range(2):
+      operation_id, prob_id, magnitude = (
+          policy[6 * n + i * 3 : 6 * n + (i + 1) * 3])
+      policy_name = search_space[operation_id]
+      policy_prob = (
+          augmentation_probabilities[prob_id] * probability_scale)
+      policy_list[n].append((policy_name,
+                             policy_prob,
+                             magnitude * magnitude_scale))
+  return policy_list
+
+
+simple_classification_policy = [8, 2, 7, 7, 1, 10,
+                                1, 0, 9, 6, 1, 10,
+                                8, 1, 9, 5, 1, 9,
+                                4, 1, 7, 1, 3, 9,
+                                8, 1, 1, 1, 1, 7]
+
+# All available policies.
+available_policies = {
+    'simple_classification_policy_magnitude_scale_0.2': convert_policy(
+        simple_classification_policy,
+        augmentation_reduced_operations,
+        magnitude_scale=0.2),
+    'simple_classification_policy': convert_policy(
+        simple_classification_policy,
+        augmentation_reduced_operations,
+        magnitude_scale=1),
+}
diff --git a/data/preprocessing/autoaugment_policy_test.py b/data/preprocessing/autoaugment_policy_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9b02a38cb3def5a7277f2142ca5a27d0552cdcb
--- /dev/null
+++ b/data/preprocessing/autoaugment_policy_test.py
@@ -0,0 +1,43 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for autoaugment_policy.py."""
+
+import tensorflow as tf
+
+from deeplab2.data.preprocessing import autoaugment_policy
+
+
+class AutoaugmentPolicyTest(tf.test.TestCase):
+
+  def testConvertPolicy(self):
+    policy = [5, 1, 10, 5, 3, 4,
+              6, 3, 7, 3, 3, 9,
+              2, 2, 8, 8, 2, 8,
+              1, 4, 9, 4, 5, 7,
+              6, 4, 1, 1, 3, 4]
+    expected = [
+        [('Color', 0.2, 10), ('Color', 0.6, 4)],
+        [('Contrast', 0.6, 7), ('Posterize', 0.6, 9)],
+        [('Invert', 0.4, 8), ('Sharpness', 0.4, 8)],
+        [('Equalize', 0.8, 9), ('Solarize', 1.0, 7)],
+        [('Contrast', 0.8, 1), ('Equalize', 0.6, 4)],
+    ]
+    policy_list = autoaugment_policy.convert_policy(policy)
+    self.assertAllEqual(policy_list, expected)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/data/preprocessing/autoaugment_utils.py b/data/preprocessing/autoaugment_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3600b51b1018fc399158d0c4ebe1e772975a5c6a
--- /dev/null
+++ b/data/preprocessing/autoaugment_utils.py
@@ -0,0 +1,422 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""AutoAugment utility file.
+
+Please cite or refer to the following papers:
+- Ekin D Cubuk, Barret Zoph, Dandelion Mane, Vijay Vasudevan, and Quoc V Le.
+"Autoaugment: Learning augmentation policies from data." In CVPR, 2019.
+
+- Ekin D Cubuk, Barret Zoph, Jonathon Shlens, and Quoc V Le.
+"Randaugment: Practical automated data augmentation with a reduced search
+space." In CVPR, 2020.
+"""
+
+import inspect
+
+import tensorflow as tf
+
+from deeplab2.data.preprocessing import autoaugment_policy
+
+
+# This signifies the max integer that the controller RNN could predict for the
+# augmentation scheme.
+_MAX_LEVEL = 10.
+
+
+def blend(image1, image2, factor):
+  """Blends image1 and image2 using 'factor'.
+
+  Factor can be above 0.0.  A value of 0.0 means only image1 is used.
+  A value of 1.0 means only image2 is used.  A value between 0.0 and
+  1.0 means we linearly interpolate the pixel values between the two
+  images.  A value greater than 1.0 "extrapolates" the difference
+  between the two pixel values, and we clip the results to values
+  between 0 and 255.
+
+  Args:
+    image1: An image Tensor of type uint8.
+    image2: An image Tensor of type uint8.
+    factor: A floating point value above 0.0.
+
+  Returns:
+    A blended image Tensor of type uint8.
+  """
+  if factor == 0.0:
+    return tf.convert_to_tensor(image1)
+  if factor == 1.0:
+    return tf.convert_to_tensor(image2)
+
+  image1 = tf.cast(image1, tf.float32)
+  image2 = tf.cast(image2, tf.float32)
+
+  difference = image2 - image1
+  scaled = factor * difference
+
+  # Do addition in float.
+  temp = tf.cast(image1, tf.float32) + scaled
+
+  # Interpolate
+  if factor > 0.0 and factor < 1.0:
+    # Interpolation means we always stay within 0 and 255.
+    return tf.cast(temp, tf.uint8)
+
+  # Extrapolate:
+  #
+  # We need to clip and then cast.
+  return tf.cast(tf.clip_by_value(temp, 0.0, 255.0), tf.uint8)
+
+
+def solarize(image, threshold=128):
+  # For each pixel in the image, select the pixel
+  # if the value is less than the threshold.
+  # Otherwise, subtract 255 from the pixel.
+  return tf.where(image < threshold, image, 255 - image)
+
+
+def invert(image):
+  """Inverts the image pixels."""
+  image = tf.convert_to_tensor(image)
+  return 255 - image
+
+
+def color(image, factor):
+  """Equivalent of PIL Color."""
+  degenerate = tf.image.grayscale_to_rgb(tf.image.rgb_to_grayscale(image))
+  return blend(degenerate, image, factor)
+
+
+def contrast(image, factor):
+  """Equivalent of PIL Contrast."""
+  degenerate = tf.image.rgb_to_grayscale(image)
+  # Cast before calling tf.histogram.
+  degenerate = tf.cast(degenerate, tf.int32)
+
+  # Compute the grayscale histogram, then compute the mean pixel value,
+  # and create a constant image size of that value.  Use that as the
+  # blending degenerate target of the original image.
+  hist = tf.histogram_fixed_width(degenerate, [0, 255], nbins=256)
+  mean = tf.reduce_sum(tf.cast(hist, tf.float32)) / 256.0
+  degenerate = tf.ones_like(degenerate, dtype=tf.float32) * mean
+  degenerate = tf.clip_by_value(degenerate, 0.0, 255.0)
+  degenerate = tf.image.grayscale_to_rgb(tf.cast(degenerate, tf.uint8))
+  return blend(degenerate, image, factor)
+
+
+def brightness(image, factor):
+  """Equivalent of PIL Brightness."""
+  degenerate = tf.zeros_like(image)
+  return blend(degenerate, image, factor)
+
+
+def posterize(image, bits):
+  """Equivalent of PIL Posterize."""
+  shift = 8 - bits
+  return tf.bitwise.left_shift(tf.bitwise.right_shift(image, shift), shift)
+
+
+def autocontrast(image):
+  """Implements Autocontrast function from PIL using TF ops.
+
+  Args:
+    image: A 3D uint8 tensor.
+
+  Returns:
+    The image after it has had autocontrast applied to it and will be of type
+    uint8.
+  """
+
+  def scale_channel(image):
+    """Scale the 2D image using the autocontrast rule."""
+    # A possibly cheaper version can be done using cumsum/unique_with_counts
+    # over the histogram values, rather than iterating over the entire image.
+    # to compute mins and maxes.
+    lo = tf.cast(tf.reduce_min(image), tf.float32)
+    hi = tf.cast(tf.reduce_max(image), tf.float32)
+
+    # Scale the image, making the lowest value 0 and the highest value 255.
+    def scale_values(im):
+      scale = 255.0 / (hi - lo)
+      offset = -lo * scale
+      im = tf.cast(im, tf.float32) * scale + offset
+      im = tf.clip_by_value(im, 0.0, 255.0)
+      return tf.cast(im, tf.uint8)
+
+    result = tf.cond(hi > lo, lambda: scale_values(image), lambda: image)
+    return result
+
+  # Assumes RGB for now.  Scales each channel independently
+  # and then stacks the result.
+  s1 = scale_channel(image[:, :, 0])
+  s2 = scale_channel(image[:, :, 1])
+  s3 = scale_channel(image[:, :, 2])
+  image = tf.stack([s1, s2, s3], 2)
+  return image
+
+
+def sharpness(image, factor):
+  """Implements Sharpness function from PIL using TF ops."""
+  orig_image = image
+  image = tf.cast(image, tf.float32)
+  # Make image 4D for conv operation.
+  image = tf.expand_dims(image, 0)
+  # SMOOTH PIL Kernel.
+  kernel = tf.constant(
+      [[1, 1, 1], [1, 5, 1], [1, 1, 1]], dtype=tf.float32,
+      shape=[3, 3, 1, 1]) / 13.
+  # Tile across channel dimension.
+  kernel = tf.tile(kernel, [1, 1, 3, 1])
+  strides = [1, 1, 1, 1]
+  degenerate = tf.nn.depthwise_conv2d(
+      image, kernel, strides, padding='VALID', dilations=[1, 1])
+  degenerate = tf.clip_by_value(degenerate, 0.0, 255.0)
+  degenerate = tf.squeeze(tf.cast(degenerate, tf.uint8), [0])
+
+  # For the borders of the resulting image, fill in the values of the
+  # original image.
+  mask = tf.ones_like(degenerate)
+  padded_mask = tf.pad(mask, [[1, 1], [1, 1], [0, 0]])
+  padded_degenerate = tf.pad(degenerate, [[1, 1], [1, 1], [0, 0]])
+  result = tf.where(tf.equal(padded_mask, 1), padded_degenerate, orig_image)
+
+  # Blend the final result.
+  return blend(result, orig_image, factor)
+
+
+def equalize(image):
+  """Implements Equalize function from PIL using TF ops."""
+  def scale_channel(im, c):
+    """Scale the data in the channel to implement equalize."""
+    im = tf.cast(im[:, :, c], tf.int32)
+    # Compute the histogram of the image channel.
+    histo = tf.histogram_fixed_width(im, [0, 255], nbins=256)
+
+    # For the purposes of computing the step, filter out the nonzeros.
+    nonzero = tf.where(tf.not_equal(histo, 0))
+    nonzero_histo = tf.reshape(tf.gather(histo, nonzero), [-1])
+    step = (tf.reduce_sum(nonzero_histo) - nonzero_histo[-1]) // 255
+
+    def build_lut(histo, step):
+      # Compute the cumulative sum, shifting by step // 2
+      # and then normalization by step.
+      lut = (tf.cumsum(histo) + (step // 2)) // step
+      # Shift lut, prepending with 0.
+      lut = tf.concat([[0], lut[:-1]], 0)
+      # Clip the counts to be in range.  This is done
+      # in the C code for image.point.
+      return tf.clip_by_value(lut, 0, 255)
+
+    # If step is zero, return the original image.  Otherwise, build
+    # lut from the full histogram and step and then index from it.
+    result = tf.cond(tf.equal(step, 0),
+                     lambda: im,
+                     lambda: tf.gather(build_lut(histo, step), im))
+
+    return tf.cast(result, tf.uint8)
+
+  # Assumes RGB for now.  Scales each channel independently
+  # and then stacks the result.
+  s1 = scale_channel(image, 0)
+  s2 = scale_channel(image, 1)
+  s3 = scale_channel(image, 2)
+  image = tf.stack([s1, s2, s3], 2)
+  return image
+
+
+NAME_TO_FUNC = {
+    'AutoContrast': autocontrast,
+    'Equalize': equalize,
+    'Invert': invert,
+    'Posterize': posterize,
+    'Solarize': solarize,
+    'Color': color,
+    'Contrast': contrast,
+    'Brightness': brightness,
+    'Sharpness': sharpness,
+}
+
+
+def _enhance_level_to_arg(level):
+  return ((level/_MAX_LEVEL) * 1.8 + 0.1,)
+
+
+def level_to_arg():
+  return {
+      'AutoContrast':
+          lambda level: (),
+      'Equalize':
+          lambda level: (),
+      'Invert':
+          lambda level: (),
+      'Posterize': lambda level: (int((level/_MAX_LEVEL) * 4),),
+      'Solarize': lambda level: (int((level/_MAX_LEVEL) * 256),),
+      'Color':
+          _enhance_level_to_arg,
+      'Contrast':
+          _enhance_level_to_arg,
+      'Brightness':
+          _enhance_level_to_arg,
+      'Sharpness':
+          _enhance_level_to_arg,
+  }
+
+
+def label_wrapper(func):
+  """Adds a label function argument to func and returns unchanged label."""
+  def wrapper(images, label, *args, **kwargs):
+    return func(images, *args, **kwargs), label
+  return wrapper
+
+
+def _parse_policy_info(name, prob, level, replace_value, ignore_label):
+  """Returns the function corresponding to `name` and update `level` param."""
+  func = NAME_TO_FUNC[name]
+  args = level_to_arg()[name](level)
+
+  if 'prob' in inspect.getfullargspec(func)[0]:
+    args = tuple([prob] + list(args))
+
+  # Add in replace arg if it is required for the function that is being called.
+  if 'replace' in inspect.getfullargspec(func)[0]:
+    # Make sure ignore_label is also in the argument.
+    assert 'ignore_label' in inspect.getfullargspec(func)[0]
+    # Make sure replace is the second from last argument
+    assert 'replace' == inspect.getfullargspec(func)[0][-2]
+    # Make sure ignore_label is the final argument
+    assert 'ignore_label' == inspect.getfullargspec(func)[0][-1]
+    args = tuple(list(args) + [replace_value, ignore_label])
+
+  # Add label as the second positional argument for the function if it does
+  # not already exist.
+  if 'label' not in inspect.getfullargspec(func)[0]:
+    func = label_wrapper(func)
+  return (func, prob, args)
+
+
+def _apply_func_with_prob(func, image, args, prob, label):
+  """Apply `func` to image w/ `args` as input with probability `prob`."""
+  assert isinstance(args, tuple)
+  assert 'label' == inspect.getfullargspec(func)[0][1]
+
+  # If prob is a function argument, then this randomness is being handled
+  # inside the function, so make sure it is always called.
+  if 'prob' in inspect.getfullargspec(func)[0]:
+    prob = 1.0
+
+  # Apply the function with probability `prob`.
+  should_apply_op = tf.cast(
+      tf.floor(tf.random.uniform([], dtype=tf.float32) + prob), tf.bool)
+  augmented_image, augmented_label = tf.cond(
+      should_apply_op,
+      lambda: func(image, label, *args),
+      lambda: (image, label))
+  return augmented_image, augmented_label
+
+
+def select_and_apply_random_policy(policies, image, label):
+  """Select a random policy from `policies` and apply it to `image`."""
+  policy_to_select = tf.random.uniform([], maxval=len(policies), dtype=tf.int32)
+  # Note that using tf.case instead of tf.conds would result in significantly
+  # larger graphs and would even break export for some larger policies.
+  for (i, policy) in enumerate(policies):
+    image, label = tf.cond(
+        tf.equal(i, policy_to_select),
+        lambda selected_policy=policy: selected_policy(image, label),
+        lambda: (image, label))
+  return (image, label)
+
+
+def build_and_apply_autoaugment_policy(policies, image, label, ignore_label):
+  """Builds a policy from the given policies passed in and applies to image.
+
+  Args:
+    policies: list of lists of tuples in the form `(func, prob, level)`, `func`
+      is a string name of the augmentation function, `prob` is the probability
+      of applying the `func` operation, `level` is the input argument for
+      `func`.
+    image: tf.Tensor that the resulting policy will be applied to.
+    label: tf.Tensor that the resulting policy will be applied to.
+    ignore_label: The label value which will be ignored for training and
+      evaluation.
+
+  Returns:
+    A version of image that now has data augmentation applied to it based on
+    the `policies` pass into the function. Additionally, returns bboxes if
+    a value for them is passed in that is not None
+  """
+  replace_value = [128, 128, 128]
+
+  # func is the string name of the augmentation function, prob is the
+  # probability of applying the operation and level is the parameter associated
+  # with the tf op.
+
+  # tf_policies are functions that take in an image and return an augmented
+  # image.
+  tf_policies = []
+  for policy in policies:
+    tf_policy = []
+    # Link string name to the correct python function and make sure the correct
+    # argument is passed into that function.
+    for policy_info in policy:
+      policy_info = (
+          list(policy_info) + [replace_value, ignore_label])
+
+      tf_policy.append(_parse_policy_info(*policy_info))
+    # Now build the tf policy that will apply the augmentation procedue
+    # on image.
+    def make_final_policy(tf_policy_):
+      def final_policy(image_, label_):
+        for func, prob, args in tf_policy_:
+          image_, label_ = _apply_func_with_prob(
+              func, image_, args, prob, label_)
+        return image_, label_
+      return final_policy
+    tf_policies.append(make_final_policy(tf_policy))
+
+  augmented_images, augmented_label = select_and_apply_random_policy(
+      tf_policies, image, label)
+  # If no bounding boxes were specified, then just return the images.
+  return (augmented_images, augmented_label)
+
+
+def distort_image_with_autoaugment(image,
+                                   label,
+                                   ignore_label,
+                                   augmentation_name=None):
+  """Applies the AutoAugment policy to `image` and `label`.
+
+  Args:
+    image: `Tensor` of shape [height, width, 3] representing an image.
+    label: `Tensor` of shape [height, width, 1] representing a label.
+    ignore_label: The label value which will be ignored for training and
+      evaluation.
+    augmentation_name: The name of the AutoAugment policy to use. See
+      autoaugment_policy.py for available_policies.
+
+  Returns:
+    A tuple containing the augmented versions of `image` and `label`.
+
+  Raises:
+    ValueError: If the augmentation_name is not in available_policies.
+  """
+  if augmentation_name:
+    available_policies = autoaugment_policy.available_policies
+    if augmentation_name not in available_policies:
+      raise ValueError(
+          'Invalid augmentation_name: {}'.format(augmentation_name))
+    policy = available_policies[augmentation_name]
+    return build_and_apply_autoaugment_policy(
+        policy, image, label, ignore_label)
+  return image, label
diff --git a/data/preprocessing/autoaugment_utils_test.py b/data/preprocessing/autoaugment_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5347198dd2cf21a4068c9df242497f63fa503f1b
--- /dev/null
+++ b/data/preprocessing/autoaugment_utils_test.py
@@ -0,0 +1,40 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for autoaugment_utils.py."""
+
+import numpy as np
+import tensorflow as tf
+
+from deeplab2.data.preprocessing import autoaugment_utils
+
+
+class AutoaugmentUtilsTest(tf.test.TestCase):
+
+  def testAugmentWithNamedPolicy(self):
+    num_classes = 3
+    np_image = np.random.randint(256, size=(13, 13, 3))
+    image = tf.constant(np_image, dtype=tf.uint8)
+    np_label = np.random.randint(num_classes, size=(13, 13, 1))
+    label = tf.constant(np_label, dtype=tf.int32)
+    image, label = autoaugment_utils.distort_image_with_autoaugment(
+        image, label, ignore_label=255,
+        augmentation_name='simple_classification_policy')
+    self.assertTrue(image.numpy().any())
+    self.assertTrue(label.numpy().any())
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/data/preprocessing/input_preprocessing.py b/data/preprocessing/input_preprocessing.py
new file mode 100644
index 0000000000000000000000000000000000000000..e44b68b4aee7c0a87c27e9dc4e0db7d84ad1d731
--- /dev/null
+++ b/data/preprocessing/input_preprocessing.py
@@ -0,0 +1,307 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This file contains functions to preprocess images and labels."""
+
+import tensorflow as tf
+
+from deeplab2.data.preprocessing import autoaugment_utils
+from deeplab2.data.preprocessing import preprocess_utils
+
+# The probability of flipping the images and labels
+# left-right during training
+_PROB_OF_FLIP = 0.5
+
+_MEAN_PIXEL = [127.5, 127.5, 127.5]
+
+
+def _pad_image_and_label(image, label, offset_height, offset_width,
+                         target_height, target_width, ignore_label=None):
+  """Pads the image and the label to the given size.
+
+  Args:
+    image: A tf.Tensor of shape [height, width, channels].
+    label: A tf.Tensor of shape [height, width, 1] or None.
+    offset_height: The number of rows of zeros to add on top of the image and
+      label.
+    offset_width: The number of columns of zeros to add on the left of the image
+      and label.
+    target_height: The total height after padding.
+    target_width: The total width after padding.
+    ignore_label: The ignore_label for the label. Must only be set when label is
+      given.
+
+  Returns:
+    The padded image and label as a tuple (padded_image, padded_label).
+
+  Raises:
+    tf.errors.InvalidArgumentError: An error occurs if the padding configuration
+      is invalid.
+    ValueError: An error occurs if label is given without an ignore_label.
+  """
+  height = tf.shape(image)[0]
+  width = tf.shape(image)[1]
+  original_dtype = image.dtype
+  if original_dtype not in (tf.float32, tf.float64):
+    image = tf.cast(image, tf.float32)
+
+  bottom_padding = target_height - offset_height - height
+  right_padding = target_width - offset_width - width
+
+  assert_bottom_padding = tf.assert_greater(
+      bottom_padding, -1,
+      'The padding configuration is not valid. Please either increase the '
+      'target size or reduce the padding offset.')
+  assert_right_padding = tf.assert_greater(
+      right_padding, -1, 'The padding configuration is not valid. Please either'
+      ' increase the target size or reduce the padding offset.')
+  with tf.control_dependencies([assert_bottom_padding, assert_right_padding]):
+    paddings = [[offset_height, bottom_padding], [offset_width, right_padding],
+                [0, 0]]
+
+    image = image - _MEAN_PIXEL
+    image = tf.pad(image, paddings)
+    image = image + _MEAN_PIXEL
+    image = tf.cast(image, original_dtype)
+
+    if label is not None:
+      if ignore_label is None:
+        raise ValueError(
+            'If a label is given, the ignore label must be set too.')
+      label = tf.pad(label, paddings, constant_values=ignore_label)
+
+    return image, label
+
+
+def _update_max_resize_value(max_resize_value, crop_size, is_inference=False):
+  """Checks and may update max_resize_value.
+
+  Args:
+    max_resize_value: A 2-tuple of (height, width), maximum allowed value
+      after resize. If a single element is given, then height and width
+      share the same value. None, empty or having 0 indicates no maximum value
+      will be used.
+    crop_size: A 2-tuple of (height, width), crop size used.
+    is_inference: Boolean, whether the model is performing inference or not.
+
+  Returns:
+    Updated max_resize_value.
+  """
+  max_resize_value = preprocess_utils.process_resize_value(max_resize_value)
+  if max_resize_value is None and is_inference:
+    # During inference, default max_resize_value to crop size to allow
+    # model taking input images with larger sizes.
+    max_resize_value = crop_size
+
+  if max_resize_value is None:
+    return None
+
+  if max_resize_value[0] > crop_size[0] or max_resize_value[1] > crop_size[1]:
+    raise ValueError(
+        'Maximum resize value provided (%s) exceeds model crop size (%s)' %
+        (max_resize_value, crop_size))
+  return max_resize_value
+
+
+def preprocess_image_and_label(image,
+                               label,
+                               crop_height,
+                               crop_width,
+                               prev_image=None,
+                               prev_label=None,
+                               min_resize_value=None,
+                               max_resize_value=None,
+                               resize_factor=None,
+                               min_scale_factor=1.,
+                               max_scale_factor=1.,
+                               scale_factor_step_size=0,
+                               ignore_label=None,
+                               is_training=True,
+                               autoaugment_policy_name=None):
+  """Preprocesses the image and label.
+
+  Args:
+    image: A tf.Tensor containing the image with shape [height, width, 3].
+    label: A tf.Tensor containing the label with shape [height, width, 1] or
+      None.
+    crop_height: The height value used to crop the image and label.
+    crop_width: The width value used to crop the image and label.
+    prev_image: An optional tensor of shape [image_height, image_width, 3].
+    prev_label: An optional tensor of shape [label_height, label_width, 1].
+    min_resize_value: A 2-tuple of (height, width), desired minimum value
+      after resize. If a single element is given, then height and width share
+      the same value. None, empty or having 0 indicates no minimum value will
+      be used.
+    max_resize_value: A 2-tuple of (height, width), maximum allowed value
+      after resize. If a single element is given, then height and width
+      share the same value. None, empty or having 0 indicates no maximum value
+      will be used.
+    resize_factor: Resized dimensions are multiple of factor plus one.
+    min_scale_factor: Minimum scale factor for random scale augmentation.
+    max_scale_factor: Maximum scale factor for random scale augmentation.
+    scale_factor_step_size: The step size from min scale factor to max scale
+      factor. The input is randomly scaled based on the value of
+      (min_scale_factor, max_scale_factor, scale_factor_step_size).
+    ignore_label: The label value which will be ignored for training and
+      evaluation.
+    is_training: If the preprocessing is used for training or not.
+    autoaugment_policy_name: String, autoaugment policy name. See
+        autoaugment_policy.py for available policies.
+
+  Returns:
+    resized_image: The resized input image without other augmentations as a
+      tf.Tensor.
+    processed_image: The preprocessed image as a tf.Tensor.
+    label: The preprocessed groundtruth segmentation label as a tf.Tensor.
+
+  Raises:
+    ValueError: Ground truth label not provided during training.
+  """
+  if is_training and label is None:
+    raise ValueError('During training, label must be provided.')
+
+  image.get_shape().assert_is_compatible_with(tf.TensorShape([None, None, 3]))
+
+  # Keep reference to original image.
+  resized_image = image
+  if prev_image is not None:
+    image = tf.concat([image, prev_image], axis=2)
+  processed_image = tf.cast(image, tf.float32)
+  processed_prev_image = None
+
+  if label is not None:
+    label.get_shape().assert_is_compatible_with(tf.TensorShape([None, None, 1]))
+    if prev_label is not None:
+      label = tf.concat([label, prev_label], axis=2)
+    label = tf.cast(label, tf.int32)
+
+  # Resize image and label to the desired range.
+  if any([min_resize_value, max_resize_value, not is_training]):
+    max_resize_value = _update_max_resize_value(
+        max_resize_value,
+        crop_size=(crop_height, crop_width),
+        is_inference=not is_training)
+
+    processed_image, label = (
+        preprocess_utils.resize_to_range(
+            image=processed_image,
+            label=label,
+            min_size=min_resize_value,
+            max_size=max_resize_value,
+            factor=resize_factor,
+            align_corners=True))
+    if prev_image is None:
+      resized_image = tf.identity(processed_image)
+    else:
+      resized_image, _ = tf.split(processed_image, 2, axis=2)
+
+  if prev_image is not None:
+    processed_image, processed_prev_image = tf.split(processed_image, 2, axis=2)
+
+  if prev_label is not None:
+    label, prev_label = tf.split(label, 2, axis=2)
+
+  if not is_training:
+    image_height = tf.shape(processed_image)[0]
+    image_width = tf.shape(processed_image)[1]
+
+    offset_height = 0
+    offset_width = 0
+    processed_image, label = _pad_image_and_label(processed_image, label,
+                                                  offset_height, offset_width,
+                                                  crop_height, crop_width,
+                                                  ignore_label)
+    processed_image.set_shape([crop_height, crop_width, 3])
+    if label is not None:
+      label.set_shape([crop_height, crop_width, 1])
+    if prev_image is not None:
+      processed_prev_image, prev_label = _pad_image_and_label(
+          processed_prev_image, prev_label, offset_height, offset_width,
+          crop_height, crop_width, ignore_label)
+      processed_prev_image.set_shape([crop_height, crop_width, 3])
+      if prev_label is not None:
+        prev_label.set_shape([crop_height, crop_width, 1])
+    return (resized_image, processed_image, label, processed_prev_image,
+            prev_label)
+
+  # Data augmentation by randomly scaling the inputs.
+  scale = preprocess_utils.get_random_scale(
+      min_scale_factor, max_scale_factor, scale_factor_step_size)
+  processed_image, label = preprocess_utils.randomly_scale_image_and_label(
+      processed_image, label, scale)
+  if processed_prev_image is not None:
+    (processed_prev_image,
+     prev_label) = preprocess_utils.randomly_scale_image_and_label(
+         processed_prev_image, prev_label, scale)
+
+  # Apply autoaugment if any.
+  if autoaugment_policy_name:
+    processed_image, label = _autoaugment_helper(
+        processed_image, label, ignore_label, autoaugment_policy_name)
+    if processed_prev_image is not None:
+      processed_prev_image, prev_label = _autoaugment_helper(
+          processed_prev_image, prev_label, ignore_label,
+          autoaugment_policy_name)
+
+  # Pad image and label to have dimensions >= [crop_height, crop_width].
+  image_height = tf.shape(processed_image)[0]
+  image_width = tf.shape(processed_image)[1]
+  target_height = image_height + tf.maximum(crop_height - image_height, 0)
+  target_width = image_width + tf.maximum(crop_width - image_width, 0)
+
+  # Randomly crop the image and label.
+  def _uniform_offset(margin):
+    return tf.random.uniform(
+        [], minval=0, maxval=tf.maximum(margin, 1), dtype=tf.int32)
+
+  offset_height = _uniform_offset(crop_height - image_height)
+  offset_width = _uniform_offset(crop_width - image_width)
+  processed_image, label = _pad_image_and_label(processed_image, label,
+                                                offset_height, offset_width,
+                                                target_height, target_width,
+                                                ignore_label)
+  if processed_prev_image is not None:
+    processed_prev_image, prev_label = _pad_image_and_label(
+        processed_prev_image, prev_label, offset_height, offset_width,
+        target_height, target_width, ignore_label)
+
+  if processed_prev_image is not None:
+    (processed_image, label, processed_prev_image,
+     prev_label) = preprocess_utils.random_crop(
+         [processed_image, label, processed_prev_image, prev_label],
+         crop_height, crop_width)
+    # Randomly left-right flip the image and label.
+    (processed_image, label, processed_prev_image, prev_label,
+     _) = preprocess_utils.flip_dim(
+         [processed_image, label, processed_prev_image, prev_label],
+         _PROB_OF_FLIP,
+         dim=1)
+  else:
+    processed_image, label = preprocess_utils.random_crop(
+        [processed_image, label], crop_height, crop_width)
+    # Randomly left-right flip the image and label.
+    processed_image, label, _ = preprocess_utils.flip_dim(
+        [processed_image, label], _PROB_OF_FLIP, dim=1)
+
+  return resized_image, processed_image, label, processed_prev_image, prev_label
+
+
+def _autoaugment_helper(image, label, ignore_label, policy_name):
+  image = tf.cast(image, tf.uint8)
+  label = tf.cast(label, tf.int32)
+  image, label = autoaugment_utils.distort_image_with_autoaugment(
+      image, label, ignore_label, policy_name)
+  image = tf.cast(image, tf.float32)
+  return image, label
diff --git a/data/preprocessing/input_preprocessing_test.py b/data/preprocessing/input_preprocessing_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..26a31b87d711e74c48e50c02ff5076ae5917279a
--- /dev/null
+++ b/data/preprocessing/input_preprocessing_test.py
@@ -0,0 +1,174 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for input_preprocessing."""
+
+import numpy as np
+import tensorflow as tf
+
+from deeplab2.data.preprocessing import input_preprocessing
+
+
+class InputPreprocessingTest(tf.test.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    self._image = tf.convert_to_tensor(np.random.randint(256, size=[33, 33, 3]))
+    self._label = tf.convert_to_tensor(np.random.randint(19, size=[33, 33, 1]))
+
+  def test_cropping(self):
+    crop_height = np.random.randint(33)
+    crop_width = np.random.randint(33)
+
+    original_image, processed_image, processed_label, prev_image, prev_label = (
+        input_preprocessing.preprocess_image_and_label(
+            image=self._image,
+            label=self._label,
+            prev_image=tf.identity(self._image),
+            prev_label=tf.identity(self._label),
+            crop_height=crop_height,
+            crop_width=crop_width,
+            ignore_label=255))
+
+    self.assertListEqual(original_image.shape.as_list(),
+                         [33, 33, 3])
+    self.assertListEqual(processed_image.shape.as_list(),
+                         [crop_height, crop_width, 3])
+    self.assertListEqual(processed_label.shape.as_list(),
+                         [crop_height, crop_width, 1])
+    np.testing.assert_equal(processed_image.numpy(), prev_image.numpy())
+    np.testing.assert_equal(processed_label.numpy(), prev_label.numpy())
+
+  def test_resizing(self):
+    height, width = 65, 65
+
+    original_image, processed_image, processed_label, prev_image, prev_label = (
+        input_preprocessing.preprocess_image_and_label(
+            image=self._image,
+            label=self._label,
+            prev_image=tf.identity(self._image),
+            prev_label=tf.identity(self._label),
+            crop_height=height,
+            crop_width=width,
+            min_resize_value=65,
+            max_resize_value=65,
+            resize_factor=32,
+            ignore_label=255))
+
+    self.assertListEqual(original_image.shape.as_list(),
+                         [height, width, 3])
+    self.assertListEqual(processed_image.shape.as_list(),
+                         [height, width, 3])
+    self.assertListEqual(processed_label.shape.as_list(),
+                         [height, width, 1])
+    np.testing.assert_equal(processed_image.numpy(), prev_image.numpy())
+    np.testing.assert_equal(processed_label.numpy(), prev_label.numpy())
+
+  def test_scaling(self):
+    height, width = 65, 65
+
+    original_image, processed_image, processed_label, prev_image, prev_label = (
+        input_preprocessing.preprocess_image_and_label(
+            image=self._image,
+            label=self._label,
+            prev_image=tf.identity(self._image),
+            prev_label=tf.identity(self._label),
+            crop_height=height,
+            crop_width=width,
+            min_scale_factor=0.5,
+            max_scale_factor=2.0,
+            ignore_label=255))
+
+    self.assertListEqual(original_image.shape.as_list(),
+                         [33, 33, 3])
+    self.assertListEqual(processed_image.shape.as_list(),
+                         [height, width, 3])
+    self.assertListEqual(processed_label.shape.as_list(),
+                         [height, width, 1])
+    np.testing.assert_equal(processed_image.numpy(), prev_image.numpy())
+    np.testing.assert_equal(processed_label.numpy(), prev_label.numpy())
+
+  def test_return_padded_image_and_label(self):
+    image = np.dstack([[[5, 6], [9, 0]], [[4, 3], [3, 5]], [[7, 8], [1, 2]]])
+    image = tf.convert_to_tensor(image, dtype=tf.float32)
+    label = np.array([[[1], [2]], [[3], [4]]])
+    expected_image = np.dstack([[[127.5, 127.5, 127.5, 127.5, 127.5],
+                                 [127.5, 127.5, 127.5, 127.5, 127.5],
+                                 [127.5, 5, 6, 127.5, 127.5],
+                                 [127.5, 9, 0, 127.5, 127.5],
+                                 [127.5, 127.5, 127.5, 127.5, 127.5]],
+                                [[127.5, 127.5, 127.5, 127.5, 127.5],
+                                 [127.5, 127.5, 127.5, 127.5, 127.5],
+                                 [127.5, 4, 3, 127.5, 127.5],
+                                 [127.5, 3, 5, 127.5, 127.5],
+                                 [127.5, 127.5, 127.5, 127.5, 127.5]],
+                                [[127.5, 127.5, 127.5, 127.5, 127.5],
+                                 [127.5, 127.5, 127.5, 127.5, 127.5],
+                                 [127.5, 7, 8, 127.5, 127.5],
+                                 [127.5, 1, 2, 127.5, 127.5],
+                                 [127.5, 127.5, 127.5, 127.5, 127.5]]])
+    expected_label = np.array([[[255], [255], [255], [255], [255]],
+                               [[255], [255], [255], [255], [255]],
+                               [[255], [1], [2], [255], [255]],
+                               [[255], [3], [4], [255], [255]],
+                               [[255], [255], [255], [255], [255]]])
+
+    padded_image, padded_label = input_preprocessing._pad_image_and_label(
+        image, label, 2, 1, 5, 5, 255)
+    np.testing.assert_allclose(padded_image.numpy(), expected_image)
+    np.testing.assert_allclose(padded_label.numpy(), expected_label)
+
+  def test_return_original_image_when_target_size_is_equal_to_image_size(self):
+    height, width, _ = tf.shape(self._image)
+    padded_image, _ = input_preprocessing._pad_image_and_label(
+        self._image, None, 0, 0, height, width)
+    np.testing.assert_allclose(padded_image.numpy(), self._image)
+
+  def test_die_on_target_size_greater_than_image_size(self):
+    height, width, _ = tf.shape(self._image)
+    with self.assertRaises(tf.errors.InvalidArgumentError):
+      _ = input_preprocessing._pad_image_and_label(self._image, None, 0, 0,
+                                                   height, width - 1)
+
+    with self.assertRaises(tf.errors.InvalidArgumentError):
+      _ = input_preprocessing._pad_image_and_label(self._image, None, 0, 0,
+                                                   height - 1, width)
+
+  def test_die_if_target_size_not_possible_with_given_offset(self):
+    height, width, _ = tf.shape(self._image)
+    with self.assertRaises(tf.errors.InvalidArgumentError):
+      _ = input_preprocessing._pad_image_and_label(self._image, None, 3, 3,
+                                                   height + 2, width + 2)
+
+  def test_set_min_resize_value_only_during_training(self):
+    crop_height = np.random.randint(33)
+    crop_width = np.random.randint(33)
+
+    _, processed_image, _, _, _ = (
+        input_preprocessing.preprocess_image_and_label(
+            image=self._image,
+            label=self._label,
+            crop_height=crop_height,
+            crop_width=crop_width,
+            min_resize_value=[10],
+            max_resize_value=None,
+            ignore_label=255))
+
+    self.assertListEqual(processed_image.shape.as_list(),
+                         [crop_height, crop_width, 3])
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/data/preprocessing/preprocess_utils.py b/data/preprocessing/preprocess_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3773001558eb4cb1ced3adaf7e73c47d3dc7f0d
--- /dev/null
+++ b/data/preprocessing/preprocess_utils.py
@@ -0,0 +1,516 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utility functions related to preprocessing inputs."""
+
+import numpy as np
+import tensorflow as tf
+
+
+def flip_dim(tensor_list, prob=0.5, dim=1):
+  """Randomly flips a dimension of the given tensor.
+
+  The decision to randomly flip the `Tensors` is made together. In other words,
+  all or none of the images pass in are flipped.
+
+  Note that tf.random_flip_left_right and tf.random_flip_up_down isn't used so
+  that we can control for the probability as well as ensure the same decision
+  is applied across the images.
+
+  Args:
+    tensor_list: A list of `Tensors` with the same number of dimensions.
+    prob: The probability of a left-right flip.
+    dim: The dimension to flip, 0, 1, ..
+
+  Returns:
+    outputs: A list of the possibly flipped `Tensors` as well as an indicator
+    `Tensor` at the end whose value is `True` if the inputs were flipped and
+    `False` otherwise.
+
+  Raises:
+    ValueError: If dim is negative or greater than the dimension of a `Tensor`.
+  """
+  random_value = tf.random.uniform([])
+
+  def flip():
+    flipped = []
+    for tensor in tensor_list:
+      if dim < 0 or dim >= len(tensor.get_shape().as_list()):
+        raise ValueError('dim must represent a valid dimension.')
+      flipped.append(tf.reverse(tensor, [dim]))
+    return flipped
+
+  is_flipped = tf.less_equal(random_value, prob)
+  outputs = tf.cond(is_flipped, flip, lambda: tensor_list)
+  if not isinstance(outputs, (list, tuple)):
+    outputs = [outputs]
+  outputs.append(is_flipped)
+
+  return outputs
+
+
+def get_label_resize_method(label):
+  """Returns the resize method of labels depending on label dtype.
+
+  Args:
+    label: Groundtruth label tensor.
+
+  Returns:
+    tf.image.ResizeMethod.BILINEAR, if label dtype is floating.
+    tf.image.ResizeMethod.NEAREST_NEIGHBOR, if label dtype is integer.
+
+  Raises:
+    ValueError: If label is neither floating nor integer.
+  """
+  if label.dtype.is_floating:
+    return tf.image.ResizeMethod.BILINEAR
+  elif label.dtype.is_integer:
+    return tf.image.ResizeMethod.NEAREST_NEIGHBOR
+  else:
+    raise ValueError('Label type must be either floating or integer.')
+
+
+def _crop(image, offset_height, offset_width, crop_height, crop_width):
+  """Crops the given image using the provided offsets and sizes.
+
+  Note that the method doesn't assume we know the input image size but it does
+  assume we know the input image rank.
+
+  Args:
+    image: an image of shape [height, width, channels].
+    offset_height: a scalar tensor indicating the height offset.
+    offset_width: a scalar tensor indicating the width offset.
+    crop_height: the height of the cropped image.
+    crop_width: the width of the cropped image.
+
+  Returns:
+    The cropped (and resized) image.
+
+  Raises:
+    ValueError: if `image` doesn't have rank of 3.
+    InvalidArgumentError: if the rank is not 3 or if the image dimensions are
+      less than the crop size.
+  """
+  original_shape = tf.shape(image)
+
+  if len(image.get_shape().as_list()) != 3:
+    raise ValueError('input must have rank of 3')
+  original_channels = image.get_shape().as_list()[2]
+
+  rank_assertion = tf.Assert(
+      tf.equal(tf.rank(image), 3),
+      ['Rank of image must be equal to 3.'])
+  with tf.control_dependencies([rank_assertion]):
+    cropped_shape = tf.stack([crop_height, crop_width, original_shape[2]])
+
+  size_assertion = tf.Assert(
+      tf.logical_and(
+          tf.greater_equal(original_shape[0], crop_height),
+          tf.greater_equal(original_shape[1], crop_width)),
+      ['Crop size greater than the image size.'])
+
+  offsets = tf.cast(tf.stack([offset_height, offset_width, 0]), tf.int32)
+
+  # Use tf.slice instead of crop_to_bounding box as it accepts tensors to
+  # define the crop size.
+  with tf.control_dependencies([size_assertion]):
+    image = tf.slice(image, offsets, cropped_shape)
+  image = tf.reshape(image, cropped_shape)
+  image.set_shape([crop_height, crop_width, original_channels])
+  return image
+
+
+def random_crop(image_list, crop_height, crop_width):
+  """Crops the given list of images.
+
+  The function applies the same crop to each image in the list. This can be
+  effectively applied when there are multiple image inputs of the same
+  dimension such as:
+
+    image, depths, normals = random_crop([image, depths, normals], 120, 150)
+
+  Args:
+    image_list: a list of image tensors of the same dimension but possibly
+      varying channel.
+    crop_height: the new height.
+    crop_width: the new width.
+
+  Returns:
+    the image_list with cropped images.
+
+  Raises:
+    ValueError: if there are multiple image inputs provided with different size
+      or the images are smaller than the crop dimensions.
+  """
+  if not image_list:
+    raise ValueError('Empty image_list.')
+
+  # Compute the rank assertions.
+  rank_assertions = []
+  for i in range(len(image_list)):
+    image_rank = tf.rank(image_list[i])
+    rank_assert = tf.Assert(
+        tf.equal(image_rank, 3), [
+            'Wrong rank for tensor %d in image_list [expected] [actual]', i, 3,
+            image_rank
+        ])
+    rank_assertions.append(rank_assert)
+
+  with tf.control_dependencies([rank_assertions[0]]):
+    image_shape = tf.shape(image_list[0])
+  image_height = image_shape[0]
+  image_width = image_shape[1]
+  crop_size_assert = tf.Assert(
+      tf.logical_and(
+          tf.greater_equal(image_height, crop_height),
+          tf.greater_equal(image_width, crop_width)),
+      ['Crop size greater than the image size.'])
+
+  asserts = [rank_assertions[0], crop_size_assert]
+
+  for i in range(1, len(image_list)):
+    image = image_list[i]
+    asserts.append(rank_assertions[i])
+    with tf.control_dependencies([rank_assertions[i]]):
+      shape = tf.shape(image)
+    height = shape[0]
+    width = shape[1]
+
+    height_assert = tf.Assert(
+        tf.equal(height, image_height), [
+            'Wrong height for tensor %d in image_list [expected][actual]', i,
+            height, image_height
+        ])
+    width_assert = tf.Assert(
+        tf.equal(width, image_width), [
+            'Wrong width for tensor %d in image_list [expected][actual]', i,
+            width, image_width
+        ])
+    asserts.extend([height_assert, width_assert])
+
+  # Create a random bounding box.
+  #
+  # Use tf.random.uniform and not numpy.random.rand as doing the former would
+  # generate random numbers at graph eval time, unlike the latter which
+  # generates random numbers at graph definition time.
+  with tf.control_dependencies(asserts):
+    max_offset_height = tf.reshape(image_height - crop_height + 1, [])
+    max_offset_width = tf.reshape(image_width - crop_width + 1, [])
+  offset_height = tf.random.uniform([],
+                                    maxval=max_offset_height,
+                                    dtype=tf.int32)
+  offset_width = tf.random.uniform([], maxval=max_offset_width, dtype=tf.int32)
+
+  return [_crop(image, offset_height, offset_width,
+                crop_height, crop_width) for image in image_list]
+
+
+def get_random_scale(min_scale_factor, max_scale_factor, step_size):
+  """Gets a random scale value.
+
+  Args:
+    min_scale_factor: Minimum scale value.
+    max_scale_factor: Maximum scale value.
+    step_size: The step size from minimum to maximum value.
+
+  Returns:
+    A tensor with random scale value selected between minimum and maximum value.
+    If `min_scale_factor` and `max_scale_factor` are the same, a number is
+    returned instead.
+
+  Raises:
+    ValueError: min_scale_factor has unexpected value.
+  """
+  if min_scale_factor < 0 or min_scale_factor > max_scale_factor:
+    raise ValueError('Unexpected value of min_scale_factor.')
+
+  if min_scale_factor == max_scale_factor:
+    return np.float32(min_scale_factor)
+
+  # When step_size = 0, we sample the value uniformly from [min, max).
+  if step_size == 0:
+    return tf.random.uniform([1],
+                             minval=min_scale_factor,
+                             maxval=max_scale_factor)
+
+  # When step_size != 0, we randomly select one discrete value from [min, max].
+  num_steps = int((max_scale_factor - min_scale_factor) / step_size + 1)
+  scale_factors = tf.linspace(min_scale_factor, max_scale_factor, num_steps)
+  shuffled_scale_factors = tf.random.shuffle(scale_factors)
+  return shuffled_scale_factors[0]
+
+
+def randomly_scale_image_and_label(image, label=None, scale=1.0):
+  """Randomly scales image and label.
+
+  Args:
+    image: Image with shape [height, width, 3].
+    label: Label with shape [height, width, 1].
+    scale: The value to scale image and label.
+
+  Returns:
+    Scaled image and label.
+  """
+  # No random scaling if scale == 1.
+  if scale == 1.0:
+    return image, label
+  image_shape = tf.shape(image)
+  new_dim = tf.cast(
+      tf.cast([image_shape[0], image_shape[1]], tf.float32) * scale,
+      tf.int32)
+
+  # Need squeeze and expand_dims because image interpolation takes
+  # 4D tensors as input.
+  image = tf.squeeze(
+      tf.compat.v1.image.resize_bilinear(
+          tf.expand_dims(image, 0), new_dim, align_corners=True), [0])
+  if label is not None:
+    label = tf.compat.v1.image.resize(
+        label,
+        new_dim,
+        method=get_label_resize_method(label),
+        align_corners=True)
+
+  return image, label
+
+
+def resolve_shape(tensor, rank=None):
+  """Fully resolves the shape of a Tensor.
+
+  Use as much as possible the shape components already known during graph
+  creation and resolve the remaining ones during runtime.
+
+  Args:
+    tensor: Input tensor whose shape we query.
+    rank: The rank of the tensor, provided that we know it.
+
+  Returns:
+    shape: The full shape of the tensor.
+  """
+  if rank is not None:
+    shape = tensor.get_shape().with_rank(rank).as_list()
+  else:
+    shape = tensor.get_shape().as_list()
+
+  if None in shape:
+    dynamic_shape = tf.shape(tensor)
+    for i in range(len(shape)):
+      if shape[i] is None:
+        shape[i] = dynamic_shape[i]
+
+  return shape
+
+
+def _scale_dim(original_size, factor):
+  """Helper method to scale one input dimension by the given factor."""
+  original_size = tf.cast(original_size, tf.float32)
+  factor = tf.cast(factor, tf.float32)
+  return tf.cast(tf.floor(original_size * factor), tf.int32)
+
+
+def process_resize_value(resize_spec):
+  """Helper method to process input resize spec.
+
+  Args:
+    resize_spec: Either None, a python scalar, or a sequence with length <=2.
+      Each value in the sequence should be a python integer.
+
+  Returns:
+    None if input size is not valid, or 2-tuple of (height, width), derived
+      from input resize_spec.
+  """
+  if not resize_spec:
+    return None
+
+  if isinstance(resize_spec, int):
+    # For conveniences and also backward compatibility.
+    resize_spec = (resize_spec,)
+
+  resize_spec = tuple(resize_spec)
+
+  if len(resize_spec) == 1:
+    resize_spec = (resize_spec[0], resize_spec[0])
+
+  if len(resize_spec) != 2:
+    raise ValueError('Unable to process input resize_spec: %s' % resize_spec)
+
+  if resize_spec[0] <= 0 or resize_spec[1] <= 0:
+    return None
+
+  return resize_spec
+
+
+def _resize_to_match_min_size(input_shape, min_size):
+  """Returns the resized shape so that both sides match minimum size.
+
+  Note: the input image will still be scaled if input height and width
+  are already greater than minimum size.
+
+  Args:
+    input_shape: A 2-tuple, (height, width) of the input image. Each value can
+      be either a python integer or a integer scalar tensor.
+    min_size: A tuple of (minimum height, minimum width) to specify the
+      minimum shape after resize. The input shape would be scaled so that both
+      height and width will be greater than or equal to their minimum value.
+
+  Returns:
+    A 2-tuple, (height, width), resized input shape which preserves input
+      aspect ratio.
+  """
+  input_height, input_width = input_shape
+  min_height, min_width = min_size
+
+  scale_factor = tf.maximum(min_height / input_height, min_width / input_width)
+  return (_scale_dim(input_height, scale_factor),
+          _scale_dim(input_width, scale_factor))
+
+
+def _resize_to_fit_max_size(input_shape, max_size):
+  """Returns the resized shape so that both sides fit within max size.
+
+  Note: if input shape is already smaller or equal to maximum size, no resize
+    operation would be performed.
+
+  Args:
+    input_shape: A 2-tuple, (height, width) of the input image. Each value can
+      be either a python integer or a integer scalar tensor.
+    max_size: A tuple of (minimum height, minimum width) to specify
+      the maximum allowed shape after resize.
+
+  Returns:
+    A 2-tuple, (height, width), resized input shape which preserves input
+      aspect ratio.
+  """
+  input_height, input_width = input_shape
+  max_height, max_width = max_size
+  scale_factor = tf.minimum(max_height / input_height, max_width / input_width)
+
+  scale_factor = tf.minimum(tf.cast(scale_factor, tf.float32),
+                            tf.cast(1.0, tf.float32))
+  return (_scale_dim(input_height, scale_factor),
+          _scale_dim(input_width, scale_factor))
+
+
+def resize_to_range_helper(input_shape, min_size, max_size=None, factor=None):
+  """Determines output size in specified range.
+
+  The output size (height and/or width) can be described by two cases:
+  1. If current side can be rescaled so its minimum size is equal to min_size
+     without the other side exceeding its max_size, then do so.
+  2. Otherwise, resize so at least one side is reaching its max_size.
+
+  An integer in `range(factor)` is added to the computed sides so that the
+  final dimensions are multiples of `factor` plus one.
+
+  Args:
+    input_shape: A 2-tuple, (height, width) of the input image. Each value can
+      be either a python integer or a integer scalar tensor.
+    min_size: A 2-tuple of (height, width), desired minimum value after resize.
+      If a single element is given, then height and width share the same
+      min_size. None, empty or having 0 indicates no minimum value will be used.
+    max_size: A 2-tuple of (height, width), maximum allowed value after resize.
+      If a single element is given, then height and width share the same
+      max_size. None, empty or having 0 indicates no maximum value will be used.
+      Note that the output dimension is no larger than max_size and may be
+      slightly smaller than max_size when factor is not None.
+    factor: None or integer, make output size multiple of factor plus one.
+
+  Returns:
+    A 1-D tensor containing the [new_height, new_width].
+  """
+  output_shape = input_shape
+
+  min_size = process_resize_value(min_size)
+  if min_size:
+    output_shape = _resize_to_match_min_size(input_shape, min_size)
+
+  max_size = process_resize_value(max_size)
+  if max_size:
+    if factor:
+      # Update max_size to be a multiple of factor plus 1 and make sure the
+      # max dimension after resizing is no larger than max_size.
+      max_size = (max_size[0] - (max_size[0] - 1) % factor,
+                  max_size[1] - (max_size[1] - 1) % factor)
+
+    output_shape = _resize_to_fit_max_size(output_shape, max_size)
+
+  output_shape = tf.stack(output_shape)
+  # Ensure that both output sides are multiples of factor plus one.
+  if factor:
+    output_shape += (factor - (output_shape - 1) % factor) % factor
+
+  return output_shape
+
+
+def resize_to_range(image,
+                    label=None,
+                    min_size=None,
+                    max_size=None,
+                    factor=None,
+                    align_corners=True,
+                    method=tf.image.ResizeMethod.BILINEAR):
+  """Resizes image or label so their sides are within the provided range.
+
+  The output size (height and/or width) can be described by two cases:
+  1. If current side can be rescaled so its minimum size is equal to min_size
+     without the other side exceeding its max_size, then do so.
+  2. Otherwise, resize so at least one side is reaching its max_size.
+
+  An integer in `range(factor)` is added to the computed sides so that the
+  final dimensions are multiples of `factor` plus one.
+
+  Args:
+    image: A 3D tensor of shape [height, width, channels].
+    label: (optional) A 3D tensor of shape [height, width, channels].
+    min_size: A 2-tuple of (height, width), desired minimum value after resize.
+      If a single element is given, then height and width share the same
+      min_size. None, empty or having 0 indicates no minimum value will be used.
+    max_size: A 2-tuple of (height, width), maximum allowed value after resize.
+      If a single element is given, then height and width share the same
+      max_size. None, empty or having 0 indicates no maximum value will be used.
+      Note that the output dimension is no larger than max_size and may be
+      slightly smaller than max_size when factor is not None.
+    factor: Make output size multiple of factor plus one.
+    align_corners: If True, exactly align all 4 corners of input and output.
+    method: Image resize method. Defaults to tf.image.ResizeMethod.BILINEAR.
+
+  Returns:
+    resized_image: A 3-D tensor of shape [new_height, new_width, channels],
+      where the image has been resized with the specified method.
+    resized_label: Either None (if input label is None) or a 3-D tensor,
+      where the input label has been resized accordingly.
+
+  Raises:
+    ValueError: If the image is not a 3D tensor.
+  """
+  orig_height, orig_width, _ = resolve_shape(image, rank=3)
+  new_size = resize_to_range_helper(input_shape=(orig_height, orig_width),
+                                    min_size=min_size,
+                                    max_size=max_size,
+                                    factor=factor)
+
+  resized_image = tf.compat.v1.image.resize(
+      image, new_size, method=method, align_corners=align_corners)
+
+  if label is None:
+    return resized_image, None
+
+  resized_label = tf.compat.v1.image.resize(
+      label,
+      new_size,
+      method=get_label_resize_method(label),
+      align_corners=align_corners)
+
+  return resized_image, resized_label
diff --git a/data/preprocessing/preprocess_utils_test.py b/data/preprocessing/preprocess_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bb0aa5a70c550148ad9c9cc4d69e225d4522dbc
--- /dev/null
+++ b/data/preprocessing/preprocess_utils_test.py
@@ -0,0 +1,349 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for preprocess_utils."""
+import numpy as np
+import tensorflow as tf
+
+from deeplab2.data.preprocessing import preprocess_utils
+
+
+class PreprocessUtilsTest(tf.test.TestCase):
+
+  def testNoFlipWhenProbIsZero(self):
+    numpy_image = np.dstack([[[5., 6.],
+                              [9., 0.]],
+                             [[4., 3.],
+                              [3., 5.]]])
+    image = tf.convert_to_tensor(numpy_image)
+
+    actual, is_flipped = preprocess_utils.flip_dim([image], prob=0, dim=0)
+    self.assertAllEqual(numpy_image, actual)
+    self.assertFalse(is_flipped)
+    actual, is_flipped = preprocess_utils.flip_dim([image], prob=0, dim=1)
+    self.assertAllEqual(numpy_image, actual)
+    self.assertFalse(is_flipped)
+    actual, is_flipped = preprocess_utils.flip_dim([image], prob=0, dim=2)
+    self.assertAllEqual(numpy_image, actual)
+    self.assertFalse(is_flipped)
+
+  def testFlipWhenProbIsOne(self):
+    numpy_image = np.dstack([[[5., 6.],
+                              [9., 0.]],
+                             [[4., 3.],
+                              [3., 5.]]])
+    dim0_flipped = np.dstack([[[9., 0.],
+                               [5., 6.]],
+                              [[3., 5.],
+                               [4., 3.]]])
+    dim1_flipped = np.dstack([[[6., 5.],
+                               [0., 9.]],
+                              [[3., 4.],
+                               [5., 3.]]])
+    dim2_flipped = np.dstack([[[4., 3.],
+                               [3., 5.]],
+                              [[5., 6.],
+                               [9., 0.]]])
+    image = tf.convert_to_tensor(numpy_image)
+
+    actual, is_flipped = preprocess_utils.flip_dim([image], prob=1, dim=0)
+    self.assertAllEqual(dim0_flipped, actual)
+    self.assertTrue(is_flipped)
+    actual, is_flipped = preprocess_utils.flip_dim([image], prob=1, dim=1)
+    self.assertAllEqual(dim1_flipped, actual)
+    self.assertTrue(is_flipped)
+    actual, is_flipped = preprocess_utils.flip_dim([image], prob=1, dim=2)
+    self.assertAllEqual(dim2_flipped, actual)
+    self.assertTrue(is_flipped)
+
+  def testFlipMultipleImagesConsistentlyWhenProbIsOne(self):
+    numpy_image = np.dstack([[[5., 6.],
+                              [9., 0.]],
+                             [[4., 3.],
+                              [3., 5.]]])
+    numpy_label = np.dstack([[[0., 1.],
+                              [2., 3.]]])
+    image_dim1_flipped = np.dstack([[[6., 5.],
+                                     [0., 9.]],
+                                    [[3., 4.],
+                                     [5., 3.]]])
+    label_dim1_flipped = np.dstack([[[1., 0.],
+                                     [3., 2.]]])
+    image = tf.convert_to_tensor(numpy_image)
+    label = tf.convert_to_tensor(numpy_label)
+
+    image, label, is_flipped = preprocess_utils.flip_dim(
+        [image, label], prob=1, dim=1)
+    self.assertAllEqual(image_dim1_flipped, image)
+    self.assertAllEqual(label_dim1_flipped, label)
+    self.assertTrue(is_flipped)
+
+  def testReturnRandomFlipsOnMultipleEvals(self):
+    numpy_image = np.dstack([[[5., 6.],
+                              [9., 0.]],
+                             [[4., 3.],
+                              [3., 5.]]])
+    dim1_flipped = np.dstack([[[6., 5.],
+                               [0., 9.]],
+                              [[3., 4.],
+                               [5., 3.]]])
+    image = tf.convert_to_tensor(numpy_image)
+    original_image, not_flipped = preprocess_utils.flip_dim(
+        [image], prob=0, dim=1)
+    flip_image, is_flipped = preprocess_utils.flip_dim(
+        [image], prob=1.0, dim=1)
+    self.assertAllEqual(numpy_image, original_image)
+    self.assertFalse(not_flipped)
+    self.assertAllEqual(dim1_flipped, flip_image)
+    self.assertTrue(is_flipped)
+
+  def testReturnCorrectCropOfSingleImage(self):
+    np.random.seed(0)
+
+    height, width = 10, 20
+    image = np.random.randint(0, 256, size=(height, width, 3))
+
+    crop_height, crop_width = 2, 4
+
+    [cropped] = preprocess_utils.random_crop([tf.convert_to_tensor(image)],
+                                             crop_height,
+                                             crop_width)
+
+    # Ensure we can find the cropped image in the original:
+    is_found = False
+    for x in range(0, width - crop_width + 1):
+      for y in range(0, height - crop_height + 1):
+        if np.isclose(image[y:y+crop_height, x:x+crop_width, :],
+                      cropped).all():
+          is_found = True
+          break
+
+    self.assertTrue(is_found)
+
+  def testRandomCropMaintainsNumberOfChannels(self):
+    np.random.seed(0)
+
+    crop_height, crop_width = 10, 20
+    image = np.random.randint(0, 256, size=(100, 200, 3))
+
+    tf.random.set_seed(37)
+    [cropped] = preprocess_utils.random_crop(
+        [tf.convert_to_tensor(image)], crop_height, crop_width)
+
+    self.assertListEqual(cropped.shape.as_list(), [crop_height, crop_width, 3])
+
+  def testReturnDifferentCropAreasOnTwoEvals(self):
+    tf.random.set_seed(0)
+
+    crop_height, crop_width = 2, 3
+    image = np.random.randint(0, 256, size=(100, 200, 3))
+    [cropped0] = preprocess_utils.random_crop(
+        [tf.convert_to_tensor(image)], crop_height, crop_width)
+    [cropped1] = preprocess_utils.random_crop(
+        [tf.convert_to_tensor(image)], crop_height, crop_width)
+
+    self.assertFalse(np.isclose(cropped0.numpy(), cropped1.numpy()).all())
+
+  def testReturnConsistenCropsOfImagesInTheList(self):
+    tf.random.set_seed(0)
+
+    height, width = 10, 20
+    crop_height, crop_width = 2, 3
+    labels = np.linspace(0, height * width-1, height * width)
+    labels = labels.reshape((height, width, 1))
+    image = np.tile(labels, (1, 1, 3))
+
+    [cropped_image, cropped_label] = preprocess_utils.random_crop(
+        [tf.convert_to_tensor(image), tf.convert_to_tensor(labels)],
+        crop_height, crop_width)
+
+    for i in range(3):
+      self.assertAllEqual(cropped_image[:, :, i], tf.squeeze(cropped_label))
+
+  def testDieOnRandomCropWhenImagesWithDifferentWidth(self):
+    crop_height, crop_width = 2, 3
+    image1 = tf.convert_to_tensor(np.random.rand(4, 5, 3))
+    image2 = tf.convert_to_tensor(np.random.rand(4, 6, 1))
+
+    with self.assertRaises(tf.errors.InvalidArgumentError):
+      _ = preprocess_utils.random_crop([image1, image2], crop_height,
+                                       crop_width)
+
+  def testDieOnRandomCropWhenImagesWithDifferentHeight(self):
+    crop_height, crop_width = 2, 3
+    image1 = tf.convert_to_tensor(np.random.rand(4, 5, 3))
+    image2 = tf.convert_to_tensor(np.random.rand(5, 5, 1))
+
+    with self.assertRaises(tf.errors.InvalidArgumentError):
+      _ = preprocess_utils.random_crop([image1, image2], crop_height,
+                                       crop_width)
+
+  def testDieOnRandomCropWhenCropSizeIsGreaterThanImage(self):
+    crop_height, crop_width = 5, 9
+    image1 = tf.convert_to_tensor(np.random.rand(4, 5, 3))
+    image2 = tf.convert_to_tensor(np.random.rand(4, 5, 1))
+
+    with self.assertRaises(tf.errors.InvalidArgumentError):
+      _ = preprocess_utils.random_crop([image1, image2], crop_height,
+                                       crop_width)
+
+  def testRandomScaleFitsInRange(self):
+    scale_value = preprocess_utils.get_random_scale(1., 2., 0.)
+    self.assertGreaterEqual(scale_value, 1.)
+    self.assertLessEqual(scale_value, 2.)
+
+  def testDeterminedRandomScaleReturnsNumber(self):
+    scale = preprocess_utils.get_random_scale(1., 1., 0.)
+    self.assertEqual(scale, 1.)
+
+  def testResizeTensorsToRange(self):
+    test_shapes = [[60, 40],
+                   [15, 30],
+                   [15, 50]]
+    min_size = 50
+    max_size = 100
+    factor = None
+    expected_shape_list = [(75, 50, 3),
+                           (50, 100, 3),
+                           (30, 100, 3)]
+    for i, test_shape in enumerate(test_shapes):
+      image = tf.random.normal([test_shape[0], test_shape[1], 3])
+      new_tensor_list = preprocess_utils.resize_to_range(
+          image=image,
+          label=None,
+          min_size=min_size,
+          max_size=max_size,
+          factor=factor,
+          align_corners=True)
+      self.assertEqual(new_tensor_list[0].shape, expected_shape_list[i])
+
+  def testResizeTensorsToRangeWithFactor(self):
+    test_shapes = [[60, 40],
+                   [15, 30],
+                   [15, 50]]
+    min_size = 50
+    max_size = 98
+    factor = 8
+    expected_image_shape_list = [(81, 57, 3),
+                                 (49, 97, 3),
+                                 (33, 97, 3)]
+    expected_label_shape_list = [(81, 57, 1),
+                                 (49, 97, 1),
+                                 (33, 97, 1)]
+    for i, test_shape in enumerate(test_shapes):
+      image = tf.random.normal([test_shape[0], test_shape[1], 3])
+      label = tf.random.normal([test_shape[0], test_shape[1], 1])
+      new_tensor_list = preprocess_utils.resize_to_range(
+          image=image,
+          label=label,
+          min_size=min_size,
+          max_size=max_size,
+          factor=factor,
+          align_corners=True)
+      self.assertEqual(new_tensor_list[0].shape, expected_image_shape_list[i])
+      self.assertEqual(new_tensor_list[1].shape, expected_label_shape_list[i])
+
+  def testResizeTensorsToRangeWithSimilarMinMaxSizes(self):
+    test_shapes = [[60, 40],
+                   [15, 30],
+                   [15, 50]]
+    # Values set so that one of the side = 97.
+    min_size = 96
+    max_size = 98
+    factor = 8
+    expected_image_shape_list = [(97, 65, 3),
+                                 (49, 97, 3),
+                                 (33, 97, 3)]
+    expected_label_shape_list = [(97, 65, 1),
+                                 (49, 97, 1),
+                                 (33, 97, 1)]
+    for i, test_shape in enumerate(test_shapes):
+      image = tf.random.normal([test_shape[0], test_shape[1], 3])
+      label = tf.random.normal([test_shape[0], test_shape[1], 1])
+      new_tensor_list = preprocess_utils.resize_to_range(
+          image=image,
+          label=label,
+          min_size=min_size,
+          max_size=max_size,
+          factor=factor,
+          align_corners=True)
+      self.assertEqual(new_tensor_list[0].shape, expected_image_shape_list[i])
+      self.assertEqual(new_tensor_list[1].shape, expected_label_shape_list[i])
+
+  def testResizeTensorsToRangeWithEqualMaxSize(self):
+    test_shapes = [[97, 38],
+                   [96, 97]]
+    # Make max_size equal to the larger value of test_shapes.
+    min_size = 97
+    max_size = 97
+    factor = 8
+    expected_image_shape_list = [(97, 41, 3),
+                                 (97, 97, 3)]
+    expected_label_shape_list = [(97, 41, 1),
+                                 (97, 97, 1)]
+    for i, test_shape in enumerate(test_shapes):
+      image = tf.random.normal([test_shape[0], test_shape[1], 3])
+      label = tf.random.normal([test_shape[0], test_shape[1], 1])
+      new_tensor_list = preprocess_utils.resize_to_range(
+          image=image,
+          label=label,
+          min_size=min_size,
+          max_size=max_size,
+          factor=factor,
+          align_corners=True)
+      self.assertEqual(new_tensor_list[0].shape, expected_image_shape_list[i])
+      self.assertEqual(new_tensor_list[1].shape, expected_label_shape_list[i])
+
+  def testResizeTensorsToRangeWithPotentialErrorInTFCeil(self):
+    test_shape = [3936, 5248]
+    # Make max_size equal to the larger value of test_shapes.
+    min_size = 1441
+    max_size = 1441
+    factor = 16
+    expected_image_shape = (1089, 1441, 3)
+    expected_label_shape = (1089, 1441, 1)
+    image = tf.random.normal([test_shape[0], test_shape[1], 3])
+    label = tf.random.normal([test_shape[0], test_shape[1], 1])
+    new_tensor_list = preprocess_utils.resize_to_range(
+        image=image,
+        label=label,
+        min_size=min_size,
+        max_size=max_size,
+        factor=factor,
+        align_corners=True)
+    self.assertEqual(new_tensor_list[0].shape, expected_image_shape)
+    self.assertEqual(new_tensor_list[1].shape, expected_label_shape)
+
+  def testResizeTensorWithOnlyMaxSize(self):
+    test_shapes = [[97, 38],
+                   [96, 18]]
+
+    max_size = (97, 28)
+    # Since the second test shape already fits max size, do nothing.
+    expected_image_shape_list = [(71, 28, 3),
+                                 (96, 18, 3)]
+    for i, test_shape in enumerate(test_shapes):
+      image = tf.random.normal([test_shape[0], test_shape[1], 3])
+      new_tensor_list = preprocess_utils.resize_to_range(
+          image=image,
+          label=None,
+          min_size=None,
+          max_size=max_size,
+          align_corners=True)
+      self.assertEqual(new_tensor_list[0].shape, expected_image_shape_list[i])
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/data/sample_generator.py b/data/sample_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc08f6f69057c8da060060596b0b06ccac67a4c6
--- /dev/null
+++ b/data/sample_generator.py
@@ -0,0 +1,651 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This file contains code to get a sample from a dataset."""
+
+import functools
+
+import numpy as np
+import tensorflow as tf
+
+from deeplab2 import common
+from deeplab2.data import dataset_utils
+from deeplab2.data.preprocessing import input_preprocessing as preprocessing
+
+
+def _compute_gaussian_from_std(sigma):
+  """Computes the Gaussian and its size from a given standard deviation."""
+  size = int(6 * sigma + 3)
+  x = np.arange(size, dtype=np.float)
+  y = x[:, np.newaxis]
+  x0, y0 = 3 * sigma + 1, 3 * sigma + 1
+  return np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma**2)), size
+
+
+class PanopticSampleGenerator:
+  """This class generates samples from images and labels."""
+
+  def __init__(self,
+               dataset_info,
+               is_training,
+               crop_size,
+               min_resize_value=None,
+               max_resize_value=None,
+               resize_factor=None,
+               min_scale_factor=1.,
+               max_scale_factor=1.,
+               scale_factor_step_size=0,
+               autoaugment_policy_name=None,
+               only_semantic_annotations=False,
+               thing_id_mask_annotations=False,
+               max_thing_id=128,
+               sigma=8,
+               focus_small_instances=None):
+    """Initializes the panoptic segmentation generator.
+
+    Args:
+      dataset_info: A dictionary with the following keys.
+      - `name`: String, dataset name.
+      - `ignore_label`: Integer, ignore label.
+      - `class_has_instances_list`: A list of integers indicating which
+        class has instance annotations.
+      - `panoptic_label_divisor`: Integer, panoptic label divisor.
+      - `num_classes`: Integer, number of classes.
+      - `is_video_dataset`: Boolean, is video dataset or not.
+      is_training: Boolean, is training mode or not.
+      crop_size: Image crop size [height, width].
+      min_resize_value: A 2-tuple of (height, width), desired minimum value
+        after resize. If a single element is given, then height and width share
+        the same value. None, empty or having 0 indicates no minimum value will
+        be used.
+      max_resize_value: A 2-tuple of (height, width), maximum allowed value
+        after resize. If a single element is given, then height and width
+        share the same value. None, empty or having 0 indicates no maximum
+        value will be used.
+      resize_factor: Resized dimensions are multiple of factor plus one.
+      min_scale_factor: Minimum scale factor for random scale augmentation.
+      max_scale_factor: Maximum scale factor for random scale augmentation.
+      scale_factor_step_size: The step size from min scale factor to max scale
+        factor. The input is randomly scaled based on the value of
+        (min_scale_factor, max_scale_factor, scale_factor_step_size).
+      autoaugment_policy_name: String, autoaugment policy name. See
+        autoaugment_policy.py for available policies.
+      only_semantic_annotations: An optional flag indicating whether the model
+        needs only semantic annotations (default: False).
+      thing_id_mask_annotations: An optional flag indicating whether the model
+        needs thing_id_mask annotations. When `thing_id_mask_annotations` is
+        True, we will additionally return mask annotation for each `thing`
+        instance, encoded with a unique thing_id. This ground-truth annotation
+        could be used to learn a better segmentation mask for each instance.
+        `thing_id` indicates the number of unique thing-ID to each instance in
+        an image, starting the counting from 0 (default: False).
+      max_thing_id: The maximum number of possible thing instances per image. It
+        is used together when thing_id_mask_annotations = True, representing the
+        maximum thing ID encoded in the thing_id_mask. (default: 128).
+      sigma: The standard deviation of the Gaussian used to encode the center
+        keypoint (default: 8).
+      focus_small_instances: An optional dict that defines how to deal with
+        small instances (default: None):
+        -`threshold`: An integer defining the threshold pixel number for an
+          instance to be considered small.
+        -`weight`: A number that defines the loss weight for small instances.
+    """
+    self._dataset_info = dataset_info
+    self._ignore_label = self._dataset_info['ignore_label']
+    self._only_semantic_annotations = only_semantic_annotations
+    self._sigma = sigma
+    self._instance_area_threshold = 0
+    self._small_instance_weight = 1.0
+    self._thing_id_mask_annotations = thing_id_mask_annotations
+    self._max_thing_id = max_thing_id
+    self._is_training = is_training
+    self._preprocessing_fn = functools.partial(
+        preprocessing.preprocess_image_and_label,
+        crop_height=crop_size[0],
+        crop_width=crop_size[1],
+        min_resize_value=min_resize_value,
+        max_resize_value=max_resize_value,
+        resize_factor=resize_factor,
+        min_scale_factor=min_scale_factor,
+        max_scale_factor=max_scale_factor,
+        scale_factor_step_size=scale_factor_step_size,
+        autoaugment_policy_name=autoaugment_policy_name,
+        ignore_label=self._ignore_label *
+        self._dataset_info['panoptic_label_divisor'],
+        is_training=self._is_training)
+
+    if focus_small_instances is not None:
+      self._instance_area_threshold = focus_small_instances['threshold']
+      self._small_instance_weight = focus_small_instances['weight']
+
+    self._gaussian, self._gaussian_size = _compute_gaussian_from_std(
+        self._sigma)
+    self._gaussian = tf.cast(tf.reshape(self._gaussian, [-1]), tf.float32)
+
+  def __call__(self, sample_dict):
+    """Gets a sample.
+
+    Args:
+      sample_dict: A dictionary with the following keys and values:
+      - `image`: A tensor of shape [image_height, image_width, 3].
+      - `image_name`: String, image name.
+      - `label`: A tensor of shape [label_height, label_width, 1] or None.
+      - `height`: An integer specifying the height of the image.
+      - `width`: An integer specifying the width of the image.
+      - `sequence`: An optional string specifying the sequence name.
+      - `prev_image`: An optional tensor of the same shape as `image`.
+      - `prev_label`: An optional tensor of the same shape as `label`.
+      - `next_image`: An optional next-frame tensor of the shape of `image`.
+      - `next_label`: An optional next-frame tensor of the shape of `label`.
+
+    Returns:
+      sample: A dictionary storing required data for panoptic segmentation.
+    """
+    return self.call(**sample_dict)
+
+  def call(self,
+           image,
+           image_name,
+           label,
+           height,
+           width,
+           sequence='',
+           prev_image=None,
+           prev_label=None,
+           next_image=None,
+           next_label=None):
+    """Gets a sample.
+
+    Args:
+      image: A tensor of shape [image_height, image_width, 3].
+      image_name: String, image name.
+      label: A tensor of shape [label_height, label_width, 1] or None.
+      height: An integer specifying the height of the image.
+      width: An integer specifying the width of the image.
+      sequence: An optional string specifying the sequence name.
+      prev_image: An optional tensor of shape [image_height, image_width, 3].
+      prev_label: An optional tensor of shape [label_height, label_width, 1].
+      next_image: An optional tensor of shape [image_height, image_width, 3].
+      next_label: An optional tensor of shape [label_height, label_width, 1].
+
+    Returns:
+      sample: A dictionary storing required data for panoptic segmentation.
+
+    Raises:
+      ValueError: An error occurs when the label shape is invalid.
+      NotImplementedError: An error occurs when thing_id_mask_annotations comes
+        together with prev_image or prev_label, not currently implemented.
+    """
+    if label is not None:
+      label.get_shape().assert_is_compatible_with(
+          tf.TensorShape([None, None, 1]))
+      original_label = tf.cast(label, dtype=tf.int32, name='original_label')
+      if next_label is not None:
+        original_next_label = tf.cast(
+            next_label, dtype=tf.int32, name='original_next_label')
+    # Reusing the preprocessing function for both next and prev samples.
+    if next_image is not None:
+      resized_image, image, label, next_image, next_label = (
+          self._preprocessing_fn(
+              image, label, prev_image=next_image, prev_label=next_label))
+    else:
+      resized_image, image, label, prev_image, prev_label = (
+          self._preprocessing_fn(
+              image, label, prev_image=prev_image, prev_label=prev_label))
+    sample = {
+        common.IMAGE: image
+    }
+    if prev_image is not None:
+      sample[common.IMAGE] = tf.concat([image, prev_image], axis=2)
+    if next_image is not None:
+      sample[common.NEXT_IMAGE] = next_image
+      sample[common.IMAGE] = tf.concat([image, next_image], axis=2)
+    if label is not None:
+      # Panoptic label for crowd regions will be ignore_label.
+      semantic_label, panoptic_label, thing_mask, crowd_region = (
+          dataset_utils.get_semantic_and_panoptic_label(
+              self._dataset_info, label, self._ignore_label))
+      sample[common.GT_SEMANTIC_KEY] = tf.squeeze(semantic_label, axis=2)
+      semantic_weights = tf.ones_like(semantic_label, dtype=tf.float32)
+      sample[common.SEMANTIC_LOSS_WEIGHT_KEY] = tf.squeeze(
+          semantic_weights, axis=2)
+      sample[common.GT_IS_CROWD] = tf.squeeze(crowd_region, axis=2)
+
+      if not self._only_semantic_annotations:
+        # The sample will have the original label including crowd regions.
+        sample[common.GT_PANOPTIC_KEY] = tf.squeeze(label, axis=2)
+        # Compute center loss for all non-crowd and non-ignore pixels.
+        non_crowd_and_non_ignore_regions = tf.logical_and(
+            tf.logical_not(crowd_region),
+            tf.not_equal(semantic_label, self._ignore_label))
+        sample[common.CENTER_LOSS_WEIGHT_KEY] = tf.squeeze(tf.cast(
+            non_crowd_and_non_ignore_regions, tf.float32), axis=2)
+        # Compute regression loss only for thing pixels that are not crowd.
+        non_crowd_things = tf.logical_and(
+            tf.logical_not(crowd_region), thing_mask)
+        sample[common.REGRESSION_LOSS_WEIGHT_KEY] = tf.squeeze(tf.cast(
+            non_crowd_things, tf.float32), axis=2)
+
+        prev_panoptic_label = None
+        next_panoptic_label = None
+        if prev_label is not None:
+          _, prev_panoptic_label, _, _ = (
+              dataset_utils.get_semantic_and_panoptic_label(
+                  self._dataset_info, prev_label, self._ignore_label))
+        if next_label is not None:
+          _, next_panoptic_label, _, _ = (
+              dataset_utils.get_semantic_and_panoptic_label(
+                  self._dataset_info, next_label, self._ignore_label))
+        (sample[common.GT_INSTANCE_CENTER_KEY],
+         sample[common.GT_INSTANCE_REGRESSION_KEY],
+         sample[common.SEMANTIC_LOSS_WEIGHT_KEY],
+         prev_center_map,
+         frame_center_offsets,
+         next_offset) = self._generate_gt_center_and_offset(
+             panoptic_label, semantic_weights, prev_panoptic_label,
+             next_panoptic_label)
+
+        sample[common.GT_INSTANCE_REGRESSION_KEY] = tf.cast(
+            sample[common.GT_INSTANCE_REGRESSION_KEY], tf.float32)
+
+        if next_label is not None:
+          sample[common.GT_NEXT_INSTANCE_REGRESSION_KEY] = tf.cast(
+              next_offset, tf.float32)
+          sample[common.NEXT_REGRESSION_LOSS_WEIGHT_KEY] = tf.cast(
+              tf.greater(tf.reduce_sum(tf.abs(next_offset), axis=2), 0),
+              tf.float32)
+
+        # Only squeeze center map and semantic loss weights, as regression map
+        # has two channels (x and y offsets).
+        sample[common.GT_INSTANCE_CENTER_KEY] = tf.squeeze(
+            sample[common.GT_INSTANCE_CENTER_KEY], axis=2)
+        sample[common.SEMANTIC_LOSS_WEIGHT_KEY] = tf.squeeze(
+            sample[common.SEMANTIC_LOSS_WEIGHT_KEY], axis=2)
+
+        if prev_label is not None:
+          sample[common.GT_FRAME_OFFSET_KEY] = frame_center_offsets
+          sample[common.GT_FRAME_OFFSET_KEY] = tf.cast(
+              sample[common.GT_FRAME_OFFSET_KEY], tf.float32)
+          frame_offsets_present = tf.logical_or(
+              tf.not_equal(frame_center_offsets[..., 0], 0),
+              tf.not_equal(frame_center_offsets[..., 1], 0))
+          sample[common.FRAME_REGRESSION_LOSS_WEIGHT_KEY] = tf.cast(
+              frame_offsets_present, tf.float32)
+          if self._is_training:
+            sample[common.IMAGE] = tf.concat(
+                [sample[common.IMAGE], prev_center_map], axis=2)
+
+        if self._thing_id_mask_annotations:
+          if any([prev_image is not None,
+                  prev_label is not None,
+                  next_image is not None,
+                  next_label is not None]):
+            raise NotImplementedError(
+                'Current implementation of Max-DeepLab does not support '
+                + 'prev_image, prev_label, next_image, or next_label.')
+          thing_id_mask, thing_id_class = (
+              self._generate_thing_id_mask_and_class(
+                  panoptic_label, non_crowd_things))
+          sample[common.GT_THING_ID_MASK_KEY] = tf.squeeze(
+              thing_id_mask, axis=2)
+          sample[common.GT_THING_ID_CLASS_KEY] = thing_id_class
+
+    if not self._is_training:
+      # Resized image is only used during visualization.
+      sample[common.RESIZED_IMAGE] = resized_image
+      sample[common.IMAGE_NAME] = image_name
+      sample[common.GT_SIZE_RAW] = tf.stack([height, width], axis=0)
+      if self._dataset_info['is_video_dataset']:
+        sample[common.SEQUENCE_ID] = sequence
+      # Keep original labels for evaluation.
+      if label is not None:
+        orig_semantic_label, _, _, orig_crowd_region = (
+            dataset_utils.get_semantic_and_panoptic_label(
+                self._dataset_info, original_label, self._ignore_label))
+        sample[common.GT_SEMANTIC_RAW] = tf.squeeze(orig_semantic_label, axis=2)
+        if not self._only_semantic_annotations:
+          sample[common.GT_PANOPTIC_RAW] = tf.squeeze(original_label, axis=2)
+          sample[common.GT_IS_CROWD_RAW] = tf.squeeze(orig_crowd_region)
+          if next_label is not None:
+            sample[common.GT_NEXT_PANOPTIC_RAW] = tf.squeeze(
+                original_next_label, axis=2)
+    return sample
+
+  def _generate_thing_id_mask_and_class(self,
+                                        panoptic_label,
+                                        non_crowd_things):
+    """Generates the ground-truth thing-ID masks and their class labels.
+
+    It computes thing-ID mask and class with unique ID for each thing instance.
+    `thing_id` indicates the number of unique thing-ID to each instance in an
+    image, starting the counting from 0. Each pixel in thing_id_mask is labeled
+    with the corresponding thing-ID.
+
+    Args:
+      panoptic_label: A tf.Tensor of shape [height, width, 1].
+      non_crowd_things: A tf.Tensor of shape [height, width, 1], indicating
+        non-crowd and thing-class regions.
+
+    Returns:
+      thing_id_mask: A tf.Tensor of shape [height, width, 1]. It assigns each
+        non-crowd thing instance a unique mask-ID label, starting from 0.
+        Unassigned pixels are set to -1.
+      thing_id_class: A tf.Tensor of shape [max_thing_id]. It contains semantic
+        ID of each instance assigned to thing_id_mask. The remaining
+        (max_thing_id - num_things) elements are set to -1.
+
+    Raises:
+      ValueError: An error occurs when the thing-ID mask contains stuff or crowd
+        region.
+      ValueError: An error occurs when thing_count is greater or equal to
+        self._max_thing_id.
+
+    """
+    unique_ids, _ = tf.unique(tf.reshape(panoptic_label, [-1]))
+    thing_id_mask = -tf.ones_like(panoptic_label)
+    thing_id_class = -tf.ones(self._max_thing_id)
+    thing_count = 0
+    for panoptic_id in unique_ids:
+      semantic_id = panoptic_id // self._dataset_info['panoptic_label_divisor']
+      # Filter out IDs that are not thing instances (i.e., IDs for ignore_label,
+      # stuff classes or crowd). Stuff classes and crowd regions both have IDs
+      # of the form panoptic_id = semantic_id * label_divisor (i.e., instance id
+      # = 0)
+      if (semantic_id == self._dataset_info['ignore_label'] or
+          panoptic_id % self._dataset_info['panoptic_label_divisor'] == 0):
+        continue
+
+      assert_stuff_crowd = tf.debugging.Assert(
+          tf.reduce_all(non_crowd_things[panoptic_label == panoptic_id]),
+          ['thing-ID mask here must not contain stuff or crowd region.'])
+      with tf.control_dependencies([assert_stuff_crowd]):
+        panoptic_id = tf.identity(panoptic_id)
+
+      thing_id_mask = tf.where(panoptic_label == panoptic_id,
+                               thing_count, thing_id_mask)
+
+      assert_thing_count = tf.debugging.Assert(
+          thing_count < self._max_thing_id,
+          ['thing_count must be smaller than self._max_thing_id.'])
+      with tf.control_dependencies([assert_thing_count]):
+        thing_count = tf.identity(thing_count)
+
+      thing_id_class = tf.tensor_scatter_nd_update(
+          thing_id_class, [[thing_count]], [semantic_id])
+      thing_count += 1
+    return thing_id_mask, thing_id_class
+
+  def _generate_prev_centers_with_noise(self,
+                                        panoptic_label,
+                                        offset_noise_factor=0.05,
+                                        false_positive_rate=0.2,
+                                        false_positive_noise_factor=0.05):
+    """Generates noisy center predictions for the previous frame.
+
+    Args:
+      panoptic_label: A tf.Tensor of shape [height, width, 1].
+      offset_noise_factor: An optional float defining the maximum fraction of
+        the object size that is used to displace the previous center.
+      false_positive_rate: An optional float indicating at which probability
+        false positives should be added.
+      false_positive_noise_factor: An optional float defining the maximum
+        fraction of the object size that is used to displace the false positive
+        center.
+
+    Returns:
+      A tuple of (center, ids_to_center) with both being tf.Tensor of shape
+      [height, width, 1] and shape [N, 2] where N is the number of unique IDs.
+    """
+    height = tf.shape(panoptic_label)[0]
+    width = tf.shape(panoptic_label)[1]
+
+    # Pad center to make boundary handling easier.
+    center_pad_begin = int(round(3 * self._sigma + 1))
+    center_pad_end = int(round(3 * self._sigma + 2))
+    center_pad = center_pad_begin + center_pad_end
+
+    center = tf.zeros((height + center_pad, width + center_pad))
+    unique_ids, _ = tf.unique(tf.reshape(panoptic_label, [-1]))
+    ids_to_center_x = tf.zeros_like(unique_ids, dtype=tf.int32)
+    ids_to_center_y = tf.zeros_like(unique_ids, dtype=tf.int32)
+
+    for panoptic_id in unique_ids:
+      semantic_id = panoptic_id // self._dataset_info['panoptic_label_divisor']
+      # Filter out IDs that should be ignored, are stuff classes or crowd.
+      # Stuff classes and crowd regions both have IDs of the form panoptic_id =
+      # semantic_id * label_divisor
+      if (semantic_id == self._dataset_info['ignore_label'] or
+          panoptic_id % self._dataset_info['panoptic_label_divisor'] == 0):
+        continue
+
+      # Convert [[y0, x0, 0], ...] to [[y0, ...], [x0, ...], [0, ...]].
+      mask_index = tf.cast(
+          tf.transpose(tf.where(panoptic_label == panoptic_id)), tf.float32)
+      centers = tf.reduce_mean(mask_index, axis=1)
+      bbox_size = (
+          tf.reduce_max(mask_index, axis=1) - tf.reduce_min(mask_index, axis=1))
+
+      # Add noise.
+      center_y = (
+          centers[0] + tf.random.normal([], dtype=tf.float32) *
+          offset_noise_factor * bbox_size[0])
+      center_x = (
+          centers[1] + tf.random.normal([], dtype=tf.float32) *
+          offset_noise_factor * bbox_size[1])
+
+      center_x = tf.minimum(
+          tf.maximum(tf.cast(tf.round(center_x), tf.int32), 0), width - 1)
+      center_y = tf.minimum(
+          tf.maximum(tf.cast(tf.round(center_y), tf.int32), 0), height - 1)
+
+      id_index = tf.where(tf.equal(panoptic_id, unique_ids))
+      ids_to_center_x = tf.tensor_scatter_nd_update(
+          ids_to_center_x, id_index, tf.expand_dims(center_x, axis=0))
+      ids_to_center_y = tf.tensor_scatter_nd_update(
+          ids_to_center_y, id_index, tf.expand_dims(center_y, axis=0))
+
+      def add_center_gaussian(center_x_coord, center_y_coord, center):
+        # Due to the padding with center_pad_begin in center, the computed
+        # center becomes the upper left corner in the center tensor.
+        upper_left = center_x_coord, center_y_coord
+        bottom_right = (upper_left[0] + self._gaussian_size,
+                        upper_left[1] + self._gaussian_size)
+
+        indices_x, indices_y = tf.meshgrid(
+            tf.range(upper_left[0], bottom_right[0]),
+            tf.range(upper_left[1], bottom_right[1]))
+        indices = tf.transpose(
+            tf.stack([tf.reshape(indices_y, [-1]),
+                      tf.reshape(indices_x, [-1])]))
+
+        return tf.tensor_scatter_nd_max(
+            center, indices, self._gaussian, name='center_scatter')
+
+      center = add_center_gaussian(center_x, center_y, center)
+      # Generate false positives.
+      center_y = (
+          tf.cast(center_y, dtype=tf.float32) +
+          tf.random.normal([], dtype=tf.float32) * false_positive_noise_factor *
+          bbox_size[0])
+      center_x = (
+          tf.cast(center_x, dtype=tf.float32) +
+          tf.random.normal([], dtype=tf.float32) * false_positive_noise_factor *
+          bbox_size[1])
+
+      center_x = tf.minimum(
+          tf.maximum(tf.cast(tf.round(center_x), tf.int32), 0), width - 1)
+      center_y = tf.minimum(
+          tf.maximum(tf.cast(tf.round(center_y), tf.int32), 0), height - 1)
+      # Draw a sample to decide whether to add a false positive or not.
+      center = center + tf.cast(
+          tf.random.uniform([], dtype=tf.float32) < false_positive_rate,
+          tf.float32) * (
+              add_center_gaussian(center_x, center_y, center) - center)
+
+    center = center[center_pad_begin:(center_pad_begin + height),
+                    center_pad_begin:(center_pad_begin + width)]
+    center = tf.expand_dims(center, -1)
+    return center, unique_ids, ids_to_center_x, ids_to_center_y
+
+  def _generate_gt_center_and_offset(self,
+                                     panoptic_label,
+                                     semantic_weights,
+                                     prev_panoptic_label=None,
+                                     next_panoptic_label=None):
+    """Generates the ground-truth center and offset from the panoptic labels.
+
+    Additionally, the per-pixel weights for the semantic branch are increased
+    for small instances. In case, prev_panoptic_label is passed, it also
+    computes the previous center heatmap with random noise and the offsets
+    between center maps.
+
+    Args:
+      panoptic_label: A tf.Tensor of shape [height, width, 1].
+      semantic_weights: A tf.Tensor of shape [height, width, 1].
+      prev_panoptic_label: An optional tf.Tensor of shape [height, width, 1].
+      next_panoptic_label: An optional tf.Tensor of shape [height, width, 1].
+
+    Returns:
+      A tuple (center, offsets, weights, prev_center, frame_offset*,
+      next_offset) with each being a tf.Tensor of shape [height, width, 1 (2*)].
+      If prev_panoptic_label is None, prev_center and frame_offset are None.
+      If next_panoptic_label is None, next_offset is None.
+    """
+    height = tf.shape(panoptic_label)[0]
+    width = tf.shape(panoptic_label)[1]
+
+    # Pad center to make boundary handling easier.
+    center_pad_begin = int(round(3 * self._sigma + 1))
+    center_pad_end = int(round(3 * self._sigma + 2))
+    center_pad = center_pad_begin + center_pad_end
+
+    center = tf.zeros((height + center_pad, width + center_pad))
+    offset_x = tf.zeros((height, width, 1), dtype=tf.int32)
+    offset_y = tf.zeros((height, width, 1), dtype=tf.int32)
+    unique_ids, _ = tf.unique(tf.reshape(panoptic_label, [-1]))
+
+    prev_center = None
+    frame_offsets = None
+    # Due to loop handling in tensorflow, these variables had to be defined for
+    # all cases.
+    frame_offset_x = tf.zeros((height, width, 1), dtype=tf.int32)
+    frame_offset_y = tf.zeros((height, width, 1), dtype=tf.int32)
+
+    # Next-frame instance offsets.
+    next_offset = None
+    next_offset_y = tf.zeros((height, width, 1), dtype=tf.int32)
+    next_offset_x = tf.zeros((height, width, 1), dtype=tf.int32)
+
+    if prev_panoptic_label is not None:
+      (prev_center, prev_unique_ids, prev_centers_x, prev_centers_y
+      ) = self._generate_prev_centers_with_noise(prev_panoptic_label)
+
+    for panoptic_id in unique_ids:
+      semantic_id = panoptic_id // self._dataset_info['panoptic_label_divisor']
+      # Filter out IDs that should be ignored, are stuff classes or crowd.
+      # Stuff classes and crowd regions both have IDs of the form panopti_id =
+      # semantic_id * label_divisor
+      if (semantic_id == self._dataset_info['ignore_label'] or
+          panoptic_id % self._dataset_info['panoptic_label_divisor'] == 0):
+        continue
+
+      # Convert [[y0, x0, 0], ...] to [[y0, ...], [x0, ...], [0, ...]].
+      mask_index = tf.transpose(tf.where(panoptic_label == panoptic_id))
+      mask_y_index = mask_index[0]
+      mask_x_index = mask_index[1]
+
+      next_mask_index = None
+      next_mask_y_index = None
+      next_mask_x_index = None
+      if next_panoptic_label is not None:
+        next_mask_index = tf.transpose(
+            tf.where(next_panoptic_label == panoptic_id))
+        next_mask_y_index = next_mask_index[0]
+        next_mask_x_index = next_mask_index[1]
+
+      instance_area = tf.shape(mask_x_index)
+      if instance_area < self._instance_area_threshold:
+        semantic_weights = tf.where(panoptic_label == panoptic_id,
+                                    self._small_instance_weight,
+                                    semantic_weights)
+
+      centers = tf.reduce_mean(tf.cast(mask_index, tf.float32), axis=1)
+
+      center_x = tf.cast(tf.round(centers[1]), tf.int32)
+      center_y = tf.cast(tf.round(centers[0]), tf.int32)
+
+      # Due to the padding with center_pad_begin in center, the computed center
+      # becomes the upper left corner in the center tensor.
+      upper_left = center_x, center_y
+      bottom_right = (upper_left[0] + self._gaussian_size,
+                      upper_left[1] + self._gaussian_size)
+
+      indices_x, indices_y = tf.meshgrid(
+          tf.range(upper_left[0], bottom_right[0]),
+          tf.range(upper_left[1], bottom_right[1]))
+      indices = tf.transpose(
+          tf.stack([tf.reshape(indices_y, [-1]),
+                    tf.reshape(indices_x, [-1])]))
+
+      center = tf.tensor_scatter_nd_max(
+          center, indices, self._gaussian, name='center_scatter')
+      offset_y = tf.tensor_scatter_nd_update(
+          offset_y,
+          tf.transpose(mask_index),
+          center_y - tf.cast(mask_y_index, tf.int32),
+          name='offset_y_scatter')
+      offset_x = tf.tensor_scatter_nd_update(
+          offset_x,
+          tf.transpose(mask_index),
+          center_x - tf.cast(mask_x_index, tf.int32),
+          name='offset_x_scatter')
+      if prev_panoptic_label is not None:
+        mask = tf.equal(prev_unique_ids, panoptic_id)
+        if tf.math.count_nonzero(mask) > 0:
+          prev_center_x = prev_centers_x[mask]
+          prev_center_y = prev_centers_y[mask]
+
+          frame_offset_y = tf.tensor_scatter_nd_update(
+              frame_offset_y,
+              tf.transpose(mask_index),
+              prev_center_y - tf.cast(mask_y_index, tf.int32),
+              name='frame_offset_y_scatter')
+          frame_offset_x = tf.tensor_scatter_nd_update(
+              frame_offset_x,
+              tf.transpose(mask_index),
+              prev_center_x - tf.cast(mask_x_index, tf.int32),
+              name='frame_offset_x_scatter')
+      if next_panoptic_label is not None:
+        next_offset_y = tf.tensor_scatter_nd_update(
+            next_offset_y,
+            tf.transpose(next_mask_index),
+            center_y - tf.cast(next_mask_y_index, tf.int32),
+            name='next_offset_y_scatter')
+        next_offset_x = tf.tensor_scatter_nd_update(
+            next_offset_x,
+            tf.transpose(next_mask_index),
+            center_x - tf.cast(next_mask_x_index, tf.int32),
+            name='next_offset_x_scatter')
+
+    offset = tf.concat([offset_y, offset_x], axis=2)
+    center = center[center_pad_begin:(center_pad_begin + height),
+                    center_pad_begin:(center_pad_begin + width)]
+    center = tf.expand_dims(center, -1)
+    if prev_panoptic_label is not None:
+      frame_offsets = tf.concat([frame_offset_y, frame_offset_x], axis=2)
+    if next_panoptic_label is not None:
+      next_offset = tf.concat([next_offset_y, next_offset_x], axis=2)
+    return (center, offset, semantic_weights, prev_center, frame_offsets,
+            next_offset)
diff --git a/data/sample_generator_test.py b/data/sample_generator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fa3cb3cbd1a3104aca5ad6fa0e909956a914f8b
--- /dev/null
+++ b/data/sample_generator_test.py
@@ -0,0 +1,274 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for sample_generator."""
+
+import os
+
+from absl import flags
+import numpy as np
+from PIL import Image
+import tensorflow as tf
+
+from deeplab2 import common
+from deeplab2.data import data_utils
+from deeplab2.data import dataset
+from deeplab2.data import sample_generator
+
+image_utils = tf.keras.preprocessing.image
+
+flags.DEFINE_string(
+    'panoptic_annotation_data',
+    'deeplab2/data/testdata/',
+    'Path to annotated test image.')
+flags.DEFINE_bool('update_golden_data', False,
+                  'Whether or not to update the golden data for testing.')
+
+FLAGS = flags.FLAGS
+
+_FILENAME_PREFIX = 'dummy_000000_000000'
+_IMAGE_FOLDER = 'leftImg8bit/'
+_TARGET_FOLDER = 'targets/'
+
+
+def _get_groundtruth_image(computed_image_array, groundtruth_image_filename):
+  if FLAGS.update_golden_data:
+    image = Image.fromarray(tf.squeeze(computed_image_array).numpy())
+    with tf.io.gfile.GFile(groundtruth_image_filename, mode='wb') as fp:
+      image.save(fp)
+    return computed_image_array
+
+  with tf.io.gfile.GFile(groundtruth_image_filename, mode='rb') as fp:
+    image = data_utils.read_image(fp.read())
+    # If loaded image has 3 channels, the returned shape is [height, width, 3].
+    # If loaded image has 1 channel, the returned shape is [height, width].
+    image = np.squeeze(image_utils.img_to_array(image))
+  return image
+
+
+def _get_groundtruth_array(computed_image_array, groundtruth_image_filename):
+  if FLAGS.update_golden_data:
+    with tf.io.gfile.GFile(groundtruth_image_filename, mode='wb') as fp:
+      np.save(fp, computed_image_array)
+    return computed_image_array
+  with tf.io.gfile.GFile(groundtruth_image_filename, mode='rb') as fp:
+    # If loaded data has C>1 channels, the returned shape is [height, width, C].
+    # If loaded data has 1 channel, the returned shape is [height, width].
+    array = np.squeeze(np.load(fp))
+  return array
+
+
+class PanopticSampleGeneratorTest(tf.test.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    self._test_img_data_dir = os.path.join(
+        FLAGS.test_srcdir,
+        FLAGS.panoptic_annotation_data,
+        _IMAGE_FOLDER)
+    self._test_gt_data_dir = os.path.join(
+        FLAGS.test_srcdir,
+        FLAGS.panoptic_annotation_data)
+    self._test_target_data_dir = os.path.join(
+        FLAGS.test_srcdir,
+        FLAGS.panoptic_annotation_data,
+        _TARGET_FOLDER)
+    image_path = self._test_img_data_dir + _FILENAME_PREFIX + '_leftImg8bit.png'
+    with tf.io.gfile.GFile(image_path, 'rb') as image_file:
+      rgb_image = data_utils.read_image(image_file.read())
+    self._rgb_image = tf.convert_to_tensor(np.array(rgb_image))
+    label_path = self._test_gt_data_dir + 'dummy_gt_for_vps.png'
+    with tf.io.gfile.GFile(label_path, 'rb') as label_file:
+      label = data_utils.read_image(label_file.read())
+    self._label = tf.expand_dims(tf.convert_to_tensor(
+        np.dot(np.array(label), [1, 256, 256 * 256])), -1)
+
+  def test_input_generator(self):
+    tf.random.set_seed(0)
+    np.random.seed(0)
+    small_instances = {'threshold': 4096, 'weight': 3.0}
+    generator = sample_generator.PanopticSampleGenerator(
+        dataset.CITYSCAPES_PANOPTIC_INFORMATION._asdict(),
+        focus_small_instances=small_instances,
+        is_training=True,
+        crop_size=[769, 769],
+        thing_id_mask_annotations=True)
+    input_sample = {
+        'image': self._rgb_image,
+        'image_name': 'test_image',
+        'label': self._label,
+        'height': 800,
+        'width': 800
+    }
+    sample = generator(input_sample)
+
+    self.assertIn(common.IMAGE, sample)
+    self.assertIn(common.GT_SEMANTIC_KEY, sample)
+    self.assertIn(common.GT_PANOPTIC_KEY, sample)
+    self.assertIn(common.GT_INSTANCE_CENTER_KEY, sample)
+    self.assertIn(common.GT_INSTANCE_REGRESSION_KEY, sample)
+    self.assertIn(common.GT_IS_CROWD, sample)
+    self.assertIn(common.GT_THING_ID_MASK_KEY, sample)
+    self.assertIn(common.GT_THING_ID_CLASS_KEY, sample)
+    self.assertIn(common.SEMANTIC_LOSS_WEIGHT_KEY, sample)
+    self.assertIn(common.CENTER_LOSS_WEIGHT_KEY, sample)
+    self.assertIn(common.REGRESSION_LOSS_WEIGHT_KEY, sample)
+
+    self.assertListEqual(sample[common.IMAGE].shape.as_list(), [769, 769, 3])
+    self.assertListEqual(sample[common.GT_SEMANTIC_KEY].shape.as_list(),
+                         [769, 769])
+    self.assertListEqual(sample[common.GT_PANOPTIC_KEY].shape.as_list(),
+                         [769, 769])
+    self.assertListEqual(sample[common.GT_INSTANCE_CENTER_KEY].shape.as_list(),
+                         [769, 769])
+    self.assertListEqual(
+        sample[common.GT_INSTANCE_REGRESSION_KEY].shape.as_list(),
+        [769, 769, 2])
+    self.assertListEqual(sample[common.GT_IS_CROWD].shape.as_list(), [769, 769])
+    self.assertListEqual(sample[common.GT_THING_ID_MASK_KEY].shape.as_list(),
+                         [769, 769])
+    self.assertListEqual(sample[common.GT_THING_ID_CLASS_KEY].shape.as_list(),
+                         [128])
+    self.assertListEqual(
+        sample[common.SEMANTIC_LOSS_WEIGHT_KEY].shape.as_list(), [769, 769])
+    self.assertListEqual(sample[common.CENTER_LOSS_WEIGHT_KEY].shape.as_list(),
+                         [769, 769])
+    self.assertListEqual(
+        sample[common.REGRESSION_LOSS_WEIGHT_KEY].shape.as_list(),
+        [769, 769])
+
+    gt_sem = sample[common.GT_SEMANTIC_KEY]
+    gt_pan = sample[common.GT_PANOPTIC_KEY]
+    gt_center = tf.cast(sample[common.GT_INSTANCE_CENTER_KEY] * 255, tf.uint8)
+    gt_is_crowd = sample[common.GT_IS_CROWD]
+    gt_thing_id_mask = sample[common.GT_THING_ID_MASK_KEY]
+    gt_thing_id_class = sample[common.GT_THING_ID_CLASS_KEY]
+    image = tf.cast(sample[common.IMAGE], tf.uint8)
+
+    # semantic weights can be in range of [0, 3] in this example.
+    semantic_weights = tf.cast(sample[common.SEMANTIC_LOSS_WEIGHT_KEY] * 85,
+                               tf.uint8)
+    center_weights = tf.cast(sample[common.CENTER_LOSS_WEIGHT_KEY] * 255,
+                             tf.uint8)
+    offset_weights = tf.cast(sample[common.REGRESSION_LOSS_WEIGHT_KEY] * 255,
+                             tf.uint8)
+
+    np.testing.assert_almost_equal(
+        image.numpy(),
+        _get_groundtruth_image(
+            image,
+            self._test_target_data_dir + 'rgb_target.png'))
+    np.testing.assert_almost_equal(
+        gt_sem.numpy(),
+        _get_groundtruth_image(
+            gt_sem,
+            self._test_target_data_dir + 'semantic_target.png'))
+    # Save gt as png. Pillow is currently unable to correctly save the image as
+    # 32bit, but uses 16bit which overflows.
+    _ = _get_groundtruth_image(
+        gt_pan, self._test_target_data_dir + 'panoptic_target.png')
+    np.testing.assert_almost_equal(
+        gt_pan.numpy(),
+        _get_groundtruth_array(
+            gt_pan,
+            self._test_target_data_dir + 'panoptic_target.npy'))
+    np.testing.assert_almost_equal(
+        gt_thing_id_mask.numpy(),
+        _get_groundtruth_array(
+            gt_thing_id_mask,
+            self._test_target_data_dir + 'thing_id_mask_target.npy'))
+    np.testing.assert_almost_equal(
+        gt_thing_id_class.numpy(),
+        _get_groundtruth_array(
+            gt_thing_id_class,
+            self._test_target_data_dir + 'thing_id_class_target.npy'))
+    np.testing.assert_almost_equal(
+        gt_center.numpy(),
+        _get_groundtruth_image(
+            gt_center,
+            self._test_target_data_dir + 'center_target.png'))
+    np.testing.assert_almost_equal(
+        sample[common.GT_INSTANCE_REGRESSION_KEY].numpy(),
+        _get_groundtruth_array(
+            sample[common.GT_INSTANCE_REGRESSION_KEY].numpy(),
+            self._test_target_data_dir + 'offset_target.npy'))
+    np.testing.assert_array_equal(
+        gt_is_crowd.numpy(),
+        _get_groundtruth_array(gt_is_crowd.numpy(),
+                               self._test_target_data_dir + 'is_crowd.npy'))
+    np.testing.assert_almost_equal(
+        semantic_weights.numpy(),
+        _get_groundtruth_image(
+            semantic_weights,
+            self._test_target_data_dir + 'semantic_weights.png'))
+    np.testing.assert_almost_equal(
+        center_weights.numpy(),
+        _get_groundtruth_image(
+            center_weights,
+            self._test_target_data_dir + 'center_weights.png'))
+    np.testing.assert_almost_equal(
+        offset_weights.numpy(),
+        _get_groundtruth_image(
+            offset_weights,
+            self._test_target_data_dir + 'offset_weights.png'))
+
+  def test_input_generator_eval(self):
+    tf.random.set_seed(0)
+    np.random.seed(0)
+    small_instances = {'threshold': 4096, 'weight': 3.0}
+    generator = sample_generator.PanopticSampleGenerator(
+        dataset.CITYSCAPES_PANOPTIC_INFORMATION._asdict(),
+        focus_small_instances=small_instances,
+        is_training=False,
+        crop_size=[800, 800])
+    input_sample = {
+        'image': self._rgb_image,
+        'image_name': 'test_image',
+        'label': self._label,
+        'height': 800,
+        'width': 800
+    }
+    sample = generator(input_sample)
+
+    self.assertIn(common.GT_SEMANTIC_RAW, sample)
+    self.assertIn(common.GT_PANOPTIC_RAW, sample)
+    self.assertIn(common.GT_IS_CROWD_RAW, sample)
+
+    gt_sem_raw = sample[common.GT_SEMANTIC_RAW]
+    gt_pan_raw = sample[common.GT_PANOPTIC_RAW]
+    gt_is_crowd_raw = sample[common.GT_IS_CROWD_RAW]
+
+    self.assertListEqual(gt_sem_raw.shape.as_list(), [800, 800])
+    self.assertListEqual(gt_pan_raw.shape.as_list(), [800, 800])
+    self.assertListEqual(gt_is_crowd_raw.shape.as_list(), [800, 800])
+
+    np.testing.assert_almost_equal(
+        gt_sem_raw.numpy(),
+        _get_groundtruth_image(
+            gt_sem_raw,
+            self._test_target_data_dir + 'eval_semantic_target.png'))
+    np.testing.assert_almost_equal(
+        gt_pan_raw.numpy(),
+        _get_groundtruth_array(
+            gt_pan_raw,
+            self._test_target_data_dir + 'eval_panoptic_target.npy'))
+    np.testing.assert_almost_equal(
+        gt_is_crowd_raw.numpy(),
+        _get_groundtruth_array(gt_is_crowd_raw, self._test_target_data_dir +
+                               'eval_is_crowd.npy'))
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/data/testdata/create_test_data.py b/data/testdata/create_test_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e4e06d5b2e3a87943c1cb3f54d490d7588551cb
--- /dev/null
+++ b/data/testdata/create_test_data.py
@@ -0,0 +1,205 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Script to generate test data for cityscapes."""
+
+import collections
+import json
+import os
+
+from absl import app
+from absl import flags
+from absl import logging
+import numpy as np
+from PIL import Image
+import tensorflow as tf
+
+# resources dependency
+
+from deeplab2.data import data_utils
+from deeplab2.data import dataset
+
+flags.DEFINE_string(
+    'panoptic_annotation_path',
+    'deeplab2/data/testdata/'
+    'dummy_prediction.png',
+    'Path to annotated test image with cityscapes encoding.')
+flags.DEFINE_string(
+    'panoptic_gt_output_path',
+    'deeplab2/data/testdata/'
+    'dummy_gt_for_vps.png',
+    'Path to annotated test image with Video Panoptic Segmentation encoding.')
+flags.DEFINE_string(
+    'output_cityscapes_root',
+    'deeplab2/data/testdata/',
+    'Path to output root directory.')
+
+FLAGS = flags.FLAGS
+
+# Cityscapes label, using `TrainId`.
+_CITYSCAPES_IGNORE = 255
+# Each valid (not ignored) label below is a tuple of (TrainId, EvalId)
+_CITYSCAPES_CAR = (13, 26)
+_CITYSCAPES_TREE = (8, 21)
+_CITYSCAPES_SKY = (10, 23)
+_CITYSCAPES_BUILDING = (2, 11)
+_CITYSCAPES_ROAD = (0, 7)
+
+_IS_CROWD = 'is_crowd'
+_NOT_CROWD = 'not_crowd'
+
+_CLASS_HAS_INSTANCES_LIST = dataset.CITYSCAPES_PANOPTIC_INFORMATION.class_has_instances_list
+_PANOPTIC_LABEL_DIVISOR = dataset.CITYSCAPES_PANOPTIC_INFORMATION.panoptic_label_divisor
+_FILENAME_PREFIX = 'dummy_000000_000000'
+
+
+def create_test_data(annotation_path):
+  """Creates cityscapes panoptic annotation, vps annotation and segment info.
+
+  Our Video Panoptic Segmentation (VPS) encoding uses ID == semantic trainID *
+  1000 + instance ID (starting at 1) with instance ID == 0 marking
+  crowd regions.
+
+  Args:
+    annotation_path: The path to the annotation to be loaded.
+
+  Returns:
+    A tuple of cityscape annotation, vps annotation and segment infos.
+  """
+  # Convert panoptic labels to cityscapes label format.
+
+  # Dictionary mapping converted panoptic annotation to its corresponding
+  # Cityscapes label. Here the key is encoded by converting each RGB pixel
+  # value to 1 * R + 256 * G + 256 * 256 * B.
+  panoptic_label_to_cityscapes_label = {
+      0: (_CITYSCAPES_IGNORE, _NOT_CROWD),
+      31110: (_CITYSCAPES_CAR, _NOT_CROWD),
+      31354: (_CITYSCAPES_CAR, _IS_CROWD),
+      35173: (_CITYSCAPES_CAR, _NOT_CROWD),
+      488314: (_CITYSCAPES_CAR, _IS_CROWD),
+      549788: (_CITYSCAPES_CAR, _IS_CROWD),
+      1079689: (_CITYSCAPES_CAR, _IS_CROWD),
+      1341301: (_CITYSCAPES_CAR, _NOT_CROWD),
+      1544590: (_CITYSCAPES_CAR, _NOT_CROWD),
+      1926498: (_CITYSCAPES_CAR, _NOT_CROWD),
+      4218944: (_CITYSCAPES_TREE, _NOT_CROWD),
+      4251840: (_CITYSCAPES_SKY, _NOT_CROWD),
+      6959003: (_CITYSCAPES_BUILDING, _NOT_CROWD),
+      # To be merged with the building segment above.
+      8396960: (_CITYSCAPES_BUILDING, _NOT_CROWD),
+      8413312: (_CITYSCAPES_ROAD, _NOT_CROWD),
+  }
+  with tf.io.gfile.GFile(annotation_path, 'rb') as f:
+    panoptic = data_utils.read_image(f.read())
+
+  # Input panoptic annotation is RGB color coded, here we convert each pixel
+  # to a unique number to avoid comparing 3-tuples.
+  panoptic = np.dot(panoptic, [1, 256, 256 * 256])
+  # Creates cityscapes panoptic map. Cityscapes use ID == semantic EvalId for
+  # `stuff` segments and `thing` segments with `iscrowd` label, and
+  # ID == semantic EvalId * 1000 + instance ID (starting from 0) for other
+  # `thing` segments.
+  cityscapes_panoptic = np.zeros_like(panoptic, dtype=np.int32)
+  # Creates Video Panoptic Segmentation (VPS) map. We use ID == semantic
+  # trainID * 1000 + instance ID (starting at 1) with instance ID == 0 marking
+  # crowd regions.
+  vps_panoptic = np.zeros_like(panoptic, dtype=np.int32)
+  num_instances_per_class = collections.defaultdict(int)
+  unique_labels = np.unique(panoptic)
+
+  # Dictionary that maps segment id to segment info.
+  segments_info = {}
+  for label in unique_labels:
+    cityscapes_label, is_crowd = panoptic_label_to_cityscapes_label[label]
+    selected_pixels = panoptic == label
+
+    if cityscapes_label == _CITYSCAPES_IGNORE:
+      vps_panoptic[selected_pixels] = (
+          _CITYSCAPES_IGNORE * _PANOPTIC_LABEL_DIVISOR)
+      continue
+
+    train_id, eval_id = tuple(cityscapes_label)
+    cityscapes_id = eval_id
+    vps_id = train_id * _PANOPTIC_LABEL_DIVISOR
+    if train_id in _CLASS_HAS_INSTANCES_LIST:
+      # `thing` class.
+      if is_crowd != _IS_CROWD:
+        cityscapes_id = (
+            eval_id * _PANOPTIC_LABEL_DIVISOR +
+            num_instances_per_class[train_id])
+        # First instance should have ID 1.
+        vps_id += num_instances_per_class[train_id] + 1
+        num_instances_per_class[train_id] += 1
+
+    cityscapes_panoptic[selected_pixels] = cityscapes_id
+    vps_panoptic[selected_pixels] = vps_id
+    pixel_area = int(np.sum(selected_pixels))
+    if cityscapes_id in segments_info:
+      logging.info('Merging segments with label %d into segment %d', label,
+                   cityscapes_id)
+      segments_info[cityscapes_id]['area'] += pixel_area
+    else:
+      segments_info[cityscapes_id] = {
+          'area': pixel_area,
+          'category_id': train_id,
+          'id': cityscapes_id,
+          'iscrowd': 1 if is_crowd == _IS_CROWD else 0,
+      }
+
+  cityscapes_panoptic = np.dstack([
+      cityscapes_panoptic % 256, cityscapes_panoptic // 256,
+      cityscapes_panoptic // 256 // 256
+  ])
+  vps_panoptic = np.dstack(
+      [vps_panoptic % 256, vps_panoptic // 256, vps_panoptic // 256 // 256])
+  return (cityscapes_panoptic.astype(np.uint8), vps_panoptic.astype(np.uint8),
+          list(segments_info.values()))
+
+
+def main(argv):
+  if len(argv) > 1:
+    raise app.UsageError('Too many command-line arguments.')
+
+  data_path = FLAGS.panoptic_annotation_path  # OSS: removed internal filename loading.
+  panoptic_map, vps_map, segments_info = create_test_data(data_path)
+  panoptic_map_filename = _FILENAME_PREFIX + '_gtFine_panoptic.png'
+  panoptic_map_path = os.path.join(FLAGS.output_cityscapes_root, 'gtFine',
+                                   'cityscapes_panoptic_dummy_trainId',
+                                   panoptic_map_filename)
+
+  gt_output_path = FLAGS.panoptic_gt_output_path  # OSS: removed internal filename loading.
+  with tf.io.gfile.GFile(gt_output_path, 'wb') as f:
+    Image.fromarray(vps_map).save(f, format='png')
+
+  panoptic_map_path = panoptic_map_path  # OSS: removed internal filename loading.
+  with tf.io.gfile.GFile(panoptic_map_path, 'wb') as f:
+    Image.fromarray(panoptic_map).save(f, format='png')
+
+  json_annotation = {
+      'annotations': [{
+          'file_name': _FILENAME_PREFIX + '_gtFine_panoptic.png',
+          'image_id': _FILENAME_PREFIX,
+          'segments_info': segments_info
+      }]
+  }
+  json_annotation_path = os.path.join(FLAGS.output_cityscapes_root, 'gtFine',
+                                      'cityscapes_panoptic_dummy_trainId.json')
+  json_annotation_path = json_annotation_path  # OSS: removed internal filename loading.
+  with tf.io.gfile.GFile(json_annotation_path, 'w') as f:
+    json.dump(json_annotation, f, indent=2)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/data/testdata/dummy_gt_for_vps.png b/data/testdata/dummy_gt_for_vps.png
new file mode 100644
index 0000000000000000000000000000000000000000..e943d1f5704d7d9db8ad0a6c402b6d2eca61ab3a
Binary files /dev/null and b/data/testdata/dummy_gt_for_vps.png differ
diff --git a/data/testdata/dummy_prediction.png b/data/testdata/dummy_prediction.png
new file mode 100644
index 0000000000000000000000000000000000000000..f0b979eb87d103f5b11e548cdbeab9fa11e57d34
Binary files /dev/null and b/data/testdata/dummy_prediction.png differ
diff --git a/data/testdata/gtFine/cityscapes_panoptic_dummy_trainId.json b/data/testdata/gtFine/cityscapes_panoptic_dummy_trainId.json
new file mode 100644
index 0000000000000000000000000000000000000000..8465f987d8d75a152d5aa85b12514eeb68362448
--- /dev/null
+++ b/data/testdata/gtFine/cityscapes_panoptic_dummy_trainId.json
@@ -0,0 +1,70 @@
+{
+  "annotations": [
+    {
+      "file_name": "dummy_000000_000000_gtFine_panoptic.png",
+      "image_id": "dummy_000000_000000",
+      "segments_info": [
+        {
+          "area": 958,
+          "category_id": 13,
+          "id": 26000,
+          "iscrowd": 0
+        },
+        {
+          "area": 6178,
+          "category_id": 13,
+          "id": 26,
+          "iscrowd": 1
+        },
+        {
+          "area": 10496,
+          "category_id": 13,
+          "id": 26001,
+          "iscrowd": 0
+        },
+        {
+          "area": 5534,
+          "category_id": 13,
+          "id": 26002,
+          "iscrowd": 0
+        },
+        {
+          "area": 32768,
+          "category_id": 13,
+          "id": 26003,
+          "iscrowd": 0
+        },
+        {
+          "area": 19906,
+          "category_id": 13,
+          "id": 26004,
+          "iscrowd": 0
+        },
+        {
+          "area": 15940,
+          "category_id": 8,
+          "id": 21,
+          "iscrowd": 0
+        },
+        {
+          "area": 278754,
+          "category_id": 10,
+          "id": 23,
+          "iscrowd": 0
+        },
+        {
+          "area": 222420,
+          "category_id": 2,
+          "id": 11,
+          "iscrowd": 0
+        },
+        {
+          "area": 46475,
+          "category_id": 0,
+          "id": 7,
+          "iscrowd": 0
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/data/testdata/gtFine/cityscapes_panoptic_dummy_trainId/dummy_000000_000000_gtFine_panoptic.png b/data/testdata/gtFine/cityscapes_panoptic_dummy_trainId/dummy_000000_000000_gtFine_panoptic.png
new file mode 100644
index 0000000000000000000000000000000000000000..61fe7ba373f44768e652d9b48386e4299172e755
Binary files /dev/null and b/data/testdata/gtFine/cityscapes_panoptic_dummy_trainId/dummy_000000_000000_gtFine_panoptic.png differ
diff --git a/data/testdata/leftImg8bit/dummy_000000_000000_leftImg8bit.png b/data/testdata/leftImg8bit/dummy_000000_000000_leftImg8bit.png
new file mode 100644
index 0000000000000000000000000000000000000000..a1d4a6eedb8d36bfa265627563107eec8c1cda8c
Binary files /dev/null and b/data/testdata/leftImg8bit/dummy_000000_000000_leftImg8bit.png differ
diff --git a/data/testdata/targets/center_target.png b/data/testdata/targets/center_target.png
new file mode 100644
index 0000000000000000000000000000000000000000..8310b59d0ae5526ee39d54fb63416a436b760038
Binary files /dev/null and b/data/testdata/targets/center_target.png differ
diff --git a/data/testdata/targets/center_weights.png b/data/testdata/targets/center_weights.png
new file mode 100644
index 0000000000000000000000000000000000000000..2c985f4f1677dce8e2ef26e6c468bd499fff4ae7
Binary files /dev/null and b/data/testdata/targets/center_weights.png differ
diff --git a/data/testdata/targets/eval_is_crowd.npy b/data/testdata/targets/eval_is_crowd.npy
new file mode 100644
index 0000000000000000000000000000000000000000..b0b544bd0bbe20ae7632b92f6e8ee75e6093eb76
Binary files /dev/null and b/data/testdata/targets/eval_is_crowd.npy differ
diff --git a/data/testdata/targets/eval_panoptic_target.npy b/data/testdata/targets/eval_panoptic_target.npy
new file mode 100644
index 0000000000000000000000000000000000000000..1dce1427996fb8ad7bac8d3013481548b951284a
Binary files /dev/null and b/data/testdata/targets/eval_panoptic_target.npy differ
diff --git a/data/testdata/targets/eval_semantic_target.png b/data/testdata/targets/eval_semantic_target.png
new file mode 100644
index 0000000000000000000000000000000000000000..60214bbe0e7696852adccf56cb9edef098d2eb40
Binary files /dev/null and b/data/testdata/targets/eval_semantic_target.png differ
diff --git a/data/testdata/targets/is_crowd.npy b/data/testdata/targets/is_crowd.npy
new file mode 100644
index 0000000000000000000000000000000000000000..24130fc708dccedff42626f8b51908ffc54bc00c
Binary files /dev/null and b/data/testdata/targets/is_crowd.npy differ
diff --git a/data/testdata/targets/offset_target.npy b/data/testdata/targets/offset_target.npy
new file mode 100644
index 0000000000000000000000000000000000000000..c993faf4c15382ff9f2ac2c40165d8fcdeb65f35
Binary files /dev/null and b/data/testdata/targets/offset_target.npy differ
diff --git a/data/testdata/targets/offset_weights.png b/data/testdata/targets/offset_weights.png
new file mode 100644
index 0000000000000000000000000000000000000000..7918ce04969500719d628e0151321ecda4ff4d8f
Binary files /dev/null and b/data/testdata/targets/offset_weights.png differ
diff --git a/data/testdata/targets/panoptic_target.npy b/data/testdata/targets/panoptic_target.npy
new file mode 100644
index 0000000000000000000000000000000000000000..5e8831e96bf685fb1a83474e8e8810c551f56cbf
Binary files /dev/null and b/data/testdata/targets/panoptic_target.npy differ
diff --git a/data/testdata/targets/panoptic_target.png b/data/testdata/targets/panoptic_target.png
new file mode 100644
index 0000000000000000000000000000000000000000..248d57de058c2c756a9464fdebbae1e6fd7fd630
Binary files /dev/null and b/data/testdata/targets/panoptic_target.png differ
diff --git a/data/testdata/targets/rgb_target.png b/data/testdata/targets/rgb_target.png
new file mode 100644
index 0000000000000000000000000000000000000000..3da0a683ba406cd16a6b4c3f8fd5e21f4a9d8e11
Binary files /dev/null and b/data/testdata/targets/rgb_target.png differ
diff --git a/data/testdata/targets/semantic_target.png b/data/testdata/targets/semantic_target.png
new file mode 100644
index 0000000000000000000000000000000000000000..1100d7764ceb200c413cd8b42a6fdd18692c0371
Binary files /dev/null and b/data/testdata/targets/semantic_target.png differ
diff --git a/data/testdata/targets/semantic_weights.png b/data/testdata/targets/semantic_weights.png
new file mode 100644
index 0000000000000000000000000000000000000000..29b970f31b0a4bb253209225d44cc618532ab261
Binary files /dev/null and b/data/testdata/targets/semantic_weights.png differ
diff --git a/data/testdata/targets/thing_id_class_target.npy b/data/testdata/targets/thing_id_class_target.npy
new file mode 100644
index 0000000000000000000000000000000000000000..6e50e7ebeab996fcd0194798e7efa1cb63a6e062
Binary files /dev/null and b/data/testdata/targets/thing_id_class_target.npy differ
diff --git a/data/testdata/targets/thing_id_mask_target.npy b/data/testdata/targets/thing_id_mask_target.npy
new file mode 100644
index 0000000000000000000000000000000000000000..28c058c8ef020aeb574e649cc8a99afb8c06d867
Binary files /dev/null and b/data/testdata/targets/thing_id_mask_target.npy differ
diff --git a/data/utils/__init__.py b/data/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..35e4ce02ff422f3aa84ab644b88d65b13e0cbc03
--- /dev/null
+++ b/data/utils/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/data/utils/create_step_panoptic_maps.py b/data/utils/create_step_panoptic_maps.py
new file mode 100644
index 0000000000000000000000000000000000000000..77dd710a6861c858fd4b4ad3dc5f9eba5f912678
--- /dev/null
+++ b/data/utils/create_step_panoptic_maps.py
@@ -0,0 +1,305 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""Creates STEP panoptic map from semantic and instance maps.
+
+This script implements the process of merging semantic maps (from our extra
+annotations[1]) and instance maps (collected from the MOTS[2]) to obtain the
+STEP panoptic map.
+
+[1] Mark Weber, etc. STEP: Segmenting and Tracking Every Pixel, arXiv:2102.11859
+[2] Paul Voigtlaender, etc. Multi-object tracking and segmentation. CVPR, 2019
+
+To run this script, you need to install opencv-python (>=4.4.0).
+e.g. In Linux, run
+$pip install opencv-python
+
+The input directory structure should be as follows:
+
++ INPUT_SEMANTIC_MAP_ROOT_DIR
+  + train
+    + sequence_id
+      - *.png
+      ...
+  + val
+
++ INPUT_INSTANCE_MAP_ROOT_DIR
+  + train
+    + sequence_id
+      - *.png
+       ...
+  + val
+
++ OUTPUT_PANOPTIC_MAP_ROOT_DIR (generated)
+  + train
+    + sequence_id
+      - *.png
+       ...
+  + val
+
+The ground-truth panoptic map is generated and encoded as the following in PNG
+format:
+  R: semantic_id
+  G: instance_id // 256
+  B: instance % 256
+
+The generated panoptic maps will be used by ../build_step_data.py to create
+tfrecords for training and evaluation.
+
+Example to run the scipt:
+
+```bash
+   python deeplab2/data/utils/create_step_panoptic_maps.py \
+     --input_semantic_map_root_dir=...
+     ...
+```
+"""
+
+import os
+from typing import Any, Sequence, Union
+
+from absl import app
+from absl import flags
+from absl import logging
+import cv2
+import numpy as np
+from PIL import Image
+import tensorflow as tf
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string('input_semantic_map_root_dir', None,
+                    'Path to a directory containing the semantic map.')
+flags.DEFINE_string('input_instance_root_dir', None,
+                    'Path to a directory containing the instance map.')
+flags.DEFINE_string('output_panoptic_map_root_dir', None,
+                    'Path to a directory where we write the panoptic map.')
+flags.DEFINE_integer(
+    'kernel_size', 15, 'Kernel size to extend instance object boundary when '
+    'merging it with semantic map.')
+flags.DEFINE_enum('dataset_name', 'kitti-step',
+                  ['kitti-step', 'motchallenge-step'], 'Name of the dataset')
+
+# The label definition below follows Cityscapes label definition in
+# https://www.cityscapes-dataset.com/.
+MOTCHALLENGE_MERGED_CLASSES = (0, 3, 4, 5, 6, 7, 9, 13, 14, 15, 16, 17)
+NUM_VALID_CLASSES = 19
+SEMANTIC_CAR = 13
+SEMANTIC_PERSON = 11
+SEMANTIC_VOID = 255
+INSTANCE_CAR = 1
+INSTANCE_PERSON = 2
+INSTANCE_LABEL_DIVISOR = 1000
+
+
+def encode_panoptic_map(panoptic_map: np.ndarray) -> np.ndarray:
+  """Encodes the panoptic map in three channel image format."""
+  # Encoding format: R: semantic | G: instance // 256 | B: instance % 256
+  semantic_id = panoptic_map // INSTANCE_LABEL_DIVISOR
+  instance_id = panoptic_map % INSTANCE_LABEL_DIVISOR
+  return np.dstack(
+      (semantic_id, instance_id // 256, instance_id % 256)).astype(np.uint8)
+
+
+def load_image(image_path: str) -> np.ndarray:
+  """Loads an image as numpy array."""
+  with tf.io.gfile.GFile(image_path, 'rb') as f:
+    return np.array(Image.open(f))
+
+
+def _update_motchallege_label_map(semantic_map: np.ndarray) -> np.ndarray:
+  """Updates semantic map by merging some classes."""
+  # For MOTChallenge dataset, we merge some classes since they are less
+  # representative:
+  #--------------------------------------------------------------
+  # Original index | Updated index|          Note
+  #----------------+--------------+------------------------------
+  #       0        |      1       |   map road to sidewalk
+  #       1        |      1       |   keep sidewalk
+  #       2        |      2       |   keep building
+  #       3        |     255      |   not present anyway
+  #       4        |     255      |   remove fence
+  #       5        |     255      |   remove pole
+  #       6        |     255      |   remove traffic light
+  #       7        |     255      |   not present anyway
+  #       8        |      8       |   keep vegetation
+  #       9        |      8       |   map terrain to vegetation
+  #       10       |     10       |   keep sky
+  #       11       |     11       |   keep pedestrain
+  #       12       |     12       |   keep rider
+  #       13       |     255      |   remove car
+  #       14       |     255      |   not present anyway
+  #       15       |     255      |   not present anyway
+  #       16       |     255      |   not present anyway
+  #       17       |     255      |   remove motorcycle
+  #       18       |     18       |   keep bicycle
+  #       255      |     255      |   keep void
+  #--------------------------------------------------------------
+  for label in MOTCHALLENGE_MERGED_CLASSES:
+    if label == 0:
+      semantic_map[semantic_map == label] = 1
+    elif label == 9:
+      semantic_map[semantic_map == label] = 8
+    else:
+      semantic_map[semantic_map == label] = 255
+  return semantic_map
+
+
+def _compute_panoptic_id(semantic_id: Union[int, np.ndarray],
+                         instance_id: Union[int, np.ndarray]) -> Any:
+  """Gets the panoptic id by combining semantic and instance id."""
+  return semantic_id * INSTANCE_LABEL_DIVISOR + instance_id
+
+
+def _remap_motchallege_semantic_indices(panoptic_id: np.ndarray) -> np.ndarray:
+  """Updates MOTChallenge semantic map by re-mapping label indices."""
+  semantic_id = panoptic_id // INSTANCE_LABEL_DIVISOR
+  instance_id = panoptic_id % INSTANCE_LABEL_DIVISOR
+  # Re-mapping index
+  # 1 -> 0:     sidewalk
+  # 2 -> 1:     building
+  # 8 -> 2:     vegetation
+  # 10 -> 3:    sky
+  # 11 -> 4:    pedestrain
+  # 12 -> 5:    rider
+  # 18 -> 6:    bicycle
+  # 255 -> 255: void
+  all_labels = set(range(NUM_VALID_CLASSES))
+  for i, label in enumerate(
+      sorted(all_labels - set(MOTCHALLENGE_MERGED_CLASSES))):
+    semantic_id[semantic_id == label] = i
+  return _compute_panoptic_id(semantic_id, instance_id)
+
+
+def _get_semantic_maps(semantic_map_root: str, dataset_split: str,
+                       sequence_id: str) -> Sequence[str]:
+  """Gets files for the specified data type and dataset split."""
+  search_files = os.path.join(semantic_map_root, dataset_split, sequence_id,
+                              '*')
+  filenames = tf.io.gfile.glob(search_files)
+  return sorted(filenames)
+
+
+class StepPanopticMapGenerator(object):
+  """Class to generate and write panoptic map from semantic and instance map."""
+
+  def __init__(self, kernel_size: int, dataset_name: str):
+    self.kernel_size = kernel_size
+    self.is_mots_challenge = (dataset_name == 'motchallenge-step')
+
+  def _update_semantic_label_map(self, instance_map: np.ndarray,
+                                 semantic_map: np.ndarray) -> np.ndarray:
+    """Updates semantic map by leveraging semantic map and instance map."""
+    kernel = np.ones((self.kernel_size, self.kernel_size), np.uint8)
+    updated_semantic_map = semantic_map.astype(np.int32)
+    if self.is_mots_challenge:
+      updated_semantic_map = _update_motchallege_label_map(updated_semantic_map)
+    for label in (SEMANTIC_CAR, SEMANTIC_PERSON):
+      semantic_mask = (semantic_map == label)
+      if label == SEMANTIC_PERSON:
+        # The instance ids are encoded according to
+        # https://www.vision.rwth-aachen.de/page/mots
+        instance_mask = (
+            instance_map // INSTANCE_LABEL_DIVISOR == INSTANCE_PERSON)
+      elif label == SEMANTIC_CAR:
+        instance_mask = instance_map // INSTANCE_LABEL_DIVISOR == INSTANCE_CAR
+      # Run dilation on the instance map to merge it with semantic map.
+      instance_mask = instance_mask.astype(np.uint8)
+      dilated_instance_mask = cv2.dilate(instance_mask, kernel)
+      void_boundary = np.logical_and(dilated_instance_mask - instance_mask,
+                                     semantic_mask)
+      updated_semantic_map[void_boundary] = SEMANTIC_VOID
+    return updated_semantic_map
+
+  def merge_panoptic_map(self, semantic_map: np.ndarray,
+                         instance_map: np.ndarray) -> np.ndarray:
+    """Merges semantic labels with given instance map."""
+    # Use semantic_map as the base map.
+    updated_semantic_map = self._update_semantic_label_map(
+        instance_map, semantic_map)
+    panoptic_map = _compute_panoptic_id(updated_semantic_map, 0)
+    # Merge instance.
+    mask_car = instance_map // INSTANCE_LABEL_DIVISOR == INSTANCE_CAR
+    # The instance map has index from 0 but the panoptic map's instance index
+    # will start from 1.
+    instance_id = (instance_map[mask_car] % INSTANCE_LABEL_DIVISOR) + 1
+    panoptic_map[mask_car] = _compute_panoptic_id(SEMANTIC_CAR,
+                                                  instance_id.astype(np.int32))
+    mask_person = instance_map // INSTANCE_LABEL_DIVISOR == INSTANCE_PERSON
+    instance_id = (instance_map[mask_person] % INSTANCE_LABEL_DIVISOR) + 1
+    panoptic_map[mask_person] = _compute_panoptic_id(
+        SEMANTIC_PERSON, instance_id.astype(np.int32))
+
+    # Remap label indices.
+    if self.is_mots_challenge:
+      panoptic_map = _remap_motchallege_semantic_indices(panoptic_map)
+    return panoptic_map
+
+  def build_panoptic_maps(self, semantic_map_root: str, instance_map_root: str,
+                          dataset_split: str, sequence_id: str,
+                          panoptic_map_root: str):
+    """Creates panoptic maps and save them as PNG format.
+
+    Args:
+      semantic_map_root: Semantic map root folder.
+      instance_map_root: Instance map root folder.
+      dataset_split: Train/Val/Test split of the data.
+      sequence_id: Sequence id of the data.
+      panoptic_map_root: Panoptic map root folder where the encoded panoptic
+        maps will be saved.
+    """
+    semantic_maps = _get_semantic_maps(semantic_map_root, dataset_split,
+                                       sequence_id)
+    for semantic_map_path in semantic_maps:
+      image_name = os.path.basename(semantic_map_path)
+      instance_map_path = os.path.join(instance_map_root, dataset_split,
+                                       sequence_id, image_name)
+      if not tf.io.gfile.exists(instance_map_path):
+        logging.warn('Could not find instance map for %s', semantic_map_path)
+        continue
+      semantic_map = load_image(semantic_map_path)
+      instance_map = load_image(instance_map_path)
+      panoptic_map = self.merge_panoptic_map(semantic_map, instance_map)
+      encoded_panoptic_map = Image.fromarray(
+          encode_panoptic_map(panoptic_map)).convert('RGB')
+      panoptic_map_path = os.path.join(panoptic_map_root, dataset_split,
+                                       sequence_id, image_name)
+      with tf.io.gfile.GFile(panoptic_map_path, 'wb') as f:
+        encoded_panoptic_map.save(f, format='PNG')
+
+
+def main(argv: Sequence[str]) -> None:
+  if len(argv) > 1:
+    raise app.UsageError('Too many command-line arguments.')
+
+  panoptic_map_generator = StepPanopticMapGenerator(FLAGS.kernel_size,
+                                                    FLAGS.dataset_name)
+  for dataset_split in ('train', 'val', 'test'):
+    sem_dir = os.path.join(FLAGS.input_semantic_map_root_dir, dataset_split)
+    if not tf.io.gfile.exists(sem_dir):
+      logging.info('Split %s not found.', dataset_split)
+      continue
+    for set_dir in tf.io.gfile.listdir(sem_dir):
+      tf.io.gfile.makedirs(
+          os.path.join(FLAGS.output_panoptic_map_root_dir, dataset_split,
+                       set_dir))
+      logging.info('Start to create panoptic map for split %s, sequence %s.',
+                   dataset_split, set_dir)
+      panoptic_map_generator.build_panoptic_maps(
+          FLAGS.input_semantic_map_root_dir, FLAGS.input_instance_root_dir,
+          dataset_split, set_dir, FLAGS.output_panoptic_map_root_dir)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/dataset.proto b/dataset.proto
new file mode 100644
index 0000000000000000000000000000000000000000..b33263aba4e3c5d06f7699e75c41ed8b52d263c3
--- /dev/null
+++ b/dataset.proto
@@ -0,0 +1,88 @@
+// Copyright 2021 The Deeplab2 Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+
+package deeplab2;
+
+// Configure the dataset options.
+message DatasetOptions {
+  // Set the dataset. See dataset.py for supported datasets.
+  optional string dataset = 1;
+  // Set the dataset file pattern to be used with glob.
+  repeated string file_pattern = 2;
+  // Set the number of samples per batch. This must be a multiple of replicas.
+  // E.g. batch_size = 8 on 4 GPUs equals a batch size of 2 on each GPU.
+  optional int32 batch_size = 3 [default = 32];
+  // Set the crop size as a list of [crop_height, crop_width].
+  repeated int32 crop_size = 4;
+  // Minimum value for resize. Can be 1) empty; or 2) an integer, indicating
+  // the desired size of the shorter image side (either height or width); or
+  // 3) a 2-tuple of (height, width), indicating the desired minimum value for
+  // height and width after resize. Setting values to non-positive indicate
+  // no minimum value would be used.
+  repeated int32 min_resize_value = 5;
+  // Maximum value for resize. Can be 1) empty; or 2) an integer, indicating
+  // the maximum allowed size of the longer image side (either height or width);
+  // or 3) a 2-tuple of (height, width), indicating the maximum allowed size
+  // after resize. Setting values to non-positive indicates no maximum value
+  // would be used.
+  repeated int32 max_resize_value = 6;
+  // Set the resizing factor.
+  optional int32 resize_factor = 7;
+
+  /* Augmentation options.*/
+  message AugmentationOptions {
+    // Set the minimum scale factor for augmentation. Default not to use.
+    optional float min_scale_factor = 1 [default = 1.0];
+    // Set the maximum scale factor for augmentation. Default not to use.
+    optional float max_scale_factor = 2 [default = 1.0];
+    // Set the scale factor step size for data augmentation.
+    optional float scale_factor_step_size = 3 [default = 0.25];
+    // The name of the AutoAugment policy to use.
+    optional string autoaugment_policy_name = 4;
+  }
+  optional AugmentationOptions augmentations = 8;
+  // Set the standard deviation used to generate Gaussian center ground-truth.
+  optional float sigma = 9 [default = 8.0];
+  // Set whether to use increased weights on small instances.
+  optional bool increase_small_instance_weights = 10 [default = false];
+  // Set the pixel threshold for small instances.
+  optional int32 small_instance_threshold = 11 [default = 4096];
+  // Set the small instance weight.
+  optional float small_instance_weight = 12 [default = 3.0];
+  // Set whether to use two frames togetehr (current frame + previous frame) as
+  // input for video panoptic segmentation.
+  optional bool use_two_frames = 13 [default = false];
+  // Whether to decode the groundtruth label. Some dataset splits (e.g., test
+  // set) may not contain any groundtruth label. In that case, set this field
+  // to false to avoid decoding non-existing groundtruth label.
+  optional bool decode_groundtruth_label = 14 [default = true];
+  // Whether the model needs thing_id_mask annotations. When True, we will
+  // additionally return mask annotation for each `thing` instance, encoded with
+  // a unique thing_id. This ground-truth annotation could be used to learn a
+  // better segmentation mask for each instance. `thing_id` indicates the number
+  // of unique thing-ID to each instance in an image, starting the counting from
+  // 0 (default: False).
+  optional bool thing_id_mask_annotations = 15 [default = false];
+  // Set the maximum number of possible thing instances per image. It is used
+  // together when enabling generation of thing_id_mask_annotations (= True),
+  // representing the maximum thing ID encoded in the thing_id_mask.
+  optional int32 max_thing_id = 16 [default = 128];
+  // Set whether to use the next frame together with the current frame for video
+  // panoptic segmentation (VPS). This field also controls using two-frame as
+  // input for VPS. Note that `use_two_frames` is adopted in Motion-DeepLab,
+  // while `use_next_frame` is used in ViP-DeepLab.
+  optional bool use_next_frame = 17 [default = false];
+}
diff --git a/evaluation/.DS_Store b/evaluation/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6
Binary files /dev/null and b/evaluation/.DS_Store differ
diff --git a/evaluation/__init__.py b/evaluation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..35e4ce02ff422f3aa84ab644b88d65b13e0cbc03
--- /dev/null
+++ b/evaluation/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/evaluation/coco_instance_ap.py b/evaluation/coco_instance_ap.py
new file mode 100644
index 0000000000000000000000000000000000000000..c97d8c02c2e2c683d4df9f47c5510de8bee7347c
--- /dev/null
+++ b/evaluation/coco_instance_ap.py
@@ -0,0 +1,337 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""COCO-style instance segmentation evaluation metrics.
+
+Implements a Keras interface to COCO API.
+COCO API: github.com/cocodataset/cocoapi/
+"""
+from typing import Any, Collection, Mapping, Optional
+
+from absl import logging
+import numpy as np
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+import tensorflow as tf
+
+from deeplab2.utils import coco_tools
+from deeplab2.utils import panoptic_instances
+
+
+def _unwrap_segmentation(seg):
+  return {
+      'size': list(seg['size']),
+      'counts': seg['counts'],
+  }
+
+
+_ANNOTATION_CONVERSION = {
+    'bbox': list,
+    'segmentation': _unwrap_segmentation,
+}
+
+
+def _unwrap_annotation(ann: Mapping[str, Any]) -> Mapping[str, Any]:
+  """Unwraps the objects in an COCO-style annotation dictionary.
+
+  Logic within the Keras metric class wraps the objects within the ground-truth
+  and detection annotations in ListWrapper and DictWrapper classes. On the other
+  hand, the COCO API does strict type checking as part of determining which
+  branch to use in comparing detections and segmentations. We therefore have
+  to coerce the types from the wrapper to the built-in types that COCO is
+  expecting.
+
+  Args:
+    ann: A COCO-style annotation dictionary that may contain ListWrapper and
+      DictWrapper objects.
+
+  Returns:
+    The same annotation information, but with wrappers reduced to built-in
+    types.
+  """
+  unwrapped_ann = {}
+  for k in ann:
+    if k in _ANNOTATION_CONVERSION:
+      unwrapped_ann[k] = _ANNOTATION_CONVERSION[k](ann[k])
+    else:
+      unwrapped_ann[k] = ann[k]
+  return unwrapped_ann
+
+
+class InstanceAveragePrecision(tf.keras.metrics.Metric):
+  """COCO evaluation metric class."""
+
+  def __init__(self, name: str = 'instance_ap', **kwargs):
+    """Constructs COCO evaluation class."""
+    super(InstanceAveragePrecision, self).__init__(name=name, **kwargs)
+    self.reset_states()
+
+  def reset_states(self) -> None:
+    """Reset COCO API object."""
+    self.detections = []
+    self.dataset = {
+        'images': [],
+        'annotations': [],
+        'categories': []
+    }
+    self.image_id = 1
+    self.next_groundtruth_annotation_id = 1
+    self.category_ids = set()
+    self.metric_values = None
+
+  def evaluate(self) -> np.ndarray:
+    """Evaluates with detections from all images with COCO API.
+
+    Returns:
+      coco_metric: float numpy array with shape [12] representing the
+        coco-style evaluation metrics.
+    """
+    self.dataset['categories'] = [{
+        'id': int(category_id)
+    } for category_id in self.category_ids]
+
+    # Creates "unwrapped" copies of COCO json-style objects.
+    dataset = {
+        'images': self.dataset['images'],
+        'categories': self.dataset['categories']
+    }
+    dataset['annotations'] = [
+        _unwrap_annotation(ann) for ann in self.dataset['annotations']
+    ]
+    detections = [_unwrap_annotation(ann) for ann in self.detections]
+
+    logging.info('Creating COCO objects for AP eval...')
+    coco_gt = COCO()
+    coco_gt.dataset = dataset
+    coco_gt.createIndex()
+
+    coco_dt = coco_gt.loadRes(detections)
+
+    logging.info('Running COCO evaluation...')
+    coco_eval = COCOeval(coco_gt, coco_dt, iouType='segm')
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+    coco_metrics = coco_eval.stats
+    return np.array(coco_metrics, dtype=np.float32)
+
+  def result(self) -> np.ndarray:
+    """Return the instance segmentation metric values, computing them if needed.
+
+    Returns:
+      A float vector of 12 elements. The meaning of each element is (in order):
+
+       0. AP @[ IoU=0.50:0.95 | area=   all | maxDets=100 ]
+       1. AP @[ IoU=0.50      | area=   all | maxDets=100 ]
+       2. AP @[ IoU=0.75      | area=   all | maxDets=100 ]
+       3. AP @[ IoU=0.50:0.95 | area= small | maxDets=100 ]
+       4. AP @[ IoU=0.50:0.95 | area=medium | maxDets=100 ]
+       5. AP @[ IoU=0.50:0.95 | area= large | maxDets=100 ]
+       6. AR @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ]
+       7. AR @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ]
+       8. AR @[ IoU=0.50:0.95 | area=   all | maxDets=100 ]
+       9. AR @[ IoU=0.50:0.95 | area= small | maxDets=100 ]
+      10. AR @[ IoU=0.50:0.95 | area=medium | maxDets=100 ]
+      11, AR @[ IoU=0.50:0.95 | area= large | maxDets=100 ]
+
+      Where: AP = Average Precision
+             AR = Average Recall
+             IoU = Intersection over Union. IoU=0.50:0.95 is the average of the
+               metric over thresholds of 0.5 to 0.95 with increments of 0.05.
+
+      The area thresholds mean that, for those entries, ground truth annotation
+      with area outside the range is ignored.
+        small:  [0**2, 32**2],
+        medium: [32**2, 96**2]
+        large:  [96**2, 1e5**2]
+    """
+    if not self.metric_values:
+      self.metric_values = self.evaluate()
+    return self.metric_values
+
+  def update_state(self, groundtruth_boxes: tf.Tensor,
+                   groundtruth_classes: tf.Tensor, groundtruth_masks: tf.Tensor,
+                   groundtruth_is_crowd: tf.Tensor, detection_masks: tf.Tensor,
+                   detection_scores: tf.Tensor,
+                   detection_classes: tf.Tensor) -> None:
+    """Update detection results and groundtruth data.
+
+    Append detection results to self.detections to the aggregate results from
+    all of the validation set. The groundtruth_data is parsed and added into a
+    dictionary with the same format as COCO dataset, which can be used for
+    evaluation.
+
+    Args:
+      groundtruth_boxes: tensor (float32) with shape [num_gt_annos, 4]
+      groundtruth_classes: tensor (int) with shape [num_gt_annos]
+      groundtruth_masks: tensor (uint8) with shape [num_gt_annos, image_height,
+        image_width]
+      groundtruth_is_crowd: tensor (bool) with shape [num_gt_annos]
+      detection_masks: tensor (uint8) with shape [num_detections, image_height,
+        image_width]
+      detection_scores: tensor (float32) with shape [num_detections]
+      detection_classes: tensor (int) with shape [num_detections]
+    """
+    # Reset the caching of result values.
+    self.metric_values = None
+
+    # Update known category ids.
+    self.category_ids.update(groundtruth_classes.numpy())
+    self.category_ids.update(detection_classes.numpy())
+
+    # Add ground-truth annotations.
+    groundtruth_annotations = coco_tools.ExportSingleImageGroundtruthToCoco(
+        self.image_id,
+        self.next_groundtruth_annotation_id,
+        self.category_ids,
+        groundtruth_boxes.numpy(),
+        groundtruth_classes.numpy(),
+        groundtruth_masks=groundtruth_masks.numpy(),
+        groundtruth_is_crowd=groundtruth_is_crowd.numpy())
+    self.next_groundtruth_annotation_id += len(groundtruth_annotations)
+
+    # Add to set of images for which there are gt & detections
+    # Infers image size from groundtruth masks.
+    _, height, width = groundtruth_masks.shape
+    self.dataset['images'].append({
+        'id': self.image_id,
+        'height': height,
+        'width': width,
+    })
+    self.dataset['annotations'].extend(groundtruth_annotations)
+
+    # Add predictions/detections.
+    detection_annotations = coco_tools.ExportSingleImageDetectionMasksToCoco(
+        self.image_id, self.category_ids, detection_masks.numpy(),
+        detection_scores.numpy(), detection_classes.numpy())
+    self.detections.extend(detection_annotations)
+
+    self.image_id += 1
+
+
+def _instance_masks(panoptic_label_map: tf.Tensor,
+                    instance_panoptic_labels: tf.Tensor) -> tf.Tensor:
+  """Constructs an array of masks for each instance in a panoptic label map.
+
+  Args:
+    panoptic_label_map: An integer tensor of shape `[image_height, image_width]`
+      specifying the panoptic label at each pixel.
+    instance_panoptic_labels: An integer tensor of shape `[num_instances]` that
+      gives the label for each unique instance for which to compute masks.
+
+  Returns:
+    A boolean tensor of shape `[num_instances, image_height, image_width]` where
+    each slice in the first dimension gives the mask for a single instance over
+    the entire image.
+  """
+  return tf.math.equal(
+      tf.expand_dims(panoptic_label_map, 0),
+      tf.reshape(instance_panoptic_labels,
+                 [tf.size(instance_panoptic_labels), 1, 1]))
+
+
+class PanopticInstanceAveragePrecision(tf.keras.metrics.Metric):
+  """Computes instance segmentation AP of panoptic segmentations.
+
+  Panoptic segmentation includes both "thing" and "stuff" classes. This class
+  ignores the "stuff" classes to report metrics on only the "thing" classes
+  that have discrete instances. It computes a series of AP-based metrics using
+  the COCO evaluation scripts.
+  """
+
+  def __init__(self,
+               num_classes: int,
+               things_list: Collection[int],
+               label_divisor: int,
+               ignored_label: int,
+               name: str = 'panoptic_instance_ap',
+               **kwargs):
+    """Constructs panoptic instance segmentation evaluation class."""
+    super(PanopticInstanceAveragePrecision, self).__init__(name=name, **kwargs)
+    self.num_classes = num_classes
+    self.stuff_list = set(range(num_classes)).difference(things_list)
+    self.label_divisor = label_divisor
+    self.ignored_label = ignored_label
+    self.detection_metric = InstanceAveragePrecision()
+    self.reset_states()
+
+  def reset_states(self) -> None:
+    self.detection_metric.reset_states()
+
+  def result(self) -> np.ndarray:
+    return self.detection_metric.result()
+
+  def update_state(self,
+                   groundtruth_panoptic: tf.Tensor,
+                   predicted_panoptic: tf.Tensor,
+                   semantic_probability: tf.Tensor,
+                   instance_score_map: tf.Tensor,
+                   is_crowd_map: Optional[tf.Tensor] = None) -> None:
+    """Adds the results from a new image to be computed by the metric.
+
+    Args:
+      groundtruth_panoptic: A 2D integer tensor, with the true panoptic label at
+        each pixel.
+      predicted_panoptic: 2D integer tensor with predicted panoptic labels to be
+        evaluated.
+      semantic_probability: An float tensor of shape `[image_height,
+        image_width, num_classes]`. Specifies at each pixel the estimated
+        probability distribution that that pixel belongs to each semantic class.
+      instance_score_map: A 2D float tensor, where the pixels for an instance
+        will have the probability of that being an instance.
+      is_crowd_map: A 2D boolean tensor. Where it is True, the instance in that
+        region is a "crowd" instance. It is assumed that all pixels in an
+        instance will have the same value in this map. If set to None (the
+        default), it will be assumed that none of the ground truth instances are
+        crowds.
+    """
+    classes_to_ignore = tf.convert_to_tensor([self.ignored_label] +
+                                             list(self.stuff_list), tf.int32)
+    (gt_unique_labels,
+     gt_box_coords) = panoptic_instances.instance_boxes_from_masks(
+         groundtruth_panoptic, classes_to_ignore, self.label_divisor)
+    gt_classes = tf.math.floordiv(gt_unique_labels, self.label_divisor)
+
+    gt_masks = _instance_masks(groundtruth_panoptic, gt_unique_labels)
+
+    if is_crowd_map is None:
+      gt_is_crowd = tf.zeros(tf.shape(gt_classes), tf.bool)
+    else:
+      gt_is_crowd = panoptic_instances.per_instance_is_crowd(
+          is_crowd_map, groundtruth_panoptic, gt_unique_labels)
+
+    (pred_unique_labels,
+     pred_scores) = panoptic_instances.combined_instance_scores(
+         predicted_panoptic, semantic_probability, instance_score_map,
+         self.label_divisor, self.ignored_label)
+
+    # Filter out stuff and ignored label.
+    pred_classes = tf.math.floordiv(pred_unique_labels, self.label_divisor)
+    pred_class_is_ignored = tf.math.reduce_any(
+        tf.math.equal(
+            tf.expand_dims(pred_classes, 1),
+            tf.expand_dims(classes_to_ignore, 0)),
+        axis=1)
+    pred_class_is_kept = tf.math.logical_not(pred_class_is_ignored)
+    pred_unique_labels = tf.boolean_mask(pred_unique_labels, pred_class_is_kept)
+    pred_scores = tf.boolean_mask(pred_scores, pred_class_is_kept)
+
+    # Recompute class labels after the filtering.
+    pred_classes = tf.math.floordiv(pred_unique_labels, self.label_divisor)
+    pred_masks = _instance_masks(predicted_panoptic, pred_unique_labels)
+
+    self.detection_metric.update_state(gt_box_coords, gt_classes, gt_masks,
+                                       gt_is_crowd, pred_masks, pred_scores,
+                                       pred_classes)
diff --git a/evaluation/coco_instance_ap_test.py b/evaluation/coco_instance_ap_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..efc11d829e46e8888bde650ab44025c7ac98fda3
--- /dev/null
+++ b/evaluation/coco_instance_ap_test.py
@@ -0,0 +1,316 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for the COCO Instance AP metric."""
+
+from absl import logging
+import numpy as np
+import tensorflow as tf
+
+from deeplab2.evaluation import coco_instance_ap
+from deeplab2.evaluation import test_utils
+
+# See the definition of the color names at:
+#   https://en.wikipedia.org/wiki/Web_colors.
+_CLASS_COLOR_MAP = {
+    (0, 0, 0): 0,
+    (0, 0, 255): 1,  # Person (blue).
+    (255, 0, 0): 2,  # Bear (red).
+    (0, 255, 0): 3,  # Tree (lime).
+    (255, 0, 255): 4,  # Bird (fuchsia).
+    (0, 255, 255): 5,  # Sky (aqua).
+    (255, 255, 0): 6,  # Cat (yellow).
+}
+
+
+def combine_maps(semantic_map, instance_map, label_divisor):
+  combined_map = instance_map + semantic_map * label_divisor
+  return tf.cast(combined_map, tf.int32)
+
+
+class CocoInstanceApTest(tf.test.TestCase):
+
+  def test_evaluates_single_image(self):
+    groundtruth_boxes = [
+        [0.25, 0.4, 0.75, 1.0],
+    ]
+    groundtruth_classes = [8]
+    groundtruth_masks = [[
+        [0, 0, 0, 0, 0],
+        [0, 0, 1, 1, 0],
+        [0, 0, 1, 1, 1],
+        [0, 0, 0, 0, 0],
+    ]]
+    groundtruth_is_crowd = [False]
+
+    detection_masks = [[
+        [0, 0, 0, 0, 0],
+        [0, 0, 1, 1, 0],
+        [0, 0, 1, 1, 0],
+        [0, 0, 0, 0, 0],
+    ]]
+    detection_scores = [0.8]
+    detection_classes = [8]
+
+    groundtruth_boxes = tf.constant(groundtruth_boxes, dtype=tf.float32)
+    groundtruth_classes = tf.constant(groundtruth_classes, dtype=tf.int32)
+    groundtruth_masks = tf.constant(groundtruth_masks, dtype=tf.uint8)
+    groundtruth_is_crowd = tf.constant(groundtruth_is_crowd, dtype=tf.bool)
+
+    detection_masks = tf.constant(detection_masks, dtype=tf.uint8)
+    detection_scores = tf.constant(detection_scores, dtype=tf.float32)
+    detection_classes = tf.constant(detection_classes, dtype=tf.int32)
+
+    metric_obj = coco_instance_ap.InstanceAveragePrecision()
+    metric_obj.update_state(groundtruth_boxes, groundtruth_classes,
+                            groundtruth_masks, groundtruth_is_crowd,
+                            detection_masks, detection_scores,
+                            detection_classes)
+    result = metric_obj.result().numpy()
+
+    # The IoU for the foreground match is 0.8. So it is a TP for 7/10 of the IoU
+    # thresholds.
+    expected_result = [0.7, 1, 1, 0.7, -1, -1, 0.7, 0.7, 0.7, 0.7, -1, -1]
+    np.testing.assert_array_almost_equal(result, expected_result)
+
+
+class PanopticInstanceApTest(tf.test.TestCase):
+
+  def test_evaluates_single_image(self):
+    num_classes = 3
+    things_list = [1, 2]
+    label_divisor = 256
+    ignore_label = 0
+    instance_class_map = {
+        0: 0,
+        47: 1,
+        97: 1,
+        133: 1,
+        150: 1,
+        174: 1,
+        198: 2,
+        215: 1,
+        244: 1,
+        255: 1,
+    }
+    gt_instances, gt_classes = test_utils.panoptic_segmentation_with_class_map(
+        'team_gt_instance.png', instance_class_map)
+    gt_panoptic = combine_maps(gt_classes, gt_instances, label_divisor)
+
+    pred_classes = test_utils.read_segmentation_with_rgb_color_map(
+        'team_pred_class.png', _CLASS_COLOR_MAP)
+    pred_instances = test_utils.read_test_image(
+        'team_pred_instance.png', image_format='L')
+
+    pred_panoptic = combine_maps(pred_classes, pred_instances, label_divisor)
+    semantic_probability = tf.ones(
+        tf.concat([tf.shape(pred_panoptic), [num_classes]], 0))
+    instance_score_map = tf.ones(tf.shape(pred_panoptic))
+
+    metric_obj = coco_instance_ap.PanopticInstanceAveragePrecision(
+        num_classes, things_list, label_divisor, ignore_label)
+    metric_obj.update_state(gt_panoptic, pred_panoptic, semantic_probability,
+                            instance_score_map)
+
+    result = metric_obj.result().numpy()
+    logging.info('result = %s', result)
+
+    expected_result = [
+        0.2549, 0.9356, 0.1215, -1.0, 0.2399, 0.501, 0.0812, 0.2688, 0.2688,
+        -1.0, 0.2583, 0.5
+    ]
+    np.testing.assert_almost_equal(result, expected_result, decimal=4)
+
+  def test_evaluates_with_scores(self):
+    num_classes = 3
+    things_list = list(range(num_classes))
+    label_divisor = 256
+    ignore_label = 0
+    gt_classes = tf.constant([
+        [1, 1, 2, 2],
+        [1, 1, 2, 2],
+        [0, 0, 2, 2],
+        [0, 0, 2, 2],
+    ], tf.int32)
+    pred_classes = tf.constant([
+        [1, 1, 1, 1],
+        [1, 1, 1, 1],
+        [0, 0, 2, 2],
+        [0, 0, 2, 2],
+    ], tf.int32)
+    instances = tf.constant([
+        [1, 1, 2, 2],
+        [1, 1, 2, 2],
+        [0, 0, 3, 3],
+        [0, 0, 3, 3],
+    ], tf.int32)
+
+    gt_panoptic = combine_maps(gt_classes, instances, label_divisor)
+    pred_panoptic = combine_maps(pred_classes, instances, label_divisor)
+
+    semantic_probability = tf.constant([
+        [
+            [0, 0, 0, 0],
+            [0, 0, 0, 0],
+            [1, 1, 0, 0],
+            [1, 1, 0, 0],
+        ],
+        [
+            [1, 1, 1, 1],
+            [1, 1, 1, 1],
+            [0, 0, 0, 0],
+            [0, 0, 0, 0],
+        ],
+        [
+            [0, 0, 0, 0],
+            [0, 0, 0, 0],
+            [0, 0, 1, 1],
+            [0, 0, 1, 1],
+        ],
+    ], tf.float32)
+    semantic_probability = tf.transpose(semantic_probability, [1, 2, 0])
+
+    # This score map gives higher score to the incorrect instance.
+    bad_instance_scores = tf.constant([
+        [0.4, 0.4, 0.9, 0.9],
+        [0.4, 0.4, 0.9, 0.9],
+        [0.0, 0.0, 0.8, 0.8],
+        [0.0, 0.0, 0.8, 0.8],
+    ], tf.float32)
+    metric_obj = coco_instance_ap.PanopticInstanceAveragePrecision(
+        num_classes, things_list, label_divisor, ignore_label)
+    metric_obj.update_state(gt_panoptic, pred_panoptic, semantic_probability,
+                            bad_instance_scores)
+
+    bad_result = metric_obj.result().numpy()
+    logging.info('bad_result = %s', bad_result)
+    expected_bad_result = [
+        0.5025, 0.5025, 0.5025, 0.5025, -1., -1., 0.25, 0.75, 0.75, 0.75, -1.,
+        -1.
+    ]
+    np.testing.assert_almost_equal(bad_result, expected_bad_result, decimal=4)
+
+    # This score map gives lower score to the incorrect instance.
+    good_instance_scores = tf.constant([
+        [0.9, 0.9, 0.4, 0.4],
+        [0.9, 0.9, 0.4, 0.4],
+        [0.0, 0.0, 0.8, 0.8],
+        [0.0, 0.0, 0.8, 0.8],
+    ], tf.float32)
+    metric_obj.reset_states()
+    metric_obj.update_state(gt_panoptic, pred_panoptic, semantic_probability,
+                            good_instance_scores)
+
+    good_result = metric_obj.result().numpy()
+    logging.info('good_result = %s', good_result)
+
+    # Since the correct instance(s) have higher score, the "good" scores should
+    # give a result with higher AP.
+    expected_good_result = [
+        0.75248, 0.75248, 0.75248, 0.75248, -1, -1, 0.75, 0.75, 0.75, 0.75, -1,
+        -1
+    ]
+    np.testing.assert_almost_equal(good_result, expected_good_result, decimal=4)
+
+  def test_ignores_crowds(self):
+    num_classes = 3
+    things_list = list(range(num_classes))
+    label_divisor = 256
+    ignore_label = 0
+    gt_classes = tf.constant([
+        [1, 1, 2, 2],
+        [1, 1, 2, 2],
+        [0, 0, 2, 2],
+        [0, 0, 2, 2],
+    ], tf.int32)
+    pred_classes = tf.constant([
+        [1, 1, 1, 1],
+        [1, 1, 1, 1],
+        [0, 0, 2, 2],
+        [0, 0, 2, 2],
+    ], tf.int32)
+    instances = tf.constant([
+        [1, 1, 2, 2],
+        [1, 1, 2, 2],
+        [0, 0, 3, 3],
+        [0, 0, 3, 3],
+    ], tf.int32)
+    is_crowd_map = tf.math.equal(instances, 2)
+
+    gt_panoptic = combine_maps(gt_classes, instances, label_divisor)
+    pred_panoptic = combine_maps(pred_classes, instances, label_divisor)
+
+    semantic_probability = tf.ones(
+        tf.concat([tf.shape(pred_panoptic), [num_classes]], 0))
+    instance_score_map = tf.ones(tf.shape(pred_panoptic))
+
+    metric_obj = coco_instance_ap.PanopticInstanceAveragePrecision(
+        num_classes, things_list, label_divisor, ignore_label)
+    metric_obj.update_state(gt_panoptic, pred_panoptic, semantic_probability,
+                            instance_score_map, is_crowd_map)
+
+    result = metric_obj.result().numpy()
+    logging.info('result = %s', result)
+    # Expect perfect results (for the quantities that have an AP value), because
+    # the only mistake is a "crowd" instance.
+    expected_result = [1., 1., 1., 1., -1., -1., 1., 1., 1., 1., -1., -1.]
+    np.testing.assert_almost_equal(result, expected_result, decimal=4)
+
+  def test_ignores_stuff(self):
+    num_classes = 4
+    things_list = [3]
+    label_divisor = 256
+    ignore_label = 0
+    gt_classes = tf.constant([
+        [3, 3, 2, 2],
+        [3, 3, 2, 2],
+        [0, 0, 2, 2],
+        [0, 0, 2, 2],
+    ], tf.int32)
+    pred_classes = tf.constant([
+        [3, 3, 1, 1],
+        [3, 3, 1, 1],
+        [0, 0, 2, 2],
+        [0, 0, 2, 2],
+    ], tf.int32)
+    instances = tf.constant([
+        [1, 1, 2, 2],
+        [1, 1, 2, 2],
+        [0, 0, 3, 3],
+        [0, 0, 3, 3],
+    ], tf.int32)
+
+    gt_panoptic = combine_maps(gt_classes, instances, label_divisor)
+    pred_panoptic = combine_maps(pred_classes, instances, label_divisor)
+
+    semantic_probability = tf.ones(
+        tf.concat([tf.shape(pred_panoptic), [num_classes]], 0))
+    instance_score_map = tf.ones(tf.shape(pred_panoptic))
+
+    metric_obj = coco_instance_ap.PanopticInstanceAveragePrecision(
+        num_classes, things_list, label_divisor, ignore_label)
+    metric_obj.update_state(gt_panoptic, pred_panoptic, semantic_probability,
+                            instance_score_map)
+
+    result = metric_obj.result().numpy()
+    logging.info('result = %s', result)
+    # Expect perfect results (for the quantities that have an AP value), because
+    # the mistakes are all in "stuff" classes.
+    expected_result = [1., 1., 1., 1., -1., -1., 1., 1., 1., 1., -1., -1.]
+    np.testing.assert_almost_equal(result, expected_result, decimal=4)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/evaluation/depth_aware_segmentation_and_tracking_quality.py b/evaluation/depth_aware_segmentation_and_tracking_quality.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bbbb637a48bc480d83dfb8cc0f70998a5729c64
--- /dev/null
+++ b/evaluation/depth_aware_segmentation_and_tracking_quality.py
@@ -0,0 +1,210 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Implementation of Depth-aware Segmentation and Tracking Quality (DSTQ) metric."""
+
+import collections
+from typing import Sequence, List, Tuple
+import tensorflow as tf
+from deeplab2.evaluation import segmentation_and_tracking_quality as stq
+
+
+class DSTQuality(stq.STQuality):
+  """Metric class for Depth-aware Segmentation and Tracking Quality (DSTQ).
+
+  This metric computes STQ and the inlier depth metric (or depth quality (DQ))
+  under several thresholds. Then it returns the geometric mean of DQ's, AQ and
+  IoU to get the final DSTQ, i.e.,
+
+  DSTQ@{threshold_1} = pow(STQ ** 2 * DQ@{threshold_1}, 1/3)
+  DSTQ@{threshold_2} = pow(STQ ** 2 * DQ@{threshold_2}, 1/3)
+  ...
+  DSTQ = pow(STQ ** 2 * DQ, 1/3)
+
+  where DQ = pow(prod_i^n(threshold_i), 1/n) for n depth thresholds.
+
+  The default choices for depth thresholds are 1.1 and 1.25, i.e.,
+  max(pred/gt, gt/pred) <= 1.1 and max(pred/gt, gt/pred) <= 1.25.
+  Commonly used thresholds for the inlier metrics are 1.25, 1.25**2, 1.25**3.
+  These thresholds are so loose that many methods achieves > 99%.
+  Therefore, we choose 1.25 and 1.1 to encourage high-precision predictions.
+
+  Example usage:
+
+  dstq_obj = depth_aware_segmentation_and_tracking_quality.DSTQuality(
+    num_classes, things_list, ignore_label, max_instances_per_category,
+    offset, depth_threshold)
+  dstq.update_state(y_true_1, y_pred_1, d_true_1, d_pred_1)
+  dstq.update_state(y_true_2, y_pred_2, d_true_2, d_pred_2)
+  ...
+  result = dstq_obj.result().numpy()
+  """
+
+  _depth_threshold: Tuple[float, float] = (1.25, 1.1)
+  _depth_total_counts: collections.OrderedDict
+  _depth_inlier_counts: List[collections.OrderedDict]
+
+  def __init__(self,
+               num_classes: int,
+               things_list: Sequence[int],
+               ignore_label: int,
+               max_instances_per_category: int,
+               offset: int,
+               depth_threshold: Tuple[float] = (1.25, 1.1),
+               name: str = 'dstq',):  # pytype: disable=annotation-type-mismatch
+    """Initialization of the DSTQ metric.
+
+    Args:
+      num_classes: Number of classes in the dataset as an integer.
+      things_list: A sequence of class ids that belong to `things`.
+      ignore_label: The class id to be ignored in evaluation as an integer or
+        integer tensor.
+      max_instances_per_category: The maximum number of instances for each class
+        as an integer or integer tensor.
+      offset: The maximum number of unique labels as an integer or integer
+        tensor.
+      depth_threshold: A sequence of depth thresholds for the depth quality.
+        (default: (1.25, 1.1))
+      name: An optional name. (default: 'dstq')
+    """
+    super().__init__(num_classes, things_list, ignore_label,
+                     max_instances_per_category, offset, name)
+    if not (isinstance(depth_threshold, tuple) or
+            isinstance(depth_threshold, list)):
+      raise TypeError('The type of depth_threshold must be tuple or list.')
+    if not depth_threshold:
+      raise ValueError('depth_threshold must be non-empty.')
+    self._depth_threshold = tuple(depth_threshold)
+    self._depth_total_counts = collections.OrderedDict()
+    self._depth_inlier_counts = []
+    for _ in range(len(self._depth_threshold)):
+      self._depth_inlier_counts.append(collections.OrderedDict())
+
+  def update_state(self,
+                   y_true: tf.Tensor,
+                   y_pred: tf.Tensor,
+                   d_true: tf.Tensor,
+                   d_pred: tf.Tensor,
+                   sequence_id: int = 0):
+    """Accumulates the depth-aware segmentation and tracking quality statistics.
+
+    Args:
+      y_true: The ground-truth panoptic label map for a particular video frame
+        (defined as semantic_map * max_instances_per_category + instance_map).
+      y_pred: The predicted panoptic label map for a particular video frame
+        (defined as semantic_map * max_instances_per_category + instance_map).
+      d_true: The ground-truth depth map for this video frame.
+      d_pred: The predicted depth map for this video frame.
+      sequence_id: The optional ID of the sequence the frames belong to. When no
+        sequence is given, all frames are considered to belong to the same
+        sequence (default: 0).
+    """
+    super().update_state(y_true, y_pred, sequence_id)
+    # Valid depth labels contain positive values.
+    d_valid_mask = d_true > 0
+    d_valid_total = tf.reduce_sum(tf.cast(d_valid_mask, tf.int32))
+    # Valid depth prediction is expected to contain positive values.
+    d_valid_mask = tf.logical_and(d_valid_mask, d_pred > 0)
+    d_valid_true = tf.boolean_mask(d_true, d_valid_mask)
+    d_valid_pred = tf.boolean_mask(d_pred, d_valid_mask)
+    inlier_error = tf.maximum(d_valid_pred / d_valid_true,
+                              d_valid_true / d_valid_pred)
+    # For each threshold, count the number of inliers.
+    for threshold_index, threshold in enumerate(self._depth_threshold):
+      num_inliers = tf.reduce_sum(tf.cast(inlier_error <= threshold, tf.int32))
+      inlier_counts = self._depth_inlier_counts[threshold_index]
+      inlier_counts[sequence_id] = (inlier_counts.get(sequence_id, 0) +
+                                    int(num_inliers.numpy()))
+    # Update the total counts of the depth labels.
+    self._depth_total_counts[sequence_id] = (
+        self._depth_total_counts.get(sequence_id, 0) +
+        int(d_valid_total.numpy()))
+
+  def result(self):
+    """Computes the depth-aware segmentation and tracking quality.
+
+    Returns:
+      A dictionary containing:
+        - 'STQ': The total STQ score.
+        - 'AQ': The total association quality (AQ) score.
+        - 'IoU': The total mean IoU.
+        - 'STQ_per_seq': A list of the STQ score per sequence.
+        - 'AQ_per_seq': A list of the AQ score per sequence.
+        - 'IoU_per_seq': A list of mean IoU per sequence.
+        - 'Id_per_seq': A list of sequence Ids to map list index to sequence.
+        - 'Length_per_seq': A list of the length of each sequence.
+        - 'DSTQ': The total DSTQ score.
+        - 'DSTQ@thres': The total DSTQ score for threshold thres
+        - 'DSTQ_per_seq@thres': A list of DSTQ score per sequence for thres.
+        - 'DQ': The total DQ score.
+        - 'DQ@thres': The total DQ score for threshold thres.
+        - 'DQ_per_seq@thres': A list of DQ score per sequence for thres.
+    """
+    # Gather the results for STQ.
+    stq_results = super().result()
+    # Collect results for depth quality per sequecne and threshold.
+    dq_per_seq_at_threshold = {}
+    dq_at_threshold = {}
+    for threshold_index, threshold in enumerate(self._depth_threshold):
+      dq_per_seq_at_threshold[threshold] = [0] * len(self._ground_truth)
+      total_count = 0
+      inlier_count = 0
+      # Follow the order of computing STQ by enumerating _ground_truth.
+      for index, sequence_id in enumerate(self._ground_truth):
+        sequence_inlier = self._depth_inlier_counts[threshold_index][
+            sequence_id]
+        sequence_total = self._depth_total_counts[sequence_id]
+        if sequence_total > 0:
+          dq_per_seq_at_threshold[threshold][
+              index] = sequence_inlier / sequence_total
+        total_count += sequence_total
+        inlier_count += sequence_inlier
+      if total_count == 0:
+        dq_at_threshold[threshold] = 0
+      else:
+        dq_at_threshold[threshold] = inlier_count / total_count
+    # Compute DQ as the geometric mean of DQ's at different thresholds.
+    dq = 1
+    for _, threshold in enumerate(self._depth_threshold):
+      dq *= dq_at_threshold[threshold]
+    dq = dq ** (1 / len(self._depth_threshold))
+    dq_results = {}
+    dq_results['DQ'] = dq
+    for _, threshold in enumerate(self._depth_threshold):
+      dq_results['DQ@{}'.format(threshold)] = dq_at_threshold[threshold]
+      dq_results['DQ_per_seq@{}'.format(
+          threshold)] = dq_per_seq_at_threshold[threshold]
+    # Combine STQ and DQ to get DSTQ.
+    dstq_results = {}
+    dstq_results['DSTQ'] = (stq_results['STQ'] ** 2 * dq) ** (1/3)
+    for _, threshold in enumerate(self._depth_threshold):
+      dstq_results['DSTQ@{}'.format(threshold)] = (
+          stq_results['STQ'] ** 2 * dq_at_threshold[threshold]) ** (1/3)
+      dstq_results['DSTQ_per_seq@{}'.format(threshold)] = [
+          (stq_result**2 * dq_result)**(1 / 3) for stq_result, dq_result in zip(
+              stq_results['STQ_per_seq'], dq_per_seq_at_threshold[threshold])
+      ]
+    # Merge all the results.
+    dstq_results.update(stq_results)
+    dstq_results.update(dq_results)
+    return dstq_results
+
+  def reset_states(self):
+    """Resets all states that accumulated data."""
+    super().reset_states()
+    self._depth_total_counts = collections.OrderedDict()
+    self._depth_inlier_counts = []
+    for _ in range(len(self._depth_threshold)):
+      self._depth_inlier_counts.append(collections.OrderedDict())
diff --git a/evaluation/depth_aware_segmentation_and_tracking_quality_test.py b/evaluation/depth_aware_segmentation_and_tracking_quality_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..222ea0bc62f46fd36c3044515682416d9424e5df
--- /dev/null
+++ b/evaluation/depth_aware_segmentation_and_tracking_quality_test.py
@@ -0,0 +1,283 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for depth_aware_segmentation_and_tracking_quality."""
+
+import numpy as np
+import tensorflow as tf
+
+from deeplab2.evaluation import depth_aware_segmentation_and_tracking_quality as dstq
+
+
+class DepthAwareSegmentationAndTrackingQualityTest(tf.test.TestCase):
+
+  def test_complex_example(self):
+    n_classes = 3
+    ignore_label = 255
+    # classes = ['sky', 'vegetation', 'cars'].
+    things_list = [2]
+    max_instances_per_category = 1000
+
+    ground_truth_semantic_1 = np.array([[0, 0, 0, 0, 0, 0, 0, 0],
+                                        [0, 0, 0, 0, 0, 0, 0, 0],
+                                        [0, 0, 0, 2, 0, 1, 1, 1],
+                                        [0, 2, 2, 2, 2, 1, 1, 1],
+                                        [2, 2, 2, 2, 2, 2, 1, 1],
+                                        [2, 2, 2, 2, 2, 2, 2, 1],
+                                        [2, 2, 2, 2, 2, 2, 2, 1],
+                                        [2, 2, 2, 2, 2, 2, 1, 1]])
+    ground_truth_semantic_2 = np.array([[0, 0, 0, 0, 0, 0, 0, 0],
+                                        [0, 0, 0, 0, 0, 0, 0, 0],
+                                        [0, 2, 0, 0, 1, 1, 0, 0],
+                                        [2, 2, 2, 1, 1, 1, 1, 0],
+                                        [2, 2, 2, 2, 1, 1, 1, 1],
+                                        [2, 2, 2, 2, 2, 1, 1, 1],
+                                        [2, 2, 2, 2, 2, 1, 1, 1],
+                                        [2, 2, 2, 2, 1, 1, 1, 1]])
+    ground_truth_semantic_3 = np.array([[0, 0, 0, 0, 0, 0, 0, 0],
+                                        [0, 0, 0, 0, 0, 0, 0, 0],
+                                        [0, 0, 0, 0, 0, 0, 0, 0],
+                                        [2, 0, 1, 1, 1, 0, 0, 0],
+                                        [2, 2, 1, 1, 1, 1, 0, 0],
+                                        [2, 2, 2, 1, 1, 1, 1, 0],
+                                        [2, 2, 2, 1, 1, 1, 1, 1],
+                                        [2, 2, 2, 1, 1, 1, 1, 1]])
+    ground_truth_semantic = np.stack([
+        ground_truth_semantic_1, ground_truth_semantic_2,
+        ground_truth_semantic_3
+    ])
+
+    ground_truth_instance_1 = np.array([[0, 0, 0, 0, 0, 0, 0, 0],
+                                        [0, 0, 0, 0, 0, 0, 0, 0],
+                                        [0, 0, 0, 2, 0, 0, 0, 0],
+                                        [0, 2, 2, 2, 2, 0, 0, 0],
+                                        [2, 2, 2, 2, 2, 2, 0, 0],
+                                        [2, 2, 2, 2, 2, 2, 2, 0],
+                                        [2, 2, 2, 2, 2, 2, 2, 0],
+                                        [2, 2, 2, 2, 2, 2, 0, 0]])
+    ground_truth_instance_2 = np.array([[0, 0, 0, 0, 0, 0, 0, 0],
+                                        [0, 0, 0, 0, 0, 0, 0, 0],
+                                        [0, 2, 0, 0, 0, 0, 0, 0],
+                                        [2, 2, 2, 0, 0, 0, 0, 0],
+                                        [2, 2, 2, 2, 0, 0, 0, 0],
+                                        [2, 2, 2, 2, 2, 0, 0, 0],
+                                        [2, 2, 2, 2, 2, 0, 0, 0],
+                                        [2, 2, 2, 2, 0, 0, 0, 0]])
+    ground_truth_instance_3 = np.array([[0, 0, 0, 0, 0, 0, 0, 0],
+                                        [0, 0, 0, 0, 0, 0, 0, 0],
+                                        [0, 0, 0, 0, 0, 0, 0, 0],
+                                        [2, 0, 0, 0, 0, 0, 0, 0],
+                                        [2, 2, 0, 0, 0, 0, 0, 0],
+                                        [2, 2, 2, 0, 0, 0, 0, 0],
+                                        [2, 2, 2, 0, 0, 0, 0, 0],
+                                        [2, 2, 2, 0, 0, 0, 0, 0]])
+
+    ground_truth_instance = np.stack([
+        ground_truth_instance_1, ground_truth_instance_2,
+        ground_truth_instance_3
+    ])
+    ground_truth = (ground_truth_semantic * max_instances_per_category
+                    + ground_truth_instance)
+
+    prediction_semantic_1 = np.array([[0, 0, 0, 0, 0, 0, 0, 0],
+                                      [0, 0, 0, 0, 0, 0, 0, 0],
+                                      [0, 0, 0, 0, 0, 1, 0, 0],
+                                      [0, 0, 0, 2, 2, 1, 1, 1],
+                                      [0, 2, 2, 2, 2, 2, 1, 1],
+                                      [2, 2, 2, 2, 2, 2, 2, 1],
+                                      [2, 2, 2, 2, 2, 2, 2, 1],
+                                      [2, 2, 2, 2, 2, 2, 2, 1]])
+    prediction_semantic_2 = np.array([[0, 0, 0, 0, 0, 0, 0, 0],
+                                      [0, 0, 0, 0, 0, 0, 0, 0],
+                                      [0, 0, 0, 0, 1, 1, 0, 0],
+                                      [0, 2, 2, 2, 1, 1, 1, 1],
+                                      [2, 2, 2, 2, 1, 1, 1, 1],
+                                      [2, 2, 2, 2, 2, 1, 1, 1],
+                                      [2, 2, 2, 2, 2, 2, 1, 1],
+                                      [2, 2, 2, 2, 2, 1, 1, 1]])
+    prediction_semantic_3 = np.array([[0, 0, 0, 0, 0, 0, 0, 0],
+                                      [0, 0, 0, 0, 0, 0, 0, 0],
+                                      [0, 0, 0, 0, 1, 0, 0, 0],
+                                      [0, 0, 1, 1, 1, 1, 0, 0],
+                                      [2, 2, 2, 1, 1, 1, 0, 0],
+                                      [2, 2, 2, 1, 1, 1, 1, 1],
+                                      [2, 2, 2, 2, 1, 1, 1, 1],
+                                      [2, 2, 2, 2, 1, 1, 1, 1]])
+    prediction_semantic = np.stack(
+        [prediction_semantic_1, prediction_semantic_2, prediction_semantic_3])
+
+    prediction_instance_1 = np.array([[0, 0, 0, 0, 0, 0, 0, 0],
+                                      [0, 0, 0, 0, 0, 0, 0, 0],
+                                      [0, 0, 0, 0, 0, 0, 0, 0],
+                                      [0, 0, 0, 2, 2, 0, 0, 0],
+                                      [0, 2, 2, 2, 2, 1, 0, 0],
+                                      [2, 2, 2, 2, 2, 1, 1, 0],
+                                      [2, 2, 2, 2, 1, 1, 1, 0],
+                                      [2, 2, 2, 2, 1, 1, 1, 0]])
+    prediction_instance_2 = np.array([[0, 0, 0, 0, 0, 0, 0, 0],
+                                      [0, 0, 0, 0, 0, 0, 0, 0],
+                                      [0, 0, 0, 0, 0, 0, 0, 0],
+                                      [0, 2, 2, 2, 0, 0, 0, 0],
+                                      [2, 2, 2, 2, 0, 0, 0, 0],
+                                      [2, 2, 2, 2, 2, 0, 0, 0],
+                                      [2, 2, 2, 2, 1, 1, 0, 0],
+                                      [2, 2, 2, 2, 1, 0, 0, 0]])
+    prediction_instance_3 = np.array([[0, 0, 0, 0, 0, 0, 0, 0],
+                                      [0, 0, 0, 0, 0, 0, 0, 0],
+                                      [0, 0, 0, 0, 0, 0, 0, 0],
+                                      [0, 0, 0, 0, 0, 0, 0, 0],
+                                      [2, 2, 2, 0, 0, 0, 0, 0],
+                                      [2, 2, 2, 0, 0, 0, 0, 0],
+                                      [2, 2, 2, 2, 0, 0, 0, 0],
+                                      [2, 2, 2, 2, 0, 0, 0, 0]])
+    prediction_instance = np.stack(
+        [prediction_instance_1, prediction_instance_2, prediction_instance_3])
+    prediction = (prediction_semantic * max_instances_per_category
+                  + prediction_instance)
+
+    ground_truth_depth = np.array(
+        [[56.1, 50.9, 54.0, 63.6, 68.6, 50.9, 50.9, 58.1],
+         [62.6, 52.1, 00.0, 60.9, 62.4, 52.6, 56.3, 63.4],
+         [57.1, 61.2, 63.8, 63.1, 52.3, 54.3, 52.1, 51.4],
+         [65.8, 50.5, 58.9, 54.3, 00.0, 65.4, 63.8, 56.8],
+         [50.6, 56.5, 53.0, 66.9, 51.8, 58.6, 65.9, 66.4],
+         [53.5, 56.2, 53.6, 50.6, 64.6, 51.1, 68.7, 50.3],
+         [69.0, 65.3, 66.4, 51.9, 68.3, 50.5, 00.0, 67.4],
+         [59.7, 51.3, 50.1, 67.2, 68.8, 62.8, 64.9, 59.5]])
+    prediction_depth = np.array(
+        [[67.5, 36.9, 65.7, 77.9, 75.0, 45.1, 68.2, 63.3],
+         [43.8, 63.0, 79.4, 78.1, 82.2, 36.9, 59.2, 83.2],
+         [70.6, 73.2, 77.8, 71.3, 41.3, 47.5, 58.8, 64.8],
+         [60.5, 51.7, 72.2, 49.8, 56.1, 60.7, 72.2, 73.0],
+         [34.5, 55.7, 46.7, 47.4, 69.6, 43.5, 82.3, 84.8],
+         [46.9, 39.5, 35.4, 61.3, 79.4, 42.2, 48.9, 56.3],
+         [57.0, 75.0, 84.2, 46.3, 67.4, 55.5, 46.9, 70.0],
+         [62.3, 58.3, 59.4, 74.5, 70.6, 54.6, 78.6, 48.1]])
+
+    with self.subTest('No valid depth labels'):
+      # Compute DSTQuality.
+      dstq_metric = dstq.DSTQuality(
+          n_classes, things_list, ignore_label, max_instances_per_category,
+          256 * 256, (1.25, 1.1))
+      no_valid_ground_truth_depth = ground_truth_depth * 0
+
+      for i in range(3):
+        dstq_metric.update_state(
+            tf.convert_to_tensor(ground_truth[i, ...], dtype=tf.int32),
+            tf.convert_to_tensor(prediction[i, ...], dtype=tf.int32),
+            tf.convert_to_tensor(no_valid_ground_truth_depth, dtype=tf.float32),
+            tf.convert_to_tensor(prediction_depth, dtype=tf.float32),
+            1)
+      result = dstq_metric.result()
+
+      # Check if additional implementations alter the STQ results.
+      # The example is copied from the complex example for testing STQ.
+      # The results are expected to be unchanged.
+      np.testing.assert_almost_equal(result['STQ'], 0.66841773352)
+      np.testing.assert_almost_equal(result['AQ'], 0.55366581415)
+      np.testing.assert_almost_equal(result['IoU'], 0.8069529580309542)
+      np.testing.assert_almost_equal(result['STQ_per_seq'], [0.66841773352])
+      np.testing.assert_almost_equal(result['AQ_per_seq'], [0.55366581415])
+      np.testing.assert_almost_equal(result['IoU_per_seq'],
+                                     [0.8069529580309542])
+      np.testing.assert_almost_equal(result['ID_per_seq'], [1])
+      np.testing.assert_almost_equal(result['Length_per_seq'], [3])
+      # As there is no valid depth labels, any depth metrics should be 0.
+      np.testing.assert_almost_equal(result['DSTQ'], 0.0)
+      np.testing.assert_almost_equal(result['DSTQ@1.1'], 0.0)
+      np.testing.assert_almost_equal(result['DSTQ@1.25'], 0.0)
+      np.testing.assert_almost_equal(result['DSTQ_per_seq@1.1'], [0.0])
+      np.testing.assert_almost_equal(result['DSTQ_per_seq@1.25'], [0.0])
+      np.testing.assert_almost_equal(result['DQ'], 0.0)
+      np.testing.assert_almost_equal(result['DQ@1.1'], 0.0)
+      np.testing.assert_almost_equal(result['DQ@1.25'], 0.0)
+      np.testing.assert_almost_equal(result['DQ_per_seq@1.1'], [0.0])
+      np.testing.assert_almost_equal(result['DQ_per_seq@1.25'], [0.0])
+
+    with self.subTest('Default depth thresholds'):
+      # Compute DSTQuality.
+      dstq_metric = dstq.DSTQuality(
+          n_classes, things_list, ignore_label, max_instances_per_category,
+          256 * 256, (1.25, 1.1))
+
+      for i in range(3):
+        dstq_metric.update_state(
+            tf.convert_to_tensor(ground_truth[i, ...], dtype=tf.int32),
+            tf.convert_to_tensor(prediction[i, ...], dtype=tf.int32),
+            tf.convert_to_tensor(ground_truth_depth, dtype=tf.float32),
+            tf.convert_to_tensor(prediction_depth, dtype=tf.float32),
+            1)
+
+      result = dstq_metric.result()
+      # Prepare groundtruth metrics.
+      valid_depth_labels_total = np.sum(ground_truth_depth > 0)
+      valid_depth_labels = ground_truth_depth[ground_truth_depth > 0]
+      valid_depth_pred = prediction_depth[ground_truth_depth > 0]
+      valid_depth_error = np.maximum(valid_depth_pred / valid_depth_labels,
+                                     valid_depth_labels / valid_depth_pred)
+      dq_1_1 = np.sum(valid_depth_error <= 1.1) / valid_depth_labels_total
+      dq_1_25 = np.sum(valid_depth_error <= 1.25) / valid_depth_labels_total
+
+      # Check if additional implementations alter the STQ results.
+      # The example is copied from the complex example for testing STQ.
+      # The results are expected to be unchanged.
+      np.testing.assert_almost_equal(result['STQ'], 0.66841773352)
+      np.testing.assert_almost_equal(result['AQ'], 0.55366581415)
+      np.testing.assert_almost_equal(result['IoU'], 0.8069529580309542)
+      np.testing.assert_almost_equal(result['STQ_per_seq'], [0.66841773352])
+      np.testing.assert_almost_equal(result['AQ_per_seq'], [0.55366581415])
+      np.testing.assert_almost_equal(result['IoU_per_seq'],
+                                     [0.8069529580309542])
+      np.testing.assert_almost_equal(result['ID_per_seq'], [1])
+      np.testing.assert_almost_equal(result['Length_per_seq'], [3])
+      # Results are checked by groundtruth or equations.
+      np.testing.assert_almost_equal(result['DSTQ'] ** 3,
+                                     result['STQ'] ** 2 * result['DQ'])
+      np.testing.assert_almost_equal(result['DSTQ@1.1'] ** 3,
+                                     result['STQ'] ** 2 * result['DQ@1.1'])
+      np.testing.assert_almost_equal(result['DSTQ@1.25'] ** 3,
+                                     result['STQ'] ** 2 * result['DQ@1.25'])
+      np.testing.assert_almost_equal(result['DSTQ_per_seq@1.1'],
+                                     [result['DSTQ@1.1']])
+      np.testing.assert_almost_equal(result['DSTQ_per_seq@1.25'],
+                                     [result['DSTQ@1.25']])
+      np.testing.assert_almost_equal(result['DQ'] ** 2,
+                                     result['DQ@1.1'] * result['DQ@1.25'])
+      np.testing.assert_almost_equal(result['DQ@1.1'], dq_1_1)
+      np.testing.assert_almost_equal(result['DQ@1.25'], dq_1_25)
+      np.testing.assert_almost_equal(result['DQ_per_seq@1.1'],
+                                     [result['DQ@1.1']])
+      np.testing.assert_almost_equal(result['DQ_per_seq@1.25'],
+                                     [result['DQ@1.25']])
+      # Results are checked by real numbers.
+      np.testing.assert_almost_equal(result['DSTQ'], 0.5552059833215103)
+      np.testing.assert_almost_equal(result['DSTQ@1.1'], 0.45663565048742255)
+      np.testing.assert_almost_equal(result['DSTQ@1.25'],
+                                     0.6750539157136957)
+      np.testing.assert_almost_equal(result['DSTQ_per_seq@1.1'],
+                                     [0.45663565048742255])
+      np.testing.assert_almost_equal(result['DSTQ_per_seq@1.25'],
+                                     [0.6750539157136957])
+      np.testing.assert_almost_equal(result['DQ'], 0.3830597195261614)
+      np.testing.assert_almost_equal(result['DQ@1.1'], 0.21311475409836064)
+      np.testing.assert_almost_equal(result['DQ@1.25'], 0.6885245901639344)
+      np.testing.assert_almost_equal(result['DQ_per_seq@1.1'],
+                                     [0.21311475409836064])
+      np.testing.assert_almost_equal(result['DQ_per_seq@1.25'],
+                                     [0.6885245901639344])
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/evaluation/panoptic_quality.py b/evaluation/panoptic_quality.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f8d089a0f725176acfd5f2b9fc3ffc63bdd802a
--- /dev/null
+++ b/evaluation/panoptic_quality.py
@@ -0,0 +1,266 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Implementation of the Panoptic Quality metric.
+
+Panoptic Quality is an instance-based metric for evaluating the task of
+image parsing, aka panoptic segmentation.
+
+Please see the paper for details:
+"Panoptic Segmentation", Alexander Kirillov, Kaiming He, Ross Girshick,
+Carsten Rother and Piotr Dollar. arXiv:1801.00868, 2018.
+"""
+
+from typing import Any, List, Mapping, Optional, Tuple
+
+import numpy as np
+import tensorflow as tf
+
+
+def _ids_to_counts(id_array: np.ndarray) -> Mapping[int, int]:
+  """Given a numpy array, a mapping from each unique entry to its count."""
+  ids, counts = np.unique(id_array, return_counts=True)
+  return dict(zip(ids, counts))
+
+
+class PanopticQuality(tf.keras.metrics.Metric):
+  """Metric class for Panoptic Quality.
+
+  "Panoptic Segmentation" by Alexander Kirillov, Kaiming He, Ross Girshick,
+  Carsten Rother, Piotr Dollar.
+  https://arxiv.org/abs/1801.00868
+
+  Stand-alone usage:
+
+  pq_obj = panoptic_quality.PanopticQuality(num_classes,
+    max_instances_per_category, ignored_label)
+  pq_obj.update_state(y_true_1, y_pred_1)
+  pq_obj.update_state(y_true_2, y_pred_2)
+  ...
+  result = pq_obj.result().numpy()
+  """
+
+  def __init__(self,
+               num_classes: int,
+               ignored_label: int,
+               max_instances_per_category: int,
+               offset: int,
+               name: str = 'panoptic_quality',
+               **kwargs):
+    """Initialization of the PanopticQuality metric.
+
+    Args:
+      num_classes: Number of classes in the dataset as an integer.
+      ignored_label: The class id to be ignored in evaluation as an integer or
+        integer tensor.
+      max_instances_per_category: The maximum number of instances for each class
+        as an integer or integer tensor.
+      offset: The maximum number of unique labels as an integer or integer
+        tensor.
+      name: An optional variable_scope name. (default: 'panoptic_quality')
+      **kwargs: The keyword arguments that are passed on to `fn`.
+    """
+    super(PanopticQuality, self).__init__(name=name, **kwargs)
+    self.num_classes = num_classes
+    self.ignored_label = ignored_label
+    self.max_instances_per_category = max_instances_per_category
+    self.total_iou = self.add_weight(
+        'total_iou', shape=(num_classes,), initializer=tf.zeros_initializer)
+    self.total_tp = self.add_weight(
+        'total_tp', shape=(num_classes,), initializer=tf.zeros_initializer)
+    self.total_fn = self.add_weight(
+        'total_fn', shape=(num_classes,), initializer=tf.zeros_initializer)
+    self.total_fp = self.add_weight(
+        'total_fp', shape=(num_classes,), initializer=tf.zeros_initializer)
+    self.offset = offset
+
+  def compare_and_accumulate(
+      self, gt_panoptic_label: tf.Tensor, pred_panoptic_label: tf.Tensor
+  ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Compares predicted segmentation with groundtruth, accumulates its metric.
+
+    It is not assumed that instance ids are unique across different categories.
+    See for example combine_semantic_and_instance_predictions.py in official
+    PanopticAPI evaluation code for issues to consider when fusing category
+    and instance labels.
+
+    Instances ids of the ignored category have the meaning that id 0 is "void"
+    and remaining ones are crowd instances.
+
+    Args:
+      gt_panoptic_label: A tensor that combines label array from categories and
+        instances for ground truth.
+      pred_panoptic_label: A tensor that combines label array from categories
+        and instances for the prediction.
+
+    Returns:
+      The value of the metrics (iou, tp, fn, fp) over all comparisons, as a
+      float scalar.
+    """
+    iou_per_class = np.zeros(self.num_classes, dtype=np.float64)
+    tp_per_class = np.zeros(self.num_classes, dtype=np.float64)
+    fn_per_class = np.zeros(self.num_classes, dtype=np.float64)
+    fp_per_class = np.zeros(self.num_classes, dtype=np.float64)
+
+    # Pre-calculate areas for all groundtruth and predicted segments.
+    gt_segment_areas = _ids_to_counts(gt_panoptic_label.numpy())
+    pred_segment_areas = _ids_to_counts(pred_panoptic_label.numpy())
+
+    # We assume the ignored segment has instance id = 0.
+    ignored_panoptic_id = self.ignored_label * self.max_instances_per_category
+
+    # Next, combine the groundtruth and predicted labels. Dividing up the pixels
+    # based on which groundtruth segment and which predicted segment they belong
+    # to, this will assign a different 64-bit integer label to each choice
+    # of (groundtruth segment, predicted segment), encoded as
+    #   gt_panoptic_label * offset + pred_panoptic_label.
+    intersection_id_array = tf.cast(gt_panoptic_label,
+                                    tf.int64) * self.offset + tf.cast(
+                                        pred_panoptic_label, tf.int64)
+
+    # For every combination of (groundtruth segment, predicted segment) with a
+    # non-empty intersection, this counts the number of pixels in that
+    # intersection.
+    intersection_areas = _ids_to_counts(intersection_id_array.numpy())
+
+    # Compute overall ignored overlap.
+    def prediction_ignored_overlap(pred_panoptic_label):
+      intersection_id = ignored_panoptic_id * self.offset + pred_panoptic_label
+      return intersection_areas.get(intersection_id, 0)
+
+    # Sets that are populated with which segments groundtruth/predicted segments
+    # have been matched with overlapping predicted/groundtruth segments
+    # respectively.
+    gt_matched = set()
+    pred_matched = set()
+
+    # Calculate IoU per pair of intersecting segments of the same category.
+    for intersection_id, intersection_area in intersection_areas.items():
+      gt_panoptic_label = intersection_id // self.offset
+      pred_panoptic_label = intersection_id % self.offset
+
+      gt_category = gt_panoptic_label // self.max_instances_per_category
+      pred_category = pred_panoptic_label // self.max_instances_per_category
+      if gt_category != pred_category:
+        continue
+      if pred_category == self.ignored_label:
+        continue
+
+      # Union between the groundtruth and predicted segments being compared does
+      # not include the portion of the predicted segment that consists of
+      # groundtruth "void" pixels.
+      union = (
+          gt_segment_areas[gt_panoptic_label] +
+          pred_segment_areas[pred_panoptic_label] - intersection_area -
+          prediction_ignored_overlap(pred_panoptic_label))
+      iou = intersection_area / union
+      if iou > 0.5:
+        tp_per_class[gt_category] += 1
+        iou_per_class[gt_category] += iou
+        gt_matched.add(gt_panoptic_label)
+        pred_matched.add(pred_panoptic_label)
+
+    # Count false negatives for each category.
+    for gt_panoptic_label in gt_segment_areas:
+      if gt_panoptic_label in gt_matched:
+        continue
+      category = gt_panoptic_label // self.max_instances_per_category
+      # Failing to detect a void segment is not a false negative.
+      if category == self.ignored_label:
+        continue
+      fn_per_class[category] += 1
+
+    # Count false positives for each category.
+    for pred_panoptic_label in pred_segment_areas:
+      if pred_panoptic_label in pred_matched:
+        continue
+      # A false positive is not penalized if is mostly ignored in the
+      # groundtruth.
+      if (prediction_ignored_overlap(pred_panoptic_label) /
+          pred_segment_areas[pred_panoptic_label]) > 0.5:
+        continue
+      category = pred_panoptic_label // self.max_instances_per_category
+      if category == self.ignored_label:
+        continue
+      fp_per_class[category] += 1
+    return iou_per_class, tp_per_class, fn_per_class, fp_per_class
+
+  def update_state(
+      self,
+      y_true: tf.Tensor,
+      y_pred: tf.Tensor,
+      sample_weight: Optional[tf.Tensor] = None) -> List[tf.Operation]:
+    """Accumulates the panoptic quality statistics.
+
+    Args:
+      y_true: The ground truth panoptic label map (defined as semantic_map *
+        max_instances_per_category + instance_map).
+      y_pred: The predicted panoptic label map (defined as semantic_map *
+        max_instances_per_category + instance_map).
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
+        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
+        be broadcastable to `y_true`.
+
+    Returns:
+      Update ops for iou, tp, fn, fp.
+    """
+    result = self.compare_and_accumulate(y_true, y_pred)
+    iou, tp, fn, fp = tuple(result)
+    update_iou_op = self.total_iou.assign_add(iou)
+    update_tp_op = self.total_tp.assign_add(tp)
+    update_fn_op = self.total_fn.assign_add(fn)
+    update_fp_op = self.total_fp.assign_add(fp)
+    return [update_iou_op, update_tp_op, update_fn_op, update_fp_op]
+
+  def result(self) -> tf.Tensor:
+    """Computes the panoptic quality."""
+    sq = tf.math.divide_no_nan(self.total_iou, self.total_tp)
+    rq = tf.math.divide_no_nan(
+        self.total_tp,
+        self.total_tp + 0.5 * self.total_fn + 0.5 * self.total_fp)
+    pq = tf.math.multiply(sq, rq)
+
+    # Find the valid classes that will be used for evaluation. We will
+    # ignore classes which have (tp + fn + fp) equal to 0.
+    # The "ignore" label will be included in this based on logic that skips
+    # counting those instances/regions.
+    valid_classes = tf.not_equal(self.total_tp + self.total_fn + self.total_fp,
+                                 0)
+
+    # Compute averages over classes.
+    qualities = tf.stack(
+        [pq, sq, rq, self.total_tp, self.total_fn, self.total_fp], axis=0)
+    summarized_qualities = tf.math.reduce_mean(
+        tf.boolean_mask(qualities, valid_classes, axis=1), axis=1)
+
+    return summarized_qualities
+
+  def reset_states(self) -> None:
+    """See base class."""
+    tf.keras.backend.set_value(self.total_iou, np.zeros(self.num_classes))
+    tf.keras.backend.set_value(self.total_tp, np.zeros(self.num_classes))
+    tf.keras.backend.set_value(self.total_fn, np.zeros(self.num_classes))
+    tf.keras.backend.set_value(self.total_fp, np.zeros(self.num_classes))
+
+  def get_config(self) -> Mapping[str, Any]:
+    """See base class."""
+    config = {
+        'num_classes': self.num_classes,
+        'ignored_label': self.ignored_label,
+        'max_instances_per_category': self.max_instances_per_category,
+        'offset': self.offset,
+    }
+    base_config = super(PanopticQuality, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
diff --git a/evaluation/panoptic_quality_test.py b/evaluation/panoptic_quality_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecef73fd8d93dbcac295f9f5431c1ba4cc08398b
--- /dev/null
+++ b/evaluation/panoptic_quality_test.py
@@ -0,0 +1,214 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for panoptic_quality metrics."""
+import collections
+
+from absl import logging
+import numpy as np
+import tensorflow as tf
+
+from deeplab2.evaluation import panoptic_quality
+from deeplab2.evaluation import test_utils
+
+# See the definition of the color names at:
+#   https://en.wikipedia.org/wiki/Web_colors.
+_CLASS_COLOR_MAP = {
+    (0, 0, 0): 0,
+    (0, 0, 255): 1,  # Person (blue).
+    (255, 0, 0): 2,  # Bear (red).
+    (0, 255, 0): 3,  # Tree (lime).
+    (255, 0, 255): 4,  # Bird (fuchsia).
+    (0, 255, 255): 5,  # Sky (aqua).
+    (255, 255, 0): 6,  # Cat (yellow).
+}
+
+
+def combine_maps(semantic_map, instance_map, label_divisor):
+  combined_map = instance_map + semantic_map * label_divisor
+  return tf.cast(combined_map, tf.int32)
+
+
+class PanopticQualityMetricTest(tf.test.TestCase):
+
+  def test_streaming_metric_on_single_image(self):
+    max_instances_per_category = 1000
+    instance_class_map = {
+        0: 0,
+        47: 1,
+        97: 1,
+        133: 1,
+        150: 1,
+        174: 1,
+        198: 2,
+        215: 1,
+        244: 1,
+        255: 1,
+    }
+    gt_instances, gt_classes = test_utils.panoptic_segmentation_with_class_map(
+        'team_gt_instance.png', instance_class_map)
+
+    pred_classes = test_utils.read_segmentation_with_rgb_color_map(
+        'team_pred_class.png', _CLASS_COLOR_MAP)
+    pred_instances = test_utils.read_test_image(
+        'team_pred_instance.png', image_format='L')
+
+    pq_obj = panoptic_quality.PanopticQuality(
+        num_classes=3,
+        max_instances_per_category=max_instances_per_category,
+        ignored_label=0, offset=256*256)
+
+    y_true = combine_maps(gt_classes, gt_instances, max_instances_per_category)
+    y_pred = combine_maps(pred_classes, pred_instances,
+                          max_instances_per_category)
+    pq_obj.update_state(y_true, y_pred)
+    result = pq_obj.result().numpy()
+    self.assertAlmostEqual(result[0], 0.62156284, places=4)
+    self.assertAlmostEqual(result[1], 0.64664984, places=4)
+    self.assertAlmostEqual(result[2], 0.9666667, places=4)
+    self.assertEqual(result[3], 4.)
+    self.assertAlmostEqual(result[4], 0.5)
+    self.assertEqual(result[5], 0.)
+
+  def test_streaming_metric_on_multiple_images(self):
+    num_classes = 7
+
+    bird_gt_instance_class_map = {
+        92: 5,
+        176: 3,
+        255: 4,
+    }
+    cat_gt_instance_class_map = {
+        0: 0,
+        255: 6,
+    }
+    team_gt_instance_class_map = {
+        0: 0,
+        47: 1,
+        97: 1,
+        133: 1,
+        150: 1,
+        174: 1,
+        198: 2,
+        215: 1,
+        244: 1,
+        255: 1,
+    }
+    max_instances_per_category = 256
+    test_image = collections.namedtuple(
+        'TestImage',
+        ['gt_class_map', 'gt_path', 'pred_inst_path', 'pred_class_path'])
+    test_images = [
+        test_image(bird_gt_instance_class_map, 'bird_gt.png',
+                   'bird_pred_instance.png', 'bird_pred_class.png'),
+        test_image(cat_gt_instance_class_map, 'cat_gt.png',
+                   'cat_pred_instance.png', 'cat_pred_class.png'),
+        test_image(team_gt_instance_class_map, 'team_gt_instance.png',
+                   'team_pred_instance.png', 'team_pred_class.png'),
+    ]
+
+    gt_classes = []
+    gt_instances = []
+    pred_classes = []
+    pred_instances = []
+    for test_image in test_images:
+      (image_gt_instances,
+       image_gt_classes) = test_utils.panoptic_segmentation_with_class_map(
+           test_image.gt_path, test_image.gt_class_map)
+      gt_classes.append(image_gt_classes)
+      gt_instances.append(image_gt_instances)
+
+      pred_classes.append(
+          test_utils.read_segmentation_with_rgb_color_map(
+              test_image.pred_class_path, _CLASS_COLOR_MAP))
+      pred_instances.append(
+          test_utils.read_test_image(test_image.pred_inst_path,
+                                     image_format='L'))
+
+    pq_obj = panoptic_quality.PanopticQuality(
+        num_classes=num_classes,
+        max_instances_per_category=max_instances_per_category,
+        ignored_label=0, offset=256*256)
+    for pred_class, pred_instance, gt_class, gt_instance in zip(
+        pred_classes, pred_instances, gt_classes, gt_instances):
+      y_true = combine_maps(gt_class, gt_instance, max_instances_per_category)
+      y_pred = combine_maps(pred_class, pred_instance,
+                            max_instances_per_category)
+      pq_obj.update_state(y_true, y_pred)
+    result = pq_obj.result().numpy()
+
+    self.assertAlmostEqual(result[0], 0.76855499, places=4)
+    self.assertAlmostEqual(result[1], 0.7769174, places=4)
+    self.assertAlmostEqual(result[2], 0.98888892, places=4)
+    self.assertEqual(result[3], 2.)
+    self.assertAlmostEqual(result[4], 1. / 6, places=4)
+    self.assertEqual(result[5], 0.)
+
+  def test_predicted_non_contiguous_ignore_label(self):
+    max_instances_per_category = 256
+    pq_obj = panoptic_quality.PanopticQuality(
+        num_classes=3,
+        max_instances_per_category=max_instances_per_category,
+        ignored_label=9,
+        offset=256 * 256)
+
+    gt_class = [
+        [0, 9, 9],
+        [1, 2, 2],
+        [1, 9, 9],
+    ]
+    gt_instance = [
+        [0, 2, 2],
+        [1, 0, 0],
+        [1, 0, 0],
+    ]
+    y_true = combine_maps(
+        np.array(gt_class), np.array(gt_instance), max_instances_per_category)
+    logging.info('y_true=\n%s', y_true)
+
+    pred_class = [
+        [0, 0, 9],
+        [1, 1, 1],
+        [1, 9, 9],
+    ]
+    pred_instance = [
+        [0, 0, 0],
+        [0, 1, 1],
+        [0, 1, 1],
+    ]
+    y_pred = combine_maps(
+        np.array(pred_class), np.array(pred_instance),
+        max_instances_per_category)
+    logging.info('y_pred=\n%s', y_pred)
+
+    pq_obj.update_state(y_true, y_pred)
+    result = pq_obj.result().numpy()
+
+    # pq
+    self.assertAlmostEqual(result[0], 2. / 9, places=4)
+    # sq
+    self.assertAlmostEqual(result[1], 1. / 3, places=4)
+    # rq
+    self.assertAlmostEqual(result[2], 2. / 9, places=4)
+    # tp
+    self.assertAlmostEqual(result[3], 1. / 3, places=4)
+    # fn
+    self.assertAlmostEqual(result[4], 2. / 3, places=4)
+    # fp
+    self.assertAlmostEqual(result[5], 2. / 3, places=4)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/evaluation/segmentation_and_tracking_quality.py b/evaluation/segmentation_and_tracking_quality.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6c3171c8c3e98cc265b296f7b9e44df190f0d9d
--- /dev/null
+++ b/evaluation/segmentation_and_tracking_quality.py
@@ -0,0 +1,282 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Implementation of the Segmentation and Tracking Quality (STQ) metric."""
+
+import collections
+from typing import MutableMapping, Sequence, Dict, Text, Any
+import numpy as np
+import tensorflow as tf
+
+
+def _update_dict_stats(stat_dict: MutableMapping[int, tf.Tensor],
+                       id_array: tf.Tensor):
+  """Updates a given dict with corresponding counts."""
+  ids, _, counts = tf.unique_with_counts(id_array)
+  for idx, count in zip(ids.numpy(), counts):
+    if idx in stat_dict:
+      stat_dict[idx] += count
+    else:
+      stat_dict[idx] = count
+
+
+class STQuality(object):
+  """Metric class for the Segmentation and Tracking Quality (STQ).
+
+  The metric computes the geometric mean of two terms.
+  - Association Quality: This term measures the quality of the track ID
+      assignment for `thing` classes. It is formulated as a weighted IoU
+      measure.
+  - Segmentation Quality: This term measures the semantic segmentation quality.
+      The standard class IoU measure is used for this.
+
+  Example usage:
+
+  stq_obj = segmentation_tracking_quality.STQuality(num_classes, things_list,
+    ignore_label, max_instances_per_category, offset)
+  stq_obj.update_state(y_true_1, y_pred_1)
+  stq_obj.update_state(y_true_2, y_pred_2)
+  ...
+  result = stq_obj.result().numpy()
+  """
+
+  def __init__(self,
+               num_classes: int,
+               things_list: Sequence[int],
+               ignore_label: int,
+               max_instances_per_category: int,
+               offset: int,
+               name='stq'
+               ):
+    """Initialization of the STQ metric.
+
+    Args:
+      num_classes: Number of classes in the dataset as an integer.
+      things_list: A sequence of class ids that belong to `things`.
+      ignore_label: The class id to be ignored in evaluation as an integer or
+        integer tensor.
+      max_instances_per_category: The maximum number of instances for each class
+        as an integer or integer tensor.
+      offset: The maximum number of unique labels as an integer or integer
+        tensor.
+      name: An optional name. (default: 'st_quality')
+    """
+    self._name = name
+    self._num_classes = num_classes
+    self._ignore_label = ignore_label
+    self._things_list = things_list
+    self._max_instances_per_category = max_instances_per_category
+
+    if ignore_label >= num_classes:
+      self._confusion_matrix_size = num_classes + 1
+      self._include_indices = np.arange(self._num_classes)
+    else:
+      self._confusion_matrix_size = num_classes
+      self._include_indices = np.array(
+          [i for i in range(num_classes) if i != self._ignore_label])
+
+    self._iou_confusion_matrix_per_sequence = collections.OrderedDict()
+    self._predictions = collections.OrderedDict()
+    self._ground_truth = collections.OrderedDict()
+    self._intersections = collections.OrderedDict()
+    self._sequence_length = collections.OrderedDict()
+    self._offset = offset
+    lower_bound = num_classes * max_instances_per_category
+    if offset < lower_bound:
+      raise ValueError('The provided offset %d is too small. No guarantess '
+                       'about the correctness of the results can be made. '
+                       'Please choose an offset that is higher than num_classes'
+                       ' * max_instances_per_category = %d' % lower_bound)
+
+  def update_state(self, y_true: tf.Tensor, y_pred: tf.Tensor,
+                   sequence_id=0):
+    """Accumulates the segmentation and tracking quality statistics.
+
+    Args:
+      y_true: The ground-truth panoptic label map for a particular video frame
+        (defined as semantic_map * max_instances_per_category + instance_map).
+      y_pred: The predicted panoptic label map for a particular video frame
+        (defined as semantic_map * max_instances_per_category + instance_map).
+      sequence_id: The optional ID of the sequence the frames belong to. When no
+        sequence is given, all frames are considered to belong to the same
+        sequence (default: 0).
+    """
+    y_true = tf.cast(y_true, dtype=tf.int64)
+    y_pred = tf.cast(y_pred, dtype=tf.int64)
+    semantic_label = y_true // self._max_instances_per_category
+    semantic_prediction = y_pred // self._max_instances_per_category
+    # Check if the ignore value is outside the range [0, num_classes]. If yes,
+    # map `_ignore_label` to `_num_classes`, so it can be used to create the
+    # confusion matrix.
+    if self._ignore_label > self._num_classes:
+      semantic_label = tf.where(
+          tf.not_equal(semantic_label, self._ignore_label), semantic_label,
+          self._num_classes)
+      semantic_prediction = tf.where(
+          tf.not_equal(semantic_prediction, self._ignore_label),
+          semantic_prediction, self._num_classes)
+    if sequence_id in self._iou_confusion_matrix_per_sequence:
+      self._iou_confusion_matrix_per_sequence[sequence_id] += (
+          tf.math.confusion_matrix(
+              tf.reshape(semantic_label, [-1]),
+              tf.reshape(semantic_prediction, [-1]),
+              self._confusion_matrix_size,
+              dtype=tf.int64))
+      self._sequence_length[sequence_id] += 1
+    else:
+      self._iou_confusion_matrix_per_sequence[sequence_id] = (
+          tf.math.confusion_matrix(
+              tf.reshape(semantic_label, [-1]),
+              tf.reshape(semantic_prediction, [-1]),
+              self._confusion_matrix_size,
+              dtype=tf.int64))
+      self._predictions[sequence_id] = {}
+      self._ground_truth[sequence_id] = {}
+      self._intersections[sequence_id] = {}
+      self._sequence_length[sequence_id] = 1
+
+    instance_label = y_true % self._max_instances_per_category
+
+    label_mask = tf.zeros_like(semantic_label, dtype=tf.bool)
+    prediction_mask = tf.zeros_like(semantic_prediction, dtype=tf.bool)
+    for things_class_id in self._things_list:
+      label_mask = tf.logical_or(label_mask,
+                                 tf.equal(semantic_label, things_class_id))
+      prediction_mask = tf.logical_or(
+          prediction_mask, tf.equal(semantic_prediction, things_class_id))
+
+    # Select the `crowd` region of the current class. This region is encoded
+    # instance id `0`.
+    is_crowd = tf.logical_and(tf.equal(instance_label, 0), label_mask)
+    # Select the non-crowd region of the corresponding class as the `crowd`
+    # region is ignored for the tracking term.
+    label_mask = tf.logical_and(label_mask, tf.logical_not(is_crowd))
+    # Do not punish id assignment for regions that are annotated as `crowd` in
+    # the ground-truth.
+    prediction_mask = tf.logical_and(prediction_mask, tf.logical_not(is_crowd))
+
+    seq_preds = self._predictions[sequence_id]
+    seq_gts = self._ground_truth[sequence_id]
+    seq_intersects = self._intersections[sequence_id]
+
+    # Compute and update areas of ground-truth, predictions and intersections.
+    _update_dict_stats(seq_preds, y_pred[prediction_mask])
+    _update_dict_stats(seq_gts, y_true[label_mask])
+
+    non_crowd_intersection = tf.logical_and(label_mask, prediction_mask)
+    intersection_ids = (
+        y_true[non_crowd_intersection] * self._offset +
+        y_pred[non_crowd_intersection])
+    _update_dict_stats(seq_intersects, intersection_ids)
+
+  def result(self) -> Dict[Text, Any]:
+    """Computes the segmentation and tracking quality.
+
+    Returns:
+      A dictionary containing:
+        - 'STQ': The total STQ score.
+        - 'AQ': The total association quality (AQ) score.
+        - 'IoU': The total mean IoU.
+        - 'STQ_per_seq': A list of the STQ score per sequence.
+        - 'AQ_per_seq': A list of the AQ score per sequence.
+        - 'IoU_per_seq': A list of mean IoU per sequence.
+        - 'Id_per_seq': A list of sequence Ids to map list index to sequence.
+        - 'Length_per_seq': A list of the length of each sequence.
+    """
+    # Compute association quality (AQ)
+    num_tubes_per_seq = [0] * len(self._ground_truth)
+    aq_per_seq = [0] * len(self._ground_truth)
+    iou_per_seq = [0] * len(self._ground_truth)
+    id_per_seq = [''] * len(self._ground_truth)
+
+    for index, sequence_id in enumerate(self._ground_truth):
+      outer_sum = 0.0
+      predictions = self._predictions[sequence_id]
+      ground_truth = self._ground_truth[sequence_id]
+      intersections = self._intersections[sequence_id]
+      num_tubes_per_seq[index] = len(ground_truth)
+      id_per_seq[index] = sequence_id
+
+      for gt_id, gt_size in ground_truth.items():
+        inner_sum = 0.0
+        for pr_id, pr_size in predictions.items():
+          tpa_key = self._offset * gt_id + pr_id
+          if tpa_key in intersections:
+            tpa = intersections[tpa_key].numpy()
+            fpa = pr_size.numpy() - tpa
+            fna = gt_size.numpy() - tpa
+            inner_sum += tpa * (tpa / (tpa + fpa + fna))
+
+        outer_sum += 1.0 / gt_size.numpy() * inner_sum
+      aq_per_seq[index] = outer_sum
+
+    aq_mean = np.sum(aq_per_seq) / np.maximum(np.sum(num_tubes_per_seq), 1e-15)
+    aq_per_seq = aq_per_seq / np.maximum(num_tubes_per_seq, 1e-15)
+
+    # Compute IoU scores.
+    # The rows correspond to ground-truth and the columns to predictions.
+    # Remove fp from confusion matrix for the void/ignore class.
+    total_confusion = np.zeros(
+        (self._confusion_matrix_size, self._confusion_matrix_size),
+        dtype=np.int64)
+    for index, confusion in enumerate(
+        self._iou_confusion_matrix_per_sequence.values()):
+      confusion = confusion.numpy()
+      removal_matrix = np.zeros_like(confusion)
+      removal_matrix[self._include_indices, :] = 1.0
+      confusion *= removal_matrix
+      total_confusion += confusion
+
+      # `intersections` corresponds to true positives.
+      intersections = confusion.diagonal()
+      fps = confusion.sum(axis=0) - intersections
+      fns = confusion.sum(axis=1) - intersections
+      unions = intersections + fps + fns
+
+      num_classes = np.count_nonzero(unions)
+      ious = (intersections.astype(np.double) /
+              np.maximum(unions, 1e-15).astype(np.double))
+      iou_per_seq[index] = np.sum(ious) / num_classes
+
+    # `intersections` corresponds to true positives.
+    intersections = total_confusion.diagonal()
+    fps = total_confusion.sum(axis=0) - intersections
+    fns = total_confusion.sum(axis=1) - intersections
+    unions = intersections + fps + fns
+
+    num_classes = np.count_nonzero(unions)
+    ious = (intersections.astype(np.double) /
+            np.maximum(unions, 1e-15).astype(np.double))
+    iou_mean = np.sum(ious) / num_classes
+
+    st_quality = np.sqrt(aq_mean * iou_mean)
+    st_quality_per_seq = np.sqrt(aq_per_seq * iou_per_seq)
+    return {'STQ': st_quality,
+            'AQ': aq_mean,
+            'IoU': float(iou_mean),
+            'STQ_per_seq': st_quality_per_seq,
+            'AQ_per_seq': aq_per_seq,
+            'IoU_per_seq': iou_per_seq,
+            'ID_per_seq': id_per_seq,
+            'Length_per_seq': list(self._sequence_length.values()),
+            }
+
+  def reset_states(self):
+    """Resets all states that accumulated data."""
+    self._iou_confusion_matrix_per_sequence = collections.OrderedDict()
+    self._predictions = collections.OrderedDict()
+    self._ground_truth = collections.OrderedDict()
+    self._intersections = collections.OrderedDict()
+    self._sequence_length = collections.OrderedDict()
diff --git a/evaluation/segmentation_and_tracking_quality_test.py b/evaluation/segmentation_and_tracking_quality_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f1a03293ffaf6b900342147ce5f68970ead690f
--- /dev/null
+++ b/evaluation/segmentation_and_tracking_quality_test.py
@@ -0,0 +1,281 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for segmentation_tracking_quality."""
+
+import numpy as np
+import tensorflow as tf
+
+from deeplab2.evaluation import segmentation_and_tracking_quality as stq
+
+
+def _compute_metric_and_compare(metric, ground_truth, prediction,
+                                expected_result):
+  metric.update_state(
+      tf.convert_to_tensor(ground_truth), tf.convert_to_tensor(prediction), 1)
+  result = metric.result()
+  metric.reset_states()
+  np.testing.assert_almost_equal(result['STQ'], expected_result[0])
+  np.testing.assert_almost_equal(result['AQ'], expected_result[1])
+  np.testing.assert_almost_equal(result['IoU'], expected_result[2])
+  np.testing.assert_almost_equal(result['STQ_per_seq'], [expected_result[0]])
+  np.testing.assert_almost_equal(result['AQ_per_seq'], [expected_result[1]])
+  np.testing.assert_almost_equal(result['IoU_per_seq'], [expected_result[2]])
+  np.testing.assert_almost_equal(result['ID_per_seq'], [1])
+  np.testing.assert_almost_equal(result['Length_per_seq'], [1])
+
+
+class STQualityTest(tf.test.TestCase):
+
+  def test_complex_example(self):
+    n_classes = 3
+    ignore_label = 255
+    # classes = ['sky', 'vegetation', 'cars'].
+    things_list = [2]
+    max_instances_per_category = 1000
+
+    ground_truth_semantic_1 = np.array([[0, 0, 0, 0, 0, 0, 0, 0],
+                                        [0, 0, 0, 0, 0, 0, 0, 0],
+                                        [0, 0, 0, 2, 0, 1, 1, 1],
+                                        [0, 2, 2, 2, 2, 1, 1, 1],
+                                        [2, 2, 2, 2, 2, 2, 1, 1],
+                                        [2, 2, 2, 2, 2, 2, 2, 1],
+                                        [2, 2, 2, 2, 2, 2, 2, 1],
+                                        [2, 2, 2, 2, 2, 2, 1, 1]])
+    ground_truth_semantic_2 = np.array([[0, 0, 0, 0, 0, 0, 0, 0],
+                                        [0, 0, 0, 0, 0, 0, 0, 0],
+                                        [0, 2, 0, 0, 1, 1, 0, 0],
+                                        [2, 2, 2, 1, 1, 1, 1, 0],
+                                        [2, 2, 2, 2, 1, 1, 1, 1],
+                                        [2, 2, 2, 2, 2, 1, 1, 1],
+                                        [2, 2, 2, 2, 2, 1, 1, 1],
+                                        [2, 2, 2, 2, 1, 1, 1, 1]])
+    ground_truth_semantic_3 = np.array([[0, 0, 0, 0, 0, 0, 0, 0],
+                                        [0, 0, 0, 0, 0, 0, 0, 0],
+                                        [0, 0, 0, 0, 0, 0, 0, 0],
+                                        [2, 0, 1, 1, 1, 0, 0, 0],
+                                        [2, 2, 1, 1, 1, 1, 0, 0],
+                                        [2, 2, 2, 1, 1, 1, 1, 0],
+                                        [2, 2, 2, 1, 1, 1, 1, 1],
+                                        [2, 2, 2, 1, 1, 1, 1, 1]])
+    ground_truth_semantic = np.stack([
+        ground_truth_semantic_1, ground_truth_semantic_2,
+        ground_truth_semantic_3
+    ])
+
+    ground_truth_instance_1 = np.array([[0, 0, 0, 0, 0, 0, 0, 0],
+                                        [0, 0, 0, 0, 0, 0, 0, 0],
+                                        [0, 0, 0, 2, 0, 0, 0, 0],
+                                        [0, 2, 2, 2, 2, 0, 0, 0],
+                                        [2, 2, 2, 2, 2, 2, 0, 0],
+                                        [2, 2, 2, 2, 2, 2, 2, 0],
+                                        [2, 2, 2, 2, 2, 2, 2, 0],
+                                        [2, 2, 2, 2, 2, 2, 0, 0]])
+    ground_truth_instance_2 = np.array([[0, 0, 0, 0, 0, 0, 0, 0],
+                                        [0, 0, 0, 0, 0, 0, 0, 0],
+                                        [0, 2, 0, 0, 0, 0, 0, 0],
+                                        [2, 2, 2, 0, 0, 0, 0, 0],
+                                        [2, 2, 2, 2, 0, 0, 0, 0],
+                                        [2, 2, 2, 2, 2, 0, 0, 0],
+                                        [2, 2, 2, 2, 2, 0, 0, 0],
+                                        [2, 2, 2, 2, 0, 0, 0, 0]])
+    ground_truth_instance_3 = np.array([[0, 0, 0, 0, 0, 0, 0, 0],
+                                        [0, 0, 0, 0, 0, 0, 0, 0],
+                                        [0, 0, 0, 0, 0, 0, 0, 0],
+                                        [2, 0, 0, 0, 0, 0, 0, 0],
+                                        [2, 2, 0, 0, 0, 0, 0, 0],
+                                        [2, 2, 2, 0, 0, 0, 0, 0],
+                                        [2, 2, 2, 0, 0, 0, 0, 0],
+                                        [2, 2, 2, 0, 0, 0, 0, 0]])
+
+    ground_truth_instance = np.stack([
+        ground_truth_instance_1, ground_truth_instance_2,
+        ground_truth_instance_3
+    ])
+    ground_truth = (ground_truth_semantic * max_instances_per_category
+                    + ground_truth_instance)
+
+    prediction_semantic_1 = np.array([[0, 0, 0, 0, 0, 0, 0, 0],
+                                      [0, 0, 0, 0, 0, 0, 0, 0],
+                                      [0, 0, 0, 0, 0, 1, 0, 0],
+                                      [0, 0, 0, 2, 2, 1, 1, 1],
+                                      [0, 2, 2, 2, 2, 2, 1, 1],
+                                      [2, 2, 2, 2, 2, 2, 2, 1],
+                                      [2, 2, 2, 2, 2, 2, 2, 1],
+                                      [2, 2, 2, 2, 2, 2, 2, 1]])
+    prediction_semantic_2 = np.array([[0, 0, 0, 0, 0, 0, 0, 0],
+                                      [0, 0, 0, 0, 0, 0, 0, 0],
+                                      [0, 0, 0, 0, 1, 1, 0, 0],
+                                      [0, 2, 2, 2, 1, 1, 1, 1],
+                                      [2, 2, 2, 2, 1, 1, 1, 1],
+                                      [2, 2, 2, 2, 2, 1, 1, 1],
+                                      [2, 2, 2, 2, 2, 2, 1, 1],
+                                      [2, 2, 2, 2, 2, 1, 1, 1]])
+    prediction_semantic_3 = np.array([[0, 0, 0, 0, 0, 0, 0, 0],
+                                      [0, 0, 0, 0, 0, 0, 0, 0],
+                                      [0, 0, 0, 0, 1, 0, 0, 0],
+                                      [0, 0, 1, 1, 1, 1, 0, 0],
+                                      [2, 2, 2, 1, 1, 1, 0, 0],
+                                      [2, 2, 2, 1, 1, 1, 1, 1],
+                                      [2, 2, 2, 2, 1, 1, 1, 1],
+                                      [2, 2, 2, 2, 1, 1, 1, 1]])
+    prediction_semantic = np.stack(
+        [prediction_semantic_1, prediction_semantic_2, prediction_semantic_3])
+
+    prediction_instance_1 = np.array([[0, 0, 0, 0, 0, 0, 0, 0],
+                                      [0, 0, 0, 0, 0, 0, 0, 0],
+                                      [0, 0, 0, 0, 0, 0, 0, 0],
+                                      [0, 0, 0, 2, 2, 0, 0, 0],
+                                      [0, 2, 2, 2, 2, 1, 0, 0],
+                                      [2, 2, 2, 2, 2, 1, 1, 0],
+                                      [2, 2, 2, 2, 1, 1, 1, 0],
+                                      [2, 2, 2, 2, 1, 1, 1, 0]])
+    prediction_instance_2 = np.array([[0, 0, 0, 0, 0, 0, 0, 0],
+                                      [0, 0, 0, 0, 0, 0, 0, 0],
+                                      [0, 0, 0, 0, 0, 0, 0, 0],
+                                      [0, 2, 2, 2, 0, 0, 0, 0],
+                                      [2, 2, 2, 2, 0, 0, 0, 0],
+                                      [2, 2, 2, 2, 2, 0, 0, 0],
+                                      [2, 2, 2, 2, 1, 1, 0, 0],
+                                      [2, 2, 2, 2, 1, 0, 0, 0]])
+    prediction_instance_3 = np.array([[0, 0, 0, 0, 0, 0, 0, 0],
+                                      [0, 0, 0, 0, 0, 0, 0, 0],
+                                      [0, 0, 0, 0, 0, 0, 0, 0],
+                                      [0, 0, 0, 0, 0, 0, 0, 0],
+                                      [2, 2, 2, 0, 0, 0, 0, 0],
+                                      [2, 2, 2, 0, 0, 0, 0, 0],
+                                      [2, 2, 2, 2, 0, 0, 0, 0],
+                                      [2, 2, 2, 2, 0, 0, 0, 0]])
+    prediction_instance = np.stack(
+        [prediction_instance_1, prediction_instance_2, prediction_instance_3])
+    prediction = (prediction_semantic * max_instances_per_category
+                  + prediction_instance)
+
+    # Compute STQuality.
+    stq_metric = stq.STQuality(
+        n_classes, things_list, ignore_label, max_instances_per_category,
+        256 * 256)
+
+    for i in range(3):
+      stq_metric.update_state(
+          tf.convert_to_tensor(ground_truth[i, ...], dtype=tf.int32),
+          tf.convert_to_tensor(prediction[i, ...], dtype=tf.int32),
+          1)
+
+    result = stq_metric.result()
+
+    np.testing.assert_almost_equal(result['STQ'], 0.66841773352)
+    np.testing.assert_almost_equal(result['AQ'], 0.55366581415)
+    np.testing.assert_almost_equal(result['IoU'], 0.8069529580309542)
+    np.testing.assert_almost_equal(result['STQ_per_seq'], [0.66841773352])
+    np.testing.assert_almost_equal(result['AQ_per_seq'], [0.55366581415])
+    np.testing.assert_almost_equal(result['IoU_per_seq'], [0.8069529580309542])
+    np.testing.assert_almost_equal(result['ID_per_seq'], [1])
+    np.testing.assert_almost_equal(result['Length_per_seq'], [3])
+
+  def test_basic_examples(self):
+    n_classes = 2
+    ignore_label = 255
+    # classes = ['cars', 'sky'].
+    things_list = [0]
+    max_instances_per_category = 1000
+
+    # Since the semantic label is `0`, the instance ID is enough.
+    ground_truth_track = np.array([[1, 1, 1, 1, 1]])
+
+    stq_metric = stq.STQuality(
+        n_classes, things_list, ignore_label, max_instances_per_category,
+        256 * 256)
+
+    with self.subTest('Example 0'):
+      predicted_track = np.array([[1, 1, 1, 1, 1]])
+      _compute_metric_and_compare(stq_metric, ground_truth_track,
+                                  predicted_track, [1.0, 1.0, 1.0])
+
+    with self.subTest('Example 1'):
+      predicted_track = np.array([[1, 1, 2, 2, 2]])
+      _compute_metric_and_compare(stq_metric, ground_truth_track,
+                                  predicted_track, [0.72111026, 0.52, 1.0])
+
+    with self.subTest('Example 2'):
+      predicted_track = np.array([[1, 2, 2, 2, 2]])
+      _compute_metric_and_compare(stq_metric, ground_truth_track,
+                                  predicted_track, [0.82462113, 0.68, 1.0])
+
+    with self.subTest('Example 3'):
+      predicted_track = np.array([[1, 2, 3, 4, 5]])
+      _compute_metric_and_compare(stq_metric, ground_truth_track,
+                                  predicted_track, [0.447213596, 0.2, 1.0])
+
+    with self.subTest('Example 4'):
+      predicted_track = np.array([[1, 2, 1, 2, 2]])
+      _compute_metric_and_compare(stq_metric, ground_truth_track,
+                                  predicted_track, [0.72111026, 0.52, 1.0])
+
+    with self.subTest('Example 5'):
+      predicted_track = (
+          np.array([[0, 1, 1, 1, 1]]) +
+          np.array([[1, 0, 0, 0, 0]]) * max_instances_per_category)
+      _compute_metric_and_compare(stq_metric, ground_truth_track,
+                                  predicted_track, [0.50596443, 0.64, 0.4])
+
+    # First label is `crowd`.
+    ground_truth_track = np.array([[0, 1, 1, 1, 1, 1]])
+
+    with self.subTest('Example 6'):
+      predicted_track = np.array([[1, 1, 1, 1, 1, 1]])
+      _compute_metric_and_compare(stq_metric, ground_truth_track,
+                                  predicted_track, [1.0, 1.0, 1.0])
+
+    with self.subTest('Example 7'):
+      predicted_track = np.array([[2, 2, 2, 2, 1, 1]])
+      _compute_metric_and_compare(stq_metric, ground_truth_track,
+                                  predicted_track, [0.72111026, 0.52, 1.0])
+
+    with self.subTest('Example 8'):
+      predicted_track = (
+          np.array([[2, 2, 0, 1, 1, 1]]) +
+          np.array([[0, 0, 1, 0, 0, 0]]) * max_instances_per_category)
+      _compute_metric_and_compare(stq_metric, ground_truth_track,
+                                  predicted_track,
+                                  [0.40824829, 0.4, 5.0 / 12.0])
+
+    # First label is `sky`.
+    ground_truth_track = (
+        np.array([[0, 1, 1, 1, 1]]) +
+        np.array([[1, 0, 0, 0, 0]]) * max_instances_per_category)
+
+    with self.subTest('Example 9'):
+      predicted_track = np.array([[1, 1, 1, 1, 1]])
+      _compute_metric_and_compare(stq_metric, ground_truth_track,
+                                  predicted_track, [0.56568542, 0.8, 0.4])
+
+    with self.subTest('Example 10'):
+      predicted_track = np.array([[2, 2, 2, 1, 1]])
+      _compute_metric_and_compare(stq_metric, ground_truth_track,
+                                  predicted_track,
+                                  [0.42426407, 0.45, 0.4])
+
+    with self.subTest('Example 11'):
+      predicted_track = (
+          np.array([[2, 2, 0, 1, 1]]) +
+          np.array([[0, 0, 1, 0, 0]]) * max_instances_per_category)
+      _compute_metric_and_compare(stq_metric, ground_truth_track,
+                                  predicted_track,
+                                  [0.3, 0.3, 0.3])
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/evaluation/test_utils.py b/evaluation/test_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0308eb618688c761a60b7a6bc07d0281dcbace70
--- /dev/null
+++ b/evaluation/test_utils.py
@@ -0,0 +1,121 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utility functions to set up unit tests on Panoptic Segmentation code."""
+
+import os
+from typing import Mapping, Optional, Tuple
+
+from absl import flags
+import numpy as np
+from PIL import Image
+
+import tensorflow as tf
+
+FLAGS = flags.FLAGS
+
+_TEST_DATA_DIR = ('deeplab2/'
+                  'evaluation/testdata')
+
+
+def read_test_image(testdata_path: str,
+                    image_format: Optional[str] = None) -> np.ndarray:
+  """Loads a test image.
+
+  Args:
+    testdata_path: Image path relative to panoptic_segmentation/testdata as a
+      string.
+    image_format: Format of the image. Can be one of 'RGBA', 'RGB', or 'L'.
+
+  Returns:
+    The image, as a numpy array.
+  """
+  image_path = os.path.join(_TEST_DATA_DIR, testdata_path)
+  with tf.io.gfile.GFile(image_path, 'rb') as f:
+    image = Image.open(f)
+    if image_format is not None:
+      image = image.convert(image_format)
+    return np.array(image)
+
+
+def read_segmentation_with_rgb_color_map(
+    image_testdata_path: str,
+    rgb_to_semantic_label: Mapping[Tuple[int, int, int], int],
+    output_dtype: Optional[np.dtype] = None) -> np.ndarray:
+  """Reads a test segmentation as an image and a map from colors to labels.
+
+  Args:
+    image_testdata_path: Image path relative to panoptic_segmentation/testdata
+      as a string.
+    rgb_to_semantic_label: Mapping from RGB colors to integer labels as a
+      dictionary.
+    output_dtype: Type of the output labels. If None, defaults to the type of
+      the provided color map.
+
+  Returns:
+    A 2D numpy array of labels.
+
+  Raises:
+    ValueError: On an incomplete `rgb_to_semantic_label`.
+  """
+  rgb_image = read_test_image(image_testdata_path, image_format='RGB')
+  if len(rgb_image.shape) != 3 or rgb_image.shape[2] != 3:
+    raise AssertionError('Expected RGB image, actual shape is %s' %
+                         (rgb_image.shape,))
+
+  num_pixels = rgb_image.shape[0] * rgb_image.shape[1]
+  unique_colors = np.unique(np.reshape(rgb_image, [num_pixels, 3]), axis=0)
+  if not set(map(tuple, unique_colors)).issubset(rgb_to_semantic_label.keys()):
+    raise ValueError('RGB image has colors not in color map.')
+
+  output_dtype = output_dtype or type(
+      next(iter(rgb_to_semantic_label.values())))
+  output_labels = np.empty(rgb_image.shape[:2], dtype=output_dtype)
+  for rgb_color, int_label in rgb_to_semantic_label.items():
+    color_array = np.array(rgb_color, ndmin=3)
+    output_labels[np.all(rgb_image == color_array, axis=2)] = int_label
+  return output_labels
+
+
+def panoptic_segmentation_with_class_map(
+    instance_testdata_path: str, instance_label_to_semantic_label: Mapping[int,
+                                                                           int]
+) -> Tuple[np.ndarray, np.ndarray]:
+  """Reads in a panoptic segmentation with an instance map and a map to classes.
+
+  Args:
+    instance_testdata_path: Path to a grayscale instance map, given as a string
+      and relative to panoptic_segmentation/testdata.
+    instance_label_to_semantic_label: A map from instance labels to class
+      labels.
+
+  Returns:
+    A tuple `(instance_labels, class_labels)` of numpy arrays.
+
+  Raises:
+    ValueError: On a mismatched set of instances in
+    the
+      `instance_label_to_semantic_label`.
+  """
+  instance_labels = read_test_image(instance_testdata_path, image_format='L')
+  if set(np.unique(instance_labels)) != set(
+      instance_label_to_semantic_label.keys()):
+    raise ValueError('Provided class map does not match present instance ids.')
+
+  class_labels = np.empty_like(instance_labels)
+  for instance_id, class_id in instance_label_to_semantic_label.items():
+    class_labels[instance_labels == instance_id] = class_id
+
+  return instance_labels, class_labels
diff --git a/evaluation/test_utils_test.py b/evaluation/test_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bdb32281b2c65dbfe1e7e875c59f7a5a13acb0f
--- /dev/null
+++ b/evaluation/test_utils_test.py
@@ -0,0 +1,67 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for test_utils."""
+import numpy as np
+import tensorflow as tf
+
+from deeplab2.evaluation import test_utils
+
+
+class TestUtilsTest(tf.test.TestCase):
+
+  def test_read_test_image(self):
+    image_array = test_utils.read_test_image('team_pred_class.png')
+    self.assertSequenceEqual(image_array.shape, (231, 345, 4))
+
+  def test_reads_segmentation_with_color_map(self):
+    rgb_to_semantic_label = {(0, 0, 0): 0, (0, 0, 255): 1, (255, 0, 0): 23}
+    labels = test_utils.read_segmentation_with_rgb_color_map(
+        'team_pred_class.png', rgb_to_semantic_label)
+
+    input_image = test_utils.read_test_image('team_pred_class.png')
+    np.testing.assert_array_equal(
+        labels == 0,
+        np.logical_and(input_image[:, :, 0] == 0, input_image[:, :, 2] == 0))
+    np.testing.assert_array_equal(labels == 1, input_image[:, :, 2] == 255)
+    np.testing.assert_array_equal(labels == 23, input_image[:, :, 0] == 255)
+
+  def test_reads_gt_segmentation(self):
+    instance_label_to_semantic_label = {
+        0: 0,
+        47: 1,
+        97: 1,
+        133: 1,
+        150: 1,
+        174: 1,
+        198: 23,
+        215: 1,
+        244: 1,
+        255: 1,
+    }
+    instances, classes = test_utils.panoptic_segmentation_with_class_map(
+        'team_gt_instance.png', instance_label_to_semantic_label)
+
+    expected_label_shape = (231, 345)
+    self.assertSequenceEqual(instances.shape, expected_label_shape)
+    self.assertSequenceEqual(classes.shape, expected_label_shape)
+    np.testing.assert_array_equal(instances == 0, classes == 0)
+    np.testing.assert_array_equal(instances == 198, classes == 23)
+    np.testing.assert_array_equal(
+        np.logical_and(instances != 0, instances != 198), classes == 1)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/evaluation/testdata/README.md b/evaluation/testdata/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..37927ec0faf0f1151758df5f0bd68bf7406f1b2e
--- /dev/null
+++ b/evaluation/testdata/README.md
@@ -0,0 +1,11 @@
+# Segmentation Evalaution Test Data
+
+## Source Images
+
+*   [team_input.png](team_input.png) \
+    Source:
+    https://ai.googleblog.com/2018/03/semantic-image-segmentation-with.html
+*   [cat_input.jpg](cat_input.jpg) \
+    Source: https://www.flickr.com/photos/magdalena_b/4995858743
+*   [bird_input.jpg](bird_input.jpg) \
+    Source: https://www.flickr.com/photos/chivinskia/40619099560
diff --git a/evaluation/testdata/bird_gt.png b/evaluation/testdata/bird_gt.png
new file mode 100644
index 0000000000000000000000000000000000000000..05d854915d1809abe3ba10f03c20e75706e0bb17
Binary files /dev/null and b/evaluation/testdata/bird_gt.png differ
diff --git a/evaluation/testdata/bird_pred_class.png b/evaluation/testdata/bird_pred_class.png
new file mode 100644
index 0000000000000000000000000000000000000000..07351bf061115d0990486cbb086b6b9ec53e691b
Binary files /dev/null and b/evaluation/testdata/bird_pred_class.png differ
diff --git a/evaluation/testdata/bird_pred_instance.png b/evaluation/testdata/bird_pred_instance.png
new file mode 100644
index 0000000000000000000000000000000000000000..faa1371f52510fb6f15fecb0eecc3441b2c8eadb
Binary files /dev/null and b/evaluation/testdata/bird_pred_instance.png differ
diff --git a/evaluation/testdata/cat_gt.png b/evaluation/testdata/cat_gt.png
new file mode 100644
index 0000000000000000000000000000000000000000..41f60111f3de899a9e1ca3a646bea72d86b3009f
Binary files /dev/null and b/evaluation/testdata/cat_gt.png differ
diff --git a/evaluation/testdata/cat_pred_class.png b/evaluation/testdata/cat_pred_class.png
new file mode 100644
index 0000000000000000000000000000000000000000..3728c68ced20312567e70540b667b53269000318
Binary files /dev/null and b/evaluation/testdata/cat_pred_class.png differ
diff --git a/evaluation/testdata/cat_pred_instance.png b/evaluation/testdata/cat_pred_instance.png
new file mode 100644
index 0000000000000000000000000000000000000000..ebd9ba4855f5c88a3b336d50e21d864a37175bbe
Binary files /dev/null and b/evaluation/testdata/cat_pred_instance.png differ
diff --git a/evaluation/testdata/team_gt_instance.png b/evaluation/testdata/team_gt_instance.png
new file mode 100644
index 0000000000000000000000000000000000000000..97abb55273ce409a5fbaa85cb999f0725d457dbf
Binary files /dev/null and b/evaluation/testdata/team_gt_instance.png differ
diff --git a/evaluation/testdata/team_pred_class.png b/evaluation/testdata/team_pred_class.png
new file mode 100644
index 0000000000000000000000000000000000000000..2ed78de2cbd923e6530f08fc2c47bf8377cfaf69
Binary files /dev/null and b/evaluation/testdata/team_pred_class.png differ
diff --git a/evaluation/testdata/team_pred_instance.png b/evaluation/testdata/team_pred_instance.png
new file mode 100644
index 0000000000000000000000000000000000000000..264606a4d8822108481132ff9e990d826c64a274
Binary files /dev/null and b/evaluation/testdata/team_pred_instance.png differ
diff --git a/evaluation/video_panoptic_quality.py b/evaluation/video_panoptic_quality.py
new file mode 100644
index 0000000000000000000000000000000000000000..02294e6ac56ac4c3a704445e266d874eedf1cf57
--- /dev/null
+++ b/evaluation/video_panoptic_quality.py
@@ -0,0 +1,98 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Implementation of the Video Panoptic Quality metric.
+
+Video Panoptic Quality is an instance-based metric for evaluating the task
+of video panoptic segmentation.
+Please see the paper for details:
+Dahun Kim, Sanghyun Woo, Joon-Young Lee, and In So Kweon.
+"Video panoptic segmentation." In CVPR, 2020.
+"""
+
+from typing import List, Tuple
+
+import numpy as np
+import tensorflow as tf
+from deeplab2.evaluation import panoptic_quality
+
+
+class VideoPanopticQuality(panoptic_quality.PanopticQuality):
+  """Metric class for Video Panoptic Quality.
+
+  Dahun Kim, Sanghyun Woo, Joon-Young Lee, and In So Kweon.
+  "Video panoptic segmentation." In CVPR, 2020.
+
+  Video Panoptic Quality can be modeled as Image Panoptic Quality with the
+  sequences of predictions and the ground-truth labels horizontally concatenated
+  as two images, separately. Therefore, this class inherits the image panoptic
+  quality class and changes the implementation to concatenated comparisons.
+
+  Siyuan Qiao, Yukun Zhu, Hartwig Adam, Alan Yuille, and Liang-Chieh Chen.
+  "ViP-DeepLab: Learning Visual Perception with Depth-aware Video Panoptic
+  Segmentation." In CVPR, 2021.
+
+  Stand-alone usage:
+  vpq_obj = video_panoptic_quality.VideoPanopticQuality(
+    num_classes, max_instances_per_category, ignored_label)
+  vpq_obj.update_state(y_true_1, y_pred_1)
+  vpq_obj.update_state(y_true_2, y_pred_2)
+  ...
+  result = vpq_obj.result().numpy()
+  """
+
+  def __init__(self,
+               num_classes: int,
+               ignored_label: int,
+               max_instances_per_category: int,
+               offset: int,
+               name: str = 'video_panoptic_quality',
+               **kwargs):
+    """Initialization of the VideoPanopticQuality metric.
+
+    Args:
+      num_classes: Number of classes in the dataset as an integer.
+      ignored_label: The class id to be ignored in evaluation as an integer or
+        integer tensor.
+      max_instances_per_category: The maximum number of instances for each class
+        as an integer or integer tensor.
+      offset: The maximum number of unique labels as an integer or integer
+        tensor.
+      name: An optional variable_scope name. (default: 'video_panoptic_quality')
+      **kwargs: The keyword arguments that are passed on to `fn`.
+    """
+    super().__init__(num_classes, ignored_label, max_instances_per_category,
+                     offset, name, **kwargs)
+
+  def compare_and_accumulate(
+      self, gt_panoptic_labels: List[tf.Tensor],
+      pred_panoptic_labels: List[tf.Tensor]
+  ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Compares predicted segmentation with groundtruth, accumulates its metric.
+
+    Args:
+      gt_panoptic_labels: A list of tensors for the ground-truth
+        video panoptic segmentation labels.
+      pred_panoptic_labels: A list of tensors for video panoptic
+        segmentation predictions.
+
+    Returns:
+      The value of the metrics (iou, tp, fn, fp) over all comparisons, as a
+      float scalar.
+    """
+    gt_panoptic_label = tf.concat(gt_panoptic_labels, axis=1)
+    pred_panoptic_label = tf.concat(pred_panoptic_labels, axis=1)
+    return super(VideoPanopticQuality, self).compare_and_accumulate(
+        gt_panoptic_label, pred_panoptic_label)
diff --git a/evaluator.proto b/evaluator.proto
new file mode 100644
index 0000000000000000000000000000000000000000..e0d72a2ba1b692caea9109f162af5d994c4303f8
--- /dev/null
+++ b/evaluator.proto
@@ -0,0 +1,95 @@
+// Copyright 2021 The Deeplab2 Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+
+package deeplab2;
+
+// Next ID: 22
+message EvaluatorOptions {
+  // Set the number of steps to run evaluation. -1 corresponds to a run over the
+  // full dataset.
+  optional int32 eval_steps = 1 [default = -1];
+  // Set the number of train steps after which eval should run in interleaved
+  // mode.
+  optional int32 eval_interval = 2 [default = 5000];
+  // Set the number of seconds to wait at most for the next checkpoint. -1 means
+  // the job will wait forever.
+  optional int32 continuous_eval_timeout = 3 [default = -1];
+  // Set whether to run evaluation as a tf function.
+  optional bool use_tf_function = 4 [default = true];
+  // Set the area size of stuff segments to discard.
+  optional int32 stuff_area_limit = 6 [default = 0];
+  // Set the area size of thing segments to discard (set to ignore_label). Note
+  // that this option is currently only supported in MaX-DeepLab.
+  optional int32 thing_area_limit = 19 [default = 0];
+  // Set the threshold for the transformer class confidence.
+  optional float transformer_class_confidence_threshold = 20 [default = 0.7];
+  // Set the threshold for the per-pixel mask confidence. Note that this option
+  // is currently only supported in MaX-DeepLab.
+  optional float pixel_confidence_threshold = 21 [default = 0.4];
+  // Set the threshold of the center heatmap for post-processing.
+  optional float center_score_threshold = 7 [default = 0.1];
+  // Set the kernel size of the nms kernel for the center heatmap.
+  optional int32 nms_kernel = 8 [default = 3];
+  // Set the number of top centers to keep. -1 corresponds to keeping all
+  // centers.
+  optional int32 keep_k_centers = 9 [default = 400];
+  // Enable saving predictions to disk.
+  optional bool save_predictions = 10 [default = false];
+  // Override the storing location. By default, predictions are written to
+  // `experiment_root` + `experiment_name` + `vis`.
+  optional string override_save_dir = 11;
+  // Set the number of samples to visualize.
+  optional int32 num_vis_samples = 12 [default = 10];
+  // Enable saving raw predictions for the whole dataset. The output path is the
+  // save_dir + `raw_semantic`/`raw_panoptic`.
+  optional bool save_raw_predictions = 13 [default = false];
+  // The format of raw panoptic predictions. This flag is used together with
+  // `save_raw_predictions`. When save_raw_predictions is True, this field
+  // specifies the format of saved raw panoptic predictions. Supports:
+  // - 'two_channel_png': The popular format, also supported by the official
+  //  COCO panoptic API (https://github.com/cocodataset/panopticapi), where
+  //  the saved PNG image contains R-channel for semantic labels and
+  //  G-channel for instance IDs.
+  // - 'three_channel_png': A simple extension of the 'two_channel_png' format,
+  //  and is adopted in some video panoptic segmentation datasets (for
+  //  example, KITTI-STEP and MOTChallenge-STEP), where the saved PNG image
+  //  contains R-channel for semantic labels, G-channel for the values of
+  //  (instance ID // 256), and B-channel for (instance ID % 256).
+  // - 'two_channel_numpy_array': A more flexible format (unconstrained by the
+  //  PNG channel size), where the panoptic predictions are saved as a numpy
+  //  array in the two channel format (i.e., first channel encodes the
+  //  semantic class and the second channel the instance ID).
+  optional string raw_panoptic_format = 17 [default = 'two_channel_png'];
+  // Enable conversion of train IDs to eval IDs for raw predictions.
+  optional bool convert_raw_to_eval_ids = 14 [default = true];
+  // Add flipped images for evaluation or not. This is used for multi-scale
+  // inference (usually used together with `eval_scales`). If True, another
+  // flipped image will be used during inference.
+  optional bool add_flipped_images = 5 [default = false];
+  // The scales to resize images for inference. Change it to, e.g. [0.5, 0.75,
+  // 1.0, 1.25, 1.5, 1.75], for multi-scale inference.
+  repeated float eval_scales = 15 [packed = true];
+  // Boolean, if true, use TensorFlow operation (CUDA kernel) to merge
+  // semantic and instance segmentation (for the final panoptic segmentation).
+  // Defaults to true, as our GPU implementation is much faster. Set to false
+  // if you could not successfully compile TensorFlow with this operation.
+  optional bool merge_semantic_and_instance_with_tf_op = 16 [default = true];
+  // Displays detailed metrics on instance segmentation AP. This includes e.g.
+  // AP at a matching IoU threshold of 0.5, or the AP of small objects only,
+  // etc. If false, will only display a summary AP metric that's an average of
+  // IoU thresholds and over all objects.
+  optional bool detailed_ap_metrics = 18 [default = false];
+}
diff --git a/export_model.py b/export_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..176721bfe4d370686ed45d2a658c7948f75c64f0
--- /dev/null
+++ b/export_model.py
@@ -0,0 +1,157 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""Script to export deeplab model to saved model."""
+
+import functools
+from typing import Any, MutableMapping, Sequence, Text
+
+from absl import app
+from absl import flags
+import tensorflow as tf
+
+from google.protobuf import text_format
+from deeplab2 import config_pb2
+from deeplab2.data import dataset
+from deeplab2.data.preprocessing import input_preprocessing
+from deeplab2.model import utils
+from deeplab2.trainer import train_lib
+
+
+_FLAGS_EXPERIMENT_OPTION_PATH = flags.DEFINE_string(
+    'experiment_option_path',
+    default='',
+    help='Path to the experiment option text proto.')
+
+_FLAGS_CKPT_PATH = flags.DEFINE_string(
+    'checkpoint_path',
+    default='',
+    help='Path to the saved checkpoint.')
+
+_FLAGS_OUTPUT_PATH = flags.DEFINE_string(
+    'output_path',
+    default='',
+    help='Output directory path for the exported saved model.')
+
+_FLAGS_MERGE_WITH_TF_OP = flags.DEFINE_boolean(
+    'merge_with_tf_op',
+    default=False,
+    help='Whether to use customized TF op for merge semantic and instance '
+    'predictions. Set it to True to reproduce the numbers as reported in '
+    'paper, but the saved model would require specifically compiled TensorFlow '
+    'to run.')
+
+
+class DeepLabModule(tf.Module):
+  """Class that runs DeepLab inference end-to-end."""
+
+  def __init__(self, config: config_pb2.ExperimentOptions, ckpt_path: Text,
+               use_tf_op: bool = False):
+    super().__init__(name='DeepLabModule')
+
+    dataset_options = config.eval_dataset_options
+    dataset_name = dataset_options.dataset
+    crop_height, crop_width = dataset_options.crop_size
+
+    config.evaluator_options.merge_semantic_and_instance_with_tf_op = use_tf_op
+    # Disable drop path and recompute grad as they are only used in training.
+    config.model_options.backbone.drop_path_keep_prob = 1.0
+
+    deeplab_model = train_lib.create_deeplab_model(
+        config,
+        dataset.MAP_NAME_TO_DATASET_INFO[dataset_name])
+    self._is_motion_deeplab = (
+        config.model_options.WhichOneof('meta_architecture') ==
+        'motion_deeplab')
+
+    # For now we only support batch size of 1 for saved model.
+    input_shape = train_lib.build_deeplab_model(
+        deeplab_model, (crop_height, crop_width), batch_size=1)
+    self._input_depth = input_shape[-1]
+
+    checkpoint = tf.train.Checkpoint(**deeplab_model.checkpoint_items)
+    # Not all saved variables (e.g. variables from optimizer) will be restored.
+    # `expect_partial()` to suppress the warning.
+    checkpoint.restore(ckpt_path).expect_partial()
+    self._model = deeplab_model
+
+    self._preprocess_fn = functools.partial(
+        input_preprocessing.preprocess_image_and_label,
+        label=None,
+        crop_height=crop_height,
+        crop_width=crop_width,
+        prev_label=None,
+        min_resize_value=dataset_options.min_resize_value,
+        max_resize_value=dataset_options.max_resize_value,
+        resize_factor=dataset_options.resize_factor,
+        is_training=False)
+
+  def get_input_spec(self):
+    """Returns TensorSpec of input tensor needed for inference."""
+    # We expect a single 3D, uint8 tensor with shape [height, width, channels].
+    return tf.TensorSpec(shape=[None, None, self._input_depth], dtype=tf.uint8)
+
+  @tf.function
+  def __call__(self, input_tensor: tf.Tensor) -> MutableMapping[Text, Any]:
+    """Performs a forward pass.
+
+    Args:
+      input_tensor: An uint8 input tensor of type tf.Tensor with shape [height,
+        width, channels].
+
+    Returns:
+      A dictionary containing the results of the specified DeepLab architecture.
+      The results are bilinearly upsampled to input size before returning.
+    """
+    input_size = [tf.shape(input_tensor)[0], tf.shape(input_tensor)[1]]
+
+    if self._is_motion_deeplab:
+      # For motion deeplab, split the input tensor to current and previous
+      # frame before preprocessing, and re-assemble them.
+      image, prev_image = tf.split(input_tensor, 2, axis=2)
+      (resized_image, processed_image, _, processed_prev_image,
+       _) = self._preprocess_fn(image=image, prev_image=prev_image)
+      processed_image = tf.concat(
+          [processed_image, processed_prev_image], axis=2)
+    else:
+      (resized_image, processed_image, _, _, _) = self._preprocess_fn(
+          image=input_tensor)
+
+    resized_size = tf.shape(resized_image)[0:2]
+    # Making input tensor to 4D to fit model input requirements.
+    outputs = self._model(tf.expand_dims(processed_image, 0), training=False)
+    # We only undo-preprocess for those defined in tuples in model/utils.py.
+    return utils.undo_preprocessing(outputs, resized_size,
+                                    input_size)
+
+
+def main(argv: Sequence[str]) -> None:
+  if len(argv) > 1:
+    raise app.UsageError('Too many command-line arguments.')
+
+  config = config_pb2.ExperimentOptions()
+  with tf.io.gfile.GFile(_FLAGS_EXPERIMENT_OPTION_PATH.value, 'r') as f:
+    text_format.Parse(f.read(), config)
+
+  module = DeepLabModule(
+      config, _FLAGS_CKPT_PATH.value, _FLAGS_MERGE_WITH_TF_OP.value)
+
+  signatures = module.__call__.get_concrete_function(module.get_input_spec())
+  tf.saved_model.save(
+      module, _FLAGS_OUTPUT_PATH.value, signatures=signatures)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/g3doc/change_logs.md b/g3doc/change_logs.md
new file mode 100644
index 0000000000000000000000000000000000000000..339995cd2c82674d225d595e090c19206f73092a
--- /dev/null
+++ b/g3doc/change_logs.md
@@ -0,0 +1,6 @@
+# Change logs
+
+*   June 7th, 2021: Add hungarian matching support on TPU for MaX-DeepLab. Our
+    TF2 version is based on Jiquan Ngiam's original Lingvo tensorflow
+    implementation and Amil Merchant's TF1 version modifications.
+*   June 1st, 2021: "Hello, World!", DeepLab2 made publicly available.
diff --git a/g3doc/faq.md b/g3doc/faq.md
new file mode 100644
index 0000000000000000000000000000000000000000..bd9d72274bd08525e197649ccdd412a688128bb6
--- /dev/null
+++ b/g3doc/faq.md
@@ -0,0 +1,98 @@
+# FAQ
+
+________________________________________________________________________________
+
+**Q1: What should I do if I encounter OOM (out-of-memory) while training the
+models?**
+
+**A1**: To avoid OOM, you could try:
+
+1.  reducing the training crop size (i.e., the flag `crop_size` in
+    `train_dataset_options`, and see Q2 for more details), which reduces the
+    input size during training,
+
+2.  using a larger output stride (e.g., 32) in the backbone (i.e., the flag
+    `output_stride` in `model_options`, and see Q3 for more details), which
+    reduces the usage of atrous convolution,
+
+3.  using a smaller backbone, such as ResNet-50.
+
+________________________________________________________________________________
+
+**Q2: What is the `crop_size` I need to set?**
+
+**A2**: DeepLab framework always uses `crop_size` equal to `output_stride` * k +
+1, where k is an integer.
+
+*   During inference/evaluation, since DeepLab framework uses whole-image
+    inference, we need to set k so that the resulting `crop_size` (in
+    `eval_dataset_options`) is slightly larger the largest image dimension in
+    the dataset. For example, we set eval_crop_size = 1025x2049 for Cityscapes
+    images whose image dimension is all equal to 1024x2048.
+
+*   During training, we could set k to be any integer as long as it fits to your
+    device memory. However, we notice a better performance when we have the same
+    `crop_size` during training and evaluation (i.e., also use whole-image crop
+    size during training).
+
+________________________________________________________________________________
+
+**Q3: What output stride should I use in the backbone?**
+
+**A3**: Using a different output stride leads to a different accuracy-and-memory
+trade-off. For example, DeepLabv1 uses output stride = 8, but it requires a lot
+of device memory. In DeepLabv3+ paper, we found that using output stride = 16
+strikes the best accuracy-and-memory trade-off, which is then our default
+setting. If you wish to further reduce the memory usage, you could set output
+stride to 32. Additionally, we suggest adjusting the `atrous_rates` in the ASPP
+module as follows.
+
+*   If `backbone.output_stride` = 32, use `atrous_rates` = [3, 6, 9].
+
+*   If `backbone.output_stride` = 16, use `atrous_rates` = [6, 12, 18].
+
+*   If `backbone.output_stride` = 8, use `atrous_rates` = [12, 24, 36].
+
+Note that these settings may not be optimal. You may need to adjust them to
+better fit your dataset.
+
+________________________________________________________________________________
+
+**Q4: Why are the results reported by the provided evaluation code slightly
+different from the official evaluation code (e.g.,
+[Cityscapes](https://github.com/mcordts/cityscapesScripts))?**
+
+**A4**: In order to run everything end-to-end in the TensorFlow system (e.g.,
+the on-line evaluation during training), we re-implemented the evaluation codes
+in TensorFlow. Additionally, our whole system, including the training and
+evaluation pipelines, uses the panoptic label format (i.e., `panoptic_label =
+semantic_label * label_divisor + instance_id`, where the `label_divisor` should
+be larger than the maximum number of instances per image), instead of the JSON
+[COCO formats](https://cocodataset.org/#format-data). These two changes along
+with rounding and similar issues result in some minor differences. Therefore,
+our re-implemented evaluation code is mainly used for TensorFlow integration
+(e.g., the support of on-line evaluation in TensorBoard). The users should run
+the corresponding official evaluation code in order to compare with other
+published papers. Note that all the reported numbers in our papers are evaluated
+with the official evaluation code.
+
+To facilitate the conversion between prediction formats, we also provide
+instructions for running the official evaluation codes on
+[Cityscapes](setup/cityscapes_test_server_evaluation.md) and
+[COCO](setup/coco_test_server_evaluation.md).
+
+________________________________________________________________________________
+
+**Q5: What should I do, if I could not manage to compile TensorFlow along with
+the provided efficient merging operation `merge_semantic_and_instance_maps`?**
+
+**A5**: In this case, we provide another fallback solution, which implements the
+merging operation with pure tf functions. This fallback solution does not
+require any TensorFlow compilation. However, note that compared to our provided
+TensorFlow merging operation `merge_semantic_and_instance_maps`, its inference
+speed is slower and the resulting segmentation performance may also be slightly
+lower.
+
+To use the pure-tf-function version of `merge_semantic_and_instance_maps`, set
+`merge_semantic_instance_with_tf_op` to `false` in your config's
+`evaluator_options`.
diff --git a/g3doc/img/axial_deeplab/axial_block.png b/g3doc/img/axial_deeplab/axial_block.png
new file mode 100644
index 0000000000000000000000000000000000000000..1126fa1df6ecafd6f894de426093af588854d7b3
Binary files /dev/null and b/g3doc/img/axial_deeplab/axial_block.png differ
diff --git a/g3doc/img/axial_deeplab/nonlocal_block.png b/g3doc/img/axial_deeplab/nonlocal_block.png
new file mode 100644
index 0000000000000000000000000000000000000000..d0fd31722fe452e1463e5bbe426544c142877b5e
Binary files /dev/null and b/g3doc/img/axial_deeplab/nonlocal_block.png differ
diff --git a/g3doc/img/axial_deeplab/position_sensitive_axial_block.png b/g3doc/img/axial_deeplab/position_sensitive_axial_block.png
new file mode 100644
index 0000000000000000000000000000000000000000..812f33f192e857ebb6be2aed84adddeec7578cf5
Binary files /dev/null and b/g3doc/img/axial_deeplab/position_sensitive_axial_block.png differ
diff --git a/g3doc/img/max_deeplab/overview.png b/g3doc/img/max_deeplab/overview.png
new file mode 100644
index 0000000000000000000000000000000000000000..cee68f7fb42ca1800975e9901221fc2c933f3d2f
Binary files /dev/null and b/g3doc/img/max_deeplab/overview.png differ
diff --git a/g3doc/img/max_deeplab/overview_simple.png b/g3doc/img/max_deeplab/overview_simple.png
new file mode 100644
index 0000000000000000000000000000000000000000..c5693bfca5eb3fb08f1018a4a3e150e7daa28d19
Binary files /dev/null and b/g3doc/img/max_deeplab/overview_simple.png differ
diff --git a/g3doc/img/panoptic_deeplab.png b/g3doc/img/panoptic_deeplab.png
new file mode 100644
index 0000000000000000000000000000000000000000..31194fd257ae14b575a96556ad80af5e2e96593c
Binary files /dev/null and b/g3doc/img/panoptic_deeplab.png differ
diff --git a/g3doc/img/step/kitti_step_annotation.png b/g3doc/img/step/kitti_step_annotation.png
new file mode 100644
index 0000000000000000000000000000000000000000..4793a78ea703eb81b257aadcf899b53641ee4f96
Binary files /dev/null and b/g3doc/img/step/kitti_step_annotation.png differ
diff --git a/g3doc/projects/axial_deeplab.md b/g3doc/projects/axial_deeplab.md
new file mode 100644
index 0000000000000000000000000000000000000000..c99a064659501e57e6b2e595236ea7299c73c9d5
--- /dev/null
+++ b/g3doc/projects/axial_deeplab.md
@@ -0,0 +1,168 @@
+# Axial-DeepLab
+
+Axial-DeepLab, improving over Panoptic-DeepLab, incorporates the powerful
+axial self-attention modules [1], also known as the encoder of Axial
+Transformers [2], for general dense prediction tasks. In this document,
+we demonstrate the effectiveness of Axial-DeepLab on the task of panoptic
+segmentation [6], unifying semantic segmentation and instance segmentation.
+
+To reduce the computation complexity of 2D self-attention (especially
+prominent for dense pixel prediction tasks) and further to allow us to
+perform attention witin a larger or even global region, we factorize the 2D
+self-attention [1, 3, 4] into **two** 1D self-attention [2, 5]. We then
+effectively integrate the **axial-attention** into a residual block [7], as
+illustrated in Fig. 1.
+
+<p align="center">
+   <img src="../img/axial_deeplab/axial_block.png" width=800>
+   <br>
+   <em>Figure 1. An axial-attention (residual) block, which consists of two
+    axial-attention layers operating along height- and width-axis
+    sequentially.</em>
+</p>
+
+The backbone of Axial-DeepLab, called Axial-ResNet, is obtained by replacing
+the residual blocks in any type of ResNets (e.g., Wide ResNets [8, 9]) with
+our proposed axial-attention blocks. Optionally, one could stack only the
+axial-attention blocks to form an **axial** stand-alone self-attention
+backbone. However, considering a better speed-accuracy trade-off
+(convolutions are typically well-optimized on modern accelerators), we
+adopt the hybrid CNN-Transformer architecture, where we stack the effective
+**axial-attention blocks** on top of the first few stages of ResNets (e.g.,
+Wide ResNets). In particular, in this document, we explore the case where
+we stack the axial-attention blocks after the *conv3_x*, i.e., we apply
+axial-attentions after (and *including*) stride 16 feature maps. This
+hybrid CNN-Transformer architecture is very effective on panoptic
+segmentation tasks as shown in the Model Zoo below.
+
+Additionally, we propose a position-sensitive self-attention design,
+which captures long range interactions with precise positional information.
+We illustrate the difference between our design and the popular non-local
+block in Fig. 2.
+
+<p align="center">
+   <img src="../img/axial_deeplab/nonlocal_block.png" height=250>
+   <img src="../img/axial_deeplab/position_sensitive_axial_block.png" height=250>
+</p>
+<center><em>Figure 2. A non-local block (left) vs. our position-sensitive
+axial-attention applied along the width-axis (right). $$\otimes$$ denotes
+matrix multiplication, and $$\oplus$$ denotes elementwise sum. The softmax
+is performed on the last axis. Blue boxes denote 1 × 1 convolutions, and
+red boxes denote relative positionalencoding.</em></center>
+
+## Prerequisite
+
+1. Make sure the software is properly [installed](../setup/installation.md).
+
+2. Make sure the target dataset is correctly prepared (e.g.,
+[Cityscapes](../setup/cityscapes.md)).
+
+3. Download the ImageNet pretrained
+[checkpoints](./imagenet_pretrained_checkpoints.md), and update the
+`initial_checkpoint` path in the config files.
+
+## Model Zoo
+
+In the Model Zoo, we explore building axial-attention blocks on top of
+SWideRNet (Scaling Wide ResNets) and MaX-DeepLab backbones (i.e., only
+the ImageNet pretrained backbone without any *Mask Transformers*).
+
+Herein, we highlight some of the employed backbones:
+
+1. **Axial-SWideRNet-(1, 1, x)**, where x = $$\{1, 3, 4.5\}$$, scaling the
+backbone layers (excluding the stem) of Wide-ResNet-41 by a factor of x. This
+backbone augments the naive SWideRNet (i.e., no Squeeze-and-Excitation
+or Switchable Atrous Convolution) with axial-attention blocks in the last
+two stages.
+
+2. **MaX-DeepLab-S-Backbone**: The ImageNet pretrained backbone of
+MaX-DeepLab-S (i.e., without any *Mask Transformers*). This backbone augments
+the ResNet-50-Beta (i.e., replacing the original stem with Inception stem)
+with axial-attention blocks in the last two stages.
+
+3. **MaX-DeepLab-L-Backbone**: The ImageNet pretrained backbone of
+MaX-DeepLab-L (i.e., without any *Mask Transformers*). This backbone adds a
+stacked decoder on top of the Wide ResNet-41, and incorporates
+axial-attention blocks to all feature maps with output stride 16 and larger.
+
+#### Cityscapes Panoptic Segmentation
+
+We provide checkpoints pretrained on Cityscapes train-fine set below. If you
+would like to train those models by yourself, please find the corresponding
+config files under this [directory](../../configs/cityscapes/axial_deeplab).
+
+All the reported results are obtained by *single-scale* inference and
+*ImageNet-1K* pretrained checkpoints.
+
+Backbone                                                                                                                                                                                                                                                             | Output stride | Input resolution | PQ [*] | mIoU [*] | PQ [**] | mIoU [**] | AP<sup>Mask</sup> [**]
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :-----------: | :---------------: | :----: | :------: | :-----: | :-------: | :--------------------:
+Axial-SWideRNet-(1, 1, 1) ([config](../../configs/cityscapes/axial_deeplab/axial_swidernet_1_1_1_os16.textproto), [ckpt](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/axial_swidernet_1_1_1_os16_axial_deeplab_cityscapes_trainfine.tar.gz))       | 16            | 1025 x 2049       | 66.1   | 82.8     | 66.63   | 83.43     | 37.18
+Axial-SWideRNet-(1, 1, 3) ([config](../../configs/cityscapes/axial_deeplab/axial_swidernet_1_1_3_os16.textproto), [ckpt](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/axial_swidernet_1_1_3_os16_axial_deeplab_cityscapes_trainfine.tar.gz))       | 16            | 1025 x 2049       | 67.1   | 83.5     | 67.63   | 83.97     | 40.00
+Axial-SWideRNet-(1, 1, 4.5) ([config](../../configs/cityscapes/axial_deeplab/axial_swidernet_1_1_4.5_os16.textproto), [ckpt](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/axial_swidernet_1_1_4.5_os16_axial_deeplab_cityscapes_trainfine.tar.gz)) | 16            | 1025 x 2049       | 68.0   | 83.0     | 68.53   | 83.49     | 39.51
+MaX-DeepLab-S-Backbone ([config](../../configs/cityscapes/axial_deeplab/max_deeplab_s_backbone_os16.textproto), [ckpt](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/max_deeplab_s_backbone_os16_axial_deeplab_cityscapes_trainfine.tar.gz))        | 16            | 1025 x 2049       | 64.5   | 82.2     | 64.97   | 82.63     | 35.55
+MaX-DeepLab-L-Backbone ([config](../../configs/cityscapes/axial_deeplab/max_deeplab_l_backbone_os16.textproto), [ckpt](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/max_deeplab_l_backbone_os16_axial_deeplab_cityscapes_trainfine.tar.gz))        | 16            | 1025 x 2049       | 66.3   | 83.1     | 66.77   | 83.67     | 38.09
+
+[*]: Results evaluated by the official script. Instance segmentation evaluation
+is not supported yet (need to convert our prediction format).
+
+[**]: Results evaluated by our pipeline. See Q4 in [FAQ](../faq.md).
+
+
+## Citing Axial-DeepLab
+
+If you find this code helpful in your research or wish to refer to the baseline
+results, please use the following BibTeX entry.
+
+* Axial-DeepLab:
+
+```
+@inproceedings{axial_deeplab_2020,
+  author={Huiyu Wang and Yukun Zhu and Bradley Green and Hartwig Adam and Alan Yuille and Liang-Chieh Chen},
+  title={{Axial-DeepLab}: Stand-Alone Axial-Attention for Panoptic Segmentation},
+  booktitle={ECCV},
+  year={2020}
+}
+
+```
+
+* Panoptic-DeepLab:
+
+```
+@inproceedings{panoptic_deeplab_2020,
+  author={Bowen Cheng and Maxwell D Collins and Yukun Zhu and Ting Liu and Thomas S Huang and Hartwig Adam and Liang-Chieh Chen},
+  title={{Panoptic-DeepLab}: A Simple, Strong, and Fast Baseline for Bottom-Up Panoptic Segmentation},
+  booktitle={CVPR},
+  year={2020}
+}
+
+```
+
+If you use the SWideRNet backbone w/ axial attention, please consider
+citing
+
+* SWideRNet:
+
+```
+@article{swidernet_2020,
+  title={Scaling Wide Residual Networks for Panoptic Segmentation},
+  author={Chen, Liang-Chieh and Wang, Huiyu and Qiao, Siyuan},
+  journal={arXiv:2011.11675},
+  year={2020}
+}
+
+```
+
+If you use the MaX-DeepLab-{S,L} backbone, please consider
+citing
+
+* MaX-DeepLab:
+
+```
+@inproceedings{max_deeplab_2021,
+  author={Huiyu Wang and Yukun Zhu and Hartwig Adam and Alan Yuille and Liang-Chieh Chen},
+  title={{MaX-DeepLab}: End-to-End Panoptic Segmentation with Mask Transformers},
+  booktitle={CVPR},
+  year={2021}
+}
+
+```
diff --git a/g3doc/projects/imagenet_pretrained_checkpoints.md b/g3doc/projects/imagenet_pretrained_checkpoints.md
new file mode 100644
index 0000000000000000000000000000000000000000..83981a8cbbda61f8a566f5e3ad0430f0579adc71
--- /dev/null
+++ b/g3doc/projects/imagenet_pretrained_checkpoints.md
@@ -0,0 +1,55 @@
+# Download the pretrained checkpoints
+
+To facilitate the model training, we also provide some checkpoints that are
+pretrained on ImageNet.
+
+After downloading the desired pretrained checkpoint, remember to update
+the `initial_checkpoint` path in the config files.
+
+## Checkpoints
+
+**Simple Training Strategy**: This training strategy yields a similar
+performance to the original ResNet paper [2].
+
+Backbone | Pretrained Dataset
+-------- | :---------------:
+ResNet-50 ([initial_checkpoint](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/resnet50_imagenet1k.tar.gz)) | ImageNet-1K
+
+**Strong Training Strategy**: This training strategy additionally
+employs AutoAugment [3], label-smoothing [4], and drop-path [5],  yielding
+a stronger performance on ImageNet than the original ResNet paper [2].
+
+Backbone                                                                                                                                                                              | Pretrained Dataset
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----------------:
+ResNet-50 ([initial_checkpoint](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/resnet50_imagenet1k_strong_training_strategy.tar.gz))                                  | ImageNet-1K
+ResNet-50-Beta ([initial_checkpoint](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/resnet50_beta_imagenet1k_strong_training_strategy.tar.gz))                        | ImageNet-1K
+Wide-ResNet-41 ([initial_checkpoint](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/wide_resnet41_imagenet1k_strong_training_strategy.tar.gz))                        | ImageNet-1K
+SWideRNet-SAC-(1, 1, 1) ([initial_checkpoint](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/swidernet_sac_1_1_1_imagenet1k_strong_training_strategy.tar.gz))         | ImageNet-1K
+SWideRNet-SAC-(1, 1, 3) ([initial_checkpoint](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/swidernet_sac_1_1_3_imagenet1k_strong_training_strategy.tar.gz))         | ImageNet-1K
+SWideRNet-SAC-(1, 1, 4.5) ([initial_checkpoint](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/swidernet_sac_1_1_4.5_imagenet1k_strong_training_strategy.tar.gz))     | ImageNet-1K
+Axial-SWideRNet-(1, 1, 1) ([initial_checkpoint](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/axial_swidernet_1_1_1_imagenet1k_strong_training_strategy.tar.gz))     | ImageNet-1K
+Axial-SWideRNet-(1, 1, 3) ([initial_checkpoint](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/axial_swidernet_1_1_3_imagenet1k_strong_training_strategy.tar.gz))     | ImageNet-1K
+Axial-SWideRNet-(1, 1, 4.5) ([initial_checkpoint](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/axial_swidernet_1_1_4.5_imagenet1k_strong_training_strategy.tar.gz)) | ImageNet-1K
+MaX-DeepLab-S-Backbone ([initial_checkpoint](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/max_deeplab_s_backbone_imagenet1k_strong_training_strategy.tar.gz))       | ImageNet-1K
+MaX-DeepLab-L-Backbone ([initial_checkpoint](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/max_deeplab_l_backbone_imagenet1k_strong_training_strategy.tar.gz))       | ImageNet-1K
+
+### References
+
+1. Olga Russakovsky, Jia Deng, Hao Su, Jonathan Krause, Sanjeev Satheesh,
+   Sean Ma, Zhiheng Huang, Andrej Karpathy, Aditya Khosla,
+   Michael Bernstein, Alexander C. Berg, and Li Fei-Fei. "ImageNet Large
+   Scale Visual Recognition Challenge". IJCV, 2015.
+
+2. Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. "Deep residual
+   learning for image recognition. In CVPR, 2016.
+
+3. Ekin D Cubuk, Barret Zoph, Dandelion Mane, Vijay Vasudevan, and
+   Quoc V Le. "Autoaugment: Learning augmentation policies from data".
+   In CVPR, 2019.
+
+4. Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jonathon Shlens, and
+   Zbigniew Wojna. "Rethinking the inception architecture for computer
+   vision." In CVPR, 2016.
+
+5. Gao Huang, Yu Sun, Zhuang Liu, Daniel Sedra, and Kilian Q Weinberger.
+   "Deep networks with stochastic depth." In ECCV, 2016.
diff --git a/g3doc/projects/max_deeplab.md b/g3doc/projects/max_deeplab.md
new file mode 100644
index 0000000000000000000000000000000000000000..4cde0cd5be6c5e26cc75ab6b366a603a60236a49
--- /dev/null
+++ b/g3doc/projects/max_deeplab.md
@@ -0,0 +1,132 @@
+# MaX-DeepLab
+
+MaX-DeepLab is the first fully **end-to-end** method for panoptic segmentation
+[1], removing the needs for previously hand-designed priors such as object
+bounding boxes (used in DETR [2]), instance centers (used in Panoptic-DeepLab
+[3]), non-maximum suppression, thing-stuff merging, *etc*.
+
+The goal of panoptic segmentation is to predict a set of non-overlapping masks
+along with their corresponding class labels (e.g., person, car, road, sky).
+MaX-DeepLab achieves this goal directly by predicting a set of class-labeled
+masks with a mask transformer.
+
+<p align="center">
+   <img src="../img/max_deeplab/overview_simple.png" width=450>
+</p>
+
+The mask transformer is trained end-to-end with a panoptic quality (PQ) inspired
+loss function, which matches and optimizes the predicted masks to the ground
+truth masks with a PQ-style similarity metric. In addition, our proposed mask
+transformer introduces a global memory path beside the pixel path CNN and
+employs all 4 types of attention between the two paths, allowing the CNN to read
+and write the global memory in any layer.
+
+<p align="center">
+   <img src="../img/max_deeplab/overview.png" width=500>
+</p>
+
+## Prerequisite
+
+1.  Make sure the software is properly [installed](../setup/installation.md).
+
+2.  Make sure the target dataset is correctly prepared (e.g.,
+    [COCO](../setup/coco.md)).
+
+3.  Download the ImageNet pretrained
+    [checkpoints](./imagenet_pretrained_checkpoints.md), and update the
+    `initial_checkpoint` path in the config files.
+
+## Model Zoo
+
+We explore MaX-DeepLab model variants that are built on top of several backbones
+(e.g., ResNet model variants [4]).
+
+1.  **MaX-DeepLab-S** replaces the last two stages of ResNet-50-beta with
+    axial-attention blocks and applies a small dual-path transformer.
+    (ResNet-50-beta replaces the ResNet-50 stem with the Inception stem [5].)
+
+### COCO Panoptic Segmentation
+
+We provide checkpoints pretrained on COCO 2017 panoptic train set and evaluated
+on the val set. If you would like to train those models by yourself, please find
+the corresponding config files under the directory
+[configs/coco/max_deeplab](../../configs/coco/max_deeplab).
+
+All the reported results are obtained by *single-scale* inference and
+*ImageNet-1K* pretrained checkpoints.
+
+Model                                                                                                                                                                                                                        | Input Resolution | Training Steps | PQ \[\*\] | PQ<sup>thing</sup> \[\*\] | PQ<sup>stuff</sup> \[\*\] | PQ \[\*\*\]
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :--------------: | :------------: | :-------: | :-----------------------: | :-----------------------: | :---------:
+MaX-DeepLab-S ([config](../../configs/coco/max_deeplab/max_deeplab_s_os16_res641_100k.textproto), [ckpt](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/max_deeplab_s_os16_res641_100k_coco_train.tar.gz))   | 641 x 641        | 100k           | 45.9      | 49.2                      | 40.9                      | 46.36
+MaX-DeepLab-S ([config](../../configs/coco/max_deeplab/max_deeplab_s_os16_res641_200k.textproto), [ckpt](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/max_deeplab_s_os16_res641_200k_coco_train.tar.gz))   | 641 x 641        | 200k           | 46.5      | 50.6                      | 40.4                      | 47.04
+MaX-DeepLab-S ([config](../../configs/coco/max_deeplab/max_deeplab_s_os16_res641_400k.textproto), [ckpt](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/max_deeplab_s_os16_res641_400k_coco_train.tar.gz))   | 641 x 641        | 400k           | 47.0      | 51.3                      | 40.5                      | 47.56
+MaX-DeepLab-S ([config](../../configs/coco/max_deeplab/max_deeplab_s_os16_res1025_100k.textproto), [ckpt](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/max_deeplab_s_os16_res1025_100k_coco_train.tar.gz)) | 1025 x 1025      | 100k           | 47.9      | 52.1                      | 41.5                      | 48.41
+MaX-DeepLab-S ([config](../../configs/coco/max_deeplab/max_deeplab_s_os16_res1025_200k.textproto), [ckpt](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/max_deeplab_s_os16_res1025_200k_coco_train.tar.gz)) | 1025 x 1025      | 200k           | 48.7      | 53.6                      | 41.3                      | 49.23
+
+\[\*\]: Results evaluated by the official script. \[\*\*\]: Results evaluated by
+our pipeline. See Q4 in [FAQ](../faq.md).
+
+Note that the results are slightly different from the paper, because of the
+implementation differences:
+
+1.  Stronger pretrained checkpoints are used in this repo.
+2.  A `linear` drop path schedule is used, rather than a `constant` schedule.
+3.  For simplicity, Adam [6] is used without weight decay, rather than Radam [7]
+    LookAhead [8] with weight decay.
+
+## Citing MaX-DeepLab
+
+If you find this code helpful in your research or wish to refer to the baseline
+results, please use the following BibTeX entry.
+
+*   MaX-DeepLab:
+
+```
+@inproceedings{max_deeplab_2021,
+  author={Huiyu Wang and Yukun Zhu and Hartwig Adam and Alan Yuille and Liang-Chieh Chen},
+  title={{MaX-DeepLab}: End-to-End Panoptic Segmentation with Mask Transformers},
+  booktitle={CVPR},
+  year={2021}
+}
+```
+
+*   Axial-DeepLab:
+
+```
+@inproceedings{axial_deeplab_2020,
+  author={Huiyu Wang and Yukun Zhu and Bradley Green and Hartwig Adam and Alan Yuille and Liang-Chieh Chen},
+  title={{Axial-DeepLab}: Stand-Alone Axial-Attention for Panoptic Segmentation},
+  booktitle={ECCV},
+  year={2020}
+}
+```
+
+### References
+
+1.  Alexander Kirillov, Kaiming He, Ross Girshick, Carsten Rother, and Piotr
+    Dollar. "Panoptic segmentation." In CVPR, 2019.
+
+2.  Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier,
+    Alexander Kirillov, and Sergey Zagoruyko. "End-to-End Object Detection with
+    Transformers." In ECCV, 2020.
+
+3.  Bowen Cheng, Maxwell D. Collins, Yukun Zhu, Ting Liu, Thomas S. Huang,
+    Hartwig Adam, and Liang-Chieh Chen. "Panoptic-DeepLab: A Simple, Strong, and
+    Fast Baseline for Bottom-Up Panoptic Segmentation." In CVPR 2020.
+
+4.  Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. "Deep residual
+    learning for image recognition." In CVPR, 2016.
+
+5.  Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jon Shlens, and Zbigniew
+    Wojna. "Rethinking the inception architecture for computer vision." In
+    CVPR, 2016.
+
+6.  Diederik P. Kingma, and Jimmy Ba. "Adam: A Method for Stochastic
+    Optimization" In ICLR, 2015.
+
+7.  Liyuan Liu, Haoming Jiang, Pengcheng He, Weizhu Chen, Xiaodong Liu, Jianfeng
+    Gao, and Jiawei Han. "On the Variance of the Adaptive Learning Rate and
+    Beyond" In ICLR, 2020.
+
+8.  Michael R. Zhang, James Lucas, Geoffrey Hinton, and Jimmy Ba. "Lookahead
+    Optimizer: k steps forward, 1 step back" In NeurIPS, 2019.
diff --git a/g3doc/projects/motion_deeplab.md b/g3doc/projects/motion_deeplab.md
new file mode 100644
index 0000000000000000000000000000000000000000..cf6b90e3807aeb043978c01477bd53725a2cbcb0
--- /dev/null
+++ b/g3doc/projects/motion_deeplab.md
@@ -0,0 +1,132 @@
+TODO: Prepare model zoo and some model introduction.
+
+References below are really meant for reference when writing the doc.
+Please remove the references once ready.
+
+References:
+
+* https://github.com/tensorflow/models/blob/master/research/deeplab/g3doc/model_zoo.md
+* https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/tf2_detection_zoo.md
+
+# Motion-DeepLab
+
+TODO: Add model introduction and maybe a figure.
+Motion-DeepLab is xxxxx
+
+## Prerequisite
+
+1. Make sure the software is properly [installed](../setup/installation.md).
+
+2. Make sure the target dataset is correctly prepared (e.g.,
+[KITTI-STEP](../setup/kitti_step.md)).
+
+3. Download the Cityscapes pretrained checkpoints listed below, and update
+the `initial_checkpoint` path in the config files.
+
+## Model Zoo
+
+### KITTI-STEP Video Panoptic Segmentation
+
+**Initial checkpoint**: We provide several Cityscapes pretrained checkpoints
+for KITTI-STEP experiments. Please download them and update the
+`initial_checkpoint` path in the config files.
+
+Model | Download | Note |
+-------- | :-----------: | :---------------: |
+Panoptic-DeepLab | [initial_checkpoint](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/resnet50_os32_panoptic_deeplab_cityscapes_crowd_trainfine.tar.gz) | The initial checkpoint for single-frame baseline.
+Motion-DeepLab | [initial_checkpoint](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/resnet50_os32_panoptic_deeplab_cityscapes_crowd_trainfine_netsurgery_first_layer.tar.gz) | The initial checkpoint for two-frame baseline.
+
+We also provide checkpoints pretrained on KITTI-STEP below. If
+you would like to train those models by yourself, please find the
+corresponding config files under the directories
+[configs/kitti/panoptic_deeplab (single-frame-baseline)](../../configs/kitti/panoptic_deeplab)
+or
+[configs/kitti/motion_deeplab (two-frame-baseline)](../../configs/kitti/motion_deeplab).
+
+**Panoptic-DeepLab (single-frame-baseline)**:
+
+Backbone                                                                                                                                                                                                                 | Output stride | Dataset split           | PQ&dagger; | AP<sup>Mask</sup>&dagger; | mIoU
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | :-----------: | :---------------------: | :--------: | :-----------------------: | :--:
+ResNet-50 ([config](../../configs/kitti/panoptic_deeplab/resnet50_os32.textproto), [ckpt](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/resnet50_os32_panoptic_deeplab_kitti_train.tar.gz))             | 32            | KITTI-STEP train set    | 48.31      | 42.22                     | 71.16
+ResNet-50 ([config](../../configs/kitti/panoptic_deeplab/resnet50_os32_trainval.textproto), [ckpt](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/resnet50_os32_panoptic_deeplab_kitti_trainval.tar.gz)) | 32            | KITTI-STEP trainval set | -          | -                         | -
+
+&dagger;: See Q4 in [FAQ](../faq.md).
+
+This single-frame baseline could be used together with other state-of-the-art
+optical flow methods (e.g., RAFT [1]) for propagating mask predictions
+from one frame to another, as shown in our STEP paper.
+
+**Motion-DeepLab (two-frame-baseline)**:
+
+Backbone | Output stride | Dataset split | PQ&dagger; | AP<sup>Mask</sup>&dagger; | mIoU | STQ
+-------- | :-----------: | :---------------: | :---: | :---: | :---: | :---:
+ResNet-50 ([config](../../configs/kitti/motion_deeplab/resnet50_os32.textproto), [ckpt](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/resnet50_os32_motion_deeplab_kitti_train.tar.gz)) | 32 | KITTI-STEP train set | 42.08 | 37.52 | 63.15 | 57.7
+ResNet-50 ([config](../../configs/kitti/motion_deeplab/resnet50_os32_trainval.textproto), [ckpt](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/resnet50_os32_motion_deeplab_kitti_trainval.tar.gz))| 32 | KITTI-STEP trainval set | - | - | - | -
+
+&dagger;: See Q4 in [FAQ](../faq.md).
+
+### MOTChallenge-STEP Video Panoptic Segmentation
+
+**Initial checkpoint**: We provide several Cityscapes pretrained checkpoints
+for MOTChallenge-STEP experiments. Please download them and update the
+`initial_checkpoint` path in the config files.
+
+Model | Download | Note |
+-------- | :-----------: | :---------------: |
+Panoptic-DeepLab | [initial_checkpoint](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/resnet50_os32_panoptic_deeplab_cityscapes_crowd_trainfine_netsurgery_last_layer.tar.gz) | The initial checkpoint for single-frame baseline.
+Motion-DeepLab | [initial_checkpoint](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/resnet50_os32_panoptic_deeplab_cityscapes_crowd_trainfine_netsurgery_first_and_last_layer.tar.gz) | The initial checkpoint for two-frame baseline.
+
+We also provide checkpoints pretrained on MOTChallenge-STEP below.
+If you would like to train those models by yourself, please find the
+corresponding config files under the directories for
+[configs/motchallenge/panoptic_deeplab (single-frame-baseline)](../../configs/motchallenge/panoptic_deeplab)
+or
+[configs/motchallenge/motion_deeplab (two-frame-baseline)](../../configs/motchallenge/motion_deeplab).
+
+**Panoptic-DeepLab (single-frame-baseline)**:
+
+TODO: Add pretrained checkpoint.
+
+Backbone | Output stride | Dataset split | PQ&dagger; | AP<sup>Mask</sup>&dagger; | mIoU
+-------- | :-----------: | :---------------: | :---: | :---: | :---:
+ResNet-50 ([config](../../configs/motchallenge/panoptic_deeplab/resnet50_os32.textproto)) | 32 | MOTChallenge-STEP train set | ? | ? | ?
+ResNet-50 | 32 | MOTChallenge-STEP trainval set | - | - | -
+
+&dagger;: See Q4 in [FAQ](../faq.md).
+
+This single-frame baseline could be used together with other state-of-the-art
+optical flow methods (e.g., RAFT [1]) for propagating mask predictions
+from one frame to another, as shown in our STEP paper.
+
+**Motion-DeepLab (two-frame-baseline)**:
+
+TODO: Add pretrained checkpoint.
+
+Backbone | Output stride | Dataset split | PQ&dagger; | AP<sup>Mask</sup>&dagger; | mIoU | STQ
+-------- | :-----------: | :---------------: | :---: | :---: | :---: | :---:
+ResNet-50 ([config](../../configs/motchallenge/motion_deeplab/resnet50_os32.textproto)) | 32 | MOTChallenge-STEP train set | ? | ? | ? |?
+ResNet-50 | 32 | MOTChallenge-STEP trainval set | - | - | - | -
+
+&dagger;: See Q4 in [FAQ](../faq.md).
+
+## Citing Motion-DeepLab
+
+If you find this code helpful in your research or wish to refer to the baseline
+results, please use the following BibTeX entry.
+
+* STEP (Motion-DeepLab):
+
+```
+@article{step_2021,
+  author={Mark Weber and Jun Xie and Maxwell Collins and Yukun Zhu and Paul Voigtlaender and Hartwig Adam and Bradley Green and Andreas Geiger and Bastian Leibe and Daniel Cremers and Aljosa Osep and Laura Leal-Taixe and Liang-Chieh Chen},
+  title={{STEP}: Segmenting and Tracking Every Pixel},
+  journal={arXiv:2102.11859},
+  year={2021}
+}
+
+```
+
+### References
+
+1. Zachary Teed and Jia Deng. RAFT: recurrent all-pairs field
+transforms for optical flow. In ECCV, 2020
diff --git a/g3doc/projects/panoptic_deeplab.md b/g3doc/projects/panoptic_deeplab.md
new file mode 100644
index 0000000000000000000000000000000000000000..cbc810a344ea1be27e488d69879fd986dcf40d55
--- /dev/null
+++ b/g3doc/projects/panoptic_deeplab.md
@@ -0,0 +1,211 @@
+# Panoptic-DeepLab
+
+Panoptic-DeepLab is a state-of-the-art **box-free** system for panoptic
+segmentation [1], where the goal is to assign a unique value, encoding both
+semantic label (e.g., person, car) and instance ID (e.g., instance_1,
+instance_2), to every pixel in an image.
+
+Panoptic-DeepLab improves over the DeeperLab [6], which is one of the first
+box-free systems for panoptic segmentation combining DeepLabv3+ [7] and
+PersonLab [8], by simplifying the class-agnostic instance detection to only use
+a center keypoint. As a result, Panoptic-DeepLab predicts three outputs: (1)
+semantic segmentation, (2) instance center heatmap, and (3) instance center
+regression.
+
+The class-agnostic instance segmentation is first obtained by grouping
+the predicted foreground pixels (inferred by semantic segmentation) to their
+closest predicted instance centers [2]. To generate final panoptic segmentation,
+we then fuse the class-agnostic instance segmentation with semantic segmentation
+by the efficient majority-vote scheme [6].
+
+
+<p align="center">
+   <img src="../img/panoptic_deeplab.png" width=800>
+</p>
+
+
+## Prerequisite
+
+1. Make sure the software is properly [installed](../setup/installation.md).
+
+2. Make sure the target dataset is correctly prepared (e.g.,
+[Cityscapes](../setup/cityscapes.md), [COCO](../setup/coco.md)).
+
+3. Download the ImageNet pretrained
+[checkpoints](./imagenet_pretrained_checkpoints.md), and update the
+`initial_checkpoint` path in the config files.
+
+## Model Zoo
+
+In the Model Zoo, we explore building Panoptic-DeepLab on top of several
+backbones (e.g., ResNet model variants [3]).
+
+Herein, we highlight some of the employed backbones:
+
+1. **ResNet-50-Beta**: We replace the original stem in ResNet-50 [3] with the
+Inception stem [9], i.e., the first original 7x7 convolution is replaced
+by three 3x3 convolutions.
+
+2. **Wide-ResNet-41**: We modify the Wide-ResNet-38 [5] by (1) removing the
+last residual block, and (2) repeating the second last residual block two
+more times.
+
+3. **SWideRNet-SAC-(1, 1, x)**, where x = $$\{1, 3, 4.5\}$$, scaling the
+backbone layers (excluding the stem) of Wide-ResNet-41 by a factor of x. This
+backbone only employs the Switchable Atrous Convolution (SAC) without the
+Squeeze-and-Excitation modules [10].
+
+### Cityscapes Panoptic Segmentation
+
+We provide checkpoints pretrained on Cityscapes train-fine set below. If you
+would like to train those models by yourself, please find the corresponding
+config files under the directory
+[configs/cityscapes/panoptic_deeplab](../../configs/cityscapes/panoptic_deeplab).
+
+All the reported results are obtained by *single-scale* inference and
+*ImageNet-1K* pretrained checkpoints.
+
+Backbone                                                                                                                                                                                                                                                             | Output stride | Input resolution | PQ [*] | mIoU [*] | PQ [**] | mIoU [**] | AP<sup>Mask</sup> [**]
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :-----------: | :---------------: | :----: | :------: | :-----: | :-------: | :--------------------:
+MobilenetV3-S ([config](../../configs/cityscapes/panoptic_deeplab/mobilenet_v3_small_os32.textproto), [ckpt](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/mobilenet_v3_small_os32_panoptic_deeplab_cityscapes_trainfine.tar.gz))                   | 32            | 1025 x 2049       | 46.7   | 69.5     | 46.92   | 69.8      | 16.53
+MobilenetV3-L ([config](../../configs/cityscapes/panoptic_deeplab/mobilenet_v3_large_os32.textproto), [ckpt](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/mobilenet_v3_large_os32_panoptic_deeplab_cityscapes_trainfine.tar.gz))                   | 32            | 1025 x 2049       | 52.7   | 73.8     | 53.07   | 74.15     | 22.58
+ResNet-50 ([config](../../configs/cityscapes/panoptic_deeplab/resnet50_os32_merge_with_pure_tf_func.textproto), [ckpt](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/resnet50_os32_panoptic_deeplab_cityscapes_trainfine.tar.gz))                   | 32            | 1025 x 2049       | 59.8   | 76.0     | 60.24   | 76.36     | 30.01
+ResNet-50-Beta ([config](../../configs/cityscapes/panoptic_deeplab/resnet50_beta_os32.textproto), [ckpt](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/resnet50_beta_os32_panoptic_deeplab_cityscapes_trainfine.tar.gz))                            | 32            | 1025 x 2049       | 60.8   | 77.0     | 61.16   | 77.37     | 31.58
+Wide-ResNet-41 ([config](../../configs/cityscapes/panoptic_deeplab/wide_resnet41_os16.textproto), [ckpt](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/wide_resnet41_os16_panoptic_deeplab_cityscapes_trainfine.tar.gz))                            | 16            | 1025 x 2049       | 64.4   | 81.5     | 64.83   | 81.92     | 36.07
+SWideRNet-SAC-(1, 1, 1) ([config](../../configs/cityscapes/panoptic_deeplab/swidernet_sac_1_1_1_os16.textproto), [ckpt](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/swidernet_sac_1_1_1_os16_panoptic_deeplab_cityscapes_trainfine.tar.gz))       | 16            | 1025 x 2049       | 64.3   | 81.8     | 64.81   | 82.24     | 36.80
+SWideRNet-SAC-(1, 1, 3) ([config](../../configs/cityscapes/panoptic_deeplab/swidernet_sac_1_1_3_os16.textproto), [ckpt](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/swidernet_sac_1_1_3_os16_panoptic_deeplab_cityscapes_trainfine.tar.gz)))      | 16            | 1025 x 2049       | 66.6   | 82.1     | 67.05   | 82.67     | 38.59
+SWideRNet-SAC-(1, 1, 4.5) ([config](../../configs/cityscapes/panoptic_deeplab/swidernet_sac_1_1_4.5_os16.textproto), [ckpt](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/swidernet_sac_1_1_4.5_os16_panoptic_deeplab_cityscapes_trainfine.tar.gz)) | 16            | 1025 x 2049       | 66.8   | 82.2     | 67.29   | 82.74     | 39.51
+
+[*]: Results evaluated by the official script. Instance segmentation evaluation
+is not supported yet (need to convert our prediction format).
+
+[**]: Results evaluated by our pipeline. See Q4 in [FAQ](../faq.md).
+
+### COCO Panoptic Segmentation
+
+We provide checkpoints pretrained on COCO train set below. If you would like to
+train those models by yourself, please find the corresponding config files under
+the directory
+[configs/coco/panoptic_deeplab](../../configs/coco/panoptic_deeplab).
+
+All the reported results are obtained by *single-scale* inference and
+*ImageNet-1K* pretrained checkpoints.
+
+Backbone                                                                                                                                                                                                                 | Output stride | Input resolution | PQ [*] | PQ [**] | mIoU [**] | AP<sup>Mask</sup> [**]
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | :-----------: | :---------------: | :----: | :-----: | :-------: | :--------------------:
+ResNet-50 ([config](../../configs/coco/panoptic_deeplab/resnet50_os32.textproto), [ckpt](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/resnet50_os32_panoptic_deeplab_coco_train_2.tar.gz))             | 32            | 641 x 641         | 34.1   | 34.60   | 54.75     | 18.50
+ResNet-50-Beta ([config](../../configs/coco/panoptic_deeplab/resnet50_beta_os32.textproto), [ckpt](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/resnet50beta_os32_panoptic_deeplab_coco_train.tar.gz)) | 32            | 641 x 641         | 34.6   | 35.10   | 54.98     | 19.24
+ResNet-50 ([config](../../configs/coco/panoptic_deeplab/resnet50_os16.textproto), [ckpt](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/resnet50_os16_panoptic_deeplab_coco_train.tar.gz))               | 16            | 641 x 641         | 35.1   | 35.67   | 55.52     | 19.40
+ResNet-50-Beta ([config](../../configs/coco/panoptic_deeplab/resnet50_beta_os16.textproto), [ckpt](https://storage.googleapis.com/gresearch/tf-deeplab/checkpoint/resnet50beta_os16_panoptic_deeplab_coco_train.tar.gz)) | 16            | 641 x 641         | 35.2   | 35.76   | 55.45     | 19.63
+
+\[*]: Results evaluated by the official script.
+
+\[**]: Results evaluated by our pipeline. See Q4 in [FAQ](../faq.md).
+
+## Citing Panoptic-DeepLab
+
+If you find this code helpful in your research or wish to refer to the baseline
+results, please use the following BibTeX entry.
+
+* Panoptic-DeepLab:
+
+```
+@inproceedings{panoptic_deeplab_2020,
+  author={Bowen Cheng and Maxwell D Collins and Yukun Zhu and Ting Liu and Thomas S Huang and Hartwig Adam and Liang-Chieh Chen},
+  title={{Panoptic-DeepLab}: A Simple, Strong, and Fast Baseline for Bottom-Up Panoptic Segmentation},
+  booktitle={CVPR},
+  year={2020}
+}
+
+```
+
+If you use the Wide-ResNet-41 backbone, please consider citing
+
+* Naive-Student:
+
+```
+@inproceedings{naive_student_2020,
+  title={{Naive-Student: Leveraging Semi-Supervised Learning in Video Sequences for Urban Scene Segmentation}},
+  author={Chen, Liang-Chieh and Lopes, Raphael Gontijo and Cheng, Bowen and Collins, Maxwell D and Cubuk, Ekin D and Zoph, Barret and Adam, Hartwig and Shlens, Jonathon},
+  booktitle={ECCV},
+  year={2020}
+}
+```
+
+If you use the SWideRNet backbone w/ Switchable Atrous Convolution,
+please consider citing
+
+* SWideRNet:
+
+```
+@article{swidernet_2020,
+  title={Scaling Wide Residual Networks for Panoptic Segmentation},
+  author={Chen, Liang-Chieh and Wang, Huiyu and Qiao, Siyuan},
+  journal={arXiv:2011.11675},
+  year={2020}
+}
+
+```
+
+* Swichable Atrous Convolution (SAC):
+
+```
+@inproceedings{detectors_2021,
+  title={{DetectoRS}: Detecting Objects with Recursive Feature Pyramid and Switchable Atrous Convolution},
+  author={Qiao, Siyuan and Chen, Liang-Chieh and Yuille, Alan},
+  booktitle={CVPR},
+  year={2021}
+}
+
+```
+
+If you use the MobileNetv3 backbone, please consider citing
+
+* MobileNetv3
+
+```
+@inproceedings{howard2019searching,
+  title={Searching for {MobileNetV3}},
+  author={Howard, Andrew and Sandler, Mark and Chu, Grace and Chen, Liang-Chieh and Chen, Bo and Tan, Mingxing and Wang, Weijun and Zhu, Yukun and Pang, Ruoming and Vasudevan, Vijay and others},
+  booktitle={ICCV},
+  year={2019}
+}
+```
+
+### References
+
+1. Alexander Kirillov, Kaiming He, Ross Girshick, Carsten Rother, and Piotr
+   Dollar. "Panoptic segmentation." In CVPR, 2019.
+
+2. Alex Kendall, Yarin Gal, and Roberto Cipolla. "Multi-task learning using
+   uncertainty to weigh losses for scene geometry and semantics." In CVPR, 2018.
+
+3. Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. "Deep residual
+   learning for image recognition." In CVPR, 2016.
+
+4. Sergey Zagoruyko and Nikos Komodakis. "Wide residual networks." In BMVC,
+   2016.
+
+5. Zifeng Wu, Chunhua Shen, and Anton Van Den Hengel. "Wider or deeper:
+   Revisiting the ResNet model for visual recognition." Pattern Recognition,
+   2019.
+
+6. Tien-Ju Yang, Maxwell D Collins, Yukun Zhu, Jyh-Jing Hwang, Ting Liu,
+   Xiao Zhang, Vivienne Sze, George Papandreou, and Liang-Chieh Chen.
+   "DeeperLab: Single-shot image parser." arXiv:1902.05093, 2019.
+
+7. Liang-Chieh Chen, Yukun Zhu, George Papandreou, Florian Schroff, and
+   Hartwig Adam. "Encoder-decoder with atrous separable convolution for
+   semantic image segmentation." In ECCV, 2018.
+
+8. George Papandreou, Tyler Zhu, Liang-Chieh Chen, Spyros Gidaris,
+   Jonathan Tompson, and Kevin Murphy. "Personlab: Person pose estimation
+   and instance segmentation with a bottom-up, part-based, geometric embedding
+   model." In ECCV, 2018.
+
+9. Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jon Shlens, and
+   Zbigniew Wojna. "Rethinking the inception architecture for computer
+   vision." In CVPR, 2016.
+
+10. Jie Hu, Li Shen, and Gang Sun. "Squeeze-and-excitation networks."
+    In CVPR, 2018.
diff --git a/g3doc/projects/vip_deeplab.md b/g3doc/projects/vip_deeplab.md
new file mode 100644
index 0000000000000000000000000000000000000000..f5b7fb31d4af57916b19f41ad182f133a062d2b3
--- /dev/null
+++ b/g3doc/projects/vip_deeplab.md
@@ -0,0 +1,41 @@
+TODO: Prepare model zoo and some model introduction.
+
+References below are really meant for reference when writing the doc.
+Please remove the references once ready.
+
+References:
+
+* https://github.com/tensorflow/models/blob/master/research/deeplab/g3doc/model_zoo.md
+* https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/tf2_detection_zoo.md
+
+## Citing ViP-DeepLab
+
+If you find this code helpful in your research or wish to refer to the baseline
+results, please use the following BibTeX entry.
+
+* ViP-DeepLab:
+
+```
+@inproceedings{vip_deeplab_2021,
+  author={Siyuan Qiao and Yukun Zhu and Hartwig Adam and Alan Yuille and Liang-Chieh Chen},
+  title={{ViP-DeepLab}: Learning Visual Perception with Depth-aware Video Panoptic Segmentation},
+  booktitle={CVPR},
+  year={2021}
+}
+
+```
+
+* Panoptic-DeepLab:
+
+```
+@inproceedings{panoptic_deeplab_2020,
+  author={Bowen Cheng and Maxwell D Collins and Yukun Zhu and Ting Liu and Thomas S Huang and Hartwig Adam and Liang-Chieh Chen},
+  title={{Panoptic-DeepLab}: A Simple, Strong, and Fast Baseline for Bottom-Up Panoptic Segmentation},
+  booktitle={CVPR},
+  year={2020}
+}
+
+```
+
+### References
+Add some related works if any
diff --git a/g3doc/setup/cityscapes.md b/g3doc/setup/cityscapes.md
new file mode 100644
index 0000000000000000000000000000000000000000..9fdcefc112db0e0f0f11ebcd7f7083ac4e31bcb2
--- /dev/null
+++ b/g3doc/setup/cityscapes.md
@@ -0,0 +1,137 @@
+# Run DeepLab2 on Cityscapes dataset
+
+This page walks through the steps required to generate
+[Cityscapes](https://www.cityscapes-dataset.com/) data for DeepLab2. DeepLab2
+uses sharded TFRecords for efficient processing of the data.
+
+## Prework
+
+Before running any Deeplab2 scripts, the user should 1. register on the
+Cityscapes dataset [website](https://www.cityscapes-dataset.com) to download the
+dataset (gtFine_trainvaltest.zip and leftImg8bit_trainvaltest.zip). 2. install
+cityscapesscripts via pip: `bash # This will install the cityscapes scripts and
+its stand-alone tools. pip install cityscapesscripts`
+
+1.  run the tools provided by Cityscapes to generate the training groundtruth.
+    See sample commandlines below:
+
+```bash
+  # Set CITYSCAPES_DATASET to your dataset root.
+
+  # Create train ID label images.
+  CITYSCAPES_DATASET='.' csCreateTrainIdLabelImgs
+
+  # To generate panoptic groundtruth, run the following command.
+  CITYSCAPES_DATASET='.' csCreatePanopticImgs --use-train-id
+
+  # [Optional] Generate panoptic groundtruth with EvalId to match evaluation
+  # on the server. This step is not required for generating TFRecords.
+  CITYSCAPES_DATASET='.' csCreatePanopticImgs
+```
+
+After running above commandlines, the expected directory structure should be as
+follows:
+
+```
+cityscapes
++-- gtFine
+|   |
+|   +-- train
+|   |   |
+|   |   +-- aachen
+|   |       |
+|   |       +-- *_color.png
+|   |       +-- *_instanceIds.png
+|   |       +-- *_labelIds.png
+|   |       +-- *_polygons.json
+|   |       +-- *_labelTrainIds.png
+|   |   ...
+|   +-- val
+|   +-- test
+|   +-- cityscapes_panoptic_{train|val|test}_trainId.json
+|   +-- cityscapes_panoptic_{train|val|test}_trainId
+|   |   |
+|   |   +-- *_panoptic.png
+|   +-- cityscapes_panoptic_{train|val|test}.json
+|   +-- cityscapes_panoptic_{train|val|test}
+|       |
+|       +-- *_panoptic.png
+|
++-- leftImg8bit
+     |
+     +-- train
+     +-- val
+     +-- test
+```
+
+## Convert prepared dataset to TFRecord
+
+Note: the rest of this doc and released DeepLab2 models use `TrainId` instead of
+`EvalId` (which is used on the evaluation server). For evaluation on the server,
+you would need to convert the predicted labels to `EvalId` .
+
+Use the following commandline to generate cityscapes TFRecords:
+
+```bash
+# Assuming we are under the folder where deeplab2 is cloned to:
+
+# For generating data for semantic segmentation task only
+python deeplab2/data/build_cityscapes_data.py \
+  --cityscapes_root=${PATH_TO_CITYSCAPES_ROOT} \
+  --output_dir=${OUTPUT_PATH_FOR_SEMANTIC} \
+  --create_panoptic_data=false
+
+# For generating data for panoptic segmentation task
+python deeplab2/data/build_cityscapes_data.py \
+  --cityscapes_root=${PATH_TO_CITYSCAPES_ROOT} \
+  --output_dir=${OUTPUT_PATH_FOR_PANOPTIC}
+```
+
+Commandline above will output three sharded tfrecord files:
+`{train|val|test}@10.tfrecord`. In the tfrecords, for `train` and `val` set, it
+contains the RGB image pixels as well as corresponding annotations. For `test`
+set, it contains RGB images only. These files will be used as the input for the
+model training and evaluation.
+
+### TFExample proto format for cityscapes
+
+The Example proto contains the following fields:
+
+*   `image/encoded`: encoded image content.
+*   `image/filename`: image filename.
+*   `image/format`: image file format.
+*   `image/height`: image height.
+*   `image/width`: image width.
+*   `image/channels`: image channels.
+*   `image/segmentation/class/encoded`: encoded segmentation content.
+*   `image/segmentation/class/format`: segmentation encoding format.
+
+For semantic segmentation (`--create_panoptic_data=false`), the encoded
+segmentation map will be the same as PNG file created by
+`createTrainIdLabelImgs.py`.
+
+For panoptic segmentation, the encoded segmentation map will be the raw bytes of
+a int32 panoptic map, where each pixel is assigned to a panoptic ID. Unlike the
+ID used in Cityscapes script (`json2instanceImg.py`), this panoptic ID is
+computed by:
+
+```
+  panoptic ID = semantic ID * label divisor + instance ID
+```
+
+where semantic ID will be:
+
+*   ignore label (255) for pixels not belonging to any segment
+*   for segments associated with `iscrowd` label:
+    *   (default): ignore label (255)
+    *   (if set `--treat_crowd_as_ignore=false` while running
+        `build_cityscapes_data.py`): `category_id` (use TrainId)
+*   `category_id` (use TrainId) for other segments
+
+The instance ID will be 0 for pixels belonging to
+
+*   `stuff` class
+*   `thing` class with `iscrowd` label
+*   pixels with ignore label
+
+and `[1, label divisor)` otherwise.
diff --git a/g3doc/setup/cityscapes_test_server_evaluation.md b/g3doc/setup/cityscapes_test_server_evaluation.md
new file mode 100644
index 0000000000000000000000000000000000000000..07a5d3f2cac9cc642310ffeb881be77d74d522b2
--- /dev/null
+++ b/g3doc/setup/cityscapes_test_server_evaluation.md
@@ -0,0 +1,109 @@
+# Test Server Evaluation on Cityscapes dataset
+
+This page walks through the steps required to convert DeepLab2 predictions for
+test server evaluation on [Cityscapes](https://www.cityscapes-dataset.com/).
+
+A high-level overview of the whole process:
+
+1.  Save raw panoptic prediction in the two-channel format.
+
+2.  Create images json file.
+
+3.  Convert predictions in the two-channel format to the panoptic COCO format.
+
+4.  Run local validation set evaluation or prepare test set evaluation.
+
+We also define some environmental variables for simplicity and convenience:
+
+`BASE_MODEL_DIRECTORY`: variables set in textproto file, which defines where all
+checkpoints and results are saved.
+
+`DATA_ROOT`: where the original Cityscapes dataset is located.
+
+`PATH_TO_SAVE`: where the converted results should be saved.
+
+`IMAGES_SPLIT`: *val* or *test* depending on the target split.
+
+## Save Raw Panoptic Prediction
+
+Save the raw panoptic predictions in the
+[two-channel panoptic format](https://arxiv.org/pdf/1801.00868.pdf) by ensuring
+the following fields are set properly in the textproto config file.
+
+```
+eval_dataset_options.decode_groundtruth_label = false
+evaluator_options.save_predictions = true
+evaluator_options.save_raw_predictions = true
+evaluator_options.convert_raw_to_eval_ids = true
+```
+
+Then run the model in evaluation modes (with `--mode=eval`), the results will be
+saved at
+
+*semantic segmentation*: ${BASE_MODEL_DIRECTORY}/vis/raw_semantic/\*.png
+
+*panoptic segmentation*: ${BASE_MODEL_DIRECTORY}/vis/raw_panoptic/\*.png
+
+## Create Images JSON
+
+Create images json file by running the following commands.
+
+```bash
+python deeplab2/utils/create_images_json_for_cityscapes.py \
+  --image_dir=${DATA_ROOT}/leftImg8bit/${IMAGES_SPLIT} \
+  --output_json_path=${PATH_TO_SAVE}/${IMAGES_SPLIT}_images.json \
+  --only_basename \
+  --include_image_type_suffix=false
+```
+
+## Convert the Prediction Format
+
+Convert prediction results saved in the
+[two-channel panoptic format](https://arxiv.org/pdf/1801.00868.pdf) to the
+panoptic COCO format.
+
+```bash
+python panopticapi/converters/2channels2panoptic_coco_format.py \
+  --source_folder=${BASE_MODEL_DIRECTORY}/vis/raw_panoptic \
+  --images_json_file=${PATH_TO_SAVE}/${IMAGES_SPLIT}_images.json\
+  --categories_json_file=deeplab2/utils/panoptic_cityscapes_categories.json \
+  --segmentations_folder=${PATH_TO_SAVE}/panoptic_cocoformat \
+  --predictions_json_file=${PATH_TO_SAVE}/panoptic_cocoformat.json
+```
+
+## Run Local Evaluation Scripts (for *validation* set)
+
+Run the [official scripts](https://github.com/mcordts/cityscapesScripts) to
+evaluate validation set results.
+
+For *semantic segmentation*:
+
+```bash
+CITYSCAPES_RESULTS=${BASE_MODEL_DIRECTORY}/vis/raw_semantic/ \
+CITYSCAPES_DATASET=${DATA_ROOT} \
+CITYSCAPES_EXPORT_DIR=${PATH_TO_SAVE} \
+python cityscapesscripts/evaluation/evalPixelLevelSemanticLabeling.py
+```
+
+For *panoptic segmentation*:
+
+```bash
+python cityscapesscripts/evaluation/evalPanopticSemanticLabeling.py \
+    --prediction-json-file=${PATH_TO_SAVE}/panoptic_cocoformat.json \
+    --prediction-folder=${PATH_TO_SAVE}/panoptic_cocoformat \
+    --gt-json-file=${DATA_ROOT}/gtFine/cityscapes_panoptic_val.json \
+    --gt-folder=${DATA_ROOT}/gtFine/cityscapes_panoptic_val
+```
+
+Please note that our prediction fortmat does not support instance segmentation
+prediction format yet.
+
+## Prepare Submission Files (for *test* set)
+
+Run the following command to prepare a submission file for test server
+evaluation.
+
+```bash
+zip -r cityscapes_test_submission_semantic.zip ${BASE_MODEL_DIRECTORY}/vis/raw_semantic
+zip -r cityscapes_test_submission_panoptic.zip ${PATH_TO_SAVE}/panoptic_cocoformat ${PATH_TO_SAVE}/panoptic_cocoformat.json
+```
diff --git a/g3doc/setup/coco.md b/g3doc/setup/coco.md
new file mode 100644
index 0000000000000000000000000000000000000000..0d6884493ae5001188f94ab0747bf20c8622ee08
--- /dev/null
+++ b/g3doc/setup/coco.md
@@ -0,0 +1,97 @@
+# Run DeepLab2 on COCO dataset
+
+This page walks through the steps required to generate
+[COCO](https://cocodataset.org/) panoptic segmentation data for DeepLab2.
+DeepLab2 uses sharded TFRecords for efficient processing of the data.
+
+## Prework
+
+Before running any Deeplab2 scripts, the users should (1) access the
+[COCO dataset website](https://cocodataset.org/) to download the dataset,
+including [2017 Train images](http://images.cocodataset.org/zips/train2017.zip),
+[2017 Val images](http://images.cocodataset.org/zips/val2017.zip),
+[2017 Test images](http://images.cocodataset.org/zips/test2017.zip), and
+[2017 Panoptic Train/Val annotations](http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip),
+and (2) unzip the downloaded files.
+
+After finishing above steps, the expected directory structure should be as
+follows:
+
+```
+.(COCO_ROOT)
++-- train2017
+|   |
+|   +-- *.jpg
+|
+|-- val2017
+|   |
+|   +-- *.jpg
+|
+|-- test2017
+|   |
+|   +-- *.jpg
+|
++-- annotations
+     |
+     +-- panoptic_{train|val}2017.json
+     +-- panoptic_{train|val}2017
+```
+
+## Convert prepared dataset to TFRecord
+
+Use the following commandline to generate COCO TFRecords:
+
+```bash
+# For generating data for panoptic segmentation task
+python deeplab2/data/build_coco_data.py \
+  --coco_root=${COCO_ROOT} \
+  --output_dir=${OUTPUT_DIR}
+```
+
+Commandline above will output three sharded tfrecord files:
+`{train|val|test}@1000.tfrecord`. In the tfrecords, for `train` and `val` set,
+it contains the RGB image pixels as well as corresponding annotations. For
+`test` set, it contains RGB images only. These files will be used as the input
+for the model training and evaluation.
+
+Note that we map the class ID to continuous IDs. Specifically, we map the
+original label ID, which ranges from 1 to 200, to the contiguous ones ranging
+from 1 to 133.
+
+### TFExample proto format for COCO
+
+The Example proto contains the following fields:
+
+*   `image/encoded`: encoded image content.
+*   `image/filename`: image filename.
+*   `image/format`: image file format.
+*   `image/height`: image height.
+*   `image/width`: image width.
+*   `image/channels`: image channels.
+*   `image/segmentation/class/encoded`: encoded segmentation content.
+*   `image/segmentation/class/format`: segmentation encoding format.
+
+For panoptic segmentation, the encoded segmentation map will be the raw bytes of
+an int32 panoptic map, where each pixel is assigned to a panoptic ID, which is
+computed by:
+
+```
+  panoptic ID = semantic ID * label divisor + instance ID
+```
+
+where semantic ID will be:
+
+*   ignore label (0) for pixels not belonging to any segment
+*   for segments associated with `iscrowd` label:
+    *   (default): ignore label (0)
+    *   (if set `--treat_crowd_as_ignore=false` while running
+        `build_coco_data.py`): `category_id`
+*   `category_id` for other segments
+
+The instance ID will be 0 for pixels belonging to
+
+*   `stuff` class
+*   `thing` class with `iscrowd` label
+*   pixels with ignore label
+
+and `[1, label divisor)` otherwise.
diff --git a/g3doc/setup/coco_test_server_evaluation.md b/g3doc/setup/coco_test_server_evaluation.md
new file mode 100644
index 0000000000000000000000000000000000000000..1f8c16d1cf1529c76c362cf3c0de7b99afe7054f
--- /dev/null
+++ b/g3doc/setup/coco_test_server_evaluation.md
@@ -0,0 +1,77 @@
+# Test Server Evaluation on COCO dataset
+
+This page walks through the steps required to convert DeepLab2 predictions for
+test server evaluation on [COCO](https://cocodataset.org/).
+
+A high-level overview of the whole process:
+
+1.  Save raw panoptic prediction in the two-channel format.
+
+2.  Convert predictions in the two-channel format to the panoptic COCO format.
+
+3.  Run local validation set evaluation or prepare test set evaluation.
+
+We also define some environmental variables for simplicity and convenience:
+
+`BASE_MODEL_DIRECTORY`: variables set in textproto file, which defines where all
+checkpoints and results are saved.
+
+`DATA_ROOT`: where the original COCO dataset is located.
+
+`PATH_TO_SAVE`: where the converted results should be saved.
+
+## Save Raw Panoptic Prediction
+
+Save the raw panoptic predictions in the
+[two-channel panoptic format](https://arxiv.org/pdf/1801.00868.pdf) by ensuring
+the following fields are set properly in the textproto config file.
+
+```
+eval_dataset_options.decode_groundtruth_label = false
+evaluator_options.save_predictions = true
+evaluator_options.save_raw_predictions = true
+evaluator_options.convert_raw_to_eval_ids = true
+```
+
+Then run the model in evaluation modes (with `--mode=eval`), and the results
+will be saved at ${BASE_MODEL_DIRECTORY}/vis/raw_panoptic/\*.png.
+
+## Convert the Prediction Format
+
+Convert prediction results saved in the
+[two-channel panoptic format](https://arxiv.org/pdf/1801.00868.pdf) to the
+panoptic COCO format.
+
+```bash
+python panopticapi/converters/2channels2panoptic_coco_format.py \
+  --source_folder=${BASE_MODEL_DIRECTORY}/vis/raw_panoptic \
+  --images_json_file=${DATA_ROOT}/annotations/IMG_JSON \
+  --categories_json_file=panopticapi/panoptic_coco_categories.json \
+  --segmentations_folder=${PATH_TO_SAVE}/panoptic_cocoformat \
+  --predictions_json_file=${PATH_TO_SAVE}/panoptic_cocoformat.json
+```
+
+The `IMG_JSON` refers to `panoptic_val2017.json` for *val* set and
+`image_info_test-dev2017.json` for *test-dev* set.
+
+## Run Local Evaluation Scripts (for *validation* set)
+
+Run the [official scripts](https://github.com/cocodataset/panopticapi) to
+evaluate validation set results.
+
+```bash
+python panopticapi/evaluation.py \
+    --pred_json_file=${PATH_TO_SAVE}/panoptic_cocoformat.json \
+    --pred_folder=${PATH_TO_SAVE}/panoptic_cocoformat \
+    --gt_json_file=${DATA_ROOT}/annotations/panoptic_val2017.json \
+    --gt_folder=${DATA_ROOT}/annotations/panoptic_val2017
+```
+
+## Prepare Submission Files (for *test* set)
+
+Run the following command to prepare a submission file for test server
+evaluation.
+
+```bash
+zip -r coco_test_submission_panoptic.zip ${PATH_TO_SAVE}/panoptic_cocoformat ${PATH_TO_SAVE}/panoptic_cocoformat.json
+```
diff --git a/g3doc/setup/getting_started.md b/g3doc/setup/getting_started.md
new file mode 100644
index 0000000000000000000000000000000000000000..44bb6fea382cab179757ae027fd85d7fb809dbe2
--- /dev/null
+++ b/g3doc/setup/getting_started.md
@@ -0,0 +1,109 @@
+# Using DeepLab2
+
+In the following, we provide instructions on how to run DeepLab2.
+
+## Prerequisites
+
+We assume DeepLab2 is successfully installed and the necessary datasets are
+configured.
+
+*   See [Installation](installation.md).
+*   See dataset guides:
+    *   [Cityscapes](cityscapes.md).
+    *   [KITTI-STEP](kitti_step.md).
+    *   [and many more](./).
+
+## Running DeepLab2
+
+DeepLab2 contains several implementations of state-of-the-art methods. In the
+following, we discuss all steps from choosing a model, setting up the
+configuration to training and evaluating it.
+
+### Choosing a model
+
+For this tutorial, we use Panoptic-DeepLab, however, running any other model
+follows the same steps. For each network architecture, we provide a guide that
+contains example configurations and (pretrained) checkpoints. You can find all
+guides [here](../projects/). For now, please checkout
+[Panoptic-DeepLab](../projects/panoptic_deeplab.md).
+
+We will use the Resnet50 model as an example for this guide. If you just want to
+run the network without training, please download the corresponding checkpoint
+trained by us. If you would like to train the network, please download the
+corresponding ImageNet pretrained checkpoint from
+[here](../projects/imagenet_pretrained_checkpoints.md).
+
+### Defining a configuration
+
+When you want to train or evaluate a network, DeepLab2 requires a corresponding
+configuration. This configuration contains information about the network
+architecture as well as all sorts of hyper-parameters. Fortunately, for almost
+all settings we provide default values and example configurations. The
+configuration of Panoptic-DeepLab with ResNet50 for the Cityscapes dataset can
+be found
+[here](../../configs/cityscapes/panoptic_deeplab/resnet50_os32_merge_with_pure_tf_func.textproto).
+
+Using our default parameters there are only a few things that needs to be
+defined:
+
+1.  The name of the experiment `experiment_name`. The experiment name is used as
+    a folder name to store all experiment related files in.
+2.  The initial checkpoint `initial_checkpoint`, which can be an empty string
+    for none or the path to a checkpoint (e.g., pretrained on ImageNet or fully
+    trained by us.)
+3.  The training dataset `train_dataset_options.file_pattern`, which should
+    point to the TfRecords of the Cityscapes train set.
+4.  The evaluation dataset `eval_dataset_options.file_pattern`, which should
+    point to the TfRecords of the Cityscapes val set.
+5.  If the custom CUDA kernel is successfully compiled, we recommend to set
+    `merge_semantic_and_instance_with_tf_op` to true.
+
+For a detailed explanation of all the parameters, we refer to the documented
+definitions of the proto files. A good starting place is the
+[config.proto](../../config.proto). The `ExperimentOptions` are a collection of
+all necessary configurations ranging from the model architecture to the training
+settings.
+
+### Training and Evaluating
+
+We currently support four different modes to run DeepLab2:
+
+*   Training: This will only train the network based on the provided
+    configuration.
+*   Evaluation: This will only evaluate the network based on the provided
+    configuration.
+*   Continuous Evaluation: This mode will constantly monitor a directory for
+    newly saved checkpoints that will be evaluated until a timeout. This mode is
+    useful when runing separate jobs for training and evaluation (e.g., a multi
+    GPU job for training, and a single GPU job for evaluating).
+*   Interleaved Training and Evaluation: In this mode, training and evaluation
+    will run interleaved. This is not supported for multi GPU jobs.
+
+### Putting everything together
+
+To run DeepLab2 on GPUs, the following command should be used:
+
+```bash
+python training/train.py \
+    --config_file=${CONFIG_FILE} \
+    --mode={train | eval | train_and_eval | continuous_eval} \
+    --model_dir=${BASE_MODEL_DIRECTORY} \
+    --num_gpus=${NUM_GPUS}
+```
+
+You can also launch DeepLab2 on TPUS. For this, the TPU address needs to be
+specified:
+
+```bash
+python training/train.py \
+    --config_file=${CONFIG_FILE} \
+    --mode={train | eval | train_and_eval | continuous_eval} \
+    --model_dir=${BASE_MODEL_DIRECTORY} \
+    --master=${TPU_ADDRESS}
+```
+
+For a detailed explanation of each option run:
+
+```bash
+python training/train.py --help
+```
diff --git a/g3doc/setup/installation.md b/g3doc/setup/installation.md
new file mode 100644
index 0000000000000000000000000000000000000000..30c384df55cef47442a17d992df6474853229de6
--- /dev/null
+++ b/g3doc/setup/installation.md
@@ -0,0 +1,269 @@
+# DeepLab2
+
+## **Requirements**
+
+DeepLab2 depends on the following libraries:
+
+*   Python3
+*   Numpy
+*   Pillow
+*   Matplotlib
+*   Tensorflow 2.5
+*   Cython
+*   [Google Protobuf](https://developers.google.com/protocol-buffers)
+*   [Orbit](https://github.com/tensorflow/models/tree/master/orbit)
+*   [pycocotools](https://github.com/cocodataset/cocoapi/tree/master/PythonAPI/pycocotools)
+    (for AP-Mask)
+
+## **Installation**
+
+### Git Clone the Project
+
+Clone the
+[`google-research/deeplab2`](https://github.com/google-research/deeplab2)
+repository.
+
+```bash
+mkdir ${YOUR_PROJECT_NAME}
+cd ${YOUR_PROJECT_NAME}
+git clone https://github.com/google-research/deeplab2.git
+```
+
+### Install TensorFlow via PIP
+
+```bash
+# Install tensorflow 2.5 as an example.
+# This should come with compatible numpy package.
+pip install tensorflow==2.5
+```
+
+**NOTE:** You should find the right Tensorflow version according to your own
+configuration at
+https://www.tensorflow.org/install/source#tested_build_configurations. You also
+need to choose the right cuda version as listed on the page if you want to run
+with GPU.
+
+### Install Protobuf
+
+Below is a quick-to-start command line to install
+[protobuf](https://github.com/protocolbuffers/protobuf) in Linux:
+
+```bash
+sudo apt-get install protobuf-compiler
+```
+
+Alternatively, you can also download the package from web on other platforms.
+Please refer to https://github.com/protocolbuffers/protobuf for more details
+about installation.
+
+### Other Libraries
+
+The remaining libraries can be installed via pip:
+
+```bash
+# Pillow
+pip install pillow
+# matplotlib
+pip install matplotlib
+# Cython
+pip install cython
+```
+
+### Install Orbit
+
+[`Orbit`](https://github.com/tensorflow/models/tree/master/orbit) is a flexible,
+lightweight library designed to make it easy to write custom training loops in
+TensorFlow 2. We used Orbit in our train/eval loops. You need to download the
+code below:
+
+```bash
+cd ${YOUR_PROJECT_NAME}
+git clone https://github.com/tensorflow/models.git
+```
+
+### Install pycocotools
+
+We also use
+[`pycocotools`](https://github.com/cocodataset/cocoapi/tree/master/PythonAPI/pycocotools)
+for instance segmentation evaluation. Below is the installation guide:
+
+```bash
+cd ${YOUR_PROJECT_NAME}
+git clone https://github.com/cocodataset/cocoapi.git
+
+# Compile cocoapi
+cd ${YOUR_PROJECT_NAME}/cocoapi/PythonAPI
+make
+cd ${YOUR_PROJECT_NAME}
+```
+
+## **Compilation**
+
+The following instructions are running from `${YOUR_PROJECT_NAME}` directory:
+
+```bash
+cd ${YOUR_PROJECT_NAME}
+```
+
+### Add Libraries to PYTHONPATH
+
+When running locally, `${YOUR_PROJECT_NAME}` directory should be appended to
+PYTHONPATH. This can be done by running the following command:
+
+```bash
+# From ${YOUR_PROJECT_NAME}:
+
+# deeplab2
+export PYTHONPATH=$PYTHONPATH:`pwd`
+# orbit
+export PYTHONPATH=$PYTHONPATH:${PATH_TO_MODELS}
+# pycocotools
+export PYTHONPATH=$PYTHONPATH:${PATH_TO_cocoapi_PythonAPI}
+```
+
+If you clone `models(for Orbit)` and `cocoapi` under `${YOUR_PROJECT_NAME}`,
+here is an example:
+
+```bash
+export PYTHONPATH=$PYTHONPATH:`pwd`:`pwd`/models:`pwd`/cocoapi/PythonAPI
+```
+
+### Compile Protocol Buffers
+
+In DeepLab2, we define
+[protocol buffers](https://developers.google.com/protocol-buffers) to configure
+training and evaluation variants (see [proto definition](../../config.proto)).
+However, protobuf needs to be compiled beforehand into a python recognizable
+format. To compile protobuf, run:
+
+```bash
+# `${PATH_TO_PROTOC}` is the directory where the `protoc` binary locates.
+${PATH_TO_PROTOC} deeplab2/*.proto --python_out=.
+
+# Alternatively, if protobuf compiler is globally accessible, you can simply run:
+protoc deeplab2/*.proto --python_out=.
+```
+
+### (Optional) Compile Custom Ops
+
+We implemented efficient merging operation to merge semantic and instance maps
+for fast inference. You can follow the guide below to compile the provided
+efficient merging operation in c++ under the folder `tensorflow_ops`.
+
+The script is mostly from
+[Compile the op using your system compiler](https://www.tensorflow.org/guide/create_op#compile_the_op_using_your_system_compiler_tensorflow_binary_installation)
+in the official tensorflow guide to create custom ops. Please refer to
+[Create an op](https://www.tensorflow.org/guide/create_op#compile_the_op_using_your_system_compiler_tensorflow_binary_installation)
+for more details.
+
+```bash
+TF_CFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))') )
+TF_LFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))') )
+OP_NAME='deeplab2/tensorflow_ops/kernels/merge_semantic_and_instance_maps_op'
+
+# CPU
+g++ -std=c++14 -shared \
+${OP_NAME}.cc ${OP_NAME}_kernel.cc -o ${OP_NAME}.so -fPIC ${TF_CFLAGS[@]} ${TF_LFLAGS[@]} -O2
+
+# GPU support (https://www.tensorflow.org/guide/create_op#compiling_the_kernel_for_the_gpu_device)
+nvcc -std=c++14 -c -o ${OP_NAME}_kernel.cu.o ${OP_NAME}_kernel.cu.cc \
+  ${TF_CFLAGS[@]} -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC --expt-relaxed-constexpr
+
+g++ -std=c++14 -shared -o ${OP_NAME}.so ${OP_NAME}.cc ${OP_NAME}_kernel.cc \
+  ${OP_NAME}_kernel.cu.o ${TF_CFLAGS[@]} -fPIC -lcudart ${TF_LFLAGS[@]}
+```
+
+To test if the compilation is done successfully, you can run:
+
+```bash
+python deeplab2/tensorflow_ops/python/kernel_tests/merge_semantic_and_instance_maps_op_test.py
+```
+
+Optionally, you could set `merge_semantic_and_instance_with_tf_op` to `false` in
+the config file to skip provided efficient merging operation and use the slower
+pure TF functions instead. See
+`deeplab2/configs/cityscaspes/panoptic_deeplab/resnet50_os32_merge_with_pure_tf_func.textproto`
+as an example.
+
+### Test the Configuration
+
+You can test if you have successfully installed and configured DeepLab2 by
+running the following commands (requires compilation of custom ops):
+
+```bash
+# Model training test (test for custom ops, protobuf)
+python deeplab2/model/deeplab_test.py
+
+# Model evaluator test (test for other packages such as orbit, cocoapi, etc)
+python deeplab2/trainer/evaluator_test.py
+```
+
+### Quick All-in-One Script for Compilation (Linux Only)
+
+We also provide a shell script to help you quickly compile and test everything
+mentioned above for Linux users:
+
+```bash
+# CPU
+deeplab2/compile.sh
+
+# GPU
+deeplab2/compile.sh gpu
+```
+
+## Troubleshooting
+
+**Q1: Can I use [conda](https://anaconda.org/) instead of pip?**
+
+**A1:** We experienced several dependency issues with the most recent conda
+package. We therefore do not provide support for installing deeplab2 via conda
+at this stage.
+
+________________________________________________________________________________
+
+**Q2: How can I specify a specific nvcc to use a specific gcc version?**
+
+**A2:** At the moment, tensorflow requires a gcc version < 9. If your default
+compiler has a higher version, the path to a different gcc needs to be set to
+compile the custom GPU op. Please check that either gcc-7 or gcc-8 are
+installed.
+
+The compiler can then be set as follows:
+
+```bash
+# Assuming gcc-7 is installed in /usr/bin (can be verified by which gcc-7)
+
+nvcc -std=c++14 -c -o ${OP_NAME}_kernel.cu.o ${OP_NAME}_kernel.cu.cc \
+${TF_CFLAGS[@]} -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC -ccbin=/usr/bin/g++-7 \
+--expt-relaxed-constexpr
+
+g++-7 -std=c++14 -shared -o ${OP_NAME}.so ${OP_NAME}.cc ${OP_NAME}_kernel.cc \
+${OP_NAME}_kernel.cu.o ${TF_CFLAGS[@]} -fPIC -lcudart ${TF_LFLAGS[@]}
+```
+
+________________________________________________________________________________
+
+**Q3: I got the following errors while compiling the efficient merging
+operation:**
+
+```
+fatal error: third_party/gpus/cuda/include/cuda_fp16.h: No such file or directory
+```
+
+**A3:** It sounds like that CUDA headers are not linked. To resolve this issue,
+you need to tell tensorflow where to find the CUDA headers:
+
+1.  Find the CUDA installation directory ${CUDA_DIR} which contains the
+    `include` folder (For example, `~/CUDA/gpus/cuda_11_0`).
+2.  Go to the directory where tensorflow package is installed. (You can find it
+    via `pip show tensorflow`.)
+3.  Then `cd` to `tensorflow/include/third_party/gpus/`. (If it doesn't exist,
+    create one.)
+4.  Symlink your CUDA include directory here:
+
+```
+ln -s ${CUDA_DIR} ./cuda
+```
+
+There have been similar issues and solutions discussed here:
+https://github.com/tensorflow/tensorflow/issues/31912#issuecomment-547475301
diff --git a/g3doc/setup/kitti_step.md b/g3doc/setup/kitti_step.md
new file mode 100644
index 0000000000000000000000000000000000000000..5c1bf893c9dc0d579a31ef5cc56620ede2c919da
--- /dev/null
+++ b/g3doc/setup/kitti_step.md
@@ -0,0 +1,178 @@
+# Run DeepLab2 on KITTI-STEP dataset
+
+## KITTI-STEP dataset
+
+KITTI-STEP extends the existing
+[KITTI-MOTS](http://www.cvlibs.net/datasets/kitti/eval_mots.php) dataset with
+spatially and temporally dense annotations. KITTI-STEP dataset provides a
+test-bed for studying long-term pixel-precise segmentation and tracking under
+real-world conditions.
+
+### Annotation
+
+KITTI-STEP's annotation is collected in a semi-automatic manner. At the first
+stage, the pseudo semantic labels generated by the state-of-the-art
+[Panoptic-DeepLab](../projects/panoptic_deeplab.md) are refined by human
+annotators with at least one round. Then this new semantic segmentation
+annotation is merged with the existing tracking instance ground-truth from the
+[KITTI-MOTS](http://www.cvlibs.net/datasets/kitti/eval_mots.php). Please refer
+to the following figure as an overview.
+
+<p align="center">
+   <img src="../img/step/kitti_step_annotation.png" width=500>
+</p>
+
+### Label Map
+
+KITTI-STEP adopts the same 19 classes as defined in
+[Cityscapes](https://www.cityscapes-dataset.com/dataset-overview/#class-definitions)
+with `pedestrians` and `cars` carefully annotated with track IDs. More
+specifically, KITTI-STEP has the following label to index mapping:
+
+Label Name     | Label ID
+-------------- | --------
+road           | 0
+sidewalk       | 1
+building       | 2
+wall           | 3
+fence          | 4
+pole           | 5
+traffic light  | 6
+traffic sign   | 7
+vegetation     | 8
+terrain        | 9
+sky            | 10
+person&dagger; | 11
+rider          | 12
+car&dagger;    | 13
+truck          | 14
+bus            | 15
+train          | 16
+motorcycle     | 17
+bicycle        | 18
+void           | 255
+
+&dagger;: Single instance annotations are available.
+
+### Prepare KITTI-STEP for Training and Evaluation
+
+#### Download data
+
+KITTI-STEP has the same train and test sequences as
+[KITTI-MOTS](http://www.cvlibs.net/datasets/kitti/eval_mots.php) (with 21 and 29
+sequences for training and testing, respectively). Similarly, the training
+sequences are further split into training set (12 sequences) and validation set
+(9 sequences).
+
+In the following, we provide a step-by-step walk through to prepare the data.
+
+1.  Create the KITTI-STEP directory: `bash mkdir ${KITTI_STEP_ROOT}/images cd
+    ${KITTI_STEP_ROOT}/images`
+
+2.  Download KITTI images from their
+    [website](http://www.cvlibs.net/datasets/kitti/index.php) and unzip. `bash
+    wget ${KITTI_LINK} unzip ${KITTI_IMAGES}.zip`
+
+3.  To prepare the dataset for our scripts, we need to move and rename some
+    directories:
+
+    ```bash
+    mv testing/image_02/ test/
+    rm -r testing/
+
+    # Move all validation sequences:
+    mkdir val
+    mv training/image_02/0002 val/
+    mv training/image_02/0006 val/
+    mv training/image_02/0007 val/
+    mv training/image_02/0008 val/
+    mv training/image_02/0010 val/
+    mv training/image_02/0013 val/
+    mv training/image_02/0014 val/
+    mv training/image_02/0016 val/
+    mv training/image_02/0018 val/
+
+    # Move training sequences
+    mv training/image_02/ train/
+    rm -r training
+    ```
+
+4.  Download groundtruth KITTI-STEP panoptic maps from
+    [here](http://storage.googleapis.com/gresearch/tf-deeplab/data/kitti-step.tar.gz).
+
+    ```bash
+    # Goto ${KITTI_STEP_ROOT}
+    cd ..
+
+    wget http://storage.googleapis.com/gresearch/tf-deeplab/data/kitti-step.tar.gz
+    tar -xvf kitti-step.tar.gz
+    mv kitti-step/panoptic_maps panoptic_maps
+    rm -r kitti-step
+    ```
+
+The groundtruth panoptic map is encoded as follows in PNG format:
+
+```
+R = semantic_id
+G = instance_id // 256
+B = instance % 256
+```
+
+Following the above guide, your data structure should look like this:
+
+```
+.(KITTI_STEP_ROOT)
++-- images
+|   |
+|   +-- train
+|   |   |
+|   |   +-- sequence_id (%04d)
+|   |       |
+|   |       +-- frame_id.png (%06d.png)
+|   |   ...
+|   +-- val
+|   +-- test
+|
++-- panoptic_maps
+     |
+     +-- train
+     |   |
+     |   +-- sequence_id (%04d)
+     |       |
+     |       +-- frame_id.png (%06d.png)
+     |   ...
+     +-- val
+```
+
+#### Create tfrecord files
+
+To create dataset for training and evaluation, run the following command:
+
+```bash
+python deeplab2/data/build_step_data.py \
+  --step_root=${KITTI_STEP_ROOT} \
+  --output_dir=${OUTPUT_DIR}
+```
+
+This script outputs three sharded tfrecord files:
+`{train|val|test}@10.tfrecord`. In the tfrecords, for `train` and `val` set, it
+contains the RGB image pixels as well as their panoptic maps. For `test` set, it
+contains RGB images only. These files will be used as the input for the model
+training and evaluation.
+
+Optionally, you can also specify with `--use_two_frames` to encode two
+consecutive frames into the tfrecord files.
+
+## Citing KITTI-STEP
+
+If you find this dataset helpful in your research, please use the following
+BibTeX entry.
+
+```
+@article{step_2021,
+  author={Mark Weber and Jun Xie and Maxwell Collins and Yukun Zhu and Paul Voigtlaender and Hartwig Adam and Bradley Green and Andreas Geiger and Bastian Leibe and Daniel Cremers and Aljosa Osep and Laura Leal-Taixe and Liang-Chieh Chen},
+  title={{STEP}: Segmenting and Tracking Every Pixel},
+  journal={arXiv:2102.11859},
+  year={2021}
+}
+```
diff --git a/g3doc/setup/motchallenge_step.md b/g3doc/setup/motchallenge_step.md
new file mode 100644
index 0000000000000000000000000000000000000000..f34046ea636847e3284bd188b7cc9d97b946b3ab
--- /dev/null
+++ b/g3doc/setup/motchallenge_step.md
@@ -0,0 +1,60 @@
+# Run DeepLab2 on MOTChallenge-STEP dataset
+
+## MOTChallenge-STEP dataset
+
+MOTChallenge-STEP extends the existing [MOTChallenge](https://motchallenge.net/)
+dataset with spatially and temporally dense annotations.
+
+### Label Map
+
+MOTChallenge-STEP dataset followings the same annotation and label policy as
+[KITTI-STEP dataset](./kitti_step.md). Among the
+[MOTChallenge](https://motchallenge.net/) dataset, 4 outdoor sequences are
+annotated for MOTChallenge-STEP. In particular, these sequences are splitted
+into 2 for training and 2 for testing. This dataset contains only 7 semantic
+classes, as not all of
+[Cityscapes](https://www.cityscapes-dataset.com/dataset-overview/#class-definitions)'
+19 semantic classes are present.
+
+Label Name     | Label ID
+-------------- | --------
+sidewalk       | 0
+building       | 1
+vegetation     | 2
+sky            | 3
+person&dagger; | 4
+rider          | 5
+bicycle        | 6
+void           | 255
+
+&dagger;: Single instance annotations are available.
+
+### Prepare MOTChallenge-STEP for Training and Evaluation
+
+#### Download data
+
+1.  Download MOTChallenge images from https://motchallenge.net/
+2.  Download groundtruth MOTChallenge-STEP panoptic maps from
+    http://storage.googleapis.com/gresearch/tf-deeplab/data/motchallenge-step.tar.gz
+
+The groundtruth panoptic map is encoded in the same way as described in
+[KITTI-STEP dataset](./kitti_step.md).
+
+#### Create tfrecord files
+
+You should follow the same folder structure and run the command line script in
+[KITTI-STEP dataset](./kitti_step.md) to prepare MOTChallenge-STEP dataset.
+
+## Citing MOTChallenge-STEP
+
+If you find this dataset helpful in your research, please use the following
+BibTeX entry.
+
+```
+@article{step_2021,
+  author={Mark Weber and Jun Xie and Maxwell Collins and Yukun Zhu and Paul Voigtlaender and Hartwig Adam and Bradley Green and Andreas Geiger and Bastian Leibe and Daniel Cremers and Aljosa Osep and Laura Leal-Taixe and Liang-Chieh Chen},
+  title={{STEP}: Segmenting and Tracking Every Pixel},
+  journal={arXiv:2102.11859},
+  year={2021}
+}
+```
diff --git a/g3doc/setup/your_own_dataset.md b/g3doc/setup/your_own_dataset.md
new file mode 100644
index 0000000000000000000000000000000000000000..bea81efebd7e8b0ac3641e172e2e6f5b27b764d4
--- /dev/null
+++ b/g3doc/setup/your_own_dataset.md
@@ -0,0 +1,53 @@
+# Convert your own dataset for DeepLab2 framework
+
+You may want to train DeepLab2 on your own dataset. Here, we provide some
+guidances and hopefully that will facillitate the preparation process.
+
+1. Prepare your own dataset.
+  * **Images** should be stored either in `jpg` or `png` format.
+  * **Annotations** should be stored either in `png` or `raw` format. The
+  DeepLab2 framework assumes the panoptic label format (i.e.,
+  `panoptic_label = semantic_label * label_divisor + instance_id`, where the
+  `label_divisor` should be larger than the maximum number of instances per
+  image). The `raw` format refers to the case where we could save the panoptic
+  annotations in the int32 array (e.g., int32_numpy_array.tostring()).
+2. Convert the dataset to TFRecord.
+  * Update our provided example code (e.g.,
+  [build_step_data.py](../../data/build_step_data.py))to convert your dataset
+  to TFRecord.
+3. Modify the `dataset.py` (path: `${DEEPLAB2}/data/dataset.py`) to provide
+  your dataset information.
+  * Set the `panoptic_label_divisor` (i.e., the `label_divisor` above)
+  correctly. Its value should be larger than the maximum number of instances
+  that could appear per image in your dataset.
+  * Set the `ignore_label` properly. Pixels annotated with `ignore_label`
+  are not used during both training and evaluation. If your dataset does not
+  contain the `ignore_label` annotations, you could simply set it to be a
+  large value (e.g., 255 as for
+  [Cityscapes](https://www.cityscapes-dataset.com/)).
+  * Set the `class_has_instance_list` properly. The variable specifies
+  which class belongs to the `thing` class (i.e., countable objects such as
+  people, cars).
+  * Set the colormap (for visualization) properly. You may also need to
+  define your own colormap (see `${DEEPLAB2}/trainer/vis_utils.py`).
+4. Prepare the experiment config.
+  * Update our provided example configs (path:
+  `${DEEPLAB2}/configs/${DATASET}/${MODEL}/${BACKBONE}`) for your use
+  case. A few things that may worth your attention:
+       * Set the `crop_size` correctly for both training and evaluation. See
+       Q2 in [FAQ](../faq.md) for more details.
+       * Tune the config flags for your dataset (e.g., `base_learning_rate`,
+       `training_number_of_step`, and so on).
+
+Finally, if your dataset only contains semantic segmentation annotations,
+you could still use DeepLab2 framework with some minor changes:
+
+1. Set `panoptic_label_divisor=None` in dataset.py (we also provide one
+example in dataset.py, where we only train with semantic segmentation on
+Cityscapes).
+2. Have a config similar to
+`${DEEPLAB2}/configs/cityscapes/panoptic_deeplab/
+resnet50_os32_semseg.textproto`, where the instance branch is not
+initiated.
+
+At this point, you are good to go! Enjoy training DeepLab2!
diff --git a/gradio/demo.py b/gradio/demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..276d53007db763c47dbad06c2a38014e2546764a
--- /dev/null
+++ b/gradio/demo.py
@@ -0,0 +1,273 @@
+import collections
+import os
+import tempfile
+
+from matplotlib import gridspec
+from matplotlib import pyplot as plt
+import numpy as np
+from PIL import Image
+import urllib
+
+import tensorflow as tf
+import gradio as gr
+
+from subprocess import call
+import sys
+
+import requests
+
+url1 = 'https://cdn.pixabay.com/photo/2014/09/07/21/52/city-438393_1280.jpg'
+r = requests.get(url1, allow_redirects=True)
+open("city1.jpg", 'wb').write(r.content)
+
+url2 = 'https://cdn.pixabay.com/photo/2016/02/19/11/36/canal-1209808_1280.jpg'
+r = requests.get(url2, allow_redirects=True)
+open("city2.jpg", 'wb').write(r.content)
+
+
+DatasetInfo = collections.namedtuple(
+    'DatasetInfo',
+    'num_classes, label_divisor, thing_list, colormap, class_names')
+
+
+def _cityscapes_label_colormap():
+  """Creates a label colormap used in CITYSCAPES segmentation benchmark.
+
+  See more about CITYSCAPES dataset at https://www.cityscapes-dataset.com/
+  M. Cordts, et al. "The Cityscapes Dataset for Semantic Urban Scene Understanding." CVPR. 2016.
+
+  Returns:
+    A 2-D numpy array with each row being mapped RGB color (in uint8 range).
+  """
+  colormap = np.zeros((256, 3), dtype=np.uint8)
+  colormap[0] = [128, 64, 128]
+  colormap[1] = [244, 35, 232]
+  colormap[2] = [70, 70, 70]
+  colormap[3] = [102, 102, 156]
+  colormap[4] = [190, 153, 153]
+  colormap[5] = [153, 153, 153]
+  colormap[6] = [250, 170, 30]
+  colormap[7] = [220, 220, 0]
+  colormap[8] = [107, 142, 35]
+  colormap[9] = [152, 251, 152]
+  colormap[10] = [70, 130, 180]
+  colormap[11] = [220, 20, 60]
+  colormap[12] = [255, 0, 0]
+  colormap[13] = [0, 0, 142]
+  colormap[14] = [0, 0, 70]
+  colormap[15] = [0, 60, 100]
+  colormap[16] = [0, 80, 100]
+  colormap[17] = [0, 0, 230]
+  colormap[18] = [119, 11, 32]
+  return colormap
+
+
+def _cityscapes_class_names():
+  return ('road', 'sidewalk', 'building', 'wall', 'fence', 'pole',
+          'traffic light', 'traffic sign', 'vegetation', 'terrain', 'sky',
+          'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle',
+          'bicycle')
+
+
+def cityscapes_dataset_information():
+  return DatasetInfo(
+      num_classes=19,
+      label_divisor=1000,
+      thing_list=tuple(range(11, 19)),
+      colormap=_cityscapes_label_colormap(),
+      class_names=_cityscapes_class_names())
+
+
+def perturb_color(color, noise, used_colors, max_trials=50, random_state=None):
+  """Pertrubs the color with some noise.
+
+  If `used_colors` is not None, we will return the color that has
+  not appeared before in it.
+
+  Args:
+    color: A numpy array with three elements [R, G, B].
+    noise: Integer, specifying the amount of perturbing noise (in uint8 range).
+    used_colors: A set, used to keep track of used colors.
+    max_trials: An integer, maximum trials to generate random color.
+    random_state: An optional np.random.RandomState. If passed, will be used to
+      generate random numbers.
+
+  Returns:
+    A perturbed color that has not appeared in used_colors.
+  """
+  if random_state is None:
+    random_state = np.random
+
+  for _ in range(max_trials):
+    random_color = color + random_state.randint(
+        low=-noise, high=noise + 1, size=3)
+    random_color = np.clip(random_color, 0, 255)
+
+    if tuple(random_color) not in used_colors:
+      used_colors.add(tuple(random_color))
+      return random_color
+
+  print('Max trial reached and duplicate color will be used. Please consider '
+        'increase noise in `perturb_color()`.')
+  return random_color
+
+
+def color_panoptic_map(panoptic_prediction, dataset_info, perturb_noise):
+  """Helper method to colorize output panoptic map.
+
+  Args:
+    panoptic_prediction: A 2D numpy array, panoptic prediction from deeplab
+      model.
+    dataset_info: A DatasetInfo object, dataset associated to the model.
+    perturb_noise: Integer, the amount of noise (in uint8 range) added to each
+      instance of the same semantic class.
+
+  Returns:
+    colored_panoptic_map: A 3D numpy array with last dimension of 3, colored
+      panoptic prediction map.
+    used_colors: A dictionary mapping semantic_ids to a set of colors used
+      in `colored_panoptic_map`.
+  """
+  if panoptic_prediction.ndim != 2:
+    raise ValueError('Expect 2-D panoptic prediction. Got {}'.format(
+        panoptic_prediction.shape))
+
+  semantic_map = panoptic_prediction // dataset_info.label_divisor
+  instance_map = panoptic_prediction % dataset_info.label_divisor
+  height, width = panoptic_prediction.shape
+  colored_panoptic_map = np.zeros((height, width, 3), dtype=np.uint8)
+
+  used_colors = collections.defaultdict(set)
+  # Use a fixed seed to reproduce the same visualization.
+  random_state = np.random.RandomState(0)
+
+  unique_semantic_ids = np.unique(semantic_map)
+  for semantic_id in unique_semantic_ids:
+    semantic_mask = semantic_map == semantic_id
+    if semantic_id in dataset_info.thing_list:
+      # For `thing` class, we will add a small amount of random noise to its
+      # correspondingly predefined semantic segmentation colormap.
+      unique_instance_ids = np.unique(instance_map[semantic_mask])
+      for instance_id in unique_instance_ids:
+        instance_mask = np.logical_and(semantic_mask,
+                                       instance_map == instance_id)
+        random_color = perturb_color(
+            dataset_info.colormap[semantic_id],
+            perturb_noise,
+            used_colors[semantic_id],
+            random_state=random_state)
+        colored_panoptic_map[instance_mask] = random_color
+    else:
+      # For `stuff` class, we use the defined semantic color.
+      colored_panoptic_map[semantic_mask] = dataset_info.colormap[semantic_id]
+      used_colors[semantic_id].add(tuple(dataset_info.colormap[semantic_id]))
+  return colored_panoptic_map, used_colors
+
+
+def vis_segmentation(image,
+                     panoptic_prediction,
+                     dataset_info,
+                     perturb_noise=60):
+  """Visualizes input image, segmentation map and overlay view."""
+  plt.figure(figsize=(30, 20))
+  grid_spec = gridspec.GridSpec(2, 2)
+
+  ax = plt.subplot(grid_spec[0])
+  plt.imshow(image)
+  plt.axis('off')
+  ax.set_title('input image', fontsize=20)
+
+  ax = plt.subplot(grid_spec[1])
+  panoptic_map, used_colors = color_panoptic_map(panoptic_prediction,
+                                                 dataset_info, perturb_noise)
+  plt.imshow(panoptic_map)
+  plt.axis('off')
+  ax.set_title('panoptic map', fontsize=20)
+
+  ax = plt.subplot(grid_spec[2])
+  plt.imshow(image)
+  plt.imshow(panoptic_map, alpha=0.7)
+  plt.axis('off')
+  ax.set_title('panoptic overlay', fontsize=20)
+
+  ax = plt.subplot(grid_spec[3])
+  max_num_instances = max(len(color) for color in used_colors.values())
+  # RGBA image as legend.
+  legend = np.zeros((len(used_colors), max_num_instances, 4), dtype=np.uint8)
+  class_names = []
+  for i, semantic_id in enumerate(sorted(used_colors)):
+    legend[i, :len(used_colors[semantic_id]), :3] = np.array(
+        list(used_colors[semantic_id]))
+    legend[i, :len(used_colors[semantic_id]), 3] = 255
+    if semantic_id < dataset_info.num_classes:
+      class_names.append(dataset_info.class_names[semantic_id])
+    else:
+      class_names.append('ignore')
+
+  plt.imshow(legend, interpolation='nearest')
+  ax.yaxis.tick_left()
+  plt.yticks(range(len(legend)), class_names, fontsize=15)
+  plt.xticks([], [])
+  ax.tick_params(width=0.0, grid_linewidth=0.0)
+  plt.grid('off')
+  return plt
+
+def run_cmd(command):
+    try:
+        print(command)
+        call(command, shell=True)
+    except KeyboardInterrupt:
+        print("Process interrupted")
+        sys.exit(1)
+MODEL_NAME = 'resnet50_os32_panoptic_deeplab_cityscapes_crowd_trainfine_saved_model'   
+
+
+_MODELS = ('resnet50_os32_panoptic_deeplab_cityscapes_crowd_trainfine_saved_model',
+           'resnet50_beta_os32_panoptic_deeplab_cityscapes_trainfine_saved_model',
+           'wide_resnet41_os16_panoptic_deeplab_cityscapes_trainfine_saved_model',
+           'swidernet_sac_1_1_1_os16_panoptic_deeplab_cityscapes_trainfine_saved_model',
+           'swidernet_sac_1_1_3_os16_panoptic_deeplab_cityscapes_trainfine_saved_model',
+           'swidernet_sac_1_1_4.5_os16_panoptic_deeplab_cityscapes_trainfine_saved_model',
+           'axial_swidernet_1_1_1_os16_axial_deeplab_cityscapes_trainfine_saved_model',
+           'axial_swidernet_1_1_3_os16_axial_deeplab_cityscapes_trainfine_saved_model',
+           'axial_swidernet_1_1_4.5_os16_axial_deeplab_cityscapes_trainfine_saved_model',
+           'max_deeplab_s_backbone_os16_axial_deeplab_cityscapes_trainfine_saved_model',
+           'max_deeplab_l_backbone_os16_axial_deeplab_cityscapes_trainfine_saved_model')
+_DOWNLOAD_URL_PATTERN = 'https://storage.googleapis.com/gresearch/tf-deeplab/saved_model/%s.tar.gz'
+
+_MODEL_NAME_TO_URL_AND_DATASET = {
+    model: (_DOWNLOAD_URL_PATTERN % model, cityscapes_dataset_information())
+    for model in _MODELS
+}
+
+MODEL_URL, DATASET_INFO = _MODEL_NAME_TO_URL_AND_DATASET[MODEL_NAME]
+
+model_dir = tempfile.mkdtemp()
+
+download_path = os.path.join(model_dir, MODEL_NAME + '.gz')
+urllib.request.urlretrieve(MODEL_URL, download_path)
+
+run_cmd("tar -xzvf " + download_path + " -C " + model_dir)
+
+LOADED_MODEL = tf.saved_model.load(os.path.join(model_dir, MODEL_NAME))
+def inference(image):
+    image = image.resize(size=(512, 512))
+    im = np.array(image)
+    output = LOADED_MODEL(tf.cast(im, tf.uint8))
+    return vis_segmentation(im, output['panoptic_pred'][0], DATASET_INFO)
+
+title = "Deeplab2"
+description = "demo for Deeplab2. To use it, simply upload your image, or click one of the examples to load them. Read more at the links below."
+article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2106.09748'>DeepLab2: A TensorFlow Library for Deep Labeling</a> | <a href='https://github.com/google-research/deeplab2'>Github Repo</a></p>"
+
+gr.Interface(
+    inference, 
+    [gr.inputs.Image(type="pil", label="Input")], 
+    gr.outputs.Image(type="plot", label="Output"),
+    title=title,
+    description=description,
+    article=article,
+    examples=[
+            ["city1.jpg"],
+            ["city2.jpg"]
+        ]).launch()
diff --git a/model.proto b/model.proto
new file mode 100644
index 0000000000000000000000000000000000000000..c4dd1a8afb9cb0c6b73803dd43a0c583b45e9997
--- /dev/null
+++ b/model.proto
@@ -0,0 +1,198 @@
+// Copyright 2021 The Deeplab2 Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+
+package deeplab2;
+
+option java_multiple_files = true;
+
+/********** Submessages used to config model options **********/
+// Configure the decoder model options.
+message DecoderOptions {
+  // Set the features key for the high-level features, e.g. 'res5'.
+  optional string feature_key = 1;
+  // Set the number of filters in each convolution of the decoder.
+  optional int32 decoder_channels = 2 [default = 256];
+  // Set the decoder convolution type. Support 'depthwise_separable_conv' and
+  // 'standard_conv'.
+  optional string decoder_conv_type = 5 [default = 'depthwise_separable_conv'];
+  // Set the number of filters in each convolution of the ASPP.
+  optional int32 aspp_channels = 3 [default = 256];
+  // Set the list of atrous rates used in the ASPP. Note that this field has
+  // to be of length 3 (to specify the three 3x3 atrous convolutions in ASPP),
+  // and it is effective only when `aspp_use_only_1x1_proj_conv` is false.
+  repeated int32 atrous_rates = 4;
+  // The ASPP module uses only 1x1 projection convolution (i.e., the ASPP five
+  // branches consisting of one 1x1 convolution, three 3x3 atrous convolutions
+  // with specified `atrous_rates`, and the global average pooling are turned
+  // off, when `aspp_use_only_1x1_proj_conv` is true), equivalent to applying
+  // only one 1x1 convolution to reduce the feature map channels (obtained from
+  // encoder backbone) to the specified `aspp_channels`. This field is mainly
+  // used (i.e., set to true) when the encoder backbone is already able to
+  // efficiently capture long-range information, e.g., by axial attention blocks
+  // (for reference, see configs/cityscapes/axial_deeplab).
+  optional bool aspp_use_only_1x1_proj_conv = 6 [default = false];
+}
+
+// Configure the low level features to use.
+message LowLevelOptions {
+  // Set the name of the low-level feature, e.g. 'res2'.
+  optional string feature_key = 1;
+  // Set the number of filters for the 1x1 projection convolution.
+  optional int32 channels_project = 2;
+}
+
+// Configure the head options.
+message HeadOptions {
+  // Set the number of filters in the last convolution, e.g. 1 or NUM_CLASSES.
+  optional int32 output_channels = 1;
+  // Set the number of filters in the 5x5 convolution, e.g. 256 or 32.
+  optional int32 head_channels = 2;
+  // Set the head convolution type. Support 'depthwise_separable_conv' and
+  // 'standard_conv'
+  optional string head_conv_type = 3 [default = 'depthwise_separable_conv'];
+}
+
+// Configure the instance branch.
+message InstanceOptions {
+  // Set whether to use the instance branch.
+  optional bool enable = 1 [default = true];
+
+  // Set the low level options used in instance branch. The list of
+  // LowLevelOptions must be ordered lower resolution to higher resolution.
+  // Leaving it empty will use the same low level options as the semantic
+  // branch.
+  repeated LowLevelOptions low_level_override = 2;
+  // Set the decoder options of the instance branch. Leaving it empty will use
+  // the same decoder options as the semantic branch.
+  optional DecoderOptions instance_decoder_override = 3;
+
+  // Configure instance center head.
+  optional HeadOptions center_head = 4;
+  // Configure instance regression head.
+  optional HeadOptions regression_head = 5;
+
+  // Configure next-frame instance regression head.
+  optional HeadOptions next_regression_head = 6;
+}
+
+// Configure the model options.
+// Next ID: 12
+message ModelOptions {
+  // Configure model backbone.
+  message BackboneOptions {
+    // Set the name of the specific architecture of the family.
+    optional string name = 1 [default = 'resnet50'];
+    // Set the output stride of the encoder.
+    optional int32 output_stride = 2 [default = 32];
+    // Set path to pretrained weights to load pretrained weights.
+    optional string pretrained_weights = 3;
+    // Set whether to use the squeeze-and-excite operation.
+    optional bool use_squeeze_and_excite = 4 [default = false];
+    // Set the drop path keep probability for training. Default not to use.
+    optional float drop_path_keep_prob = 5 [default = 1.0];
+    // Set the drop path schedule. Currently support (1) 'constant': use the
+    // same drop path probability for all blocks, and (2) 'linear': linearly
+    // decrease the drop path probability from 1.0 at the 0-th stage (or STEM)
+    // to drop_path_keep_prob at the last block.
+    optional string drop_path_schedule = 6 [default = 'constant'];
+    // Set the STEM width_multiplier, controlloing STEM convolution channels.
+    optional float stem_width_multiplier = 7 [default = 1.0];
+    // Set the backbone (except STEM) width_multiplier, controlling backbone
+    // (except STEM) convolution channels.
+    optional float backbone_width_multiplier = 8 [default = 1.0];
+    // Set the backbone (except STEM) layer_multiplier, controlling the number
+    // of layers in the backbone (except STEM).
+    optional float backbone_layer_multiplier = 9 [default = 1.0];
+    // Use the Switchable Atrous Convolution (SAC) beyond the specified stride.
+    // For example, if use_sac_beyond_stride = 16, SAC will be applied to the
+    // network stage whose original output stride >= 16 (i.e., 16 and 32, or
+    // the last two stages). Set to -1 to disable it.
+    optional int32 use_sac_beyond_stride = 10 [default = -1];
+  }
+  // Set the model option for the backbone encoder model.
+  optional BackboneOptions backbone = 1;
+
+  // Shared decoder settings across different meta architectures.
+  optional DecoderOptions decoder = 2;
+
+  // Meta-architecture specific settings.
+  message DeeplabV3Options {
+    // Set the number of classes for the last convolution to predict logits.
+    optional int32 num_classes = 1;
+  }
+
+  message DeeplabV3PlusOptions {
+    // Set the low level options used in this decoder. The list of
+    // LowLevelOptions must be ordered from higher to lower levels.
+    optional LowLevelOptions low_level = 1;
+
+    // Set the number of classes for the last convolution to predict logits.
+    optional int32 num_classes = 2;
+  }
+
+  message PanopticDeeplabOptions {
+    // Set the low level options used in this decoder. The list of
+    // LowLevelOptions must be ordered lower resolution to higher resolution.
+    repeated LowLevelOptions low_level = 1;
+    // Set the model options for the instance branch.
+    optional InstanceOptions instance = 2;
+    // Set the model options of the semantic head.
+    optional HeadOptions semantic_head = 3;
+  }
+
+  message MotionDeepLabOptions {
+    // Set the low level options used in this decoder. The list of
+    // LowLevelOptions must be ordered lower resolution to higher resolution.
+    repeated LowLevelOptions low_level = 1;
+    // Set the model options for the instance branch.
+    optional InstanceOptions instance = 2;
+    // Set the model options of the semantic head.
+    optional HeadOptions semantic_head = 3;
+    // Set the model options for the motion head.
+    optional HeadOptions motion_head = 4;
+  }
+
+  message MaXDeepLabOptions {
+    // Set the head options of the mask head.
+    optional HeadOptions pixel_space_head = 1;
+    // Set the low level options used in the semantic decoder. The list of
+    // LowLevelOptions must be ordered lower resolution to higher resolution.
+    repeated LowLevelOptions auxiliary_low_level = 2;
+    // Set the head options of the semantic head.
+    optional HeadOptions auxiliary_semantic_head = 3;
+  }
+
+  oneof meta_architecture {
+    DeeplabV3Options deeplab_v3 = 3;
+    DeeplabV3PlusOptions deeplab_v3_plus = 4;
+    PanopticDeeplabOptions panoptic_deeplab = 5;
+    MotionDeepLabOptions motion_deeplab = 7;
+    MaXDeepLabOptions max_deeplab = 10;
+    PanopticDeeplabOptions vip_deeplab = 11;
+  }
+  // Set the checkpoint to load.
+  optional string initial_checkpoint = 6;
+  // Set whether to restore the last convolution of the semantic head when
+  // loading from the initial checkpoint. Setting this flag to false is useful
+  // when an initial checkpoint was trained on a dataset with different classes.
+  optional bool restore_semantic_last_layer_from_initial_checkpoint = 8
+      [default = true];
+  // Set whether to restore the last convolution of the instance heads when
+  // loading from the initial checkpoint. Depending on the meta architecture,
+  // this includes center heatmap, center regression and motion regression.
+  optional bool restore_instance_last_layer_from_initial_checkpoint = 9
+      [default = true];
+}
diff --git a/model/__init__.py b/model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..35e4ce02ff422f3aa84ab644b88d65b13e0cbc03
--- /dev/null
+++ b/model/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/model/builder.py b/model/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..9983b3e7f38597a384aa99e9ab9a32158c3eef46
--- /dev/null
+++ b/model/builder.py
@@ -0,0 +1,174 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This file contains functions to build encoder and decoder."""
+import tensorflow as tf
+
+from deeplab2 import config_pb2
+from deeplab2.model.decoder import deeplabv3
+from deeplab2.model.decoder import deeplabv3plus
+from deeplab2.model.decoder import max_deeplab
+from deeplab2.model.decoder import motion_deeplab_decoder
+from deeplab2.model.decoder import panoptic_deeplab
+from deeplab2.model.decoder import vip_deeplab_decoder
+from deeplab2.model.encoder import axial_resnet_instances
+from deeplab2.model.encoder import mobilenet
+
+
+def create_encoder(backbone_options: config_pb2.ModelOptions.BackboneOptions,
+                   bn_layer: tf.keras.layers.Layer,
+                   conv_kernel_weight_decay: float = 0.0) -> tf.keras.Model:
+  """Creates an encoder.
+
+  Args:
+    backbone_options: A proto config of type
+      config_pb2.ModelOptions.BackboneOptions.
+    bn_layer: A tf.keras.layers.Layer that computes the normalization.
+    conv_kernel_weight_decay: A float, the weight decay for convolution kernels.
+
+  Returns:
+    An instance of tf.keras.Model containing the encoder.
+
+  Raises:
+    ValueError: An error occurs when the specified encoder meta architecture is
+      not supported.
+  """
+  if ('resnet' in backbone_options.name or
+      'swidernet' in backbone_options.name or
+      'axial_deeplab' in backbone_options.name or
+      'max_deeplab' in backbone_options.name):
+    return create_resnet_encoder(
+        backbone_options,
+        bn_layer=bn_layer,
+        conv_kernel_weight_decay=conv_kernel_weight_decay)
+  elif 'mobilenet' in backbone_options.name:
+    return create_mobilenet_encoder(
+        backbone_options,
+        bn_layer=bn_layer,
+        conv_kernel_weight_decay=conv_kernel_weight_decay)
+  raise ValueError('The specified encoder %s is not a valid encoder.' %
+                   backbone_options.name)
+
+
+def create_mobilenet_encoder(
+    backbone_options: config_pb2.ModelOptions.BackboneOptions,
+    bn_layer: tf.keras.layers.Layer,
+    conv_kernel_weight_decay: float = 0.0) -> tf.keras.Model:
+  """Creates a MobileNet encoder specified by name.
+
+  Args:
+    backbone_options: A proto config of type
+      config_pb2.ModelOptions.BackboneOptions.
+    bn_layer: A tf.keras.layers.Layer that computes the normalization.
+    conv_kernel_weight_decay: A float, the weight decay for convolution kernels.
+
+  Returns:
+    An instance of tf.keras.Model containing the MobileNet encoder.
+  """
+  if backbone_options.name.lower() == 'mobilenet_v3_large':
+    backbone = mobilenet.MobileNetV3Large
+  elif backbone_options.name.lower() == 'mobilenet_v3_small':
+    backbone = mobilenet.MobileNetV3Small
+  else:
+    raise ValueError('The specified encoder %s is not a valid encoder.' %
+                     backbone_options.name)
+  assert backbone_options.use_squeeze_and_excite
+  assert backbone_options.drop_path_keep_prob == 1
+  assert backbone_options.use_sac_beyond_stride == -1
+  assert backbone_options.backbone_layer_multiplier == 1
+  return backbone(
+      output_stride=backbone_options.output_stride,
+      width_multiplier=backbone_options.backbone_width_multiplier,
+      bn_layer=bn_layer,
+      conv_kernel_weight_decay=conv_kernel_weight_decay)
+
+
+def create_resnet_encoder(
+    backbone_options: config_pb2.ModelOptions.BackboneOptions,
+    bn_layer: tf.keras.layers.Layer,
+    conv_kernel_weight_decay: float = 0.0) -> tf.keras.Model:
+  """Creates a ResNet encoder specified by name.
+
+  Args:
+    backbone_options: A proto config of type
+      config_pb2.ModelOptions.BackboneOptions.
+    bn_layer: A tf.keras.layers.Layer that computes the normalization.
+    conv_kernel_weight_decay: A float, the weight decay for convolution kernels.
+
+  Returns:
+    An instance of tf.keras.Model containing the ResNet encoder.
+  """
+  return axial_resnet_instances.get_model(
+      backbone_options.name,
+      output_stride=backbone_options.output_stride,
+      stem_width_multiplier=backbone_options.stem_width_multiplier,
+      width_multiplier=backbone_options.backbone_width_multiplier,
+      backbone_layer_multiplier=backbone_options.backbone_layer_multiplier,
+      block_group_config={
+          'use_squeeze_and_excite': backbone_options.use_squeeze_and_excite,
+          'drop_path_keep_prob': backbone_options.drop_path_keep_prob,
+          'drop_path_schedule': backbone_options.drop_path_schedule,
+          'use_sac_beyond_stride': backbone_options.use_sac_beyond_stride},
+      bn_layer=bn_layer,
+      conv_kernel_weight_decay=conv_kernel_weight_decay)
+
+
+def create_decoder(model_options: config_pb2.ModelOptions,
+                   bn_layer: tf.keras.layers.Layer,
+                   ignore_label: int) -> tf.keras.Model:
+  """Creates a DeepLab decoder.
+
+  Args:
+    model_options: A proto config of type config_pb2.ModelOptions.
+    bn_layer: A tf.keras.layers.Layer that computes the normalization.
+    ignore_label: An integer specifying the ignore label.
+
+  Returns:
+    An instance of tf.keras.layers.Layer containing the decoder.
+
+  Raises:
+    ValueError: An error occurs when the specified meta architecture is not
+      supported.
+  """
+  meta_architecture = model_options.WhichOneof('meta_architecture')
+  if meta_architecture == 'deeplab_v3':
+    return deeplabv3.DeepLabV3(
+        model_options.decoder, model_options.deeplab_v3, bn_layer=bn_layer)
+  elif meta_architecture == 'deeplab_v3_plus':
+    return deeplabv3plus.DeepLabV3Plus(
+        model_options.decoder, model_options.deeplab_v3_plus, bn_layer=bn_layer)
+  elif meta_architecture == 'panoptic_deeplab':
+    return panoptic_deeplab.PanopticDeepLab(
+        model_options.decoder,
+        model_options.panoptic_deeplab,
+        bn_layer=bn_layer)
+  elif meta_architecture == 'motion_deeplab':
+    return motion_deeplab_decoder.MotionDeepLabDecoder(
+        model_options.decoder,
+        model_options.motion_deeplab,
+        bn_layer=bn_layer)
+  elif meta_architecture == 'vip_deeplab':
+    return vip_deeplab_decoder.ViPDeepLabDecoder(
+        model_options.decoder,
+        model_options.vip_deeplab,
+        bn_layer=bn_layer)
+  elif meta_architecture == 'max_deeplab':
+    return max_deeplab.MaXDeepLab(
+        model_options.decoder,
+        model_options.max_deeplab,
+        ignore_label=ignore_label,
+        bn_layer=bn_layer)
+  raise ValueError('The specified meta architecture %s is not implemented.' %
+                   meta_architecture)
diff --git a/model/builder_test.py b/model/builder_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fd603127caf05c0c72bc892c8bb93a7c81393be
--- /dev/null
+++ b/model/builder_test.py
@@ -0,0 +1,80 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for model.builder."""
+
+import os
+from absl.testing import parameterized
+
+import tensorflow as tf
+
+from google.protobuf import text_format
+from deeplab2 import config_pb2
+from deeplab2.model import builder
+from deeplab2.model.decoder import motion_deeplab_decoder
+from deeplab2.model.encoder import axial_resnet_instances
+from deeplab2.model.encoder import mobilenet
+# resources dependency
+
+
+_CONFIG_PATH = 'deeplab2/configs/example'
+
+
+def _read_proto_file(filename, proto):
+  filename = filename  # OSS: removed internal filename loading.
+  with tf.io.gfile.GFile(filename, 'r') as proto_file:
+    return text_format.ParseLines(proto_file, proto)
+
+
+class BuilderTest(tf.test.TestCase, parameterized.TestCase):
+
+  def test_resnet50_encoder_creation(self):
+    backbone_options = config_pb2.ModelOptions.BackboneOptions(
+        name='resnet50', output_stride=32)
+    encoder = builder.create_encoder(
+        backbone_options,
+        tf.keras.layers.experimental.SyncBatchNormalization)
+    self.assertIsInstance(encoder, axial_resnet_instances.ResNet50)
+
+  @parameterized.parameters('mobilenet_v3_large', 'mobilenet_v3_small')
+  def test_mobilenet_encoder_creation(self, model_name):
+    backbone_options = config_pb2.ModelOptions.BackboneOptions(
+        name=model_name, use_squeeze_and_excite=True, output_stride=32)
+    encoder = builder.create_encoder(
+        backbone_options,
+        tf.keras.layers.experimental.SyncBatchNormalization)
+    self.assertIsInstance(encoder, mobilenet.MobileNet)
+
+  def test_resnet_encoder_creation(self):
+    backbone_options = config_pb2.ModelOptions.BackboneOptions(
+        name='max_deeplab_s', output_stride=32)
+    encoder = builder.create_resnet_encoder(
+        backbone_options,
+        bn_layer=tf.keras.layers.experimental.SyncBatchNormalization)
+    self.assertIsInstance(encoder, axial_resnet_instances.MaXDeepLabS)
+
+  def test_decoder_creation(self):
+    proto_filename = os.path.join(
+        _CONFIG_PATH, 'example_kitti-step_motion_deeplab.textproto')
+    model_options = _read_proto_file(proto_filename, config_pb2.ModelOptions())
+    motion_decoder = builder.create_decoder(
+        model_options, tf.keras.layers.experimental.SyncBatchNormalization,
+        ignore_label=255)
+    self.assertIsInstance(motion_decoder,
+                          motion_deeplab_decoder.MotionDeepLabDecoder)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/model/decoder/__init__.py b/model/decoder/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..35e4ce02ff422f3aa84ab644b88d65b13e0cbc03
--- /dev/null
+++ b/model/decoder/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/model/decoder/aspp.py b/model/decoder/aspp.py
new file mode 100644
index 0000000000000000000000000000000000000000..32cc3e4f66c6ede6a6f32922933d32a0724c7f80
--- /dev/null
+++ b/model/decoder/aspp.py
@@ -0,0 +1,289 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This file contains code to build an ASPP layer.
+
+Reference:
+  - [Rethinking Atrous Convolution for Semantic Image Segmentation](
+      https://arxiv.org/pdf/1706.05587.pdf)
+  - [ParseNet: Looking Wider to See Better](
+      https://arxiv.org/pdf/1506.04579.pdf).
+"""
+from absl import logging
+import tensorflow as tf
+
+from deeplab2.model import utils
+from deeplab2.model.layers import convolutions
+
+
+layers = tf.keras.layers
+backend = tf.keras.backend
+
+
+class ASPPConv(tf.keras.layers.Layer):
+  """An atrous convolution for ASPP."""
+
+  def __init__(self,
+               output_channels,
+               atrous_rate,
+               name,
+               bn_layer=tf.keras.layers.BatchNormalization):
+    """Creates a atrous convolution layer for the ASPP.
+
+    This layer consists of an atrous convolution followed by a BatchNorm layer
+    and a ReLU activation.
+
+    Args:
+      output_channels: An integer specifying the number of output channels of
+        the convolution.
+      atrous_rate: An integer specifying the atrous/dilation rate of the
+        convolution.
+      name: A string specifying the name of this layer.
+      bn_layer: An optional tf.keras.layers.Layer that computes the
+        normalization (default: tf.keras.layers.BatchNormalization).
+    """
+    super(ASPPConv, self).__init__(name=name)
+
+    self._conv_bn_act = convolutions.Conv2DSame(
+        output_channels,
+        kernel_size=3,
+        name='conv_bn_act',
+        atrous_rate=atrous_rate,
+        use_bias=False,
+        use_bn=True,
+        bn_layer=bn_layer,
+        activation='relu')
+
+  def call(self, input_tensor, training=False):
+    """Performs a forward pass.
+
+    Args:
+      input_tensor: An input tensor of type tf.Tensor with shape [batch, height,
+        width, channels].
+      training: A boolean flag indicating whether training behavior should be
+        used (default: False).
+
+    Returns:
+      The output tensor.
+    """
+    return self._conv_bn_act(input_tensor, training=training)
+
+
+class ASPPPool(tf.keras.layers.Layer):
+  """A pooling layer for ASPP."""
+
+  def __init__(self,
+               output_channels,
+               name,
+               bn_layer=tf.keras.layers.BatchNormalization):
+    """Creates a pooling layer for the ASPP.
+
+    This layer consists of a global average pooling, followed by a convolution,
+    and by a BatchNorm layer and a ReLU activation.
+
+    Args:
+      output_channels: An integer specifying the number of output channels of
+        the convolution.
+      name: A string specifying the name of this layer.
+      bn_layer: An optional tf.keras.layers.Layer that computes the
+        normalization (default: tf.keras.layers.BatchNormalization).
+    """
+    super(ASPPPool, self).__init__(name=name)
+
+    self._pool_size = (None, None)
+    self._conv_bn_act = convolutions.Conv2DSame(
+        output_channels,
+        kernel_size=1,
+        name='conv_bn_act',
+        use_bias=False,
+        use_bn=True,
+        bn_layer=bn_layer,
+        activation='relu')
+
+  def set_pool_size(self, pool_size):
+    """Sets the pooling size of the pooling layer.
+
+    The default behavior of the pooling layer is global average pooling. A
+    custom pooling size can be set here.
+
+    Args:
+      pool_size: A tuple specifying the pooling size of the pooling layer.
+
+    Raises:
+      An error occurs if exactly one pooling dimension is set to 'None'.
+    """
+    # If exactly one pooling dimension is 'None' raise an error.
+    if None in pool_size and pool_size != (None, None):
+      raise ValueError('The ASPP pooling layer requires that the pooling size '
+                       'is set explicitly for both dimensions. In case, global '
+                       'average pooling should be used, call '
+                       'reset_pooling_layer() or set both to None.')
+
+    self._pool_size = pool_size
+    logging.info('Global average pooling in the ASPP pooling layer was replaced'
+                 ' with tiled average pooling using the provided pool_size. '
+                 'Please make sure this behavior is intended.')
+
+  def get_pool_size(self):
+    return self._pool_size
+
+  def reset_pooling_layer(self):
+    """Resets the pooling layer to global average pooling."""
+    self._pool_size = (None, None)
+
+  def call(self, input_tensor, training=False):
+    """Performs a forward pass.
+
+    Args:
+      input_tensor: An input tensor of type tf.Tensor with shape [batch, height,
+        width, channels].
+      training: A boolean flag indicating whether training behavior should be
+        used (default: False).
+
+    Returns:
+      The output tensor.
+    """
+    if tuple(self._pool_size) == (None, None):
+      # Global image pooling
+      pool_size = input_tensor.shape[1:3]
+    else:
+      # Tiled image pooling
+      pool_size = self._pool_size
+
+    x = backend.pool2d(input_tensor, pool_size, padding='valid',
+                       pool_mode='avg')
+    x = self._conv_bn_act(x, training=training)
+
+    target_h = tf.shape(input_tensor)[1]
+    target_w = tf.shape(input_tensor)[2]
+
+    x = utils.resize_align_corners(x, [target_h, target_w])
+    return x
+
+
+class ASPP(tf.keras.layers.Layer):
+  """An atrous spatial pyramid pooling layer."""
+
+  def __init__(self,
+               output_channels,
+               atrous_rates,
+               aspp_use_only_1x1_proj_conv=False,
+               name='ASPP',
+               bn_layer=tf.keras.layers.BatchNormalization):
+    """Creates an ASPP layer.
+
+    Args:
+      output_channels: An integer specifying the number of output channels of
+        each ASPP convolution layer.
+      atrous_rates: A list of three integers specifying the atrous/dilation rate
+        of each ASPP convolution layer.
+      aspp_use_only_1x1_proj_conv: Boolean, specifying if the ASPP five branches
+        are turned off or not. If True, the ASPP module is degenerated to one
+        1x1 convolution, projecting the input channels to `output_channels`.
+      name: A string specifying the name of this layer (default: 'ASPP').
+      bn_layer: An optional tf.keras.layers.Layer that computes the
+        normalization (default: tf.keras.layers.BatchNormalization).
+
+    Raises:
+      ValueError: An error occurs when both atrous_rates does not contain 3
+        elements and `aspp_use_only_1x1_proj_conv` is False.
+    """
+    super(ASPP, self).__init__(name=name)
+
+    if not aspp_use_only_1x1_proj_conv and len(atrous_rates) != 3:
+      raise ValueError(
+          'The ASPP layers need exactly 3 atrous rates, but %d were given' %
+          len(atrous_rates))
+    self._aspp_use_only_1x1_proj_conv = aspp_use_only_1x1_proj_conv
+
+    # Projection convolution is always used.
+    self._proj_conv_bn_act = convolutions.Conv2DSame(
+        output_channels,
+        kernel_size=1,
+        name='proj_conv_bn_act',
+        use_bias=False,
+        use_bn=True,
+        bn_layer=bn_layer,
+        activation='relu')
+
+    if not aspp_use_only_1x1_proj_conv:
+      self._conv_bn_act = convolutions.Conv2DSame(
+          output_channels,
+          kernel_size=1,
+          name='conv_bn_act',
+          use_bias=False,
+          use_bn=True,
+          bn_layer=bn_layer,
+          activation='relu')
+      rate1, rate2, rate3 = atrous_rates
+      self._aspp_conv1 = ASPPConv(output_channels, rate1, name='aspp_conv1',
+                                  bn_layer=bn_layer)
+      self._aspp_conv2 = ASPPConv(output_channels, rate2, name='aspp_conv2',
+                                  bn_layer=bn_layer)
+      self._aspp_conv3 = ASPPConv(output_channels, rate3, name='aspp_conv3',
+                                  bn_layer=bn_layer)
+      self._aspp_pool = ASPPPool(output_channels, name='aspp_pool',
+                                 bn_layer=bn_layer)
+      # Dropout is needed only when ASPP five branches are used.
+      self._proj_drop = layers.Dropout(rate=0.1)
+
+  def set_pool_size(self, pool_size):
+    """Sets the pooling size of the ASPP pooling layer.
+
+    The default behavior of the pooling layer is global average pooling. A
+    custom pooling size can be set here.
+
+    Args:
+      pool_size: A tuple specifying the pooling size of the ASPP pooling layer.
+    """
+    if not self._aspp_use_only_1x1_proj_conv:
+      self._aspp_pool.set_pool_size(pool_size)
+
+  def get_pool_size(self):
+    if not self._aspp_use_only_1x1_proj_conv:
+      return self._aspp_pool.get_pool_size()
+    else:
+      return (None, None)
+
+  def reset_pooling_layer(self):
+    """Resets the pooling layer to global average pooling."""
+    self._aspp_pool.reset_pooling_layer()
+
+  def call(self, input_tensor, training=False):
+    """Performs a forward pass.
+
+    Args:
+      input_tensor: An input tensor of type tf.Tensor with shape [batch, height,
+        width, channels].
+      training: A boolean flag indicating whether training behavior should be
+        used (default: False).
+
+    Returns:
+      The output tensor.
+    """
+    if self._aspp_use_only_1x1_proj_conv:
+      x = self._proj_conv_bn_act(input_tensor, training=training)
+    else:
+      # Apply the ASPP module.
+      results = []
+      results.append(self._conv_bn_act(input_tensor, training=training))
+      results.append(self._aspp_conv1(input_tensor, training=training))
+      results.append(self._aspp_conv2(input_tensor, training=training))
+      results.append(self._aspp_conv3(input_tensor, training=training))
+      results.append(self._aspp_pool(input_tensor, training=training))
+      x = tf.concat(results, 3)
+      x = self._proj_conv_bn_act(x, training=training)
+      x = self._proj_drop(x, training=training)
+    return x
diff --git a/model/decoder/aspp_test.py b/model/decoder/aspp_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..75f952b3daece31efec51c05ce4387837002b216
--- /dev/null
+++ b/model/decoder/aspp_test.py
@@ -0,0 +1,91 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for aspp."""
+import tensorflow as tf
+
+from deeplab2.model.decoder import aspp
+from deeplab2.utils import test_utils
+
+
+class AsppTest(tf.test.TestCase):
+
+  def test_aspp_pool_error(self):
+    pool = aspp.ASPPPool(output_channels=64, name='')
+
+    # Should pass without an error.
+    pool.set_pool_size((None, None))
+
+    with self.assertRaises(ValueError):
+      # Should raise an error.
+      pool.set_pool_size((2, None))
+
+  def test_aspp_conv_atrous_rate_shape(self):
+    atrous_rates = [2, 6, 12, 18]
+    for rate in atrous_rates:
+      conv = aspp.ASPPConv(output_channels=64, atrous_rate=rate, name='')
+      input_tensor = tf.random.uniform(shape=(2, 12, 12, 3))
+
+      output = conv(input_tensor)
+      expected_shape = [2, 12, 12, 64]
+      self.assertListEqual(output.shape.as_list(), expected_shape)
+
+  def test_aspp_conv_non_negative(self):
+    conv = aspp.ASPPConv(output_channels=12, atrous_rate=2, name='')
+    input_tensor = tf.random.uniform(shape=(2, 17, 17, 3))
+
+    output = conv(input_tensor)
+    self.assertTrue((output.numpy() >= 0.0).all())
+
+  def test_aspp_pool_shape(self):
+    pool = aspp.ASPPPool(output_channels=64, name='')
+    input_tensor = tf.random.uniform(shape=(2, 12, 12, 3))
+
+    output = pool(input_tensor)
+    expected_shape = [2, 12, 12, 64]
+    self.assertListEqual(output.shape.as_list(), expected_shape)
+
+  def test_aspp_pool_non_negative(self):
+    pool = aspp.ASPPPool(output_channels=12, name='')
+    input_tensor = tf.random.uniform(shape=(2, 17, 17, 3))
+
+    output = pool(input_tensor)
+    self.assertTrue((output.numpy() >= 0.0).all())
+
+  def test_aspp_wrong_atrous_rate(self):
+    with self.assertRaises(ValueError):
+      _ = aspp.ASPP(output_channels=64, atrous_rates=[1, 2, 3, 4])
+
+  @test_utils.test_all_strategies
+  def test_aspp_shape(self, strategy):
+    with strategy.scope():
+      for bn_layer in test_utils.NORMALIZATION_LAYERS:
+        aspp_layer = aspp.ASPP(
+            output_channels=64, atrous_rates=[6, 12, 18], bn_layer=bn_layer)
+        input_tensor = tf.random.uniform(shape=(2, 32, 32, 3))
+
+        output = aspp_layer(input_tensor)
+        expected_shape = [2, 32, 32, 64]
+        self.assertListEqual(output.shape.as_list(), expected_shape)
+
+  def test_aspp_non_negative(self):
+    aspp_layer = aspp.ASPP(output_channels=32, atrous_rates=[4, 8, 16])
+    input_tensor = tf.random.uniform(shape=(2, 32, 32, 3))
+
+    output = aspp_layer(input_tensor)
+    self.assertTrue((output.numpy() >= 0.0).all())
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/model/decoder/deeplabv3.py b/model/decoder/deeplabv3.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3217543510dcae2f89f396534ab4a0c15ccff0a
--- /dev/null
+++ b/model/decoder/deeplabv3.py
@@ -0,0 +1,121 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This file contains code to build a DeepLabV3.
+
+Reference:
+  - [Rethinking Atrous Convolution for Semantic Image Segmentation](
+      https://arxiv.org/pdf/1706.05587.pdf)
+"""
+import tensorflow as tf
+
+from deeplab2 import common
+from deeplab2.model.decoder import aspp
+from deeplab2.model.layers import convolutions
+
+
+layers = tf.keras.layers
+
+
+class DeepLabV3(layers.Layer):
+  """A DeepLabV3 model.
+
+  This model takes in features from an encoder and performs multi-scale context
+  aggregation with the help of an ASPP layer. Finally, a classification head is
+  used to predict a semantic segmentation.
+  """
+
+  def __init__(self,
+               decoder_options,
+               deeplabv3_options,
+               bn_layer=tf.keras.layers.BatchNormalization):
+    """Creates a DeepLabV3 decoder of type layers.Layer.
+
+    Args:
+      decoder_options: Decoder options as defined in config_pb2.DecoderOptions.
+      deeplabv3_options: Model options as defined in
+        config_pb2.ModelOptions.DeeplabV3Options.
+      bn_layer: An optional tf.keras.layers.Layer that computes the
+        normalization (default: tf.keras.layers.BatchNormalization).
+    """
+    super(DeepLabV3, self).__init__(name='DeepLabV3')
+
+    self._feature_name = decoder_options.feature_key
+    self._aspp = aspp.ASPP(decoder_options.aspp_channels,
+                           decoder_options.atrous_rates,
+                           bn_layer=bn_layer)
+
+    self._classifier_conv_bn_act = convolutions.Conv2DSame(
+        decoder_options.decoder_channels,
+        kernel_size=3,
+        name='classifier_conv_bn_act',
+        use_bias=False,
+        use_bn=True,
+        bn_layer=bn_layer,
+        activation='relu')
+
+    self._final_conv = convolutions.Conv2DSame(
+        deeplabv3_options.num_classes, kernel_size=1, name='final_conv')
+
+  def set_pool_size(self, pool_size):
+    """Sets the pooling size of the ASPP pooling layer.
+
+    Args:
+      pool_size: A tuple specifying the pooling size of the ASPP pooling layer.
+    """
+    self._aspp.set_pool_size(pool_size)
+
+  def get_pool_size(self):
+    return self._aspp.get_pool_size()
+
+  def reset_pooling_layer(self):
+    """Resets the ASPP pooling layer to global average pooling."""
+    self._aspp.reset_pooling_layer()
+
+  def call(self, features, training=False):
+    """Performs a forward pass.
+
+    Args:
+      features: A single input tf.Tensor or an input dict of tf.Tensor with
+        shape [batch, height, width, channels]. If passed a dict, different keys
+        should point to different features extracted by the encoder, e.g.
+        low-level or high-level features.
+      training: A boolean flag indicating whether training behavior should be
+        used (default: False).
+
+    Returns:
+      A dictionary containing the semantic prediction under key
+      common.PRED_SEMANTIC_LOGITS_KEY.
+    """
+    if isinstance(features, tf.Tensor):
+      feature = features
+    else:
+      feature = features[self._feature_name]
+
+    x = self._aspp(feature, training=training)
+
+    x = self._classifier_conv_bn_act(x, training=training)
+
+    return {common.PRED_SEMANTIC_LOGITS_KEY: self._final_conv(x)}
+
+  @property
+  def checkpoint_items(self):
+    items = {
+        common.CKPT_DEEPLABV3_ASPP: self._aspp,
+        common.CKPT_DEEPLABV3_CLASSIFIER_CONV_BN_ACT:
+            self._classifier_conv_bn_act,
+        common.CKPT_SEMANTIC_LAST_LAYER: self._final_conv,
+    }
+    return items
diff --git a/model/decoder/deeplabv3_test.py b/model/decoder/deeplabv3_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cf6698585cb0ce5d14b53021cbe631ad26a1848
--- /dev/null
+++ b/model/decoder/deeplabv3_test.py
@@ -0,0 +1,143 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for deeplabv3."""
+
+import numpy as np
+import tensorflow as tf
+
+from deeplab2 import common
+from deeplab2 import config_pb2
+from deeplab2.model.decoder import deeplabv3
+from deeplab2.utils import test_utils
+
+
+def _create_deeplabv3_model(feature_key, decoder_channels, aspp_channels,
+                            atrous_rates, num_classes, **kwargs):
+  decoder_options = config_pb2.DecoderOptions(
+      feature_key=feature_key,
+      decoder_channels=decoder_channels,
+      aspp_channels=aspp_channels,
+      atrous_rates=atrous_rates)
+  deeplabv3_options = config_pb2.ModelOptions.DeeplabV3Options(
+      num_classes=num_classes)
+  return deeplabv3.DeepLabV3(decoder_options, deeplabv3_options, **kwargs)
+
+
+class Deeplabv3Test(tf.test.TestCase):
+
+  def test_deeplabv3_feature_key_not_present(self):
+    deeplabv3_decoder = _create_deeplabv3_model(
+        feature_key='not_in_features_dict',
+        aspp_channels=64,
+        decoder_channels=48,
+        atrous_rates=[6, 12, 18],
+        num_classes=80)
+    input_dict = dict()
+    input_dict['not_the_same_key'] = tf.random.uniform(shape=(2, 65, 65, 32))
+
+    with self.assertRaises(KeyError):
+      _ = deeplabv3_decoder(input_dict)
+
+  def test_deeplabv3_output_shape(self):
+    list_of_num_classes = [2, 19, 133]
+    for num_classes in list_of_num_classes:
+      deeplabv3_decoder = _create_deeplabv3_model(
+          feature_key='not_used',
+          aspp_channels=64,
+          decoder_channels=48,
+          atrous_rates=[6, 12, 18],
+          num_classes=num_classes)
+      input_tensor = tf.random.uniform(shape=(2, 65, 65, 32))
+      expected_shape = [2, 65, 65, num_classes]
+
+      logit_tensor = deeplabv3_decoder(input_tensor)
+      self.assertListEqual(
+          logit_tensor[common.PRED_SEMANTIC_LOGITS_KEY].shape.as_list(),
+          expected_shape)
+
+  @test_utils.test_all_strategies
+  def test_sync_bn(self, strategy):
+    input_tensor = tf.random.uniform(shape=(2, 65, 65, 32))
+    with strategy.scope():
+      for bn_layer in test_utils.NORMALIZATION_LAYERS:
+        deeplabv3_decoder = _create_deeplabv3_model(
+            feature_key='not_used',
+            aspp_channels=64,
+            decoder_channels=48,
+            atrous_rates=[6, 12, 18],
+            num_classes=19,
+            bn_layer=bn_layer)
+        _ = deeplabv3_decoder(input_tensor)
+
+  def test_deeplabv3_feature_extraction_consistency(self):
+    deeplabv3_decoder = _create_deeplabv3_model(
+        aspp_channels=64,
+        decoder_channels=48,
+        atrous_rates=[6, 12, 18],
+        num_classes=80,
+        feature_key='feature_key')
+    input_tensor = tf.random.uniform(shape=(2, 65, 65, 32))
+    input_dict = dict()
+    input_dict['feature_key'] = input_tensor
+
+    reference_logits_tensor = deeplabv3_decoder(input_tensor, training=False)
+    logits_tensor_to_compare = deeplabv3_decoder(input_dict, training=False)
+
+    np.testing.assert_equal(
+        reference_logits_tensor[common.PRED_SEMANTIC_LOGITS_KEY].numpy(),
+        logits_tensor_to_compare[common.PRED_SEMANTIC_LOGITS_KEY].numpy())
+
+  def test_deeplabv3_pool_size_setter(self):
+    deeplabv3_decoder = _create_deeplabv3_model(
+        feature_key='not_used',
+        aspp_channels=64,
+        decoder_channels=48,
+        atrous_rates=[6, 12, 18],
+        num_classes=80)
+    pool_size = (10, 10)
+    deeplabv3_decoder.set_pool_size(pool_size)
+
+    self.assertTupleEqual(deeplabv3_decoder._aspp._aspp_pool._pool_size,
+                          pool_size)
+
+  def test_deeplabv3_pool_size_resetter(self):
+    deeplabv3_decoder = _create_deeplabv3_model(
+        feature_key='not_used',
+        aspp_channels=64,
+        decoder_channels=48,
+        atrous_rates=[6, 12, 18],
+        num_classes=80)
+    pool_size = (None, None)
+    deeplabv3_decoder.reset_pooling_layer()
+
+    self.assertTupleEqual(deeplabv3_decoder._aspp._aspp_pool._pool_size,
+                          pool_size)
+
+  def test_deeplabv3_ckpt_items(self):
+    deeplabv3_decoder = _create_deeplabv3_model(
+        feature_key='not_used',
+        aspp_channels=64,
+        decoder_channels=48,
+        atrous_rates=[6, 12, 18],
+        num_classes=80)
+    ckpt_dict = deeplabv3_decoder.checkpoint_items
+    self.assertIn(common.CKPT_DEEPLABV3_ASPP, ckpt_dict)
+    self.assertIn(common.CKPT_DEEPLABV3_CLASSIFIER_CONV_BN_ACT, ckpt_dict)
+    self.assertIn(common.CKPT_SEMANTIC_LAST_LAYER, ckpt_dict)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/model/decoder/deeplabv3plus.py b/model/decoder/deeplabv3plus.py
new file mode 100644
index 0000000000000000000000000000000000000000..35d66f8ad95fc7ab6e3bbf54774a0c50cf105bbb
--- /dev/null
+++ b/model/decoder/deeplabv3plus.py
@@ -0,0 +1,145 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This file contains code to build a DeepLabV3Plus.
+
+Reference:
+  - [Encoder-Decoder with Atrous Separable Convolution for Semantic Image
+      Segmentation](https://arxiv.org/pdf/1802.02611.pdf)
+"""
+
+import tensorflow as tf
+
+from deeplab2 import common
+from deeplab2.model import utils
+from deeplab2.model.decoder import aspp
+from deeplab2.model.layers import convolutions
+
+
+layers = tf.keras.layers
+
+
+class DeepLabV3Plus(tf.keras.layers.Layer):
+  """A DeepLabV3+ decoder model.
+
+  This model takes in low- and high-level features from an encoder and performs
+  multi-scale context aggregation with the help of an ASPP layer on high-level
+  features. These are concatenated with the low-level features and used as input
+  to the classification head that is used to predict a semantic segmentation.
+  """
+
+  def __init__(self,
+               decoder_options,
+               deeplabv3plus_options,
+               bn_layer=tf.keras.layers.BatchNormalization):
+    """Creates a DeepLabV3+ decoder of type tf.keras.layers.Layer.
+
+    Args:
+      decoder_options: Decoder options as defined in config_pb2.DecoderOptions.
+      deeplabv3plus_options: Model options as defined in
+        config_pb2.ModelOptions.DeeplabV3PlusOptions.
+      bn_layer: An optional tf.keras.layers.Layer that computes the
+        normalization (default: tf.keras.layers.BatchNormalization).
+    """
+    super(DeepLabV3Plus, self).__init__(name='DeepLabv3Plus')
+
+    self._high_level_feature_name = decoder_options.feature_key
+    self._low_level_feature_name = deeplabv3plus_options.low_level.feature_key
+    self._aspp = aspp.ASPP(decoder_options.aspp_channels,
+                           decoder_options.atrous_rates,
+                           bn_layer=bn_layer)
+
+    # Layers for low-level feature transformation.
+    self._project_conv_bn_act = convolutions.Conv2DSame(
+        deeplabv3plus_options.low_level.channels_project,
+        kernel_size=1,
+        name='project_conv_bn_act',
+        use_bias=False,
+        use_bn=True,
+        bn_layer=bn_layer,
+        activation='relu')
+
+    # Layers for fusing low- and high-level features.
+    self._fuse = convolutions.StackedConv2DSame(
+        conv_type='depthwise_separable_conv',
+        num_layers=2,
+        output_channels=decoder_options.decoder_channels,
+        kernel_size=3,
+        name='fuse',
+        use_bias=False,
+        use_bn=True,
+        bn_layer=bn_layer,
+        activation='relu')
+
+    self._final_conv = convolutions.Conv2DSame(
+        deeplabv3plus_options.num_classes, kernel_size=1, name='final_conv')
+
+  def reset_pooling_layer(self):
+    """Resets the ASPP pooling layer to global average pooling."""
+    self._aspp.reset_pooling_layer()
+
+  def set_pool_size(self, pool_size):
+    """Sets the pooling size of the ASPP pooling layer.
+
+    Args:
+      pool_size: A tuple specifying the pooling size of the ASPP pooling layer.
+    """
+    self._aspp.set_pool_size(pool_size)
+
+  def get_pool_size(self):
+    return self._aspp.get_pool_size()
+
+  @property
+  def checkpoint_items(self):
+    items = {
+        common.CKPT_DEEPLABV3PLUS_ASPP: self._aspp,
+        common.CKPT_DEEPLABV3PLUS_PROJECT_CONV_BN_ACT:
+            self._project_conv_bn_act,
+        common.CKPT_DEEPLABV3PLUS_FUSE: self._fuse,
+        common.CKPT_SEMANTIC_LAST_LAYER: self._final_conv,
+    }
+    return items
+
+  def call(self, features, training=False):
+    """Performs a forward pass.
+
+    Args:
+      features: An input dict of tf.Tensor with shape [batch, height, width,
+        channels]. Different keys should point to different features extracted
+        by the encoder, e.g. low-level or high-level features.
+      training: A boolean flag indicating whether training behavior should be
+        used (default: False).
+
+    Returns:
+      A dictionary containing the semantic prediction under key
+      common.PRED_SEMANTIC_LOGITS_KEY.
+    """
+    low_level_features = features[self._low_level_feature_name]
+    high_level_features = features[self._high_level_feature_name]
+
+    high_level_features = self._aspp(high_level_features, training=training)
+
+    low_level_features = self._project_conv_bn_act(low_level_features,
+                                                   training=training)
+
+    target_h = tf.shape(low_level_features)[1]
+    target_w = tf.shape(low_level_features)[2]
+
+    high_level_features = utils.resize_align_corners(
+        high_level_features, [target_h, target_w])
+    x = tf.concat([high_level_features, low_level_features], 3)
+    x = self._fuse(x)
+
+    return {common.PRED_SEMANTIC_LOGITS_KEY: self._final_conv(x)}
diff --git a/model/decoder/deeplabv3plus_test.py b/model/decoder/deeplabv3plus_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1419b55acc0a5973e414ca7a12d2716d0f838b57
--- /dev/null
+++ b/model/decoder/deeplabv3plus_test.py
@@ -0,0 +1,169 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for deeplabv3plus."""
+
+import numpy as np
+import tensorflow as tf
+
+from deeplab2 import common
+from deeplab2 import config_pb2
+from deeplab2.model.decoder import deeplabv3plus
+from deeplab2.utils import test_utils
+
+
+def _create_deeplabv3plus_model(high_level_feature_name, low_level_feature_name,
+                                low_level_channels_project,
+                                aspp_output_channels, decoder_output_channels,
+                                atrous_rates, num_classes, **kwargs):
+  decoder_options = config_pb2.DecoderOptions(
+      feature_key=high_level_feature_name,
+      decoder_channels=decoder_output_channels,
+      aspp_channels=aspp_output_channels,
+      atrous_rates=atrous_rates)
+  deeplabv3plus_options = config_pb2.ModelOptions.DeeplabV3PlusOptions(
+      low_level=config_pb2.LowLevelOptions(
+          feature_key=low_level_feature_name,
+          channels_project=low_level_channels_project),
+      num_classes=num_classes)
+  return deeplabv3plus.DeepLabV3Plus(decoder_options, deeplabv3plus_options,
+                                     **kwargs)
+
+
+class Deeplabv3PlusTest(tf.test.TestCase):
+
+  def test_deeplabv3plus_feature_key_not_present(self):
+    deeplabv3plus_decoder = _create_deeplabv3plus_model(
+        high_level_feature_name='not_in_features_dict',
+        low_level_feature_name='in_feature_dict',
+        low_level_channels_project=128,
+        aspp_output_channels=64,
+        decoder_output_channels=64,
+        atrous_rates=[6, 12, 18],
+        num_classes=80)
+    input_dict = dict()
+    input_dict['in_feature_dict'] = tf.random.uniform(shape=(2, 65, 65, 32))
+
+    with self.assertRaises(KeyError):
+      _ = deeplabv3plus_decoder(input_dict)
+
+  def test_deeplabv3plus_output_shape(self):
+    list_of_num_classes = [2, 19, 133]
+    for num_classes in list_of_num_classes:
+      deeplabv3plus_decoder = _create_deeplabv3plus_model(
+          high_level_feature_name='high',
+          low_level_feature_name='low',
+          low_level_channels_project=128,
+          aspp_output_channels=64,
+          decoder_output_channels=128,
+          atrous_rates=[6, 12, 18],
+          num_classes=num_classes)
+      input_dict = dict()
+      input_dict['high'] = tf.random.uniform(shape=(2, 65, 65, 32))
+      input_dict['low'] = tf.random.uniform(shape=(2, 129, 129, 16))
+      expected_shape = [2, 129, 129, num_classes]
+
+      logit_tensor = deeplabv3plus_decoder(input_dict)
+      self.assertListEqual(
+          logit_tensor[common.PRED_SEMANTIC_LOGITS_KEY].shape.as_list(),
+          expected_shape)
+
+  def test_deeplabv3plus_feature_extraction_consistency(self):
+    deeplabv3plus_decoder = _create_deeplabv3plus_model(
+        high_level_feature_name='high',
+        low_level_feature_name='low',
+        low_level_channels_project=128,
+        aspp_output_channels=96,
+        decoder_output_channels=64,
+        atrous_rates=[6, 12, 18],
+        num_classes=80)
+    input_dict = dict()
+    input_dict['high'] = tf.random.uniform(shape=(2, 65, 65, 32))
+    input_dict['low'] = tf.random.uniform(shape=(2, 129, 129, 16))
+
+    reference_logits_tensor = deeplabv3plus_decoder(
+        input_dict, training=False)
+    logits_tensor_to_compare = deeplabv3plus_decoder(input_dict, training=False)
+
+    np.testing.assert_equal(
+        reference_logits_tensor[common.PRED_SEMANTIC_LOGITS_KEY].numpy(),
+        logits_tensor_to_compare[common.PRED_SEMANTIC_LOGITS_KEY].numpy())
+
+  def test_deeplabv3plus_pool_size_setter(self):
+    deeplabv3plus_decoder = _create_deeplabv3plus_model(
+        high_level_feature_name='high',
+        low_level_feature_name='low',
+        low_level_channels_project=128,
+        aspp_output_channels=96,
+        decoder_output_channels=64,
+        atrous_rates=[6, 12, 18],
+        num_classes=80)
+    pool_size = (10, 10)
+    deeplabv3plus_decoder.set_pool_size(pool_size)
+
+    self.assertTupleEqual(deeplabv3plus_decoder._aspp._aspp_pool._pool_size,
+                          pool_size)
+
+  @test_utils.test_all_strategies
+  def test_deeplabv3plus_sync_bn(self, strategy):
+    input_dict = dict()
+    input_dict['high'] = tf.random.uniform(shape=(2, 65, 65, 32))
+    input_dict['low'] = tf.random.uniform(shape=(2, 129, 129, 16))
+    with strategy.scope():
+      for bn_layer in test_utils.NORMALIZATION_LAYERS:
+        deeplabv3plus_decoder = _create_deeplabv3plus_model(
+            high_level_feature_name='high',
+            low_level_feature_name='low',
+            low_level_channels_project=128,
+            aspp_output_channels=96,
+            decoder_output_channels=64,
+            atrous_rates=[6, 12, 18],
+            num_classes=80,
+            bn_layer=bn_layer)
+        _ = deeplabv3plus_decoder(input_dict)
+
+  def test_deeplabv3plus_pool_size_resetter(self):
+    deeplabv3plus_decoder = _create_deeplabv3plus_model(
+        high_level_feature_name='high',
+        low_level_feature_name='low',
+        low_level_channels_project=128,
+        aspp_output_channels=96,
+        decoder_output_channels=64,
+        atrous_rates=[6, 12, 18],
+        num_classes=80)
+    pool_size = (None, None)
+    deeplabv3plus_decoder.reset_pooling_layer()
+
+    self.assertTupleEqual(deeplabv3plus_decoder._aspp._aspp_pool._pool_size,
+                          pool_size)
+
+  def test_deeplabv3plus_ckpt_items(self):
+    deeplabv3plus_decoder = _create_deeplabv3plus_model(
+        high_level_feature_name='high',
+        low_level_feature_name='low',
+        low_level_channels_project=128,
+        aspp_output_channels=96,
+        decoder_output_channels=64,
+        atrous_rates=[6, 12, 18],
+        num_classes=80)
+    ckpt_dict = deeplabv3plus_decoder.checkpoint_items
+    self.assertIn(common.CKPT_DEEPLABV3PLUS_ASPP, ckpt_dict)
+    self.assertIn(common.CKPT_DEEPLABV3PLUS_PROJECT_CONV_BN_ACT, ckpt_dict)
+    self.assertIn(common.CKPT_DEEPLABV3PLUS_FUSE, ckpt_dict)
+    self.assertIn(common.CKPT_SEMANTIC_LAST_LAYER, ckpt_dict)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/model/decoder/max_deeplab.py b/model/decoder/max_deeplab.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8c61a09a8445fe6406806bdabb4b0b932dd6f23
--- /dev/null
+++ b/model/decoder/max_deeplab.py
@@ -0,0 +1,328 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This file contains code to build MaX-DeepLab output heads.
+
+Reference:
+  MaX-DeepLab: "End-to-End Panoptic Segmentation with Mask Transformers",
+    CVPR 2021. https://arxiv.org/abs/2012.00759
+      Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen.
+"""
+import math
+
+import tensorflow as tf
+
+from deeplab2 import common
+from deeplab2.model.decoder import panoptic_deeplab
+from deeplab2.model.layers import convolutions
+
+_PIXEL_SPACE_FEATURE_KEY = 'pixel_space_feature'
+
+
+def _get_transformer_class_head_num_classes(
+    auxiliary_semantic_head_output_channels,
+    ignore_label):
+  """Computes the num of classes for the transformer class head.
+
+  The transformer class head predicts non-void classes (i.e., thing classes and
+  stuff classes) and a void (i.e., ∅, no object) class. If the auxiliary
+  semantic head output channel includes the void class, e.g., on COCO, we
+  directly use the semantic output channel. Otherwise, e.g., on Cityscapes, we
+  add 1 (the void class) to the transformer class head.
+
+  Args:
+    auxiliary_semantic_head_output_channels: An integer, the number of output
+      channels of the auxiliary semantic head (it should be the same as the
+      num_classes field of the dataset information).
+    ignore_label: An integer specifying the ignore label. Default to 255.
+
+  Returns:
+    num_classes: An integer, the num of classes for the transformer class head.
+  """
+  if ignore_label >= auxiliary_semantic_head_output_channels:
+    return auxiliary_semantic_head_output_channels + 1
+  else:
+    return auxiliary_semantic_head_output_channels
+
+
+def add_bias_towards_void(transformer_class_logits, void_prior_prob=0.9):
+  """Adds init bias towards the void (no object) class to the class logits.
+
+  We initialize the void class with a large probability, similar to Section 3.3
+  of the Focal Loss paper.
+
+  Reference:
+    Focal Loss for Dense Object Detection, ICCV 2017.
+      https://arxiv.org/abs/1708.02002
+        Tsung-Yi Lin, Priya Goyal, Ross Girshick, Kaiming He, Piotr Dollár.
+
+  Args:
+    transformer_class_logits: A [batch, num_mask_slots, num_classes] tensor, the
+      class logits predicted by the transformer. It concats (num_classes - 1)
+      non-void classes, including both thing classes and stuff classes, and the
+      void class (the last channel). If the dataset class IDs do not follow this
+      order, MaX-DeepLab loss functions will handle the mapping and thus the
+      architecture still supports any dataset.
+    void_prior_prob: A float, the desired probability (after softmax) of the
+      void class at initialization. Defaults to 0.9 as in MaX-DeepLab.
+
+  Returns:
+    updated_transformer_class_logits: A [batch, num_mask_slots, num_classes]
+
+  Raises:
+    ValueError: If the rank of transformer_class_logits is not 3.
+  """
+  class_logits_shape = transformer_class_logits.get_shape().as_list()
+  if len(class_logits_shape) != 3:
+    raise ValueError('Input transformer_class_logits should have rank 3.')
+
+  init_bias = [0.0] * class_logits_shape[-1]
+  init_bias[-1] = math.log(
+      (class_logits_shape[-1] - 1) * void_prior_prob / (1 - void_prior_prob))
+
+  # Broadcasting the 1D init_bias to the 3D transformer_class_logits.
+  return transformer_class_logits + tf.constant(init_bias, dtype=tf.float32)
+
+
+def batch_norm_on_an_extra_axis(inputs, bn_layer):
+  """Applies a batch norm layer on an extra axis.
+
+  This batch norm will be used on the pixel space mask logits in MaX-DeepLab to
+  avoid careful initialization of previous layers and careful scaling of the
+  resulting outputs. In addition, applying batch norm on an extra axis does not
+  introduce an extra gamma and beta for each mask slot. Instead, the current
+  gamma and beta are shared for all mask slots and do not introduce biases on
+  mask slots.
+
+  Args:
+    inputs: A [batch, height, width, num_mask_slots] tensor.
+    bn_layer: A batch norm tf.keras.layers.Layer on the last axis.
+
+  Returns:
+    outputs: A [batch, height, width, num_mask_slots] tensor.
+  """
+  expanded_inputs = tf.expand_dims(inputs, axis=-1)
+  outputs = bn_layer(expanded_inputs)
+  return tf.squeeze(outputs, axis=-1)
+
+
+class MaXDeepLab(tf.keras.layers.Layer):
+  """A MaX-DeepLab head layer."""
+
+  def __init__(self,
+               decoder_options,
+               max_deeplab_options,
+               ignore_label,
+               bn_layer=tf.keras.layers.BatchNormalization):
+    """Initializes a MaX-DeepLab head.
+
+    Args:
+      decoder_options: Decoder options as defined in config_pb2.DecoderOptions.
+      max_deeplab_options: Model options as defined in
+        config_pb2.ModelOptions.MaXDeepLabOptions.
+      ignore_label: An integer specifying the ignore label.
+      bn_layer: An optional tf.keras.layers.Layer that computes the
+        normalization (default: tf.keras.layers.BatchNormalization).
+    """
+    super(MaXDeepLab, self).__init__(name='MaXDeepLab')
+
+    low_level_feature_keys = [
+        item.feature_key for item in max_deeplab_options.auxiliary_low_level
+    ]
+    low_level_channels_project = [
+        item.channels_project
+        for item in max_deeplab_options.auxiliary_low_level
+    ]
+
+    self._auxiliary_semantic_decoder = (
+        panoptic_deeplab.PanopticDeepLabSingleDecoder(
+            high_level_feature_name=decoder_options.feature_key,
+            low_level_feature_names=low_level_feature_keys,
+            low_level_channels_project=low_level_channels_project,
+            aspp_output_channels=decoder_options.aspp_channels,
+            decoder_output_channels=decoder_options.decoder_channels,
+            atrous_rates=decoder_options.atrous_rates,
+            name='auxiliary_semantic_decoder',
+            aspp_use_only_1x1_proj_conv=decoder_options
+            .aspp_use_only_1x1_proj_conv,
+            decoder_conv_type=decoder_options.decoder_conv_type,
+            bn_layer=bn_layer))
+    self._auxiliary_semantic_head = panoptic_deeplab.PanopticDeepLabSingleHead(
+        max_deeplab_options.auxiliary_semantic_head.head_channels,
+        max_deeplab_options.auxiliary_semantic_head.output_channels,
+        common.PRED_SEMANTIC_LOGITS_KEY,
+        name='auxiliary_semantic_head',
+        conv_type=max_deeplab_options.auxiliary_semantic_head.head_conv_type,
+        bn_layer=bn_layer)
+    self._pixel_space_head = panoptic_deeplab.PanopticDeepLabSingleHead(
+        max_deeplab_options.pixel_space_head.head_channels,
+        max_deeplab_options.pixel_space_head.output_channels,
+        _PIXEL_SPACE_FEATURE_KEY,
+        name='pixel_space_head',
+        conv_type=max_deeplab_options.pixel_space_head.head_conv_type,
+        bn_layer=bn_layer)
+
+    self._transformer_mask_head = convolutions.Conv1D(
+        output_channels=max_deeplab_options.pixel_space_head.output_channels,
+        name='transformer_mask_head',
+        use_bias=False,
+        # Use bn to avoid careful initialization.
+        use_bn=True,
+        bn_layer=bn_layer,
+        bn_gamma_initializer='ones',
+        activation=None,
+        kernel_initializer='he_normal',
+        kernel_size=1,
+        padding='valid')
+    # The transformer class head predicts non-void classes (i.e., thing classes
+    # and stuff classes) and a void (i.e., ∅, no object) class.
+    num_classes = _get_transformer_class_head_num_classes(
+        max_deeplab_options.auxiliary_semantic_head.output_channels,
+        ignore_label=ignore_label)
+    self._transformer_class_head = convolutions.Conv1D(
+        output_channels=num_classes,
+        name='transformer_class_head',
+        # Use conv bias rather than bn on this final class logit output.
+        use_bias=True,
+        use_bn=False,
+        activation=None,
+        # Follow common ImageNet class initlization with stddev 0.01.
+        kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.01),
+        kernel_size=1,
+        padding='valid')
+
+    self._pixel_space_feature_batch_norm = bn_layer(
+        axis=-1, name='pixel_space_feature_batch_norm',
+        gamma_initializer=tf.keras.initializers.Constant(1.0))
+    # Use a batch norm to avoid care initialization of the mask outputs.
+    self._pixel_space_mask_batch_norm = bn_layer(
+        axis=-1, name='pixel_space_mask_batch_norm',
+        # Initialize the pixel space mask with a low temperature.
+        gamma_initializer=tf.keras.initializers.Constant(0.1))
+
+  def reset_pooling_layer(self):
+    """Resets the ASPP pooling layers to global average pooling."""
+    self._auxiliary_semantic_decoder.reset_pooling_layer()
+
+  def set_pool_size(self, pool_size):
+    """Sets the pooling size of the ASPP pooling layers.
+
+    Args:
+      pool_size: A tuple specifying the pooling size of the ASPP pooling layers.
+    """
+    self._auxiliary_semantic_decoder.set_pool_size(pool_size)
+
+  def get_pool_size(self):
+    return self._auxiliary_semantic_decoder.get_pool_size()
+
+  @property
+  def checkpoint_items(self):
+    items = {
+        common.CKPT_SEMANTIC_DECODER:
+            self._auxiliary_semantic_decoder,
+        common.CKPT_SEMANTIC_HEAD_WITHOUT_LAST_LAYER:
+            self._auxiliary_semantic_head.conv_block,
+        common.CKPT_SEMANTIC_LAST_LAYER:
+            self._auxiliary_semantic_head.final_conv,
+        common.CKPT_PIXEL_SPACE_HEAD:
+            self._pixel_space_head,
+        common.CKPT_TRANSFORMER_MASK_HEAD:
+            self._transformer_mask_head,
+        common.CKPT_TRANSFORMER_CLASS_HEAD:
+            self._transformer_class_head,
+        common.CKPT_PIXEL_SPACE_FEATURE_BATCH_NORM:
+            self._pixel_space_feature_batch_norm,
+        common.CKPT_PIXEL_SPACE_MASK_BATCH_NORM:
+            self._pixel_space_mask_batch_norm,
+    }
+    return items
+
+  def call(self, features, training=False):
+    """Performs a forward pass.
+
+    Args:
+      features: An input dict of tf.Tensor with shape [batch, height, width,
+        channels] or [batch, length, channels]. Different keys should point to
+        different features extracted by the encoder, e.g., low-level or
+        high-level features.
+      training: A boolean flag indicating whether training behavior should be
+        used (default: False).
+
+    Returns:
+      A dictionary containing the auxiliary semantic segmentation logits, the
+        pixel space normalized feature, the pixel space mask logits, and the
+        mask transformer class logits.
+    """
+    results = {}
+    semantic_features = features['feature_semantic']
+    panoptic_features = features['feature_panoptic']
+    transformer_class_feature = features['transformer_class_feature']
+    transformer_mask_feature = features['transformer_mask_feature']
+
+    # Auxiliary semantic head.
+    semantic_shape = semantic_features.get_shape().as_list()
+    panoptic_shape = panoptic_features.get_shape().as_list()
+    # MaX-DeepLab always predicts panoptic feature at high resolution (e.g.,
+    # stride 4 or stride 2), but the auxiliary semantic feature could be at low
+    # resolution (e.g., stride 16 or stride 32), in the absence of the stacked
+    # decoder (L == 0). In this case, we use an auxiliary semantic decoder on
+    # top of the semantic feature, in order to add the auxiliary semantic loss.
+    if semantic_shape[1:3] != panoptic_shape[1:3]:
+      semantic_features = self._auxiliary_semantic_decoder(
+          features, training=training)
+    auxiliary_semantic_results = self._auxiliary_semantic_head(
+        semantic_features, training=training)
+    results.update(auxiliary_semantic_results)
+
+    # Pixel space head.
+    pixel_space_feature = self._pixel_space_head(
+        panoptic_features, training=training)[_PIXEL_SPACE_FEATURE_KEY]
+    pixel_space_feature = self._pixel_space_feature_batch_norm(
+        pixel_space_feature)
+    pixel_space_normalized_feature = tf.math.l2_normalize(
+        pixel_space_feature, axis=-1)
+    results[common.PRED_PIXEL_SPACE_NORMALIZED_FEATURE_KEY] = (
+        pixel_space_normalized_feature)
+
+    # Transformer class head.
+    transformer_class_logits = self._transformer_class_head(
+        transformer_class_feature)
+    # Bias towards the void class at initialization.
+    transformer_class_logits = add_bias_towards_void(
+        transformer_class_logits)
+    results[common.PRED_TRANSFORMER_CLASS_LOGITS_KEY] = transformer_class_logits
+
+    # Transformer mask kernel.
+    transformer_mask_kernel = self._transformer_mask_head(
+        transformer_mask_feature)
+
+    # Convolutional mask head. The pixel space mask logits are the matrix
+    # multiplication (or convolution) of the pixel space normalized feature and
+    # the transformer mask kernel.
+    pixel_space_mask_logits = tf.einsum(
+        'bhwd,bid->bhwi',
+        pixel_space_normalized_feature,
+        transformer_mask_kernel)
+    # The above multiplication constructs a second-order operation which is
+    # sensitive to the feature scales and initializations. In order to avoid
+    # careful initialization or scaling of the layers, we apply batch norms on
+    # top of pixel_space_feature, transformer_mask_kernel, and the resulting
+    # pixel_space_mask_logits.
+    pixel_space_mask_logits = batch_norm_on_an_extra_axis(
+        pixel_space_mask_logits, self._pixel_space_mask_batch_norm)
+    results[common.PRED_PIXEL_SPACE_MASK_LOGITS_KEY] = (
+        pixel_space_mask_logits)
+
+    return results
diff --git a/model/decoder/max_deeplab_test.py b/model/decoder/max_deeplab_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b14c2b5ef8629c07b91480bcb6119829c166b584
--- /dev/null
+++ b/model/decoder/max_deeplab_test.py
@@ -0,0 +1,89 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for max_deeplab."""
+
+import tensorflow as tf
+
+from deeplab2 import common
+from deeplab2 import config_pb2
+from deeplab2.model.decoder import max_deeplab
+
+
+def _create_max_deeplab_example_proto(num_non_void_classes=19):
+  semantic_decoder = config_pb2.DecoderOptions(
+      feature_key='feature_semantic', atrous_rates=[6, 12, 18])
+  auxiliary_semantic_head = config_pb2.HeadOptions(
+      output_channels=num_non_void_classes, head_channels=256)
+  pixel_space_head = config_pb2.HeadOptions(
+      output_channels=128, head_channels=256)
+  max_deeplab_options = config_pb2.ModelOptions.MaXDeepLabOptions(
+      pixel_space_head=pixel_space_head,
+      auxiliary_semantic_head=auxiliary_semantic_head)
+  # Add features from lowest to highest.
+  max_deeplab_options.auxiliary_low_level.add(
+      feature_key='res3', channels_project=64)
+  max_deeplab_options.auxiliary_low_level.add(
+      feature_key='res2', channels_project=32)
+  return config_pb2.ModelOptions(
+      decoder=semantic_decoder, max_deeplab=max_deeplab_options)
+
+
+class MaXDeeplabTest(tf.test.TestCase):
+
+  def test_max_deeplab_decoder_output_shape(self):
+    num_non_void_classes = 19
+    num_mask_slots = 127
+    model_options = _create_max_deeplab_example_proto(
+        num_non_void_classes=num_non_void_classes)
+    decoder = max_deeplab.MaXDeepLab(
+        max_deeplab_options=model_options.max_deeplab,
+        ignore_label=255,
+        decoder_options=model_options.decoder)
+
+    input_dict = {
+        'res2':
+            tf.random.uniform([2, 17, 17, 256]),
+        'res3':
+            tf.random.uniform([2, 9, 9, 512]),
+        'transformer_class_feature':
+            tf.random.uniform([2, num_mask_slots, 256]),
+        'transformer_mask_feature':
+            tf.random.uniform([2, num_mask_slots, 256]),
+        'feature_panoptic':
+            tf.random.uniform([2, 17, 17, 256]),
+        'feature_semantic':
+            tf.random.uniform([2, 5, 5, 2048])
+    }
+    resulting_dict = decoder(input_dict)
+    self.assertListEqual(
+        resulting_dict[common.PRED_SEMANTIC_LOGITS_KEY].shape.as_list(),
+        [2, 17, 17, 19])  # Stride 4
+    self.assertListEqual(
+        resulting_dict[
+            common.PRED_PIXEL_SPACE_NORMALIZED_FEATURE_KEY].shape.as_list(),
+        [2, 17, 17, 128])  # Stride 4
+    self.assertListEqual(
+        resulting_dict[
+            common.PRED_TRANSFORMER_CLASS_LOGITS_KEY].shape.as_list(),
+        # Non-void classes and a void class.
+        [2, num_mask_slots, num_non_void_classes + 1])
+    self.assertListEqual(
+        resulting_dict[common.PRED_PIXEL_SPACE_MASK_LOGITS_KEY].shape.as_list(),
+        [2, 17, 17, num_mask_slots])  # Stride 4.
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/model/decoder/motion_deeplab_decoder.py b/model/decoder/motion_deeplab_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b84c79a56c246b41b0a01e9f3abe3ee0fdfa218
--- /dev/null
+++ b/model/decoder/motion_deeplab_decoder.py
@@ -0,0 +1,216 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This file contains the code for the Motion-DeepLab decoder."""
+
+import tensorflow as tf
+
+from deeplab2 import common
+from deeplab2 import config_pb2
+from deeplab2.model.decoder import panoptic_deeplab
+
+
+class MotionDeepLabDecoder(tf.keras.layers.Layer):
+  """A Motion-DeepLab decoder layer.
+
+  This layer takes low- and high-level features as input and uses a dual-ASPP
+  and dual-decoder structure to aggregate features for semantic and instance
+  segmentation. On top of the decoders, four heads are used to predict semantic
+  segmentation, instance center probabilities, instance center regression, and
+  previous frame offset regression per pixel.
+  """
+
+  def __init__(
+      self,
+      decoder_options: config_pb2.DecoderOptions,
+      motion_deeplab_options: config_pb2.ModelOptions.MotionDeepLabOptions,
+      bn_layer=tf.keras.layers.BatchNormalization):
+    """Initializes a Motion-DeepLab decoder.
+
+    Args:
+      decoder_options: Decoder options as defined in config_pb2.DecoderOptions.
+      motion_deeplab_options: Model options as defined in
+        config_pb2.ModelOptions.MotionDeeplabOptions.
+      bn_layer: An optional tf.keras.layers.Layer that computes the
+        normalization (default: tf.keras.layers.BatchNormalization).
+    """
+    super(MotionDeepLabDecoder, self).__init__(name='MotionDeepLabDecoder')
+
+    low_level_feature_keys = [
+        item.feature_key for item in motion_deeplab_options.low_level
+    ]
+    low_level_channels_project = [
+        item.channels_project for item in motion_deeplab_options.low_level
+    ]
+
+    self._semantic_decoder = panoptic_deeplab.PanopticDeepLabSingleDecoder(
+        decoder_options.feature_key,
+        low_level_feature_keys,
+        low_level_channels_project,
+        decoder_options.aspp_channels,
+        decoder_options.decoder_channels,
+        decoder_options.atrous_rates,
+        name='semantic_decoder',
+        bn_layer=bn_layer)
+    self._semantic_head = panoptic_deeplab.PanopticDeepLabSingleHead(
+        motion_deeplab_options.semantic_head.head_channels,
+        motion_deeplab_options.semantic_head.output_channels,
+        common.PRED_SEMANTIC_LOGITS_KEY,
+        name='semantic_head',
+        bn_layer=bn_layer)
+
+    self._instance_decoder = None
+    self._instance_center_head = None
+    self._instance_regression_head = None
+    self._motion_regression_head = None
+
+    if motion_deeplab_options.instance.low_level_override:
+      low_level_options = motion_deeplab_options.instance.low_level_override
+    else:
+      low_level_options = motion_deeplab_options.low_level
+
+    # If instance_decoder is set, use those options; otherwise reuse the
+    # architecture as defined for the semantic decoder.
+    if motion_deeplab_options.instance.HasField('instance_decoder_override'):
+      decoder_options = (motion_deeplab_options.instance
+                         .instance_decoder_override)
+
+    low_level_feature_keys = [item.feature_key for item in low_level_options]
+    low_level_channels_project = [
+        item.channels_project for item in low_level_options
+    ]
+
+    self._instance_decoder = panoptic_deeplab.PanopticDeepLabSingleDecoder(
+        decoder_options.feature_key,
+        low_level_feature_keys,
+        low_level_channels_project,
+        decoder_options.aspp_channels,
+        decoder_options.decoder_channels,
+        decoder_options.atrous_rates,
+        name='instance_decoder',
+        bn_layer=bn_layer)
+    self._instance_center_head = panoptic_deeplab.PanopticDeepLabSingleHead(
+        motion_deeplab_options.instance.center_head.head_channels,
+        motion_deeplab_options.instance.center_head.output_channels,
+        common.PRED_CENTER_HEATMAP_KEY,
+        name='instance_center_head',
+        bn_layer=bn_layer)
+    self._instance_regression_head = panoptic_deeplab.PanopticDeepLabSingleHead(
+        motion_deeplab_options.instance.regression_head.head_channels,
+        motion_deeplab_options.instance.regression_head.output_channels,
+        common.PRED_OFFSET_MAP_KEY,
+        name='instance_regression_head',
+        bn_layer=bn_layer)
+
+    # The motion head regresses every pixel to its center in the previous
+    # frame.
+    self._motion_regression_head = panoptic_deeplab.PanopticDeepLabSingleHead(
+        motion_deeplab_options.motion_head.head_channels,
+        motion_deeplab_options.motion_head.output_channels,
+        common.PRED_FRAME_OFFSET_MAP_KEY,
+        name='motion_regression_head',
+        bn_layer=bn_layer)
+
+  def reset_pooling_layer(self):
+    """Resets the ASPP pooling layers to global average pooling."""
+    self._semantic_decoder.reset_pooling_layer()
+    if self._instance_decoder is not None:
+      self._instance_decoder.reset_pooling_layer()
+
+  def set_pool_size(self, pool_size):
+    """Sets the pooling size of the ASPP pooling layers.
+
+    Args:
+      pool_size: A tuple specifying the pooling size of the ASPP pooling layers.
+    """
+    self._semantic_decoder.set_pool_size(pool_size)
+    if self._instance_decoder is not None:
+      self._instance_decoder.set_pool_size(pool_size)
+
+  def get_pool_size(self):
+    return self._semantic_decoder.get_pool_size()
+
+  def call(self, features, training=False):
+    """Performs a forward pass.
+
+    Args:
+      features: An input dict of tf.Tensor with shape [batch, height, width,
+        channels]. Different keys should point to different features extracted
+        by the encoder, e.g. low-level or high-level features.
+      training: A boolean flag indicating whether training behavior should be
+        used (default: False).
+
+    Returns:
+      A dictionary containing the results of the semantic segmentation head and
+        depending on the configuration also of the instance segmentation head.
+    """
+
+    semantic_features = self._semantic_decoder(features, training=training)
+    results = self._semantic_head(semantic_features, training=training)
+
+    if self._instance_decoder is not None:
+      instance_features = self._instance_decoder(features, training=training)
+      instance_center_predictions = self._instance_center_head(
+          instance_features, training=training)
+      instance_regression_predictions = self._instance_regression_head(
+          instance_features, training=training)
+      motion_regression_predictions = self._motion_regression_head(
+          instance_features, training=training)
+      if results.keys() & motion_regression_predictions.keys():
+        raise ValueError('The keys of the semantic branch and the instance '
+                         'motion branch overlap. Please use unique keys.')
+      results.update(motion_regression_predictions)
+
+      if results.keys() & instance_center_predictions.keys():
+        raise ValueError('The keys of the semantic branch and the instance '
+                         'center branch overlap. Please use unique keys.')
+      results.update(instance_center_predictions)
+
+      if results.keys() & instance_regression_predictions.keys():
+        raise ValueError('The keys of the semantic branch and the instance '
+                         'regression branch overlap. Please use unique keys.')
+      results.update(instance_regression_predictions)
+
+    return results
+
+  @property
+  def checkpoint_items(self):
+    items = {
+        common.CKPT_SEMANTIC_DECODER:
+            self._semantic_decoder,
+        common.CKPT_SEMANTIC_HEAD_WITHOUT_LAST_LAYER:
+            self._semantic_head.conv_block,
+        common.CKPT_SEMANTIC_LAST_LAYER:
+            self._semantic_head.final_conv
+    }
+    if self._instance_decoder is not None:
+      instance_items = {
+          common.CKPT_INSTANCE_DECODER:
+              self._instance_decoder,
+          common.CKPT_INSTANCE_CENTER_HEAD_WITHOUT_LAST_LAYER:
+              self._instance_center_head.conv_block,
+          common.CKPT_INSTANCE_CENTER_HEAD_LAST_LAYER:
+              self._instance_center_head.final_conv,
+          common.CKPT_INSTANCE_REGRESSION_HEAD_WITHOUT_LAST_LAYER:
+              self._instance_regression_head.conv_block,
+          common.CKPT_INSTANCE_REGRESSION_HEAD_LAST_LAYER:
+              self._instance_regression_head.final_conv,
+          common.CKPT_MOTION_REGRESSION_HEAD_WITHOUT_LAST_LAYER:
+              self._motion_regression_head.conv_block,
+          common.CKPT_MOTION_REGRESSION_HEAD_LAST_LAYER:
+              self._motion_regression_head.final_conv,
+      }
+      items.update(instance_items)
+    return items
diff --git a/model/decoder/panoptic_deeplab.py b/model/decoder/panoptic_deeplab.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ccbeaff5f49789f93188ec03f49eeec06bbe0b2
--- /dev/null
+++ b/model/decoder/panoptic_deeplab.py
@@ -0,0 +1,445 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This file contains code to build a Panoptic-DeepLab decoder.
+
+Reference:
+  - [Panoptic-DeepLab: A Simple, Strong, and Fast Baseline for Bottom-Up
+      Panoptic Segmentation](https://arxiv.org/pdf/1911.10194)
+"""
+from absl import logging
+
+import tensorflow as tf
+
+from deeplab2 import common
+from deeplab2.model import utils
+from deeplab2.model.decoder import aspp
+from deeplab2.model.layers import convolutions
+
+
+layers = tf.keras.layers
+
+
+class PanopticDeepLabSingleDecoder(layers.Layer):
+  """A single Panoptic-DeepLab decoder layer.
+
+  This layer takes low- and high-level features as input and uses an ASPP
+  followed by a fusion block to decode features for a single task, e.g.,
+  semantic segmentation or instance segmentation.
+  """
+
+  def __init__(self,
+               high_level_feature_name,
+               low_level_feature_names,
+               low_level_channels_project,
+               aspp_output_channels,
+               decoder_output_channels,
+               atrous_rates,
+               name,
+               aspp_use_only_1x1_proj_conv=False,
+               decoder_conv_type='depthwise_separable_conv',
+               bn_layer=tf.keras.layers.BatchNormalization):
+    """Initializes a single Panoptic-DeepLab decoder of layers.Layer.
+
+    Args:
+      high_level_feature_name: A string specifying the name of the high-level
+        feature coming from an encoder.
+      low_level_feature_names: A list of strings specifying the name of the
+        low-level features coming from an encoder. An order from highest to
+        lower level is expected, e.g. ['res3', 'res2'].
+      low_level_channels_project: A list of integer specifying the number of
+        filters used for processing each low_level features.
+      aspp_output_channels: An integer specifying the number of filters in the
+        ASPP convolution layers.
+      decoder_output_channels: An integer specifying the number of filters in
+        the decoder convolution layers.
+      atrous_rates: A list of three integers specifying the atrous rate for the
+        ASPP layers.
+      name: A string specifying the name of the layer.
+      aspp_use_only_1x1_proj_conv: Boolean, specifying if the ASPP five branches
+        are turned off or not. If True, the ASPP module is degenerated to one
+        1x1 convolution, projecting the input channels to `output_channels`.
+      decoder_conv_type: String, specifying decoder convolution type. Support
+        'depthwise_separable_conv' and 'standard_conv'.
+      bn_layer: An optional tf.keras.layers.Layer that computes the
+        normalization (default: tf.keras.layers.BatchNormalization).
+
+    Raises:
+      ValueError: An error occurs when the length of low_level_feature_names
+        differs from the length of low_level_channels_project.
+    """
+    super(PanopticDeepLabSingleDecoder, self).__init__(name=name)
+    self._channel_axis = 3
+
+    self._aspp = aspp.ASPP(
+        aspp_output_channels,
+        atrous_rates,
+        aspp_use_only_1x1_proj_conv=aspp_use_only_1x1_proj_conv,
+        name='aspp',
+        bn_layer=bn_layer)
+    self._high_level_feature_name = high_level_feature_name
+
+    if len(low_level_feature_names) != len(low_level_channels_project):
+      raise ValueError('The Panoptic-DeepLab decoder requires the same number '
+                       'of low-level features as the number of low-level '
+                       'projection channels. But got %d and %d.'
+                       % (len(low_level_feature_names),
+                          len(low_level_channels_project)))
+
+    self._low_level_feature_names = low_level_feature_names
+
+    for i, channels_project in enumerate(low_level_channels_project):
+      # Check if channel sizes increases and issue a warning.
+      if i > 0 and low_level_channels_project[i - 1] < channels_project:
+        logging.warning(
+            'The low level projection channels usually do not '
+            'increase for features with higher spatial resolution. '
+            'Please make sure, this behavior is intended.')
+      current_low_level_conv_name, current_fusion_conv_name = (
+          utils.get_low_level_conv_fusion_conv_current_names(i))
+      utils.safe_setattr(
+          self, current_low_level_conv_name, convolutions.Conv2DSame(
+              channels_project,
+              kernel_size=1,
+              name=utils.get_layer_name(current_low_level_conv_name),
+              use_bias=False,
+              use_bn=True,
+              bn_layer=bn_layer,
+              activation='relu'))
+
+      utils.safe_setattr(
+          self, current_fusion_conv_name, convolutions.StackedConv2DSame(
+              conv_type=decoder_conv_type,
+              num_layers=1,
+              output_channels=decoder_output_channels,
+              kernel_size=5,
+              name=utils.get_layer_name(current_fusion_conv_name),
+              use_bias=False,
+              use_bn=True,
+              bn_layer=bn_layer,
+              activation='relu'))
+
+  def call(self, features, training=False):
+    """Performs a forward pass.
+
+    Args:
+      features: An input dict of tf.Tensor with shape [batch, height, width,
+        channels]. Different keys should point to different features extracted
+        by the encoder, e.g. low-level or high-level features.
+      training: A boolean flag indicating whether training behavior should be
+        used (default: False).
+
+    Returns:
+      Refined features as instance of tf.Tensor.
+    """
+
+    high_level_features = features[self._high_level_feature_name]
+    combined_features = self._aspp(high_level_features, training=training)
+
+    # Fuse low-level features with high-level features.
+    for i in range(len(self._low_level_feature_names)):
+      current_low_level_conv_name, current_fusion_conv_name = (
+          utils.get_low_level_conv_fusion_conv_current_names(i))
+      # Iterate from the highest level of the low level features to the lowest
+      # level, i.e. take the features with the smallest spatial size first.
+      low_level_features = features[self._low_level_feature_names[i]]
+      low_level_features = getattr(self, current_low_level_conv_name)(
+          low_level_features, training=training)
+
+      target_h = tf.shape(low_level_features)[1]
+      target_w = tf.shape(low_level_features)[2]
+      source_h = tf.shape(combined_features)[1]
+      source_w = tf.shape(combined_features)[2]
+
+      tf.assert_less(
+          source_h - 1,
+          target_h,
+          message='Features are down-sampled during decoder.')
+      tf.assert_less(
+          source_w - 1,
+          target_w,
+          message='Features are down-sampled during decoder.')
+
+      combined_features = utils.resize_align_corners(combined_features,
+                                                     [target_h, target_w])
+
+      combined_features = tf.concat([combined_features, low_level_features],
+                                    self._channel_axis)
+      combined_features = getattr(self, current_fusion_conv_name)(
+          combined_features, training=training)
+
+    return combined_features
+
+  def reset_pooling_layer(self):
+    """Resets the ASPP pooling layer to global average pooling."""
+    self._aspp.reset_pooling_layer()
+
+  def set_pool_size(self, pool_size):
+    """Sets the pooling size of the ASPP pooling layer.
+
+    Args:
+      pool_size: A tuple specifying the pooling size of the ASPP pooling layer.
+    """
+    self._aspp.set_pool_size(pool_size)
+
+  def get_pool_size(self):
+    return self._aspp.get_pool_size()
+
+
+class PanopticDeepLabSingleHead(layers.Layer):
+  """A single PanopticDeepLab head layer.
+
+  This layer takes in the enriched features from a decoder and adds two
+  convolutions on top.
+  """
+
+  def __init__(self,
+               intermediate_channels,
+               output_channels,
+               pred_key,
+               name,
+               conv_type='depthwise_separable_conv',
+               bn_layer=tf.keras.layers.BatchNormalization):
+    """Initializes a single PanopticDeepLab head.
+
+    Args:
+      intermediate_channels: An integer specifying the number of filters of the
+        first 5x5 convolution.
+      output_channels: An integer specifying the number of filters of the second
+        1x1 convolution.
+      pred_key: A string specifying the key of the output dictionary.
+      name: A string specifying the name of this head.
+      conv_type: String, specifying head convolution type. Support
+        'depthwise_separable_conv' and 'standard_conv'.
+      bn_layer: An optional tf.keras.layers.Layer that computes the
+        normalization (default: tf.keras.layers.BatchNormalization).
+    """
+    super(PanopticDeepLabSingleHead, self).__init__(name=name)
+    self._pred_key = pred_key
+
+    self.conv_block = convolutions.StackedConv2DSame(
+        conv_type=conv_type,
+        num_layers=1,
+        output_channels=intermediate_channels,
+        kernel_size=5,
+        name='conv_block',
+        use_bias=False,
+        use_bn=True,
+        bn_layer=bn_layer,
+        activation='relu')
+    self.final_conv = layers.Conv2D(
+        output_channels,
+        kernel_size=1,
+        name='final_conv',
+        kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.01))
+
+  def call(self, features, training=False):
+    """Performs a forward pass.
+
+    Args:
+      features: A tf.Tensor with shape [batch, height, width, channels].
+      training: A boolean flag indicating whether training behavior should be
+        used (default: False).
+
+    Returns:
+      The dictionary containing the predictions under the specified key.
+    """
+    x = self.conv_block(features, training=training)
+    return {self._pred_key: self.final_conv(x)}
+
+
+class PanopticDeepLab(layers.Layer):
+  """A Panoptic-DeepLab decoder layer.
+
+  This layer takes low- and high-level features as input and uses a dual-ASPP
+  and dual-decoder structure to aggregate features for semantic and instance
+  segmentation. On top of the decoders, three heads are used to predict semantic
+  segmentation, instance center probabilities, and instance center regression
+  per pixel.
+  """
+
+  def __init__(self,
+               decoder_options,
+               panoptic_deeplab_options,
+               bn_layer=tf.keras.layers.BatchNormalization):
+    """Initializes a Panoptic-DeepLab decoder.
+
+    Args:
+      decoder_options: Decoder options as defined in config_pb2.DecoderOptions.
+      panoptic_deeplab_options: Model options as defined in
+        config_pb2.ModelOptions.PanopticDeeplabOptions.
+      bn_layer: An optional tf.keras.layers.Layer that computes the
+        normalization (default: tf.keras.layers.BatchNormalization).
+    """
+    super(PanopticDeepLab, self).__init__(name='PanopticDeepLab')
+
+    low_level_feature_keys = [
+        item.feature_key for item in panoptic_deeplab_options.low_level
+    ]
+    low_level_channels_project = [
+        item.channels_project for item in panoptic_deeplab_options.low_level
+    ]
+
+    self._semantic_decoder = PanopticDeepLabSingleDecoder(
+        high_level_feature_name=decoder_options.feature_key,
+        low_level_feature_names=low_level_feature_keys,
+        low_level_channels_project=low_level_channels_project,
+        aspp_output_channels=decoder_options.aspp_channels,
+        decoder_output_channels=decoder_options.decoder_channels,
+        atrous_rates=decoder_options.atrous_rates,
+        name='semantic_decoder',
+        aspp_use_only_1x1_proj_conv=decoder_options.aspp_use_only_1x1_proj_conv,
+        decoder_conv_type=decoder_options.decoder_conv_type,
+        bn_layer=bn_layer)
+    self._semantic_head = PanopticDeepLabSingleHead(
+        panoptic_deeplab_options.semantic_head.head_channels,
+        panoptic_deeplab_options.semantic_head.output_channels,
+        common.PRED_SEMANTIC_LOGITS_KEY,
+        name='semantic_head',
+        conv_type=panoptic_deeplab_options.semantic_head.head_conv_type,
+        bn_layer=bn_layer)
+
+    self._instance_decoder = None
+    self._instance_center_head = None
+    self._instance_regression_head = None
+
+    if panoptic_deeplab_options.instance.enable:
+      if panoptic_deeplab_options.instance.low_level_override:
+        low_level_options = panoptic_deeplab_options.instance.low_level_override
+      else:
+        low_level_options = panoptic_deeplab_options.low_level
+
+      # If instance_decoder is set, use those options; otherwise reuse the
+      # architecture as defined for the semantic decoder.
+      if panoptic_deeplab_options.instance.HasField(
+          'instance_decoder_override'):
+        decoder_options = (panoptic_deeplab_options.instance
+                           .instance_decoder_override)
+
+      low_level_feature_keys = [item.feature_key for item in low_level_options]
+      low_level_channels_project = [
+          item.channels_project for item in low_level_options
+      ]
+
+      self._instance_decoder = PanopticDeepLabSingleDecoder(
+          high_level_feature_name=decoder_options.feature_key,
+          low_level_feature_names=low_level_feature_keys,
+          low_level_channels_project=low_level_channels_project,
+          aspp_output_channels=decoder_options.aspp_channels,
+          decoder_output_channels=decoder_options.decoder_channels,
+          atrous_rates=decoder_options.atrous_rates,
+          name='instance_decoder',
+          aspp_use_only_1x1_proj_conv=(
+              decoder_options.aspp_use_only_1x1_proj_conv),
+          decoder_conv_type=decoder_options.decoder_conv_type,
+          bn_layer=bn_layer)
+      self._instance_center_head = PanopticDeepLabSingleHead(
+          panoptic_deeplab_options.instance.center_head.head_channels,
+          panoptic_deeplab_options.instance.center_head.output_channels,
+          common.PRED_CENTER_HEATMAP_KEY,
+          name='instance_center_head',
+          conv_type=(
+              panoptic_deeplab_options.instance.center_head.head_conv_type),
+          bn_layer=bn_layer)
+      self._instance_regression_head = PanopticDeepLabSingleHead(
+          panoptic_deeplab_options.instance.regression_head.head_channels,
+          panoptic_deeplab_options.instance.regression_head.output_channels,
+          common.PRED_OFFSET_MAP_KEY,
+          name='instance_regression_head',
+          conv_type=(
+              panoptic_deeplab_options.instance.regression_head.head_conv_type),
+          bn_layer=bn_layer)
+
+  def reset_pooling_layer(self):
+    """Resets the ASPP pooling layers to global average pooling."""
+    self._semantic_decoder.reset_pooling_layer()
+    if self._instance_decoder is not None:
+      self._instance_decoder.reset_pooling_layer()
+
+  def set_pool_size(self, pool_size):
+    """Sets the pooling size of the ASPP pooling layers.
+
+    Args:
+      pool_size: A tuple specifying the pooling size of the ASPP pooling layers.
+    """
+    self._semantic_decoder.set_pool_size(pool_size)
+    if self._instance_decoder is not None:
+      self._instance_decoder.set_pool_size(pool_size)
+
+  def get_pool_size(self):
+    return self._semantic_decoder.get_pool_size()
+
+  @property
+  def checkpoint_items(self):
+    items = {
+        common.CKPT_SEMANTIC_DECODER:
+            self._semantic_decoder,
+        common.CKPT_SEMANTIC_HEAD_WITHOUT_LAST_LAYER:
+            self._semantic_head.conv_block,
+        common.CKPT_SEMANTIC_LAST_LAYER:
+            self._semantic_head.final_conv
+    }
+    if self._instance_decoder is not None:
+      instance_items = {
+          common.CKPT_INSTANCE_DECODER:
+              self._instance_decoder,
+          common.CKPT_INSTANCE_CENTER_HEAD_WITHOUT_LAST_LAYER:
+              self._instance_center_head.conv_block,
+          common.CKPT_INSTANCE_CENTER_HEAD_LAST_LAYER:
+              self._instance_center_head.final_conv,
+          common.CKPT_INSTANCE_REGRESSION_HEAD_WITHOUT_LAST_LAYER:
+              self._instance_regression_head.conv_block,
+          common.CKPT_INSTANCE_REGRESSION_HEAD_LAST_LAYER:
+              self._instance_regression_head.final_conv,
+      }
+      items.update(instance_items)
+    return items
+
+  def call(self, features, training=False):
+    """Performs a forward pass.
+
+    Args:
+      features: An input dict of tf.Tensor with shape [batch, height, width,
+        channels]. Different keys should point to different features extracted
+        by the encoder, e.g. low-level or high-level features.
+      training: A boolean flag indicating whether training behavior should be
+        used (default: False).
+
+    Returns:
+      A dictionary containing the results of the semantic segmentation head and
+        depending on the configuration also of the instance segmentation head.
+    """
+
+    semantic_features = self._semantic_decoder(features, training=training)
+    results = self._semantic_head(semantic_features, training=training)
+
+    if self._instance_decoder is not None:
+      instance_features = self._instance_decoder(features, training=training)
+      instance_center_predictions = self._instance_center_head(
+          instance_features, training=training)
+      instance_regression_predictions = self._instance_regression_head(
+          instance_features, training=training)
+
+      if results.keys() & instance_center_predictions.keys():
+        raise ValueError('The keys of the semantic branch and the instance '
+                         'center branch overlap. Please use unique keys.')
+      results.update(instance_center_predictions)
+
+      if results.keys() & instance_regression_predictions.keys():
+        raise ValueError('The keys of the semantic branch and the instance '
+                         'regression branch overlap. Please use unique keys.')
+      results.update(instance_regression_predictions)
+
+    return results
diff --git a/model/decoder/panoptic_deeplab_test.py b/model/decoder/panoptic_deeplab_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff5a8bf69e542371618dbcc17fe9d17c63f0e1be
--- /dev/null
+++ b/model/decoder/panoptic_deeplab_test.py
@@ -0,0 +1,267 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for panoptic_deeplab."""
+
+import tensorflow as tf
+
+from deeplab2 import common
+from deeplab2 import config_pb2
+from deeplab2.model.decoder import panoptic_deeplab
+from deeplab2.utils import test_utils
+
+
+def _create_panoptic_deeplab_example_proto(num_classes=19):
+  semantic_decoder = config_pb2.DecoderOptions(
+      feature_key='res5', atrous_rates=[6, 12, 18])
+  semantic_head = config_pb2.HeadOptions(
+      output_channels=num_classes, head_channels=256)
+
+  instance_decoder = config_pb2.DecoderOptions(
+      feature_key='res5', decoder_channels=128, atrous_rates=[6, 12, 18])
+  center_head = config_pb2.HeadOptions(
+      output_channels=1, head_channels=32)
+  regression_head = config_pb2.HeadOptions(
+      output_channels=2, head_channels=32)
+
+  instance_branch = config_pb2.InstanceOptions(
+      instance_decoder_override=instance_decoder,
+      center_head=center_head,
+      regression_head=regression_head)
+
+  panoptic_deeplab_options = config_pb2.ModelOptions.PanopticDeeplabOptions(
+      semantic_head=semantic_head, instance=instance_branch)
+  # Add features from lowest to highest.
+  panoptic_deeplab_options.low_level.add(
+      feature_key='res3', channels_project=64)
+  panoptic_deeplab_options.low_level.add(
+      feature_key='res2', channels_project=32)
+
+  return config_pb2.ModelOptions(
+      decoder=semantic_decoder, panoptic_deeplab=panoptic_deeplab_options)
+
+
+def _create_expected_shape(input_shape, output_channels):
+  output_shape = input_shape.copy()
+  output_shape[3] = output_channels
+  return output_shape
+
+
+class PanopticDeeplabTest(tf.test.TestCase):
+
+  def test_panoptic_deeplab_single_decoder_init_errors(self):
+    with self.assertRaises(ValueError):
+      _ = panoptic_deeplab.PanopticDeepLabSingleDecoder(
+          high_level_feature_name='test',
+          low_level_feature_names=['only_one_name'],  # Error: Only one name.
+          low_level_channels_project=[64, 32],
+          aspp_output_channels=256,
+          decoder_output_channels=256,
+          atrous_rates=[6, 12, 18],
+          name='test_decoder')
+
+    with self.assertRaises(ValueError):
+      _ = panoptic_deeplab.PanopticDeepLabSingleDecoder(
+          high_level_feature_name='test',
+          low_level_feature_names=['one', 'two'],
+          low_level_channels_project=[64],  # Error: Only one projection size.
+          aspp_output_channels=256,
+          decoder_output_channels=256,
+          atrous_rates=[6, 12, 18],
+          name='test_decoder')
+
+  def test_panoptic_deeplab_single_decoder_call_errors(self):
+    decoder = panoptic_deeplab.PanopticDeepLabSingleDecoder(
+        high_level_feature_name='high',
+        low_level_feature_names=['low_one', 'low_two'],
+        low_level_channels_project=[64, 32],
+        aspp_output_channels=256,
+        decoder_output_channels=256,
+        atrous_rates=[6, 12, 18],
+        name='test_decoder')
+
+    with self.assertRaises(KeyError):
+      input_dict = {'not_high': tf.random.uniform(shape=(2, 32, 32, 512)),
+                    'low_one': tf.random.uniform(shape=(2, 128, 128, 128)),
+                    'low_two': tf.random.uniform(shape=(2, 256, 256, 64))}
+      _ = decoder(input_dict)
+    with self.assertRaises(KeyError):
+      input_dict = {'high': tf.random.uniform(shape=(2, 32, 32, 512)),
+                    'not_low_one': tf.random.uniform(shape=(2, 128, 128, 128)),
+                    'low_two': tf.random.uniform(shape=(2, 256, 256, 64))}
+      _ = decoder(input_dict)
+    with self.assertRaises(KeyError):
+      input_dict = {'high': tf.random.uniform(shape=(2, 32, 32, 512)),
+                    'low_one': tf.random.uniform(shape=(2, 128, 128, 128)),
+                    'not_low_two': tf.random.uniform(shape=(2, 256, 256, 64))}
+      _ = decoder(input_dict)
+
+  def test_panoptic_deeplab_single_decoder_reset_pooling(self):
+    decoder = panoptic_deeplab.PanopticDeepLabSingleDecoder(
+        high_level_feature_name='high',
+        low_level_feature_names=['low_one', 'low_two'],
+        low_level_channels_project=[64, 32],
+        aspp_output_channels=256,
+        decoder_output_channels=256,
+        atrous_rates=[6, 12, 18],
+        name='test_decoder')
+    pool_size = (None, None)
+    decoder.reset_pooling_layer()
+
+    self.assertTupleEqual(decoder._aspp._aspp_pool._pool_size,
+                          pool_size)
+
+  def test_panoptic_deeplab_single_decoder_set_pooling(self):
+    decoder = panoptic_deeplab.PanopticDeepLabSingleDecoder(
+        high_level_feature_name='high',
+        low_level_feature_names=['low_one', 'low_two'],
+        low_level_channels_project=[64, 32],
+        aspp_output_channels=256,
+        decoder_output_channels=256,
+        atrous_rates=[6, 12, 18],
+        name='test_decoder')
+
+    pool_size = (10, 10)
+    decoder.set_pool_size(pool_size)
+
+    self.assertTupleEqual(decoder._aspp._aspp_pool._pool_size,
+                          pool_size)
+
+  def test_panoptic_deeplab_single_decoder_output_shape(self):
+    decoder_channels = 256
+    decoder = panoptic_deeplab.PanopticDeepLabSingleDecoder(
+        high_level_feature_name='high',
+        low_level_feature_names=['low_one', 'low_two'],
+        low_level_channels_project=[64, 32],
+        aspp_output_channels=256,
+        decoder_output_channels=decoder_channels,
+        atrous_rates=[6, 12, 18],
+        name='test_decoder')
+
+    input_shapes_list = [[[2, 128, 128, 128], [2, 256, 256, 64],
+                          [2, 32, 32, 512]],
+                         [[2, 129, 129, 128], [2, 257, 257, 64],
+                          [2, 33, 33, 512]]]
+
+    for shapes in input_shapes_list:
+      input_dict = {'low_one': tf.random.uniform(shape=shapes[0]),
+                    'low_two': tf.random.uniform(shape=shapes[1]),
+                    'high': tf.random.uniform(shape=shapes[2])}
+
+      expected_shape = _create_expected_shape(shapes[1], decoder_channels)
+
+      resulting_tensor = decoder(input_dict)
+      self.assertListEqual(resulting_tensor.shape.as_list(), expected_shape)
+
+  def test_panoptic_deeplab_single_head_output_shape(self):
+    output_channels = 19
+    head = panoptic_deeplab.PanopticDeepLabSingleHead(
+        intermediate_channels=256,
+        output_channels=output_channels,
+        pred_key='pred',
+        name='test_head')
+
+    input_shapes_list = [[2, 256, 256, 48], [2, 257, 257, 48]]
+    for shape in input_shapes_list:
+      input_tensor = tf.random.uniform(shape=shape)
+      expected_shape = _create_expected_shape(shape, output_channels)
+
+      resulting_tensor = head(input_tensor)
+      self.assertListEqual(resulting_tensor['pred'].shape.as_list(),
+                           expected_shape)
+
+  def test_panoptic_deeplab_decoder_output_shape(self):
+    num_classes = 31
+    model_options = _create_panoptic_deeplab_example_proto(
+        num_classes=num_classes)
+    decoder = panoptic_deeplab.PanopticDeepLab(
+        panoptic_deeplab_options=model_options.panoptic_deeplab,
+        decoder_options=model_options.decoder)
+
+    input_shapes_list = [[[2, 256, 256, 64], [2, 128, 128, 128],
+                          [2, 32, 32, 512]],
+                         [[2, 257, 257, 64], [2, 129, 129, 128],
+                          [2, 33, 33, 512]]]
+
+    for shapes in input_shapes_list:
+      input_dict = {'res2': tf.random.uniform(shape=shapes[0]),
+                    'res3': tf.random.uniform(shape=shapes[1]),
+                    'res5': tf.random.uniform(shape=shapes[2])}
+
+      expected_semantic_shape = _create_expected_shape(shapes[0], num_classes)
+      expected_instance_center_shape = _create_expected_shape(shapes[0], 1)
+      expected_instance_regression_shape = _create_expected_shape(shapes[0], 2)
+
+      resulting_dict = decoder(input_dict)
+      self.assertListEqual(
+          resulting_dict[common.PRED_SEMANTIC_LOGITS_KEY].shape.as_list(),
+          expected_semantic_shape)
+      self.assertListEqual(
+          resulting_dict[common.PRED_CENTER_HEATMAP_KEY].shape.as_list(),
+          expected_instance_center_shape)
+      self.assertListEqual(
+          resulting_dict[common.PRED_OFFSET_MAP_KEY].shape.as_list(),
+          expected_instance_regression_shape)
+
+  @test_utils.test_all_strategies
+  def test_panoptic_deeplab_sync_bn(self, strategy):
+    num_classes = 31
+    model_options = _create_panoptic_deeplab_example_proto(
+        num_classes=num_classes)
+    input_dict = {'res2': tf.random.uniform(shape=[2, 257, 257, 64]),
+                  'res3': tf.random.uniform(shape=[2, 129, 129, 128]),
+                  'res5': tf.random.uniform(shape=[2, 33, 33, 512])}
+
+    with strategy.scope():
+      for bn_layer in test_utils.NORMALIZATION_LAYERS:
+        decoder = panoptic_deeplab.PanopticDeepLab(
+            panoptic_deeplab_options=model_options.panoptic_deeplab,
+            decoder_options=model_options.decoder,
+            bn_layer=bn_layer)
+        _ = decoder(input_dict)
+
+  def test_panoptic_deeplab_single_decoder_logging_feature_order(self):
+    with self.assertLogs(level='WARN'):
+      _ = panoptic_deeplab.PanopticDeepLabSingleDecoder(
+          high_level_feature_name='high',
+          low_level_feature_names=['low_two', 'low_one'],
+          low_level_channels_project=[32, 64],  # Potentially wrong order.
+          aspp_output_channels=256,
+          decoder_output_channels=256,
+          atrous_rates=[6, 12, 18],
+          name='test_decoder')
+
+  def test_panoptic_deeplab_decoder_ckpt_tems(self):
+    num_classes = 31
+    model_options = _create_panoptic_deeplab_example_proto(
+        num_classes=num_classes)
+    decoder = panoptic_deeplab.PanopticDeepLab(
+        panoptic_deeplab_options=model_options.panoptic_deeplab,
+        decoder_options=model_options.decoder)
+    ckpt_dict = decoder.checkpoint_items
+    self.assertIn(common.CKPT_SEMANTIC_DECODER, ckpt_dict)
+    self.assertIn(common.CKPT_SEMANTIC_HEAD_WITHOUT_LAST_LAYER, ckpt_dict)
+    self.assertIn(common.CKPT_SEMANTIC_LAST_LAYER, ckpt_dict)
+    self.assertIn(common.CKPT_INSTANCE_DECODER, ckpt_dict)
+    self.assertIn(common.CKPT_INSTANCE_REGRESSION_HEAD_WITHOUT_LAST_LAYER,
+                  ckpt_dict)
+    self.assertIn(common.CKPT_INSTANCE_REGRESSION_HEAD_LAST_LAYER, ckpt_dict)
+    self.assertIn(common.CKPT_INSTANCE_CENTER_HEAD_WITHOUT_LAST_LAYER,
+                  ckpt_dict)
+    self.assertIn(common.CKPT_INSTANCE_CENTER_HEAD_LAST_LAYER, ckpt_dict)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/model/decoder/vip_deeplab_decoder.py b/model/decoder/vip_deeplab_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..a12bb1ca4e9ba75b6f4b2a275beb838f0376f863
--- /dev/null
+++ b/model/decoder/vip_deeplab_decoder.py
@@ -0,0 +1,279 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This file contains code to build a ViP-DeepLab decoder.
+
+Reference:
+  - [ViP-DeepLab: Learning Visual Perception with Depth-aware Video
+      Panoptic Segmentation](https://arxiv.org/abs/2012.05258)
+"""
+import tensorflow as tf
+
+from deeplab2 import common
+from deeplab2.model.decoder import panoptic_deeplab
+
+
+layers = tf.keras.layers
+
+
+class ViPDeepLabDecoder(layers.Layer):
+  """A ViP-DeepLab decoder layer.
+
+  This layer takes low- and high-level features as input and uses a dual-ASPP
+  and dual-decoder structure to aggregate features for semantic and instance
+  segmentation. On top of the decoders, three heads are used to predict semantic
+  segmentation, instance center probabilities, and instance center regression
+  per pixel. It also has a branch to predict the next-frame instance center
+  regression. Different from the ViP-DeepLab paper which uses Cascade-ASPP, this
+  reimplementation only uses ASPP.
+  """
+
+  def __init__(self,
+               decoder_options,
+               vip_deeplab_options,
+               bn_layer=tf.keras.layers.BatchNormalization):
+    """Initializes a ViP-DeepLab decoder.
+
+    Args:
+      decoder_options: Decoder options as defined in config_pb2.DecoderOptions.
+      vip_deeplab_options: Model options as defined in
+        config_pb2.ModelOptions.ViPDeeplabOptions.
+      bn_layer: An optional tf.keras.layers.Layer that computes the
+        normalization (default: tf.keras.layers.BatchNormalization).
+    """
+    super(ViPDeepLabDecoder, self).__init__(name='ViPDeepLab')
+
+    low_level_feature_keys = [
+        item.feature_key for item in vip_deeplab_options.low_level
+    ]
+    low_level_channels_project = [
+        item.channels_project for item in vip_deeplab_options.low_level
+    ]
+
+    self._semantic_decoder = panoptic_deeplab.PanopticDeepLabSingleDecoder(
+        high_level_feature_name=decoder_options.feature_key,
+        low_level_feature_names=low_level_feature_keys,
+        low_level_channels_project=low_level_channels_project,
+        aspp_output_channels=decoder_options.aspp_channels,
+        decoder_output_channels=decoder_options.decoder_channels,
+        atrous_rates=decoder_options.atrous_rates,
+        name='semantic_decoder',
+        aspp_use_only_1x1_proj_conv=decoder_options.aspp_use_only_1x1_proj_conv,
+        decoder_conv_type=decoder_options.decoder_conv_type,
+        bn_layer=bn_layer)
+    self._semantic_head = panoptic_deeplab.PanopticDeepLabSingleHead(
+        vip_deeplab_options.semantic_head.head_channels,
+        vip_deeplab_options.semantic_head.output_channels,
+        common.PRED_SEMANTIC_LOGITS_KEY,
+        name='semantic_head',
+        conv_type=vip_deeplab_options.semantic_head.head_conv_type,
+        bn_layer=bn_layer)
+
+    self._instance_decoder = None
+    self._instance_center_head = None
+    self._instance_regression_head = None
+    self._next_instance_decoder = None
+    self._next_instance_regression_head = None
+
+    if vip_deeplab_options.instance.enable:
+      if vip_deeplab_options.instance.low_level_override:
+        low_level_options = vip_deeplab_options.instance.low_level_override
+      else:
+        low_level_options = vip_deeplab_options.low_level
+
+      # If instance_decoder is set, use those options; otherwise reuse the
+      # architecture as defined for the semantic decoder.
+      if vip_deeplab_options.instance.HasField(
+          'instance_decoder_override'):
+        decoder_options = (vip_deeplab_options.instance
+                           .instance_decoder_override)
+
+      low_level_feature_keys = [item.feature_key for item in low_level_options]
+      low_level_channels_project = [
+          item.channels_project for item in low_level_options
+      ]
+
+      self._instance_decoder = panoptic_deeplab.PanopticDeepLabSingleDecoder(
+          high_level_feature_name=decoder_options.feature_key,
+          low_level_feature_names=low_level_feature_keys,
+          low_level_channels_project=low_level_channels_project,
+          aspp_output_channels=decoder_options.aspp_channels,
+          decoder_output_channels=decoder_options.decoder_channels,
+          atrous_rates=decoder_options.atrous_rates,
+          name='instance_decoder',
+          aspp_use_only_1x1_proj_conv=(
+              decoder_options.aspp_use_only_1x1_proj_conv),
+          decoder_conv_type=decoder_options.decoder_conv_type,
+          bn_layer=bn_layer)
+      self._instance_center_head = panoptic_deeplab.PanopticDeepLabSingleHead(
+          vip_deeplab_options.instance.center_head.head_channels,
+          vip_deeplab_options.instance.center_head.output_channels,
+          common.PRED_CENTER_HEATMAP_KEY,
+          name='instance_center_head',
+          conv_type=(
+              vip_deeplab_options.instance.center_head.head_conv_type),
+          bn_layer=bn_layer)
+      self._instance_regression_head = (
+          panoptic_deeplab.PanopticDeepLabSingleHead(
+              vip_deeplab_options.instance.regression_head.head_channels,
+              vip_deeplab_options.instance.regression_head.output_channels,
+              common.PRED_OFFSET_MAP_KEY,
+              name='instance_regression_head',
+              conv_type=(
+                  vip_deeplab_options.instance.regression_head.head_conv_type),
+              bn_layer=bn_layer))
+
+      if vip_deeplab_options.instance.HasField('next_regression_head'):
+        self._next_instance_decoder = (
+            panoptic_deeplab.PanopticDeepLabSingleDecoder(
+                high_level_feature_name=decoder_options.feature_key,
+                low_level_feature_names=low_level_feature_keys,
+                low_level_channels_project=low_level_channels_project,
+                aspp_output_channels=decoder_options.aspp_channels,
+                decoder_output_channels=decoder_options.decoder_channels,
+                atrous_rates=decoder_options.atrous_rates,
+                name='next_instance_decoder',
+                aspp_use_only_1x1_proj_conv=(
+                    decoder_options.aspp_use_only_1x1_proj_conv),
+                decoder_conv_type=decoder_options.decoder_conv_type,
+                bn_layer=bn_layer))
+        self._next_instance_regression_head = (
+            panoptic_deeplab.PanopticDeepLabSingleHead(
+                (vip_deeplab_options.instance.next_regression_head
+                 .head_channels),
+                (vip_deeplab_options.instance.next_regression_head
+                 .output_channels),
+                common.PRED_NEXT_OFFSET_MAP_KEY,
+                name='next_instance_regression_head',
+                conv_type=(vip_deeplab_options.instance.next_regression_head
+                           .head_conv_type),
+                bn_layer=bn_layer))
+        self._next_high_level_feature_name = decoder_options.feature_key
+
+  def reset_pooling_layer(self):
+    """Resets the ASPP pooling layers to global average pooling."""
+    self._semantic_decoder.reset_pooling_layer()
+    if self._instance_decoder is not None:
+      self._instance_decoder.reset_pooling_layer()
+    if self._next_instance_decoder is not None:
+      self._next_instance_decoder.reset_pooling_layer()
+
+  def set_pool_size(self, pool_size):
+    """Sets the pooling size of the ASPP pooling layers.
+
+    Args:
+      pool_size: A tuple specifying the pooling size of the ASPP pooling layers.
+    """
+    self._semantic_decoder.set_pool_size(pool_size)
+    if self._instance_decoder is not None:
+      self._instance_decoder.set_pool_size(pool_size)
+    if self._next_instance_decoder is not None:
+      self._next_instance_decoder.set_pool_size(pool_size)
+
+  def get_pool_size(self):
+    return self._semantic_decoder.get_pool_size()
+
+  @property
+  def checkpoint_items(self):
+    items = {
+        common.CKPT_SEMANTIC_DECODER:
+            self._semantic_decoder,
+        common.CKPT_SEMANTIC_HEAD_WITHOUT_LAST_LAYER:
+            self._semantic_head.conv_block,
+        common.CKPT_SEMANTIC_LAST_LAYER:
+            self._semantic_head.final_conv
+    }
+    if self._instance_decoder is not None:
+      instance_items = {
+          common.CKPT_INSTANCE_DECODER:
+              self._instance_decoder,
+          common.CKPT_INSTANCE_CENTER_HEAD_WITHOUT_LAST_LAYER:
+              self._instance_center_head.conv_block,
+          common.CKPT_INSTANCE_CENTER_HEAD_LAST_LAYER:
+              self._instance_center_head.final_conv,
+          common.CKPT_INSTANCE_REGRESSION_HEAD_WITHOUT_LAST_LAYER:
+              self._instance_regression_head.conv_block,
+          common.CKPT_INSTANCE_REGRESSION_HEAD_LAST_LAYER:
+              self._instance_regression_head.final_conv,
+      }
+      items.update(instance_items)
+    if self._next_instance_decoder is not None:
+      next_instance_items = {
+          common.CKPT_NEXT_INSTANCE_DECODER:
+              self._next_instance_decoder,
+          common.CKPT_NEXT_INSTANCE_REGRESSION_HEAD_WITHOUT_LAST_LAYER:
+              self._next_instance_regression_head.conv_block,
+          common.CKPT_NEXT_INSTANCE_REGRESSION_HEAD_LAST_LAYER:
+              self._next_instance_regression_head.final_conv,
+      }
+      items.update(next_instance_items)
+    return items
+
+  def call(self, features, next_features, training=False):
+    """Performs a forward pass.
+
+    Args:
+      features: An input dict of tf.Tensor with shape [batch, height, width,
+        channels]. Different keys should point to different features extracted
+        by the encoder, e.g. low-level or high-level features.
+      next_features: An input dict of tf.Tensor similar to features. The
+        features are computed with the next frame as input.
+      training: A boolean flag indicating whether training behavior should be
+        used (default: False).
+
+    Returns:
+      A dictionary containing the results of the semantic segmentation head and
+        depending on the configuration also of the instance segmentation head.
+    """
+
+    semantic_features = self._semantic_decoder(features, training=training)
+    results = self._semantic_head(semantic_features, training=training)
+
+    if self._instance_decoder is not None:
+      instance_features = self._instance_decoder(features, training=training)
+      instance_center_predictions = self._instance_center_head(
+          instance_features, training=training)
+      instance_regression_predictions = self._instance_regression_head(
+          instance_features, training=training)
+
+      if results.keys() & instance_center_predictions.keys():
+        raise ValueError('The keys of the semantic branch and the instance '
+                         'center branch overlap. Please use unique keys.')
+      results.update(instance_center_predictions)
+
+      if results.keys() & instance_regression_predictions.keys():
+        raise ValueError('The keys of the semantic branch and the instance '
+                         'regression branch overlap. Please use unique keys.')
+      results.update(instance_regression_predictions)
+
+    if self._next_instance_decoder is not None:
+      # We update the high level features in next_features with the concated
+      # features of the high level features in both features and next_features.
+      high_level_feature_name = self._next_high_level_feature_name
+      high_level_features = features[high_level_feature_name]
+      next_high_level_features = next_features[high_level_feature_name]
+      next_high_level_features = tf.concat(
+          [high_level_features, next_high_level_features], axis=3)
+      next_features[high_level_feature_name] = next_high_level_features
+      next_regression_features = self._next_instance_decoder(
+          next_features, training=training)
+      next_regression_predictions = self._next_instance_regression_head(
+          next_regression_features, training=training)
+      if results.keys() & next_regression_predictions.keys():
+        raise ValueError('The keys of the next regresion branch overlap.'
+                         'Please use unique keys.')
+      results.update(next_regression_predictions)
+
+    return results
diff --git a/model/deeplab.py b/model/deeplab.py
new file mode 100644
index 0000000000000000000000000000000000000000..617908e7469ba77e5458156aca948162b22752b2
--- /dev/null
+++ b/model/deeplab.py
@@ -0,0 +1,280 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This file contains the DeepLab meta architecture."""
+import collections
+import functools
+from typing import Any, Dict, Text, Tuple
+
+from absl import logging
+import tensorflow as tf
+
+from deeplab2 import common
+from deeplab2 import config_pb2
+from deeplab2.data import dataset
+from deeplab2.model import builder
+from deeplab2.model import utils
+from deeplab2.model.post_processor import post_processor_builder
+
+_OFFSET_OUTPUT = 'offset'
+
+
+class DeepLab(tf.keras.Model):
+  """This class represents the DeepLab meta architecture.
+
+  This class supports four architectures of the DeepLab family: DeepLab V3,
+  DeepLab V3+, Panoptic-DeepLab, and MaX-DeepLab. The exact architecture must be
+  defined during initialization.
+  """
+
+  def __init__(self,
+               config: config_pb2.ExperimentOptions,
+               dataset_descriptor: dataset.DatasetDescriptor):
+    """Initializes a DeepLab architecture.
+
+    Args:
+      config: A config_pb2.ExperimentOptions configuration.
+      dataset_descriptor: A dataset.DatasetDescriptor.
+
+    Raises:
+      ValueError: If MaX-DeepLab is used with multi-scale inference.
+    """
+    super(DeepLab, self).__init__(name='DeepLab')
+
+    if config.trainer_options.solver_options.use_sync_batchnorm:
+      logging.info('Synchronized Batchnorm is used.')
+      bn_layer = functools.partial(
+          tf.keras.layers.experimental.SyncBatchNormalization,
+          momentum=config.trainer_options.solver_options.batchnorm_momentum,
+          epsilon=config.trainer_options.solver_options.batchnorm_epsilon)
+    else:
+      logging.info('Standard (unsynchronized) Batchnorm is used.')
+      bn_layer = functools.partial(
+          tf.keras.layers.BatchNormalization,
+          momentum=config.trainer_options.solver_options.batchnorm_momentum,
+          epsilon=config.trainer_options.solver_options.batchnorm_epsilon)
+
+    # Divide weight decay by 2 to match the implementation of tf.nn.l2_loss. In
+    # this way, we allow our users to use a normal weight decay (e.g., 1e-4 for
+    # ResNet variants) in the config textproto. Then, we pass the adjusted
+    # weight decay (e.g., 5e-5 for ResNets) to keras in order to exactly match
+    # the commonly used tf.nn.l2_loss in TF1. References:
+    # https://github.com/tensorflow/models/blob/68ee72ae785274156b9e943df4145b257cd78b32/official/vision/beta/tasks/image_classification.py#L41
+    # https://www.tensorflow.org/api_docs/python/tf/keras/regularizers/l2
+    # https://www.tensorflow.org/api_docs/python/tf/nn/l2_loss
+    self._encoder = builder.create_encoder(
+        config.model_options.backbone, bn_layer,
+        conv_kernel_weight_decay=(
+            config.trainer_options.solver_options.weight_decay / 2))
+
+    self._decoder = builder.create_decoder(
+        config.model_options, bn_layer, dataset_descriptor.ignore_label)
+
+    self._is_max_deeplab = (
+        config.model_options.WhichOneof('meta_architecture') == 'max_deeplab')
+    self._post_processor = post_processor_builder.get_post_processor(
+        config, dataset_descriptor)
+
+    # The ASPP pooling size is always set to train crop size, which is found to
+    # be experimentally better.
+    pool_size = config.train_dataset_options.crop_size
+    output_stride = float(config.model_options.backbone.output_stride)
+    pool_size = tuple(
+        utils.scale_mutable_sequence(pool_size, 1.0 / output_stride))
+    logging.info('Setting pooling size to %s', pool_size)
+    self.set_pool_size(pool_size)
+
+    # Variables for multi-scale inference.
+    self._add_flipped_images = config.evaluator_options.add_flipped_images
+    if not config.evaluator_options.eval_scales:
+      self._eval_scales = [1.0]
+    else:
+      self._eval_scales = config.evaluator_options.eval_scales
+    if self._is_max_deeplab and (
+        self._add_flipped_images or len(self._eval_scales) > 1):
+      raise ValueError(
+          'MaX-DeepLab does not support multi-scale inference yet.')
+
+  def call(self,
+           input_tensor: tf.Tensor,
+           training: bool = False) -> Dict[Text, Any]:
+    """Performs a forward pass.
+
+    Args:
+      input_tensor: An input tensor of type tf.Tensor with shape [batch, height,
+        width, channels]. The input tensor should contain batches of RGB images.
+      training: A boolean flag indicating whether training behavior should be
+        used (default: False).
+
+    Returns:
+      A dictionary containing the results of the specified DeepLab architecture.
+      The results are bilinearly upsampled to input size before returning.
+    """
+    # Normalize the input in the same way as Inception. We normalize it outside
+    # the encoder so that we can extend encoders to different backbones without
+    # copying the normalization to each encoder. We normalize it after data
+    # preprocessing because it is faster on TPUs than on host CPUs. The
+    # normalization should not increase TPU memory consumption because it does
+    # not require gradient.
+    input_tensor = input_tensor / 127.5 - 1.0
+    # Get the static spatial shape of the input tensor.
+    _, input_h, input_w, _ = input_tensor.get_shape().as_list()
+    if training:
+      result_dict = self._decoder(
+          self._encoder(input_tensor, training=training), training=training)
+      result_dict = self._resize_predictions(
+          result_dict,
+          target_h=input_h,
+          target_w=input_w)
+    else:
+      result_dict = collections.defaultdict(list)
+      # Evaluation mode where one could perform multi-scale inference.
+      scale_1_pool_size = self.get_pool_size()
+      logging.info('Eval with scales %s', self._eval_scales)
+      for eval_scale in self._eval_scales:
+        # Get the scaled images/pool_size for each scale.
+        scaled_images, scaled_pool_size = (
+            self._scale_images_and_pool_size(
+                input_tensor, list(scale_1_pool_size), eval_scale))
+        # Update the ASPP pool size for different eval scales.
+        self.set_pool_size(tuple(scaled_pool_size))
+        logging.info('Eval scale %s; setting pooling size to %s',
+                     eval_scale, scaled_pool_size)
+        pred_dict = self._decoder(
+            self._encoder(scaled_images, training=training), training=training)
+        # MaX-DeepLab skips this resizing and upsamples the mask outputs in
+        # self._post_processor.
+        pred_dict = self._resize_predictions(
+            pred_dict,
+            target_h=input_h,
+            target_w=input_w)
+        # Change the semantic logits to probabilities with softmax. Note
+        # one should remove semantic logits for faster inference. We still
+        # keep them since they will be used to compute evaluation loss.
+        pred_dict[common.PRED_SEMANTIC_PROBS_KEY] = tf.nn.softmax(
+            pred_dict[common.PRED_SEMANTIC_LOGITS_KEY])
+        # Store the predictions from each scale.
+        for output_type, output_value in pred_dict.items():
+          result_dict[output_type].append(output_value)
+        if self._add_flipped_images:
+          pred_dict_reverse = self._decoder(
+              self._encoder(tf.reverse(scaled_images, [2]), training=training),
+              training=training)
+          pred_dict_reverse = self._resize_predictions(
+              pred_dict_reverse,
+              target_h=input_h,
+              target_w=input_w,
+              reverse=True)
+          # Change the semantic logits to probabilities with softmax.
+          pred_dict_reverse[common.PRED_SEMANTIC_PROBS_KEY] = tf.nn.softmax(
+              pred_dict_reverse[common.PRED_SEMANTIC_LOGITS_KEY])
+          # Store the predictions from each scale.
+          for output_type, output_value in pred_dict_reverse.items():
+            result_dict[output_type].append(output_value)
+      # Set back the pool_size for scale 1.0, the original setting.
+      self.set_pool_size(tuple(scale_1_pool_size))
+      # Average results across scales.
+      for output_type, output_value in result_dict.items():
+        result_dict[output_type] = tf.reduce_mean(
+            tf.stack(output_value, axis=0), axis=0)
+      # Post-process the results.
+      result_dict.update(self._post_processor(result_dict))
+
+    if common.PRED_CENTER_HEATMAP_KEY in result_dict:
+      result_dict[common.PRED_CENTER_HEATMAP_KEY] = tf.squeeze(
+          result_dict[common.PRED_CENTER_HEATMAP_KEY], axis=3)
+    return result_dict
+
+  def reset_pooling_layer(self):
+    """Resets the ASPP pooling layer to global average pooling."""
+    self._decoder.reset_pooling_layer()
+
+  def set_pool_size(self, pool_size: Tuple[int, int]):
+    """Sets the pooling size of the ASPP pooling layer.
+
+    Args:
+      pool_size: A tuple specifying the pooling size of the ASPP pooling layer.
+    """
+    self._decoder.set_pool_size(pool_size)
+
+  def get_pool_size(self):
+    return self._decoder.get_pool_size()
+
+  @property
+  def checkpoint_items(self) -> Dict[Text, Any]:
+    items = dict(encoder=self._encoder)
+    items.update(self._decoder.checkpoint_items)
+    return items
+
+  def _resize_predictions(self, result_dict, target_h, target_w, reverse=False):
+    """Resizes predictions to the target height and width.
+
+    This function resizes the items in the result_dict to the target height and
+    width. The items are optionally reversed w.r.t width if `reverse` is True.
+
+    Args:
+      result_dict: A dictionary storing prediction results to be resized.
+      target_h: An integer, the target height.
+      target_w: An integer, the target width.
+      reverse: A boolean, reversing the prediction result w.r.t. width.
+
+    Returns:
+      Resized (or optionally reversed) result_dict.
+    """
+    # The default MaX-DeepLab paper does not upsample any output during training
+    # in order to save GPU/TPU memory, but upsampling might lead to better
+    # performance.
+    if self._is_max_deeplab:
+      return result_dict
+    for key, value in result_dict.items():
+      if reverse:
+        value = tf.reverse(value, [2])
+        # Special care to offsets: need to flip x-offsets.
+        if _OFFSET_OUTPUT in key:
+          offset_y, offset_x = tf.split(
+              value=value, num_or_size_splits=2, axis=3)
+          offset_x *= -1
+          value = tf.concat([offset_y, offset_x], 3)
+      if _OFFSET_OUTPUT in key:
+        result_dict[key] = utils.resize_and_rescale_offsets(
+            value, [target_h, target_w])
+      else:
+        result_dict[key] = utils.resize_bilinear(
+            value, [target_h, target_w])
+    return result_dict
+
+  def _scale_images_and_pool_size(self, images, pool_size, scale):
+    """Scales images and pool_size w.r.t. scale.
+
+    Args:
+      images: An input tensor with shape [batch, height, width, 3].
+      pool_size: A list with two elements, specifying the pooling size
+        of ASPP pooling layer.
+      scale: A float, used to scale the input images and pool_size.
+
+    Returns:
+      Scaled images, and pool_size.
+    """
+    if scale == 1.0:
+      scaled_images = images
+      scaled_pool_size = pool_size
+    else:
+      image_size = images.get_shape().as_list()[1:3]
+      scaled_image_size = utils.scale_mutable_sequence(image_size, scale)
+      scaled_images = utils.resize_bilinear(images, scaled_image_size)
+      scaled_pool_size = [None, None]
+      if pool_size != [None, None]:
+        scaled_pool_size = utils.scale_mutable_sequence(pool_size, scale)
+    return scaled_images, scaled_pool_size
diff --git a/model/deeplab_test.py b/model/deeplab_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab90af61157ae23f92890d49f1490e1bb1cd7a30
--- /dev/null
+++ b/model/deeplab_test.py
@@ -0,0 +1,252 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for deeplab."""
+
+import os
+
+import numpy as np
+import tensorflow as tf
+
+from google.protobuf import text_format
+from deeplab2 import common
+from deeplab2 import config_pb2
+from deeplab2.data import dataset
+from deeplab2.model import deeplab
+from deeplab2.model import utils
+# resources dependency
+
+_CONFIG_PATH = 'deeplab2/configs/example'
+
+
+def _read_proto_file(filename, proto):
+  filename = filename  # OSS: removed internal filename loading.
+  with tf.io.gfile.GFile(filename, 'r') as proto_file:
+    return text_format.ParseLines(proto_file, proto)
+
+
+def _create_model_from_test_proto(file_name,
+                                  dataset_name='cityscapes_panoptic'):
+  proto_filename = os.path.join(_CONFIG_PATH, file_name)
+  config = _read_proto_file(proto_filename, config_pb2.ExperimentOptions())
+  return deeplab.DeepLab(config,
+                         dataset.MAP_NAME_TO_DATASET_INFO[dataset_name]
+                         ), config
+
+
+class DeeplabTest(tf.test.TestCase):
+
+  def test_deeplab_with_deeplabv3(self):
+    model, experiment_options = _create_model_from_test_proto(
+        'example_cityscapes_deeplabv3.textproto')
+    train_crop_size = tuple(
+        experiment_options.train_dataset_options.crop_size)
+    input_tensor = tf.random.uniform(
+        shape=(2, train_crop_size[0], train_crop_size[1], 3))
+    expected_semantic_shape = [
+        2, train_crop_size[0], train_crop_size[1],
+        experiment_options.model_options.deeplab_v3.num_classes]
+    resulting_dict = model(input_tensor)
+    self.assertListEqual(
+        resulting_dict[common.PRED_SEMANTIC_LOGITS_KEY].shape.as_list(),
+        expected_semantic_shape)
+    num_params = np.sum(
+        [np.prod(v.get_shape().as_list()) for v in model.trainable_weights])
+    self.assertEqual(num_params, 39638355)
+
+  def test_deeplab_with_deeplabv3plus(self):
+    model, experiment_options = _create_model_from_test_proto(
+        'example_cityscapes_deeplabv3plus.textproto')
+    train_crop_size = tuple(
+        experiment_options.train_dataset_options.crop_size)
+    input_tensor = tf.random.uniform(
+        shape=(2, train_crop_size[0], train_crop_size[1], 3))
+    expected_semantic_shape = [
+        2, train_crop_size[0], train_crop_size[1],
+        experiment_options.model_options.deeplab_v3_plus.num_classes]
+    resulting_dict = model(input_tensor)
+    self.assertListEqual(
+        resulting_dict[common.PRED_SEMANTIC_LOGITS_KEY].shape.as_list(),
+        expected_semantic_shape)
+    num_params = np.sum(
+        [np.prod(v.get_shape().as_list()) for v in model.trainable_weights])
+    self.assertEqual(num_params, 39210947)
+
+  def test_deeplab_with_deeplabv3_mv3l(self):
+    model, experiment_options = _create_model_from_test_proto(
+        'example_cityscapes_deeplabv3_mv3l.textproto')
+    train_crop_size = tuple(
+        experiment_options.train_dataset_options.crop_size)
+    input_tensor = tf.random.uniform(
+        shape=(2, train_crop_size[0], train_crop_size[1], 3))
+    expected_semantic_shape = [
+        2, train_crop_size[0], train_crop_size[1],
+        experiment_options.model_options.deeplab_v3.num_classes]
+    resulting_dict = model(input_tensor)
+    self.assertListEqual(
+        resulting_dict[common.PRED_SEMANTIC_LOGITS_KEY].shape.as_list(),
+        expected_semantic_shape)
+    num_params = np.sum(
+        [np.prod(v.get_shape().as_list()) for v in model.trainable_weights])
+    self.assertEqual(num_params, 11024963)
+
+  def test_deeplab_with_panoptic_deeplab(self):
+    model, experiment_options = _create_model_from_test_proto(
+        'example_cityscapes_panoptic_deeplab.textproto')
+    train_crop_size = tuple(
+        experiment_options.train_dataset_options.crop_size)
+    input_tensor = tf.random.uniform(
+        shape=(2, train_crop_size[0], train_crop_size[1], 3))
+    expected_semantic_shape = [
+        2, train_crop_size[0], train_crop_size[1],
+        experiment_options.model_options.panoptic_deeplab.semantic_head.
+        output_channels]
+    expected_instance_center_shape = [
+        2, train_crop_size[0], train_crop_size[1]]
+    expected_instance_regression_shape = [
+        2, train_crop_size[0], train_crop_size[1], 2]
+    resulting_dict = model(input_tensor)
+    self.assertListEqual(
+        resulting_dict[common.PRED_SEMANTIC_LOGITS_KEY].shape.as_list(),
+        expected_semantic_shape)
+    self.assertListEqual(
+        resulting_dict[common.PRED_INSTANCE_SCORES_KEY].shape.as_list(),
+        expected_instance_center_shape)
+    self.assertListEqual(
+        resulting_dict[common.PRED_CENTER_HEATMAP_KEY].shape.as_list(),
+        expected_instance_center_shape)
+    self.assertListEqual(
+        resulting_dict[common.PRED_OFFSET_MAP_KEY].shape.as_list(),
+        expected_instance_regression_shape)
+    num_params = np.sum(
+        [np.prod(v.get_shape().as_list()) for v in model.trainable_weights])
+    self.assertEqual(num_params, 54973702)
+
+  def test_deeplab_with_panoptic_deeplab_mv3l(self):
+    model, experiment_options = _create_model_from_test_proto(
+        'example_cityscapes_panoptic_deeplab_mv3l.textproto')
+    train_crop_size = tuple(
+        experiment_options.train_dataset_options.crop_size)
+    input_tensor = tf.random.uniform(
+        shape=(2, train_crop_size[0], train_crop_size[1], 3))
+    expected_semantic_shape = [
+        2, train_crop_size[0], train_crop_size[1],
+        experiment_options.model_options.panoptic_deeplab.semantic_head.
+        output_channels]
+    expected_instance_center_shape = [
+        2, train_crop_size[0], train_crop_size[1]]
+    expected_instance_regression_shape = [
+        2, train_crop_size[0], train_crop_size[1], 2]
+    resulting_dict = model(input_tensor)
+    self.assertListEqual(
+        resulting_dict[common.PRED_SEMANTIC_LOGITS_KEY].shape.as_list(),
+        expected_semantic_shape)
+    self.assertListEqual(
+        resulting_dict[common.PRED_INSTANCE_SCORES_KEY].shape.as_list(),
+        expected_instance_center_shape)
+    self.assertListEqual(
+        resulting_dict[common.PRED_CENTER_HEATMAP_KEY].shape.as_list(),
+        expected_instance_center_shape)
+    self.assertListEqual(
+        resulting_dict[common.PRED_OFFSET_MAP_KEY].shape.as_list(),
+        expected_instance_regression_shape)
+    num_params = np.sum(
+        [np.prod(v.get_shape().as_list()) for v in model.trainable_weights])
+    self.assertEqual(num_params, 18226550)
+
+  def test_deeplab_with_max_deeplab(self):
+    model, experiment_options = _create_model_from_test_proto(
+        'example_coco_max_deeplab.textproto', dataset_name='coco_panoptic')
+    train_crop_size = tuple(
+        experiment_options.train_dataset_options.crop_size)
+    input_tensor = tf.random.uniform(
+        shape=(2, train_crop_size[0], train_crop_size[1], 3))
+    stride_4_size = utils.scale_mutable_sequence(train_crop_size, 0.25)
+    expected_semantic_shape = [
+        2, stride_4_size[0], stride_4_size[1], experiment_options.model_options.
+        max_deeplab.auxiliary_semantic_head.output_channels]
+    expected_transformer_class_logits_shape = [
+        2, 128, experiment_options.model_options.
+        max_deeplab.auxiliary_semantic_head.output_channels]
+    expected_pixel_space_normalized_feature_shape = [
+        2, stride_4_size[0], stride_4_size[1], experiment_options.model_options.
+        max_deeplab.pixel_space_head.output_channels]
+    expected_pixel_space_mask_logits_shape = [
+        2, stride_4_size[0], stride_4_size[1], 128]
+    resulting_dict = model(input_tensor, training=True)
+    self.assertListEqual(
+        resulting_dict[common.PRED_SEMANTIC_LOGITS_KEY].shape.as_list(),
+        expected_semantic_shape)
+    self.assertListEqual(
+        resulting_dict[
+            common.PRED_TRANSFORMER_CLASS_LOGITS_KEY].shape.as_list(),
+        expected_transformer_class_logits_shape)
+    self.assertListEqual(
+        resulting_dict[
+            common.PRED_PIXEL_SPACE_NORMALIZED_FEATURE_KEY].shape.as_list(),
+        expected_pixel_space_normalized_feature_shape)
+    self.assertListEqual(
+        resulting_dict[common.PRED_PIXEL_SPACE_MASK_LOGITS_KEY].shape.as_list(),
+        expected_pixel_space_mask_logits_shape)
+    num_params = 0
+    for v in model.trainable_weights:
+      params = np.prod(v.get_shape().as_list())
+      # Exclude the auxiliary semantic head.
+      if 'auxiliary_semantic' not in v.name:
+        num_params += params
+    self.assertEqual(num_params, 61900200)  # 61.9M in the paper.
+
+  def test_deeplab_errors(self):
+    proto_filename = os.path.join(
+        _CONFIG_PATH, 'example_cityscapes_panoptic_deeplab.textproto')
+    experiment_options = _read_proto_file(proto_filename,
+                                          config_pb2.ExperimentOptions())
+
+    with self.subTest('ResNet error.'):
+      with self.assertRaises(ValueError):
+        experiment_options.model_options.backbone.name = 'not_a_resnet_backbone'
+        _ = deeplab.DeepLab(experiment_options,
+                            dataset.CITYSCAPES_PANOPTIC_INFORMATION)
+
+    with self.subTest('Encoder family error.'):
+      with self.assertRaises(ValueError):
+        experiment_options.model_options.backbone.name = 'not_a_backbone'
+        _ = deeplab.DeepLab(experiment_options,
+                            dataset.CITYSCAPES_PANOPTIC_INFORMATION)
+
+  def test_deeplab_set_pooling(self):
+    model, _ = _create_model_from_test_proto(
+        'example_cityscapes_panoptic_deeplab.textproto')
+    pool_size = (10, 10)
+    model.set_pool_size(pool_size)
+
+    self.assertTupleEqual(
+        model._decoder._semantic_decoder._aspp._aspp_pool._pool_size, pool_size)
+    self.assertTupleEqual(
+        model._decoder._instance_decoder._aspp._aspp_pool._pool_size, pool_size)
+
+  def test_deeplab_reset_pooling(self):
+    model, _ = _create_model_from_test_proto(
+        'example_cityscapes_panoptic_deeplab.textproto')
+    model.reset_pooling_layer()
+    pool_size = (None, None)
+    self.assertTupleEqual(
+        model._decoder._semantic_decoder._aspp._aspp_pool._pool_size, pool_size)
+    self.assertTupleEqual(
+        model._decoder._instance_decoder._aspp._aspp_pool._pool_size, pool_size)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/model/encoder/__init__.py b/model/encoder/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..35e4ce02ff422f3aa84ab644b88d65b13e0cbc03
--- /dev/null
+++ b/model/encoder/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/model/encoder/axial_resnet.py b/model/encoder/axial_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e54ec52c73a4ed32f882b44717a163800938787
--- /dev/null
+++ b/model/encoder/axial_resnet.py
@@ -0,0 +1,776 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Implements Axial-ResNets proposed in Axial-DeepLab [1].
+
+[1] Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation,
+    ECCV 2020 Spotlight.
+      Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille,
+      Liang-Chieh Chen.
+"""
+
+import tensorflow as tf
+
+from deeplab2.model import utils
+from deeplab2.model.layers import activations
+from deeplab2.model.layers import axial_block_groups
+from deeplab2.model.layers import convolutions
+from deeplab2.model.layers import resized_fuse
+from deeplab2.model.layers import stems
+
+# Add a suffix in layer names that indicate if the current layer is a part of
+# the backbone or an extra layer, i.e. if the current layer will be pretrained
+# or not. This name will be used when we apply 10x larger learning rates for
+# extra parameters that have not been pretrained, in panoptic segmentation.
+# This keyword is reserved and should not be a part of the variable names in a
+# classification pretrained backbone.
+EXTRA = 'extra'
+# Similarly, we will apply 10x larger learning rates on the memory feature.
+# This global variable name will be accessed when we build the optimizers. This
+# keyword is reserved and should not be a part of the variable names in a
+# classification pretrained backbone.
+MEMORY_FEATURE = 'memory_feature'
+
+
+class AxialResNet(tf.keras.Model):
+  """An Axial-ResNet model as proposed in Axial-DeepLab [1] and MaX-DeepLab [2].
+
+  An Axial-ResNet [1] replaces 3x3 convolutions in a Resnet by axial-attention
+  layers. A dual-path transformer [2] and a stacked decoder [2] can be used
+  optionally. In addition, this class supports scaling models with SWideRNet [3]
+  and augmenting convolutions with Switchable Atrous Convolution [4].
+
+  Reference:
+  [1] Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation,
+      ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853
+        Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille,
+        Liang-Chieh Chen.
+  [2] MaX-DeepLab: "End-to-End Panoptic Segmentation with Mask Transformers",
+      CVPR 2021. https://arxiv.org/abs/2012.00759
+        Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen.
+  [3] Scaling Wide Residual Networks for Panoptic Segmentation,
+      https://arxiv.org/abs/2011.11675
+        Liang-Chieh Chen, Huiyu Wang, Siyuan Qiao.
+  [4] DetectoRS: Detecting Objects with Recursive Feature Pyramid and Switchable
+      Atrous Convolution, CVPR 2021. https://arxiv.org/abs/2006.02334
+        Siyuan Qiao, Liang-Chieh Chen, Alan Yuille.
+  """
+
+  def __init__(self,
+               name,
+               num_blocks=(3, 4, 6, 3),
+               backbone_layer_multiplier=1.0,
+               width_multiplier=1.0,
+               stem_width_multiplier=1.0,
+               output_stride=16,
+               classification_mode=False,
+               backbone_type='resnet_beta',
+               use_axial_beyond_stride=16,
+               backbone_use_transformer_beyond_stride=32,
+               extra_decoder_use_transformer_beyond_stride=32,
+               backbone_decoder_num_stacks=0,
+               backbone_decoder_blocks_per_stage=1,
+               extra_decoder_num_stacks=0,
+               extra_decoder_blocks_per_stage=1,
+               max_num_mask_slots=128,
+               num_mask_slots=128,
+               memory_channels=256,
+               base_transformer_expansion=1.0,
+               global_feed_forward_network_channels=256,
+               high_resolution_output_stride=4,
+               activation='relu',
+               block_group_config=None,
+               bn_layer=tf.keras.layers.BatchNormalization,
+               conv_kernel_weight_decay=0.0):
+    """Initializes an AxialResNet model.
+
+    Args:
+      name: A string, the name of the model.
+      num_blocks: A list of 4 integers. It denotes the number of blocks to
+        include in the last 4 stages or block groups. Each group consists of
+        blocks that output features of the same resolution. Defaults to (3, 4,
+        6, 3) as in MaX-DeepLab-S.
+      backbone_layer_multiplier: A float, layer_multiplier for the backbone,
+        excluding the STEM. This flag controls the number of layers. Defaults to
+        1.0 as in MaX-DeepLab-S.
+      width_multiplier: A float, the channel multiplier for the block groups.
+        Defaults to 1.0 as in MaX-DeepLab-S.
+      stem_width_multiplier: A float, the channel multiplier for stem
+        convolutions. Defaults to 1.0 as in MaX-DeepLab-S.
+      output_stride: An integer, the maximum ratio of input to output spatial
+        resolution. Defaults to 16 as in MaX-DeepLab-S.
+      classification_mode: A boolean, whether to perform in a classification
+        mode. If it is True, this function directly returns backbone feature
+        endpoints. Note that these feature endpoints can also be used directly
+        for Panoptic-DeepLab or Motion-DeepLab. If it is False, this function
+        builds MaX-DeepLab extra decoder layers and extra transformer layers.
+        Defaults to False as in MaX-DeepLab.
+      backbone_type: A string, the type of backbone. Supports 'resnet',
+        'resnet_beta', and 'wider_resnet'. It controls both the stem type and
+        the residual block type. Defaults to 'resnet_beta' as in MaX-DeepLab-S.
+      use_axial_beyond_stride: An integer, the stride beyond which we use axial
+        attention. Set to 0 if no axial attention is desired. Defaults to 16 as
+        in MaX-DeepLab.
+      backbone_use_transformer_beyond_stride: An integer, the stride beyond
+        which we use a memory path transformer block on top of a regular pixel
+        path block, in the backbone. Set to 0 if no transformer block is desired
+        in the backbone. Defaults to 32 as in MaX-DeepLab-S.
+      extra_decoder_use_transformer_beyond_stride: An integer, the stride beyond
+        which we use a memory path transformer block on top of a regular pixel
+        path block, in the extra decoder stages. Set to 0 if no transformer
+        block is desired in the extra decoder stages. Defaults to 32 as in
+        MaX-DeepLab-S.
+      backbone_decoder_num_stacks: An integer, the number of decoder stacks
+        (introduced in MaX-DeepLab) that we use in the backbone. The stacked
+        decoders are applied in a stacked hour-glass style. Defaults to 0 as in
+        MaX-DeepLab-S.
+      backbone_decoder_blocks_per_stage: An integer, the number of consecutive
+        residual blocks to apply for each decoder stage, in the backbone.
+        Defaults to 1 as in MaX-DeepLab-S.
+      extra_decoder_num_stacks: An integer, the number of decoder stacks
+        (introduced in MaX-DeepLab) that we use in the extra decoder layers. It
+        is different from backbone_decoder_blocks_per_stage in that the extra
+        decoder stacks will be trained from scratch on segmentation tasks,
+        instead of pretrained on ImageNet classification. Defaults to 0 as in
+        MaX-DeepLab-S.
+      extra_decoder_blocks_per_stage: An integer, the number of consecutive
+        residual blocks to apply for each decoder stage, in the extra decoder
+        stages. Defaults to 1 as in MaX-DeepLab-S.
+      max_num_mask_slots: An integer, the maximum possible number of mask slots
+        that will be used. This will be used in a pretraining-finetuning use
+        case with different num_mask_slots: We can set max_num_mask_slots to the
+        maximum possible num_mask_slots, and then the saved checkpoint can be
+        loaded for finetuning with a different num_mask_slots. Defaults to 128
+        as in MaX-DeepLab.
+      num_mask_slots: An integer, the number of mask slots that will be used.
+        Defaults to 128 as in MaX-DeepLab-S.
+      memory_channels: An integer, the number of channels for the whole memory
+        path. Defaults to 256 as in MaX-DeepLab-S.
+      base_transformer_expansion: A float, the base width expansion rate for
+        transformer layers. Defaults to 1.0 as in MaX-DeepLab-S.
+      global_feed_forward_network_channels: An integer, the number of channels
+        in the final global feed forward network, i.e. the mask feature head and
+        the mask class head. Defaults to 256 as in MaX-DeepLab-S.
+      high_resolution_output_stride: An integer, the final decoding output
+        stride. Defaults to 4 as in MaX-DeepLab-S.
+      activation: A string, type of activation function to apply. Support
+        'relu', 'swish' (or 'silu'), 'gelu', 'approximated_gelu', and 'elu'.
+      block_group_config: An argument dictionary that will be passed to
+        block_group.
+      bn_layer: An optional tf.keras.layers.Layer that computes the
+        normalization (default: tf.keras.layers.BatchNormalization).
+      conv_kernel_weight_decay: A float, the weight decay for convolution
+        kernels.
+
+    Raises:
+      ValueError: If backbone_type is not one of 'resnet', 'resnet_beta', or
+        'wider_resnet'.
+      ValueError: If extra_decoder_blocks_per_stage is not greater than zero.
+    """
+    super(AxialResNet, self).__init__(name=name)
+
+    if extra_decoder_blocks_per_stage <= 0:
+      raise ValueError(
+          'Extra_decoder_blocks_per_stage should be great than zero.')
+    if block_group_config is None:
+      block_group_config = {}
+
+    # Compute parameter lists for block_groups. We consider five stages so that
+    # it is general enough to cover fully axial resnets and wider resnets.
+    total_strides_list = [1, 2, 4, 8, 16]
+
+    # Append 3 blocks for the first stage of fully axial resnets and wider
+    # resnets.
+    num_blocks_list = [3] + utils.scale_int_list(list(num_blocks),
+                                                 backbone_layer_multiplier)
+    strides_list = [2] * 5
+
+    # Expand the transformer and the block filters with the stride.
+    transformer_expansions_list = []
+    filters_list = []
+    for index, stride in enumerate(total_strides_list):
+      # Reduce the number of channels when we apply transformer to low level
+      # features (stride = 2, 4, or 8). The base_transformer_expansion is used
+      # for stride = 16, i.e. the standard output_stride for MaX-DeepLab-S.
+      transformer_expansions_list.append(base_transformer_expansion * stride /
+                                         16.0)
+      # Compute the base number of filters in each stage. For example, the last
+      # stage of ResNet50 has an input stride of 16, then we compute the base
+      # number of filters for a bottleneck block as 16 * 32 = 512, which is the
+      # number of filters for the 3x3 convolution in those blocks.
+      if backbone_type == 'wider_resnet' and index == 0:
+        # SWideRNet variants use stem_width_multiplier for the first block.
+        filters_list.append(int(round(stride * 32 * stem_width_multiplier)))
+      else:
+        filters_list.append(int(round(stride * 32 * width_multiplier)))
+
+    self._num_mask_slots = None
+    # Initialize memory_feature only when a transformer block is used.
+    self._use_memory_feature = (backbone_use_transformer_beyond_stride or
+                                (extra_decoder_use_transformer_beyond_stride and
+                                 (not classification_mode)))
+    if self._use_memory_feature:
+      self._memory_feature_shape = (1, max_num_mask_slots, memory_channels)
+      self._memory_feature_initializer = (
+          tf.keras.initializers.TruncatedNormal(stddev=1.0))
+      self._memory_feature_regularizer = tf.keras.regularizers.l2(
+          conv_kernel_weight_decay)
+      if num_mask_slots:
+        self._num_mask_slots = num_mask_slots
+
+    # Use a convolutional stem except fully axial cases.
+    stem_channels = int(round(64 * stem_width_multiplier))
+    self._activation_fn = activations.get_activation(activation)
+    if use_axial_beyond_stride == 1:
+      self._stem = tf.identity
+      first_block_index = 0
+    elif backbone_type.lower() == 'wider_resnet':
+      self._stem = convolutions.Conv2DSame(
+          output_channels=stem_channels,
+          kernel_size=3,
+          name='stem',
+          strides=2,
+          use_bias=False,
+          use_bn=True,
+          bn_layer=bn_layer,
+          activation='none',
+          conv_kernel_weight_decay=conv_kernel_weight_decay)
+      # Wider ResNet has five residual block stages, so we start from index 0.
+      first_block_index = 0
+      # Since we have applied the first strided convolution here, we do not use
+      # a stride for the first stage (which will operate on stride 2).
+      strides_list[0] = 1
+      total_strides_list[0] = 2
+    elif backbone_type.lower() == 'resnet_beta':
+      self._stem = stems.InceptionSTEM(
+          bn_layer=bn_layer,
+          width_multiplier=stem_width_multiplier,
+          conv_kernel_weight_decay=conv_kernel_weight_decay,
+          activation=activation)
+      first_block_index = 1
+    elif backbone_type.lower() == 'resnet':
+      self._stem = convolutions.Conv2DSame(
+          output_channels=stem_channels,
+          kernel_size=7,
+          name='stem',
+          strides=2,
+          use_bias=False,
+          use_bn=True,
+          bn_layer=bn_layer,
+          activation='none',
+          conv_kernel_weight_decay=conv_kernel_weight_decay)
+      first_block_index = 1
+    else:
+      raise ValueError(backbone_type + ' is not supported.')
+
+    self._first_block_index = first_block_index
+    # Apply standard ResNet block groups. We use first_block_index to
+    # distinguish models with 4 stages and those with 5 stages.
+    for index in range(first_block_index, 5):
+      current_name = '_stage{}'.format(index + 1)
+      utils.safe_setattr(self, current_name, axial_block_groups.BlockGroup(
+          filters=filters_list[index],
+          num_blocks=num_blocks_list[index],
+          name=utils.get_layer_name(current_name),
+          original_resnet_stride=strides_list[index],
+          original_resnet_input_stride=total_strides_list[index],
+          output_stride=output_stride,
+          backbone_type=backbone_type,
+          use_axial_beyond_stride=use_axial_beyond_stride,
+          use_transformer_beyond_stride=(
+              backbone_use_transformer_beyond_stride),
+          transformer_expansion=transformer_expansions_list[index],
+          activation=activation,
+          bn_layer=bn_layer,
+          conv_kernel_weight_decay=conv_kernel_weight_decay,
+          **block_group_config))
+    self._backbone_decoder_num_stacks = backbone_decoder_num_stacks
+    self._classification_mode = classification_mode
+    self._extra_decoder_num_stacks = extra_decoder_num_stacks
+    self._output_stride = output_stride
+    self._high_resolution_output_stride = high_resolution_output_stride
+    self._width_multiplier = width_multiplier
+    self._activation = activation
+    self._bn_layer = bn_layer
+    self._conv_kernel_weight_decay = conv_kernel_weight_decay
+    self._backbone_use_transformer_beyond_stride = (
+        backbone_use_transformer_beyond_stride)
+    self._extra_decoder_use_transformer_beyond_stride = (
+        extra_decoder_use_transformer_beyond_stride)
+
+    # Keep track of the current stack so that we know when to stop.
+    current_stack = 0
+    # Track whether we are building the backbone. This will affect the backbone
+    # related arguments, local learning rate, and so on.
+    current_is_backbone = True
+
+    if backbone_decoder_num_stacks == 0:
+      # No stacked decoder is used in the backbone, so we have finished building
+      # the backbone. We either return the classification endpoints, or continue
+      # building a non-backbone decoder for panoptic segmentation.
+      if self._classification_mode:
+        return
+      else:
+        current_is_backbone = False
+    if not current_is_backbone:
+      # Now that we have finished building the backbone and no stacked decoder
+      # is used in the backbone, so we start to build extra (i.e., non-backbone)
+      # layers for panoptic segmentation.
+      current_name = '_stage5_' + EXTRA
+      utils.safe_setattr(
+          self, current_name, axial_block_groups.BlockGroup(
+              filters=filters_list[-1],
+              num_blocks=extra_decoder_blocks_per_stage,
+              name=utils.get_layer_name(current_name),
+              original_resnet_stride=1,
+              original_resnet_input_stride=32,
+              output_stride=output_stride,
+              backbone_type=backbone_type,
+              use_axial_beyond_stride=use_axial_beyond_stride,
+              use_transformer_beyond_stride=(
+                  extra_decoder_use_transformer_beyond_stride),
+              transformer_expansion=base_transformer_expansion,
+              activation=activation,
+              bn_layer=bn_layer,
+              conv_kernel_weight_decay=conv_kernel_weight_decay,
+              **block_group_config))
+
+    # Compute parameter lists for stacked decoder.
+    total_decoder_num_stacks = (
+        backbone_decoder_num_stacks + extra_decoder_num_stacks)
+
+    # Use a function to compute the next stride.
+    next_stride_fn = lambda x: x // 2
+    current_decoder_stride = output_stride
+    decoder_stage = 0
+
+    # Exit if we have enough stacks and reach the decoding output stride.
+    while (current_stack < total_decoder_num_stacks or
+           current_decoder_stride > high_resolution_output_stride):
+      decoder_stage += 1
+      current_decoder_stride = next_stride_fn(current_decoder_stride)
+
+      if current_decoder_stride == output_stride:
+        current_stack += 1
+        # Always use blocks from the last resnet stage if the current stride is
+        # output stride (the largest stride).
+        original_resnet_input_stride = 32
+
+        # Switch the decoder direction if we reach the largest stride.
+        next_stride_fn = lambda x: x // 2
+      else:
+        original_resnet_input_stride = current_decoder_stride
+
+      # Scale channels according to the strides.
+      decoder_channels = original_resnet_input_stride * 64 * width_multiplier
+      current_transformer_expansion = (
+          base_transformer_expansion * current_decoder_stride / 16.0)
+
+      # Apply a decoder block group for building the backbone.
+      if current_is_backbone:
+        current_name = '_decoder_stage{}'.format(decoder_stage)
+        utils.safe_setattr(
+            self, current_name, axial_block_groups.BlockGroup(
+                filters=decoder_channels // 4,
+                num_blocks=backbone_decoder_blocks_per_stage,
+                name=utils.get_layer_name(current_name),
+                original_resnet_stride=1,
+                original_resnet_input_stride=original_resnet_input_stride,
+                output_stride=output_stride,
+                backbone_type=backbone_type,
+                use_axial_beyond_stride=use_axial_beyond_stride,
+                use_transformer_beyond_stride=(
+                    backbone_use_transformer_beyond_stride),
+                transformer_expansion=current_transformer_expansion,
+                activation=activation,
+                bn_layer=bn_layer,
+                conv_kernel_weight_decay=conv_kernel_weight_decay,
+                **block_group_config))
+
+      if (current_decoder_stride == output_stride and
+          current_stack == backbone_decoder_num_stacks):
+        # Now that we have finished building the backbone, we either return the
+        # classification endpoints, or continue building a non-backbone decoder
+        # for panoptic segmentation.
+        if classification_mode:
+          return
+        else:
+          current_is_backbone = False
+
+      # Apply a decoder block group for building the extra layers.
+      if not current_is_backbone:
+        # Continue building an extra (i.e., non-backbone) decoder for panoptic
+        # segmentation.
+        current_name = '_decoder_stage{}_{}'.format(decoder_stage, EXTRA)
+        utils.safe_setattr(
+            self, current_name, axial_block_groups.BlockGroup(
+                filters=decoder_channels // 4,
+                num_blocks=extra_decoder_blocks_per_stage,
+                name=utils.get_layer_name(current_name),
+                original_resnet_stride=1,
+                original_resnet_input_stride=original_resnet_input_stride,
+                output_stride=output_stride,
+                backbone_type=backbone_type,
+                use_axial_beyond_stride=use_axial_beyond_stride,
+                use_transformer_beyond_stride=(
+                    extra_decoder_use_transformer_beyond_stride),
+                transformer_expansion=current_transformer_expansion,
+                activation=activation,
+                bn_layer=bn_layer,
+                conv_kernel_weight_decay=conv_kernel_weight_decay,
+                **block_group_config))
+      if current_decoder_stride == high_resolution_output_stride:
+        next_stride_fn = lambda x: x * 2
+
+    # Assert that we have already returned if we are building a classifier.
+    assert not classification_mode
+    if (backbone_use_transformer_beyond_stride or
+        extra_decoder_use_transformer_beyond_stride):
+      # Build extra memory path feed forward networks for the class feature and
+      # the mask feature.
+      current_name = '_class_feature_' + EXTRA
+      utils.safe_setattr(
+          self, current_name, convolutions.Conv1D(
+              global_feed_forward_network_channels,
+              utils.get_layer_name(current_name),
+              use_bias=False,
+              use_bn=True,
+              bn_layer=bn_layer,
+              activation=activation,
+              conv_kernel_weight_decay=conv_kernel_weight_decay))
+      current_name = '_mask_feature_' + EXTRA
+      utils.safe_setattr(
+          self, current_name, convolutions.Conv1D(
+              global_feed_forward_network_channels,
+              utils.get_layer_name(current_name),
+              use_bias=False,
+              use_bn=True,
+              bn_layer=bn_layer,
+              activation=activation,
+              conv_kernel_weight_decay=conv_kernel_weight_decay))
+
+  def build(self, input_shape):
+    """Builds model weights and input shape dependent sub-layers."""
+    if self._use_memory_feature:
+      self._memory_feature = self.add_weight(
+          name=MEMORY_FEATURE,
+          shape=self._memory_feature_shape,
+          initializer=self._memory_feature_initializer,
+          regularizer=self._memory_feature_regularizer)
+    else:
+      self._memory_feature = None
+
+    # Go through the loop to build the ResizedFuse layers.
+    current_stack = 0
+    # Track whether we are building the backbone. This will affect the backbone
+    # related arguments, local learning rate, and so on.
+    current_is_backbone = self._backbone_decoder_num_stacks != 0
+    total_decoder_num_stacks = (
+        self._backbone_decoder_num_stacks + self._extra_decoder_num_stacks)
+    next_stride_fn = lambda x: x // 2
+    current_decoder_stride = self._output_stride
+    decoder_stage = 0
+    while (current_stack < total_decoder_num_stacks or
+           current_decoder_stride > self._high_resolution_output_stride):
+      decoder_stage += 1
+      current_decoder_stride = next_stride_fn(current_decoder_stride)
+      if current_decoder_stride == self._output_stride:
+        current_stack += 1
+        original_resnet_input_stride = 32
+        next_stride_fn = lambda x: x // 2
+      else:
+        original_resnet_input_stride = current_decoder_stride
+      # Compute the decoder_channels according to original_resnet_input_stride.
+      # For example, at stride 4 with width multiplier = 1, we use 4 * 64 = 256
+      # channels, which is the same as a standard ResNet.
+      decoder_channels = int(round(
+          original_resnet_input_stride * 64 * self._width_multiplier))
+      decoder_height, decoder_width = utils.scale_mutable_sequence(
+          input_shape[1:3], 1.0 / current_decoder_stride)
+      if current_is_backbone:
+        current_name = '_decoder_stage{}_resized_fuse'.format(decoder_stage)
+      else:
+        current_name = '_decoder_stage{}_{}_resized_fuse'.format(
+            decoder_stage, EXTRA)
+      utils.safe_setattr(
+          self, current_name, resized_fuse.ResizedFuse(
+              name=utils.get_layer_name(current_name),
+              height=decoder_height,
+              width=decoder_width,
+              num_channels=decoder_channels,
+              activation=self._activation,
+              bn_layer=self._bn_layer,
+              conv_kernel_weight_decay=self._conv_kernel_weight_decay))
+      if (current_decoder_stride == self._output_stride and
+          current_stack == self._backbone_decoder_num_stacks):
+        # Now that we have finished building the backbone, we either return the
+        # classification endpoints, or continue building a non-backbone decoder
+        # for panoptic segmentation.
+        if self._classification_mode:
+          return
+        current_is_backbone = False
+      if current_decoder_stride == self._high_resolution_output_stride:
+        next_stride_fn = lambda x: x * 2
+
+  def call_encoder_before_stacked_decoder(self, inputs, training=False):
+    """Performs a forward pass of the encoder before stacking decoders.
+
+    Args:
+      inputs: An input [batch, height, width, channel] tensor.
+      training: A boolean, whether the model is in training mode.
+
+    Returns:
+      current_output: An output tensor with shape [batch, new_height, new_width,
+        new_channel].
+      activated_output: An activated output tensor with shape [batch,
+        new_height, new_width, new_channel].
+      memory_feature: None if no transformer is used. A [batch, num_memory,
+        memory_channel] tensor if transformer is used.
+      endpoints: A dict, the network endpoints that might be used by DeepLab.
+    """
+    memory_feature = self._memory_feature
+    if self._use_memory_feature:
+      if self._num_mask_slots:
+        memory_feature = self._memory_feature[:, :self._num_mask_slots, :]
+      memory_feature = tf.tile(memory_feature,
+                               [tf.shape(inputs)[0], 1, 1])
+
+    endpoints = {}
+    output = self._stem(inputs)
+    activated_output = self._activation_fn(output)
+    endpoints['stage1'] = output
+    endpoints['res1'] = activated_output
+
+    # Apply standard ResNet block groups. We use first_block_index to
+    # distinguish models with 4 stages and those with 5 stages.
+    for index in range(self._first_block_index, 5):
+      current_name = '_stage{}'.format(index + 1)
+      current_output, activated_output, memory_feature = (
+          getattr(self, current_name)(
+              (activated_output, memory_feature), training=training))
+      endpoints[utils.get_layer_name(current_name)] = current_output
+      activated_output_name = 'res{}'.format(index + 1)
+      endpoints[activated_output_name] = activated_output
+    return current_output, activated_output, memory_feature, endpoints
+
+  def call_stacked_decoder(self,
+                           current_output,
+                           activated_output,
+                           memory_feature,
+                           endpoints,
+                           training=False):
+    """Performs a forward pass of the stacked decoders.
+
+    Args:
+      current_output: An output tensor with shape [batch, new_height, new_width,
+        new_channel].
+      activated_output: An activated output tensor with shape [batch,
+        new_height, new_width, new_channel].
+      memory_feature: None if no transformer is used. A [batch, num_memory,
+        memory_channel] tensor if transformer is used.
+      endpoints: A dict, the network endpoints that might be used by DeepLab.
+      training: A boolean, whether the model is in training mode.
+
+    Returns:
+      memory_feature: None if no transformer is used. A [batch, num_memory,
+        memory_channel] tensor if transformer is used.
+      high_resolution_outputs: A list of decoded tensors with
+        high_resolution_output_stride.
+      backbone_output: An output tensor of the backbone, with output_stride.
+      endpoints: A dict, the network endpoints that might be used by DeepLab.
+    """
+    # Keep track of the current stack so that we know when to stop.
+    current_stack = 0
+    # Track whether we are building the backbone. This will affect the backbone
+    # related arguments, local learning rate, and so on.
+    current_is_backbone = True
+    high_resolution_outputs = []
+
+    if self._backbone_decoder_num_stacks == 0:
+      # Keep track of the backbone output, since it might be used as the
+      # semantic feature output.
+      backbone_output = activated_output
+      # Now that we have finished building the backbone, we either return the
+      # classification logits, or continue building a non-backbone decoder for
+      # panoptic segmentation.
+      if self._classification_mode:
+        endpoints['backbone_output'] = backbone_output
+        return None, None, None, endpoints
+      else:
+        current_is_backbone = False
+
+    if not current_is_backbone:
+      # Build extra layers if we have finished building the backbone.
+      current_name = '_stage5_' + EXTRA
+      current_output, activated_output, memory_feature = (
+          getattr(self, current_name)(
+              (activated_output, memory_feature), training=training))
+
+    # Compute parameter lists for stacked decoder.
+    total_decoder_num_stacks = (
+        self._backbone_decoder_num_stacks + self._extra_decoder_num_stacks)
+
+    # Keep track of all endpoints that will be used in the stacked decoder.
+    stride_to_features = {}
+    stride_to_features[min(2, self._output_stride)] = [endpoints['stage1']]
+    stride_to_features[min(4, self._output_stride)] = [endpoints['stage2']]
+    stride_to_features[min(8, self._output_stride)] = [endpoints['stage3']]
+    stride_to_features[min(16, self._output_stride)] = [endpoints['stage4']]
+    # Only keep the last endpoint from the backbone with the same resolution,
+    # i.e., if the output stride is 16, the current output will override
+    # the stride 16 endpoint, endpoints['res4'].
+    stride_to_features[min(32, self._output_stride)] = [current_output]
+
+    # Use a function to compute the next stride.
+    next_stride_fn = lambda x: x // 2
+    current_decoder_stride = self._output_stride
+    decoder_stage = 0
+
+    # Exit if we have enough stacks and reach the decoding output stride.
+    while (current_stack < total_decoder_num_stacks or
+           current_decoder_stride > self._high_resolution_output_stride):
+      decoder_stage += 1
+      current_decoder_stride = next_stride_fn(current_decoder_stride)
+
+      if current_decoder_stride == self._output_stride:
+        current_stack += 1
+        # Switch the decoder direction if we reach the largest stride.
+        next_stride_fn = lambda x: x // 2
+
+      # Include the current feature and two previous features from the target
+      # resolution in the decoder. We select two because it contains one upward
+      # feature and one downward feature, but better choices are possible.
+      decoder_features_list = (
+          [current_output] +
+          stride_to_features[current_decoder_stride][-2:])
+
+      # Fuse and resize features with striding, resizing and 1x1 convolutions.
+      if current_is_backbone:
+        current_name = '_decoder_stage{}_resized_fuse'.format(decoder_stage)
+      else:
+        current_name = '_decoder_stage{}_{}_resized_fuse'.format(
+            decoder_stage, EXTRA)
+      activated_output = getattr(self, current_name)(
+          decoder_features_list, training=training)
+
+      # Apply a decoder block group for building the backbone.
+      if current_is_backbone:
+        current_name = '_decoder_stage{}'.format(decoder_stage)
+        current_output, activated_output, memory_feature = (
+            getattr(self, current_name)(
+                (activated_output, memory_feature), training=training))
+
+      if (current_decoder_stride == self._output_stride and
+          current_stack == self._backbone_decoder_num_stacks):
+        # Keep track of the backbone output, since it might be used as the
+        # semantic feature output.
+        backbone_output = activated_output
+        # Now that we have finished building the backbone, we either return the
+        # classification logits, or continue building a non-backbone decoder for
+        # panoptic segmentation.
+        if self._classification_mode:
+          endpoints['backbone_output'] = backbone_output
+          return None, None, None, endpoints
+        else:
+          current_is_backbone = False
+
+      # Apply a decoder block group for building the extra layers.
+      if not current_is_backbone:
+        current_name = '_decoder_stage{}_{}'.format(decoder_stage, EXTRA)
+        current_output, activated_output, memory_feature = (
+            getattr(self, current_name)(
+                (activated_output, memory_feature), training=training))
+
+      # Append the current feature into the feature dict for possible later
+      # usage.
+      stride_to_features[current_decoder_stride].append(current_output)
+      if current_decoder_stride == self._high_resolution_output_stride:
+        high_resolution_outputs.append(activated_output)
+        next_stride_fn = lambda x: x * 2
+    return memory_feature, high_resolution_outputs, backbone_output, endpoints
+
+  def call_extra_endpoints(self,
+                           memory_feature,
+                           high_resolution_outputs,
+                           backbone_output,
+                           endpoints,
+                           training=False):
+    """Performs a forward pass to generate extra endpoints.
+
+    Args:
+      memory_feature: None if no transformer is used. A [batch, num_memory,
+        memory_channel] tensor if transformer is used.
+      high_resolution_outputs: A list of decoded tensors with
+        high_resolution_output_stride.
+      backbone_output: An output tensor of the backbone, with output_stride.
+      endpoints: A dict, the network endpoints that might be used by DeepLab.
+      training: A boolean, whether the model is in training mode.
+
+    Returns:
+      endpoints: A dict, the network endpoints that might be used by DeepLab.
+    """
+    # Assert that we have already returned if we are building a classifier.
+    assert not self._classification_mode
+    if (self._backbone_use_transformer_beyond_stride or
+        self._extra_decoder_use_transformer_beyond_stride):
+      # Build extra memory path feed forward networks for the class feature and
+      # the mask feature.
+      class_feature = getattr(self, '_class_feature_' + EXTRA)(
+          memory_feature, training=training)
+      mask_feature = getattr(self, '_mask_feature_' + EXTRA)(
+          memory_feature, training=training)
+      endpoints['transformer_class_feature'] = class_feature
+      endpoints['transformer_mask_feature'] = mask_feature
+
+    # Output the last high resolution feature as panoptic feature.
+    endpoints['feature_panoptic'] = high_resolution_outputs[-1]
+
+    # Avoid sharing our panoptic feature with the semantic auxiliary loss. So we
+    # use the backbone feature or the decoded backbone feature for the semantic
+    # segmentation head (i.e. the auxiliary loss).
+    if self._extra_decoder_num_stacks:
+      endpoints['feature_semantic'] = (
+          high_resolution_outputs[self._backbone_decoder_num_stacks])
+    else:
+      endpoints['feature_semantic'] = backbone_output
+    endpoints['backbone_output'] = backbone_output
+    return endpoints
+
+  def call(self, inputs, training=False):
+    """Performs a forward pass.
+
+    Args:
+      inputs: An input [batch, height, width, channel] tensor.
+      training: A boolean, whether the model is in training mode.
+
+    Returns:
+      endpoints: A dict, the network endpoints that might be used by DeepLab.
+    """
+    current_output, activated_output, memory_feature, endpoints = (
+        self.call_encoder_before_stacked_decoder(inputs, training=training))
+    memory_feature, high_resolution_outputs, backbone_output, endpoints = (
+        self.call_stacked_decoder(current_output,
+                                  activated_output,
+                                  memory_feature,
+                                  endpoints,
+                                  training=training))
+    if self._classification_mode:
+      return endpoints
+    endpoints = self.call_extra_endpoints(memory_feature,
+                                          high_resolution_outputs,
+                                          backbone_output,
+                                          endpoints,
+                                          training=training)
+    return endpoints
diff --git a/model/encoder/axial_resnet_instances.py b/model/encoder/axial_resnet_instances.py
new file mode 100644
index 0000000000000000000000000000000000000000..a110c11cd9a97aec27be98b85b5136af291004ef
--- /dev/null
+++ b/model/encoder/axial_resnet_instances.py
@@ -0,0 +1,493 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains Axial-ResNet model instances for Axial-DeepLab and MaX-DeepLab.
+
+Reference:
+    Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation,
+      ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853
+        Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille,
+          Liang-Chieh Chen.
+    MaX-DeepLab: "End-to-End Panoptic Segmentation with Mask Transformers",
+      CVPR 2021. https://arxiv.org/abs/2012.00759
+        Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen.
+"""
+
+import abc
+import collections.abc
+import copy
+
+from absl import logging
+import tensorflow as tf
+
+from deeplab2.model.encoder import axial_resnet
+
+
+def _get_default_config():
+  """Gets the default config for Axial-ResNets."""
+  # The default config dictionary for an Axial-ResNet is the MaX-DeepLab-S
+  # architecture for panoptic segmentation. This default config dictionary also
+  # exactly matches the default arguments of the functions.
+  default_config = {
+      'num_blocks': [3, 4, 6, 3],
+      'backbone_layer_multiplier': 1.0,
+      'width_multiplier': 1.0,
+      'stem_width_multiplier': 1.0,
+      'output_stride': 16,
+      'classification_mode': False,
+      'backbone_type': 'resnet_beta',
+      'use_axial_beyond_stride': 16,
+      'backbone_use_transformer_beyond_stride': 32,
+      'extra_decoder_use_transformer_beyond_stride': 32,
+      'backbone_decoder_num_stacks': 0,
+      'backbone_decoder_blocks_per_stage': 1,
+      'extra_decoder_num_stacks': 0,
+      'extra_decoder_blocks_per_stage': 1,
+      'max_num_mask_slots': 128,
+      'num_mask_slots': 128,
+      'memory_channels': 256,
+      'base_transformer_expansion': 1.0,
+      'global_feed_forward_network_channels': 256,
+      'high_resolution_output_stride': 4,
+      'activation': 'relu',
+      'block_group_config': {
+          'attention_bottleneck_expansion': 2,
+          'drop_path_keep_prob': 0.8,
+          'drop_path_beyond_stride': 16,
+          'drop_path_schedule': 'constant',
+          'positional_encoding_type': None,
+          'use_global_beyond_stride': 0,
+          'use_sac_beyond_stride': 0,
+          'use_squeeze_and_excite': False,
+          'conv_use_recompute_grad': False,
+          'axial_use_recompute_grad': True,
+          'recompute_within_stride': 0,
+          'transformer_use_recompute_grad': False,
+          'axial_layer_config': {
+              'query_shape': (129, 129),
+              'key_expansion': 1,
+              'value_expansion': 2,
+              'memory_flange': (32, 32),
+              'double_global_attention': False,
+              'num_heads': 8,
+              'use_query_rpe_similarity': True,
+              'use_key_rpe_similarity': True,
+              'use_content_similarity': True,
+              'retrieve_value_rpe': True,
+              'retrieve_value_content': True,
+              'initialization_std_for_query_key_rpe': 1.0,
+              'initialization_std_for_value_rpe': 1.0,
+              'self_attention_activation': 'softmax',
+          },
+          'dual_path_transformer_layer_config': {
+              'num_heads': 8,
+              'bottleneck_expansion': 2,
+              'key_expansion': 1,
+              'value_expansion': 2,
+              'feed_forward_network_channels': 2048,
+              'use_memory_self_attention': True,
+              'use_pixel2memory_feedback_attention': True,
+              'transformer_activation': 'softmax',
+          },
+      },
+      'bn_layer': tf.keras.layers.BatchNormalization,
+      'conv_kernel_weight_decay': 0.0,
+  }
+  return default_config
+
+
+def override(config_dict, override_dict):
+  """Recursively overrides a config dict with another."""
+  output_dict = copy.deepcopy(config_dict)
+  for key, value in override_dict.items():
+    if isinstance(value, collections.abc.Mapping):
+      output_dict[key] = override(config_dict.get(key, {}), value)
+    else:
+      output_dict[key] = value
+  return output_dict
+
+
+class AxialResNetInstance(axial_resnet.AxialResNet):
+  """A base Axial-ResNet model."""
+
+  @classmethod
+  @abc.abstractmethod
+  def _get_config(cls):
+    pass
+
+  def __init__(self, name, **kwargs):
+    """Builds an Axial-ResNet model."""
+    # Get the config of the current model.
+    current_config = self._get_config()
+
+    # Override the default config with the current config. This line can be
+    # omitted because the default config equals the default arguments of the
+    # functions that build the model. But we make all the configs explicit here.
+    current_config = override(_get_default_config(), current_config)
+
+    # Finally, override the current model config with keyword arguments. In this
+    # way, we still respect arguments passed as keyword arguments, such as
+    # classification_mode, output_stride, etc.
+    current_config = override(current_config, kwargs)
+    logging.info('Axial-ResNet final config: %s', current_config)
+    super(AxialResNetInstance, self).__init__(name, **current_config)
+
+
+class MaXDeepLabS(AxialResNetInstance):
+  """MaX-DeepLab-S for panoptic segmentation.
+
+  Reference:
+    Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation,
+      ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853
+        Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille,
+          Liang-Chieh Chen.
+    MaX-DeepLab: "End-to-End Panoptic Segmentation with Mask Transformers",
+      CVPR 2021. https://arxiv.org/abs/2012.00759
+        Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen.
+  """
+
+  @classmethod
+  def _get_config(cls):
+    # Return an empty dictionary as the default values are all set for
+    # MaX-DeepLab-S.
+    return {}
+
+
+class MaXDeepLabL(AxialResNetInstance):
+  """MaX-DeepLab-L for panoptic segmentation.
+
+  Reference:
+    Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation,
+      ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853
+        Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille,
+          Liang-Chieh Chen.
+    MaX-DeepLab: "End-to-End Panoptic Segmentation with Mask Transformers",
+      CVPR 2021. https://arxiv.org/abs/2012.00759
+        Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen.
+  """
+
+  @classmethod
+  def _get_config(cls):
+    return {
+        'num_blocks': [3, 6, 3, 3],
+        'backbone_type': 'wider_resnet',
+        'backbone_use_transformer_beyond_stride': 16,
+        'extra_decoder_use_transformer_beyond_stride': 16,
+        'backbone_decoder_num_stacks': 1,
+        'extra_decoder_num_stacks': 1,
+        'extra_decoder_blocks_per_stage': 3,
+        'memory_channels': 512,
+        'base_transformer_expansion': 2.0,
+        'global_feed_forward_network_channels': 512,
+        'block_group_config': {
+            'attention_bottleneck_expansion': 4,
+            'drop_path_beyond_stride': 4,
+            'axial_layer_config': {
+                'key_expansion': 2,
+                'value_expansion': 4,
+            },
+        },
+    }
+
+
+class MaXDeepLabSBackbone(MaXDeepLabS):
+  """MaX-DeepLab-S backbone for image classification pretraining.
+
+  Reference:
+    Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation,
+      ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853
+        Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille,
+          Liang-Chieh Chen.
+    MaX-DeepLab: "End-to-End Panoptic Segmentation with Mask Transformers",
+      CVPR 2021. https://arxiv.org/abs/2012.00759
+        Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen.
+  """
+
+  @classmethod
+  def _get_config(cls):
+    base_config = super(MaXDeepLabSBackbone, cls)._get_config()
+    # Override the config of MaXDeepLabS.
+    override_config = {
+        'classification_mode': True,
+        # The transformer blocks are not ImageNet pretrained. They are randomly
+        # initialized and trained from scratch for panoptic segmentation.
+        'backbone_use_transformer_beyond_stride': 0,
+    }
+    return override(base_config, override_config)
+
+
+class MaXDeepLabLBackbone(MaXDeepLabL):
+  """MaX-DeepLab-L backbone for image classification pretraining.
+
+  Reference:
+    Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation,
+      ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853
+        Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille,
+          Liang-Chieh Chen.
+    MaX-DeepLab: "End-to-End Panoptic Segmentation with Mask Transformers",
+      CVPR 2021. https://arxiv.org/abs/2012.00759
+        Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen.
+  """
+
+  @classmethod
+  def _get_config(cls):
+    base_config = super(MaXDeepLabLBackbone, cls)._get_config()
+    # Override the config of MaXDeepLabL.
+    override_config = {
+        'classification_mode': True,
+        # The transformer blocks are not ImageNet pretrained. They are randomly
+        # initialized and trained from scratch for panoptic segmentation.
+        'backbone_use_transformer_beyond_stride': 0,
+    }
+    return override(base_config, override_config)
+
+
+class ResNet50(AxialResNetInstance):
+  """A ResNet-50 instance.
+
+  Note that the implementation is different from the original ResNet-50 in:
+    (1) We apply strided convolutions in the first 3x3 convolution of the first
+        residual block of a stage.
+    (2) We replace the strided max pooling layer in the stem by applying strided
+        convolution in the immediate next residual block.
+  """
+
+  @classmethod
+  def _get_config(cls):
+    return {
+        'classification_mode': True,
+        'backbone_type': 'resnet',
+        'use_axial_beyond_stride': 0,
+        'backbone_use_transformer_beyond_stride': 0,
+        'block_group_config': {
+            'drop_path_keep_prob': 1.0,
+        },
+    }
+
+
+class ResNet50Beta(ResNet50):
+  """A ResNet-50 but with inception stem.
+
+  Note that the implementation is different from the original ResNet-50 in:
+    (1) We apply strided convolutions in the first 3x3 convolution of the first
+        residual block of a stage.
+    (2) We replace the strided max pooling layer in the stem by applying strided
+        convolution in the immediate next residual block.
+  """
+
+  @classmethod
+  def _get_config(cls):
+    base_config = super(ResNet50Beta, cls)._get_config()
+    # Override the config of ResNet50.
+    override_config = {
+        'backbone_type': 'resnet_beta',
+    }
+    return override(base_config, override_config)
+
+
+class AxialResNetL(ResNet50):
+  """Axial-ResNet-L for image classification only.
+
+  Axial-ResNet-L is a ResNet50 with use_axial_beyond_stride = 2.
+
+  Reference:
+    Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation,
+      ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853
+        Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille,
+          Liang-Chieh Chen.
+  """
+
+  @classmethod
+  def _get_config(cls):
+    base_config = super(AxialResNetL, cls)._get_config()
+    # Override the config of ResNet50.
+    override_config = {
+        'use_axial_beyond_stride': 2,
+    }
+    return override(base_config, override_config)
+
+
+class AxialResNetS(ResNet50):
+  """Axial-ResNet-S for image classification only.
+
+  Axial-ResNet-S is a ResNet50 with use_axial_beyond_stride = 2 and
+  width_multiplier = 0.5.
+
+  Reference:
+    Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation,
+      ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853
+        Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille,
+          Liang-Chieh Chen.
+  """
+
+  @classmethod
+  def _get_config(cls):
+    base_config = super(AxialResNetS, cls)._get_config()
+    # Override the config of ResNet50.
+    override_config = {
+        'width_multiplier': 0.5,
+        'use_axial_beyond_stride': 2,
+    }
+    return override(base_config, override_config)
+
+
+class AxialDeepLabL(ResNet50Beta):
+  """Axial-DeepLab-L for panoptic segmentation.
+
+  Axial-DeepLab-L is a ResNet50Beta with use_axial_beyond_stride = 2.
+  Axial-DeepLab-L is also equivalent to Axial-ResNet-L with an inception stem.
+
+  Reference:
+    Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation,
+      ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853
+        Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille,
+          Liang-Chieh Chen.
+  """
+
+  @classmethod
+  def _get_config(cls):
+    base_config = super(AxialDeepLabL, cls)._get_config()
+    override_config = {
+        'use_axial_beyond_stride': 2,
+    }
+    return override(base_config, override_config)
+
+
+class AxialDeepLabS(ResNet50Beta):
+  """Axial-DeepLab-S for panoptic segmentation.
+
+  Axial-DeepLab-S is a ResNet50Beta with use_axial_beyond_stride = 2 and
+  width_multiplier = 0.5.
+  Axial-DeepLab-S is also equivalent to Axial-ResNet-S with an inception stem.
+
+  Reference:
+    Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation,
+      ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853
+        Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille,
+          Liang-Chieh Chen.
+  """
+
+  @classmethod
+  def _get_config(cls):
+    base_config = super(AxialDeepLabS, cls)._get_config()
+    override_config = {
+        'width_multiplier': 0.5,
+        'use_axial_beyond_stride': 2,
+    }
+    return override(base_config, override_config)
+
+
+class SWideRNet(AxialResNetInstance):
+  """A SWideRNet instance.
+
+  Note that the implementation is different from the original SWideRNet in:
+    (1) We apply strided convolutions in the first residual block of a stage,
+        instead of the last residual block.
+    (2) We replace the strided max pooling layer in the stem by applying strided
+        convolution in the immediate next residual block.
+    (3) We (optionally) use squeeze and excitation in all five stages, instead
+        of the last four stages only.
+
+  Reference:
+    Scaling Wide Residual Networks for Panoptic Segmentation,
+      https://arxiv.org/abs/2011.11675
+        Liang-Chieh Chen, Huiyu Wang, Siyuan Qiao.
+    Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation,
+      ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853
+        Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille,
+          Liang-Chieh Chen.
+  """
+
+  @classmethod
+  def _get_config(cls):
+    return {
+        'num_blocks': [3, 6, 3, 3],
+        'classification_mode': True,
+        'backbone_type': 'wider_resnet',
+        'use_axial_beyond_stride': 0,
+        'backbone_use_transformer_beyond_stride': 0,
+        'block_group_config': {
+            'drop_path_beyond_stride': 4,
+            'conv_use_recompute_grad': True,
+        },
+    }
+
+
+class AxialSWideRNet(SWideRNet):
+  """SWideRNet with axial attention blocks in the last two stages.
+
+  Note that the implementation is different from the original SWideRNet in:
+    (1) We apply strided convolutions in the first residual block of a stage,
+        instead of the last residual block.
+    (2) We replace the strided max pooling layer in the stem by applying strided
+        convolution in the immediate next residual block.
+    (3) We (optionally) use squeeze and excitation in all five stages, instead
+        of the last four stages only.
+
+  Reference:
+    Scaling Wide Residual Networks for Panoptic Segmentation,
+      https://arxiv.org/abs/2011.11675
+        Liang-Chieh Chen, Huiyu Wang, Siyuan Qiao.
+    Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation,
+      ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853
+        Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille,
+          Liang-Chieh Chen.
+  """
+
+  @classmethod
+  def _get_config(cls):
+    base_config = super(AxialSWideRNet, cls)._get_config()
+    override_config = {
+        'use_axial_beyond_stride': 16,
+        'block_group_config': {
+            'attention_bottleneck_expansion': 4,
+            'axial_layer_config': {
+                'key_expansion': 2,
+                'value_expansion': 4,
+            },
+        },
+    }
+    return override(base_config, override_config)
+
+
+def get_model(name, **kwargs):
+  """Gets the model instance given the model name."""
+  name_lower = name.lower()
+  if name_lower == 'max_deeplab_s':
+    return MaXDeepLabS(name_lower, **kwargs)
+  elif name_lower == 'max_deeplab_l':
+    return MaXDeepLabL(name_lower, **kwargs)
+  elif name_lower == 'max_deeplab_s_backbone':
+    return MaXDeepLabSBackbone(name_lower, **kwargs)
+  elif name_lower == 'max_deeplab_l_backbone':
+    return MaXDeepLabLBackbone(name_lower, **kwargs)
+  elif name_lower == 'resnet50':
+    return ResNet50(name_lower, **kwargs)
+  elif name_lower == 'resnet50_beta':
+    return ResNet50Beta(name_lower, **kwargs)
+  elif name_lower == 'swidernet' or name_lower == 'wide_resnet41':
+    return SWideRNet(name_lower, **kwargs)
+  elif name_lower == 'axial_swidernet':
+    return AxialSWideRNet(name_lower, **kwargs)
+  elif name_lower == 'axial_resnet_s':
+    return AxialResNetS(name_lower, **kwargs)
+  elif name_lower == 'axial_resnet_l':
+    return AxialResNetL(name_lower, **kwargs)
+  elif name_lower == 'axial_deeplab_s':
+    return AxialDeepLabS(name_lower, **kwargs)
+  elif name_lower == 'axial_deeplab_l':
+    return AxialDeepLabL(name_lower, **kwargs)
+  else:
+    raise ValueError(name_lower + ' is not supported.')
diff --git a/model/encoder/axial_resnet_instances_test.py b/model/encoder/axial_resnet_instances_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a13f4a8eb02873b4088990faba87160ac1ed2c0
--- /dev/null
+++ b/model/encoder/axial_resnet_instances_test.py
@@ -0,0 +1,234 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for axial_resnet_instances."""
+
+import os
+
+from absl import flags
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from deeplab2.model import test_utils
+from deeplab2.model.encoder import axial_resnet_instances
+
+FLAGS = flags.FLAGS
+
+
+class AxialResnetInstancesTest(tf.test.TestCase, parameterized.TestCase):
+
+  # The parameter count does not include the classification head.
+  @parameterized.parameters(
+      ('resnet50', 1, 23508032),
+      ('resnet50_beta', 1, 23631808),  # 123776 more than resnet50
+      ('max_deeplab_s_backbone', 1, 41343424),
+      ('max_deeplab_l_backbone', 1, 175115392),
+      ('axial_resnet_s', 1, 11466912),
+      ('axial_resnet_l', 1, 43714048),  # 127872 fewer than axial_deeplab_l
+      ('axial_deeplab_s', 1, 11565856),
+      ('axial_deeplab_l', 1, 43841920),
+      ('swidernet', 1, 109014080),  # SWideRNet-(1,1,1) without SE or SAC
+      ('swidernet', 3, 333245504),  # Should be more than 3 x 109014080
+      ('swidernet', 4.5, 487453760),  # Rounded down to [13, 27, 13, 13]
+      ('axial_swidernet', 1, 136399392),
+      ('axial_swidernet', 3, 393935520),
+      ('axial_swidernet', 4.5, 570346912),
+      )
+  def test_model_output_shape_and_num_params(
+      self, model_name, backbone_layer_multiplier, expected_num_params):
+    model = axial_resnet_instances.get_model(
+        model_name,
+        backbone_layer_multiplier=backbone_layer_multiplier,
+        bn_layer=tf.keras.layers.BatchNormalization,
+        conv_kernel_weight_decay=0.0001)
+    output = model(tf.keras.Input(shape=(224, 224, 3)))
+    if model_name in ('axial_resnet_s', 'axial_deeplab_s'):
+      self.assertListEqual(output['res5'].get_shape().as_list(),
+                           [None, 14, 14, 1024])
+    else:
+      self.assertListEqual(output['res5'].get_shape().as_list(),
+                           [None, 14, 14, 2048])
+    num_params = np.sum(
+        [np.prod(v.get_shape().as_list()) for v in model.trainable_weights])
+    self.assertEqual(num_params, expected_num_params)
+
+  def test_resnet50_variable_checkpoint_names(self):
+    model = axial_resnet_instances.get_model(
+        'resnet50',
+        bn_layer=tf.keras.layers.BatchNormalization,
+        conv_kernel_weight_decay=0.0001)
+    model(tf.keras.Input(shape=(224, 224, 3)))
+    variable_names = [w.name for w in model.trainable_weights]
+    test_variable_name = 'resnet50/stage4/block6/conv3_bn/batch_norm/beta:0'
+    self.assertIn(test_variable_name, variable_names)
+    temp_dir = self.create_tempdir()
+    temp_path = os.path.join(temp_dir, 'ckpt')
+    checkpoint = tf.train.Checkpoint(encoder=model)
+    checkpoint.save(temp_path)
+    latest_checkpoint = tf.train.latest_checkpoint(temp_dir)
+    reader = tf.train.load_checkpoint(latest_checkpoint)
+    checkpoint_names = reader.get_variable_to_shape_map().keys()
+    test_checkpoint_name = 'encoder/_stage4/_block6/_conv3_bn/_batch_norm/gamma/.ATTRIBUTES/VARIABLE_VALUE'
+    self.assertIn(test_checkpoint_name, checkpoint_names)
+
+  def test_max_deeplab_s_output_shape_and_num_params(self):
+    model = axial_resnet_instances.get_model(
+        'max_deeplab_s',
+        bn_layer=tf.keras.layers.BatchNormalization,
+        conv_kernel_weight_decay=0.0001)
+    endpoints = model(tf.keras.Input(shape=(65, 65, 3)))
+    self.assertListEqual(endpoints['backbone_output'].get_shape().as_list(),
+                         [None, 5, 5, 2048])
+    self.assertListEqual(
+        endpoints['transformer_class_feature'].get_shape().as_list(),
+        [None, 128, 256])
+    self.assertListEqual(
+        endpoints['transformer_mask_feature'].get_shape().as_list(),
+        [None, 128, 256])
+    self.assertListEqual(endpoints['feature_panoptic'].get_shape().as_list(),
+                         [None, 17, 17, 256])
+    self.assertListEqual(endpoints['feature_semantic'].get_shape().as_list(),
+                         [None, 5, 5, 2048])
+    num_params = np.sum(
+        [np.prod(v.get_shape().as_list()) for v in model.trainable_weights])
+    self.assertEqual(num_params, 61726624)
+
+  def test_max_deeplab_l_output_shape_and_num_params(self):
+    model = axial_resnet_instances.get_model(
+        'max_deeplab_l',
+        bn_layer=tf.keras.layers.BatchNormalization,
+        conv_kernel_weight_decay=0.0001)
+    endpoints = model(tf.keras.Input(shape=(65, 65, 3)))
+    self.assertListEqual(endpoints['backbone_output'].get_shape().as_list(),
+                         [None, 5, 5, 2048])
+    self.assertListEqual(
+        endpoints['transformer_class_feature'].get_shape().as_list(),
+        [None, 128, 512])
+    self.assertListEqual(
+        endpoints['transformer_mask_feature'].get_shape().as_list(),
+        [None, 128, 512])
+    self.assertListEqual(endpoints['feature_panoptic'].get_shape().as_list(),
+                         [None, 17, 17, 256])
+    self.assertListEqual(endpoints['feature_semantic'].get_shape().as_list(),
+                         [None, 17, 17, 256])
+    num_params = np.sum(
+        [np.prod(v.get_shape().as_list()) for v in model.trainable_weights])
+    self.assertEqual(num_params, 450523232)
+
+  def test_global_attention_absolute_positional_encoding_names(self):
+    model = axial_resnet_instances.get_model(
+        'max_deeplab_s_backbone',
+        block_group_config={'use_global_beyond_stride': 16,
+                            'positional_encoding_type': '1D',
+                            'axial_layer_config': {
+                                'use_query_rpe_similarity': False,
+                                'use_key_rpe_similarity': False,
+                                'retrieve_value_rpe': False}},
+        bn_layer=tf.keras.layers.BatchNormalization,
+        conv_kernel_weight_decay=0.0001)
+    model(tf.keras.Input(shape=(224, 224, 3)))
+    variable_names = [w.name for w in model.trainable_weights]
+    test_variable_name1 = 'max_deeplab_s_backbone/stage4/add_absolute_positional_encoding/height_axis_embeddings:0'
+    test_variable_name2 = 'max_deeplab_s_backbone/stage4/block2/attention/global/qkv_kernel:0'
+    self.assertIn(test_variable_name1, variable_names)
+    self.assertIn(test_variable_name2, variable_names)
+
+  @parameterized.product(
+      (dict(model_name='resnet50', backbone_layer_multiplier=1),
+       dict(model_name='resnet50_beta', backbone_layer_multiplier=1),
+       dict(model_name='wide_resnet41', backbone_layer_multiplier=1),
+       dict(model_name='swidernet', backbone_layer_multiplier=2)),
+      output_stride=[4, 8, 16, 32])
+  def test_model_atrous_consistency_with_output_stride_four(
+      self, model_name, backbone_layer_multiplier, output_stride):
+    tf.random.set_seed(0)
+
+    # Create the input.
+    pixel_inputs = test_utils.create_test_input(1, 225, 225, 3)
+
+    # Create the model and the weights.
+    model_1 = axial_resnet_instances.get_model(
+        model_name,
+        backbone_layer_multiplier=backbone_layer_multiplier,
+        bn_layer=tf.keras.layers.BatchNormalization,
+        conv_kernel_weight_decay=0.0001,
+        output_stride=4)
+
+    # Create the weights.
+    model_1(pixel_inputs, training=False)
+
+    # Set the batch norm gamma as non-zero so that the 3x3 convolution affects
+    # the output.
+    for weight in model_1.trainable_weights:
+      if '/gamma:0' in weight.name:
+        weight.assign(tf.ones_like(weight))
+
+    # Dense feature extraction followed by subsampling.
+    pixel_outputs = model_1(pixel_inputs, training=False)['res5']
+    downsampling_stride = output_stride // 4
+    expected = pixel_outputs[:, ::downsampling_stride, ::downsampling_stride, :]
+
+    # Feature extraction at the nominal network rate.
+    model_2 = axial_resnet_instances.get_model(
+        model_name,
+        backbone_layer_multiplier=backbone_layer_multiplier,
+        bn_layer=tf.keras.layers.BatchNormalization,
+        conv_kernel_weight_decay=0.0001,
+        output_stride=output_stride)
+    # Create the weights.
+    model_2(pixel_inputs, training=False)
+    # Make the two networks use the same weights.
+    model_2.set_weights(model_1.get_weights())
+    output = model_2(pixel_inputs, training=False)['res5']
+
+    # Normalize the outputs. Since we set batch_norm gamma to 1, the output
+    # activations can explode to a large standard deviation, which sometimes
+    # cause numerical errors beyond the tolerances.
+    normalizing_factor = tf.math.reduce_std(expected)
+    # Compare normalized outputs.
+    self.assertAllClose(output / normalizing_factor,
+                        expected / normalizing_factor,
+                        atol=1e-4, rtol=1e-4)
+
+  @parameterized.parameters(
+      ('resnet50',),
+      ('resnet50_beta',),
+      ('max_deeplab_s_backbone',),
+      ('max_deeplab_l_backbone',),
+      ('axial_resnet_s',),
+      ('axial_resnet_l',),
+      ('axial_deeplab_s',),
+      ('axial_deeplab_l',),
+      ('swidernet',),
+      ('axial_swidernet',),
+      )
+  def test_model_export(self, model_name):
+    model = axial_resnet_instances.get_model(
+        model_name,
+        output_stride=16,
+        backbone_layer_multiplier=1.0,
+        bn_layer=tf.keras.layers.BatchNormalization,
+        conv_kernel_weight_decay=0.0001,
+        # Disable drop path as it is not compatible with model exporting.
+        block_group_config={'drop_path_keep_prob': 1.0})
+    model(tf.keras.Input([257, 257, 3], batch_size=1), training=False)
+    export_dir = os.path.join(
+        FLAGS.test_tmpdir, 'test_model_export', model_name)
+    model.save(export_dir)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/model/encoder/axial_resnet_test.py b/model/encoder/axial_resnet_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c50b66261951164560725bd530288cededfdb8cd
--- /dev/null
+++ b/model/encoder/axial_resnet_test.py
@@ -0,0 +1,46 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for axial_resnet."""
+
+import numpy as np
+import tensorflow as tf
+
+from deeplab2.model.encoder import axial_resnet
+
+
+class AxialResNetTest(tf.test.TestCase):
+
+  def test_axial_resnet_correct_output_shape(self):
+    model = axial_resnet.AxialResNet('max_deeplab_s')
+    endpoints = model(tf.zeros([2, 65, 65, 3]), training=False)
+    self.assertListEqual(endpoints['backbone_output'].get_shape().as_list(),
+                         [2, 5, 5, 2048])
+    self.assertListEqual(
+        endpoints['transformer_class_feature'].get_shape().as_list(),
+        [2, 128, 256])
+    self.assertListEqual(
+        endpoints['transformer_mask_feature'].get_shape().as_list(),
+        [2, 128, 256])
+    self.assertListEqual(endpoints['feature_panoptic'].get_shape().as_list(),
+                         [2, 17, 17, 256])
+    self.assertListEqual(endpoints['feature_semantic'].get_shape().as_list(),
+                         [2, 5, 5, 2048])
+    num_params = np.sum(
+        [np.prod(v.get_shape().as_list()) for v in model.trainable_weights])
+    self.assertEqual(num_params, 61726624)
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/model/encoder/mobilenet.py b/model/encoder/mobilenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bb1a8d1a3a1ac0c4a59b53f3c663d62cc95a689
--- /dev/null
+++ b/model/encoder/mobilenet.py
@@ -0,0 +1,410 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""MobileNetV3 models for Deep Labeling.
+
+Reference:
+  Howard, A., Sandler, M., et al. Searching for mobilenetv3. In ICCV, 2019
+"""
+from typing import Any, Callable, Mapping, Optional, Sequence
+
+import tensorflow as tf
+
+from deeplab2.model import utils
+from deeplab2.model.layers import blocks
+from deeplab2.model.layers import convolutions
+
+# The default input image channels.
+_INPUT_CHANNELS = 3
+
+
+MNV3Small_BLOCK_SPECS = {
+    'spec_name': 'MobileNetV3Small',
+    'block_spec_schema': ['block_fn', 'kernel_size', 'strides', 'filters',
+                          'activation', 'se_ratio', 'expand_ratio',
+                          'is_endpoint'],
+    'block_specs': [
+        ('conv_bn', 3, 2, 16,
+         'hard_swish', None, None, True),
+        ('inverted_bottleneck', 3, 2, 16,
+         'relu', 0.25, 1, True),
+        ('inverted_bottleneck', 3, 2, 24,
+         'relu', None, 72. / 16, False),
+        ('inverted_bottleneck', 3, 1, 24,
+         'relu', None, 88. / 24, True),
+        ('inverted_bottleneck', 5, 2, 40,
+         'hard_swish', 0.25, 4., False),
+        ('inverted_bottleneck', 5, 1, 40,
+         'hard_swish', 0.25, 6., False),
+        ('inverted_bottleneck', 5, 1, 40,
+         'hard_swish', 0.25, 6., False),
+        ('inverted_bottleneck', 5, 1, 48,
+         'hard_swish', 0.25, 3., False),
+        ('inverted_bottleneck', 5, 1, 48,
+         'hard_swish', 0.25, 3., True),
+        ('inverted_bottleneck', 5, 2, 96,
+         'hard_swish', 0.25, 6., False),
+        ('inverted_bottleneck', 5, 1, 96,
+         'hard_swish', 0.25, 6., False),
+        ('inverted_bottleneck', 5, 1, 96,
+         'hard_swish', 0.25, 6., False),
+        ('conv_bn', 1, 1, 576,
+         'hard_swish', None, None, True),
+    ]
+}
+
+
+MNV3Large_BLOCK_SPECS = {
+    'spec_name': 'MobileNetV3Large',
+    'block_spec_schema': ['block_fn', 'kernel_size', 'strides', 'filters',
+                          'activation', 'se_ratio', 'expand_ratio',
+                          'is_endpoint'],
+    'block_specs': [
+        ('conv_bn', 3, 2, 16,
+         'hard_swish', None, None, False),
+        ('inverted_bottleneck', 3, 1, 16,
+         'relu', None, 1., True),
+        ('inverted_bottleneck', 3, 2, 24,
+         'relu', None, 4., False),
+        ('inverted_bottleneck', 3, 1, 24,
+         'relu', None, 3., True),
+        ('inverted_bottleneck', 5, 2, 40,
+         'relu', 0.25, 3., False),
+        ('inverted_bottleneck', 5, 1, 40,
+         'relu', 0.25, 3., False),
+        ('inverted_bottleneck', 5, 1, 40,
+         'relu', 0.25, 3., True),
+        ('inverted_bottleneck', 3, 2, 80,
+         'hard_swish', None, 6., False),
+        ('inverted_bottleneck', 3, 1, 80,
+         'hard_swish', None, 2.5, False),
+        ('inverted_bottleneck', 3, 1, 80,
+         'hard_swish', None, 2.3, False),
+        ('inverted_bottleneck', 3, 1, 80,
+         'hard_swish', None, 2.3, False),
+        ('inverted_bottleneck', 3, 1, 112,
+         'hard_swish', 0.25, 6., False),
+        ('inverted_bottleneck', 3, 1, 112,
+         'hard_swish', 0.25, 6., True),
+        ('inverted_bottleneck', 5, 2, 160,
+         'hard_swish', 0.25, 6., False),
+        ('inverted_bottleneck', 5, 1, 160,
+         'hard_swish', 0.25, 6., False),
+        ('inverted_bottleneck', 5, 1, 160,
+         'hard_swish', 0.25, 6., False),
+        ('conv_bn', 1, 1, 960,
+         'hard_swish', None, None, True),
+    ]
+}
+
+
+SUPPORTED_SPECS_MAP = {
+    'MobileNetV3Large': MNV3Large_BLOCK_SPECS,
+    'MobileNetV3Small': MNV3Small_BLOCK_SPECS,
+}
+
+
+# pylint: disable=invalid-name
+def _block_spec_decoder(specs: Mapping[Any, Any],
+                        width_multiplier: float,
+                        divisible_by: int = 8) -> Sequence[Mapping[str, Any]]:
+  """Decodes specs for a block.
+
+  Args:
+    specs: A `dict` specification of block specs of a mobilenet version.
+    width_multiplier: A `float` multiplier for the filter size for all
+      convolution ops. The value must be greater than zero. Typical usage will
+      be to set this value in (0, 1) to reduce the number of parameters or
+      computation cost of the model.
+    divisible_by: An `int` that ensures all inner dimensions are divisible by
+      this number.
+
+  Returns:
+    A list of block spec in dictionary that defines structure of the layers.
+  """
+
+  spec_name = specs['spec_name']
+  block_spec_schema = specs['block_spec_schema']
+  block_specs = specs['block_specs']
+
+  if not block_specs:
+    raise ValueError(
+        'The block spec cannot be empty for {} !'.format(spec_name))
+
+  if len(block_specs[0]) != len(block_spec_schema):
+    raise ValueError('The block spec values {} do not match with '
+                     'the schema {}'.format(block_specs[0], block_spec_schema))
+
+  decoded_specs = []
+
+  for spec in block_specs:
+    spec_dict = dict(zip(block_spec_schema, spec))
+    decoded_specs.append(spec_dict)
+
+  for ds in decoded_specs:
+    ds['filters'] = utils.make_divisible(
+        value=ds['filters'] * width_multiplier,
+        divisor=divisible_by,
+        min_value=8)
+
+  return decoded_specs
+# pylint: enable=invalid-name
+
+
+class MobileNet(tf.keras.Model):
+  """Creates a MobileNetV3 family model."""
+
+  def __init__(
+      self,
+      model_id: str = 'MobileNetV3Small',
+      width_multiplier: float = 1.0,
+      output_stride: Optional[int] = None,
+      min_width: int = 8,
+      divisible_by: int = 8,
+      regularize_depthwise: bool = False,
+      bn_layer: Callable[..., Any] = tf.keras.layers.BatchNormalization,
+      conv_kernel_weight_decay: float = 0.0,
+      name: str = 'MobilenNetV3'):
+    """Initializes a MobileNet V3 model.
+
+    Args:
+      model_id: A `str` of MobileNet version. The supported values are
+        `MobileNetV3Large`, `MobileNetV3Small`.
+      width_multiplier: A `float` of multiplier for the filters (number of
+        channels) for all convolution ops. The value must be greater than zero.
+        Typical usage will be to set this value in (0, 1) to reduce the number
+        of parameters or computation cost of the model.
+      output_stride: An `int` that specifies the requested ratio of input to
+        output spatial resolution. If not None, then we invoke atrous
+        convolution if necessary to prevent the network from reducing the
+        spatial resolution of activation maps. The output_stride should be
+        divisible by 4.
+      min_width: An `int` of minimum width (number of channels) for all
+        convolution ops. Enforced when width_multiplier < 1, and not an active
+        constraint when width_multiplier >= 1.
+      divisible_by: An `int` that ensures all intermediate feature dimensions
+        are divisible by this number.
+      regularize_depthwise: If True, apply regularization on depthwise conv.
+      bn_layer: An optional tf.keras.layers.Layer that computes the
+        normalization (default: tf.keras.layers.BatchNormalization).
+      conv_kernel_weight_decay: A float, the weight decay for convolution
+        kernels.
+      name: Model name.
+
+    Raises:
+      ValueError: The MobileNet version is not supported.
+      ValueError: width_multiplier is not greater than zero.
+      ValueError: Output stride must be None or a multiple of 4.
+      ValueError: Unknown block type i for layer j.
+    """
+    if model_id not in SUPPORTED_SPECS_MAP:
+      raise ValueError('The MobileNet version {} '
+                       'is not supported'.format(model_id))
+
+    if width_multiplier <= 0:
+      raise ValueError('width_multiplier is not greater than zero.')
+
+    if (output_stride is not None and
+        (output_stride <= 1 or (output_stride > 1 and output_stride % 4))):
+      raise ValueError('Output stride must be None or a multiple of 4.')
+
+    super().__init__(name=name)
+
+    self._model_id = model_id
+    self._width_multiplier = width_multiplier
+    self._min_width = min_width
+    self._output_stride = output_stride
+    self._divisible_by = divisible_by
+    self._regularize_depthwise = regularize_depthwise
+    self._bn_layer = bn_layer
+    self._conv_kernel_weight_decay = conv_kernel_weight_decay
+    self._blocks = []
+    self._endpoint_names = []
+
+    block_specs = SUPPORTED_SPECS_MAP.get(model_id)
+    self._decoded_specs = _block_spec_decoder(
+        specs=block_specs,
+        width_multiplier=self._width_multiplier,
+        divisible_by=self._divisible_by)
+
+    self._mobilenet_base()
+
+  def _mobilenet_base(self):
+    """Builds the base MobileNet architecture."""
+
+    # The current_stride variable keeps track of the output stride of the
+    # activations, i.e., the running product of convolution strides up to the
+    # current network layer. This allows us to invoke atrous convolution
+    # whenever applying the next convolution would result in the activations
+    # having output stride larger than the target output_stride.
+    current_stride = 1
+
+    # The atrous convolution rate parameter.
+    rate = 1
+
+    endpoint_level = 1
+    in_filters = _INPUT_CHANNELS
+    for i, block_def in enumerate(self._decoded_specs):
+      # We only need to build up to 'res5' endpoint for segmentation task.
+      if endpoint_level > 5 and not self._classification_mode:
+        break
+
+      block_name = '{}_{}'.format(block_def['block_fn'], i + 1)
+
+      if (self._output_stride is not None and
+          current_stride == self._output_stride):
+        # If we have reached the target output_stride, then we need to employ
+        # atrous convolution with stride=1 and multiply the atrous rate by the
+        # current unit's stride for use in subsequent layers.
+        layer_stride = 1
+        layer_rate = rate
+        rate = (
+            rate * block_def['strides']
+            if block_def['strides'] is not None else rate)
+      else:
+        layer_stride = block_def['strides']
+        layer_rate = 1
+        current_stride = (
+            current_stride * block_def['strides']
+            if block_def['strides'] is not None else current_stride)
+
+      if block_def['block_fn'] == 'conv_bn':
+
+        self._blocks.append(
+            convolutions.Conv2DSame(
+                output_channels=block_def['filters'],
+                kernel_size=block_def['kernel_size'],
+                strides=layer_stride,
+                atrous_rate=layer_rate,
+                activation=block_def['activation'],
+                use_bias=False,
+                bn_layer=self._bn_layer,
+                use_bn=True,
+                conv_kernel_weight_decay=self._conv_kernel_weight_decay,
+                name=block_name,
+                ))
+
+      elif block_def['block_fn'] == 'inverted_bottleneck':
+        atrous_rate = 1
+        # There is no need to apply atrous convolution to any 1x1 convolution.
+        if layer_rate > 1 and block_def['kernel_size'] != 1:
+          atrous_rate = layer_rate
+        self._blocks.append(
+            blocks.InvertedBottleneckBlock(
+                in_filters=in_filters,
+                out_filters=block_def['filters'],
+                expand_ratio=block_def['expand_ratio'],
+                strides=layer_stride,
+                kernel_size=block_def['kernel_size'],
+                se_ratio=block_def['se_ratio'],
+                activation=block_def['activation'],
+                expand_se_in_filters=True,
+                depthwise_activation=None,
+                atrous_rate=atrous_rate,
+                divisible_by=self._divisible_by,
+                regularize_depthwise=self._regularize_depthwise,
+                use_depthwise=True,
+                # Note that whether the residual connection would be used is
+                # also conditional on the in_filters and out_filters size, even
+                # if use_residual=True,e.g. when input_filters != out_filters,
+                # no residual connection will be created.
+                use_residual=(block_def['strides'] == 1),
+                bn_layer=self._bn_layer,
+                conv_kernel_weight_decay=self._conv_kernel_weight_decay,
+                name=block_name,
+                ))
+
+      else:
+        raise ValueError('Unknown block type {} for layer {}'.format(
+            block_def['block_fn'], i))
+
+      # Register input_filters for the next level
+      in_filters = block_def['filters']
+
+      if block_def['is_endpoint']:
+        # Name the endpoint to be 'res{1...5}' to align with ResNet. This
+        # simplifies segmentation head implementation.
+        self._endpoint_names.append('res' + str(endpoint_level))
+        endpoint_level += 1
+      else:
+        self._endpoint_names.append(None)
+
+  def call(self, input_tensor: tf.Tensor, training: bool = False):
+    """Performs a forward pass through MobileNet."""
+    net = input_tensor
+    endpoints = {}
+    for block, endpoint_name in zip(self._blocks, self._endpoint_names):
+      net = block(net, training=training)
+      if endpoint_name is not None:
+        endpoints[endpoint_name] = net
+    return endpoints
+
+
+def MobileNetV3Small(
+    width_multiplier: float = 1.0,
+    output_stride: int = 32,
+    bn_layer: Callable[..., Any] = tf.keras.layers.BatchNormalization,
+    conv_kernel_weight_decay: float = 0.0,
+    name: str = 'MobileNetV3Small') -> tf.keras.Model:
+  """Creates a MobileNetV3Small model.
+
+  Args:
+    width_multiplier: A float, depth_multiplier for the whole model.
+    output_stride: An optional integer specifying the output stride of the
+      network.
+    bn_layer: An optional tf.keras.layers.Layer that computes the
+        normalization (default: tf.keras.layers.BatchNormalization).
+    conv_kernel_weight_decay: A float, the weight decay for convolution kernels.
+    name: Model name.
+
+  Returns:
+    The MobileNetV3Small model as an instance of tf.keras.Model.
+  """
+  model = MobileNet(model_id='MobileNetV3Small',
+                    width_multiplier=width_multiplier,
+                    output_stride=output_stride,
+                    bn_layer=bn_layer,
+                    conv_kernel_weight_decay=conv_kernel_weight_decay,
+                    name=name)
+  return model
+
+
+def MobileNetV3Large(
+    width_multiplier: float = 1.0,
+    output_stride: int = 32,
+    bn_layer: Callable[..., Any] = tf.keras.layers.BatchNormalization,
+    conv_kernel_weight_decay: float = 0.0,
+    name: str = 'MobileNetV3Large') -> tf.keras.Model:
+  """Creates a MobileNetV3Large model.
+
+  Args:
+    width_multiplier: A float, depth_multiplier for the STEM.
+    output_stride: An optional integer specifying the output stride of the
+      network.
+    bn_layer: An optional tf.keras.layers.Layer that computes the
+        normalization (default: tf.keras.layers.BatchNormalization).
+    conv_kernel_weight_decay: A float, the weight decay for convolution kernels.
+    name: Model name.
+
+  Returns:
+    The MobileNetV3Large model as an instance of tf.keras.Model.
+  """
+  model = MobileNet(model_id='MobileNetV3Large',
+                    width_multiplier=width_multiplier,
+                    output_stride=output_stride,
+                    bn_layer=bn_layer,
+                    conv_kernel_weight_decay=conv_kernel_weight_decay,
+                    name=name)
+  return model
diff --git a/model/encoder/mobilenet_test.py b/model/encoder/mobilenet_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..fced57f8d09f808a4fb2bc16e9c56e7ceade1846
--- /dev/null
+++ b/model/encoder/mobilenet_test.py
@@ -0,0 +1,129 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for mobilenet."""
+
+from absl.testing import parameterized
+
+import tensorflow as tf
+
+from deeplab2.model import test_utils
+from deeplab2.model.encoder import mobilenet
+
+
+class MobilenetTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters('MobileNetV3Small', 'MobileNetV3Large')
+  def test_mobilenetv3_construct_graph(self, model_name):
+    tf.keras.backend.set_image_data_format('channels_last')
+    input_size = 128
+
+    mobilenet_models = {
+        'MobileNetV3Small': mobilenet.MobileNetV3Small,
+        'MobileNetV3Large': mobilenet.MobileNetV3Large,
+    }
+    mobilenet_channels = {
+        # The number of filters of layers having outputs been collected
+        # for filter_size_scale = 1.0
+        'MobileNetV3Small': [16, 24, 48, 576],
+        'MobileNetV3Large': [24, 40, 112, 960],
+    }
+    network = mobilenet_models[str(model_name)](width_multiplier=1.0)
+
+    inputs = tf.ones([1, input_size, input_size, 3])
+    endpoints = network(inputs)
+
+    for idx, num_filter in enumerate(mobilenet_channels[model_name]):
+      self.assertAllEqual(
+          [1, input_size / 2 ** (idx+2), input_size / 2 ** (idx+2), num_filter],
+          endpoints['res'+str(idx+2)].shape.as_list())
+
+  @parameterized.product(
+      model_name=['MobileNetV3Small', 'MobileNetV3Large'],
+      output_stride=[4, 8, 16, 32])
+  def test_mobilenetv3_atrous_endpoint_shape(self, model_name, output_stride):
+    tf.keras.backend.set_image_data_format('channels_last')
+    input_size = 321
+    batch_size = 2
+
+    mobilenet_models = {
+        'MobileNetV3Small': mobilenet.MobileNetV3Small,
+        'MobileNetV3Large': mobilenet.MobileNetV3Large,
+    }
+    stride_spatial_shapes_map = {
+        4: [81, 81, 81, 81],
+        8: [81, 41, 41, 41],
+        16: [81, 41, 21, 21],
+        32: [81, 41, 21, 11],
+    }
+    mobilenet_channels = {
+        # The number of filters of layers having outputs been collected
+        # for filter_size_scale = 1.0
+        'MobileNetV3Small': [16, 24, 48, 576],
+        'MobileNetV3Large': [24, 40, 112, 960],
+    }
+    network = mobilenet_models[str(model_name)](
+        width_multiplier=1.0,
+        output_stride=output_stride)
+    spatial_shapes = stride_spatial_shapes_map[output_stride]
+
+    inputs = tf.ones([batch_size, input_size, input_size, 3])
+    endpoints = network(inputs)
+
+    for idx, num_filters in enumerate(mobilenet_channels[model_name]):
+      expected_shape = [
+          batch_size, spatial_shapes[idx], spatial_shapes[idx], num_filters
+      ]
+      self.assertAllEqual(
+          expected_shape,
+          endpoints['res'+str(idx+2)].shape.as_list())
+
+  @parameterized.parameters('MobileNetV3Small', 'MobileNetV3Large')
+  def test_mobilenet_reload_weights(self, model_name):
+    tf.keras.backend.set_image_data_format('channels_last')
+    mobilenet_models = {
+        'MobileNetV3Small': mobilenet.MobileNetV3Small,
+        'MobileNetV3Large': mobilenet.MobileNetV3Large,
+    }
+
+    tf.random.set_seed(0)
+    pixel_inputs = test_utils.create_test_input(1, 320, 320, 3)
+
+    network1 = mobilenet_models[model_name](
+        width_multiplier=1.0,
+        output_stride=32,
+        name='m1')
+    network1(pixel_inputs, False)
+    outputs1 = network1(pixel_inputs, False)
+    pixel_outputs = outputs1['res5']
+
+    # Feature extraction at the normal network rate.
+    network2 = mobilenet_models[model_name](
+        width_multiplier=1.0,
+        output_stride=32,
+        name='m2')
+    network2(pixel_inputs, False)
+    # Make the two networks use the same weights.
+    network2.set_weights(network1.get_weights())
+    outputs2 = network2(pixel_inputs, False)
+    expected = outputs2['res5']
+
+    self.assertAllClose(network1.get_weights(), network2.get_weights(),
+                        atol=1e-4, rtol=1e-4)
+    self.assertAllClose(pixel_outputs, expected, atol=1e-4, rtol=1e-4)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/model/layers/__init__.py b/model/layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..35e4ce02ff422f3aa84ab644b88d65b13e0cbc03
--- /dev/null
+++ b/model/layers/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/model/layers/activations.py b/model/layers/activations.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b47a4378440dee008f5f176856906b0f6716046
--- /dev/null
+++ b/model/layers/activations.py
@@ -0,0 +1,132 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Defines a set of useful activation functions."""
+import functools
+import tensorflow as tf
+
+
+def gelu(input_tensor, approximate=False):
+  """Gaussian Error Linear Unit.
+
+  Reference:
+  Gaussian Error Linear Units (GELUs), Dan Hendrycks, Kevin Gimpel, arXiv 2016.
+
+  Args:
+    input_tensor: A tensor with an arbitrary shape.
+    approximate: A boolean, whether to enable approximation.
+
+  Returns:
+    The activated input tensor.
+  """
+  return tf.keras.activations.gelu(input_tensor, approximate=approximate)
+
+
+def hard_sigmoid(input_tensor):
+  """Hard sigmoid activation function.
+
+  Args:
+    input_tensor: A tensor with an arbitrary shape.
+
+  Returns:
+    The activated input tensor.
+  """
+  input_tensor = tf.convert_to_tensor(input_tensor)
+  return tf.nn.relu6(input_tensor + tf.constant(3.)) * 0.16667
+
+
+def relu6(input_tensor):
+  """Relu6 activation function.
+
+  Args:
+    input_tensor: A tensor with an arbitrary shape.
+
+  Returns:
+    The activated input tensor.
+  """
+  input_tensor = tf.convert_to_tensor(input_tensor)
+  return tf.nn.relu6(input_tensor)
+
+
+def swish(input_tensor):
+  """Swish or SiLU activation function.
+
+  Args:
+    input_tensor: A tensor with an arbitrary shape.
+
+  Returns:
+    The activated input tensor.
+  """
+  input_tensor = tf.convert_to_tensor(input_tensor)
+  return tf.nn.silu(input_tensor)
+
+
+def hard_swish(input_tensor):
+  """Hard Swish function.
+
+  Args:
+    input_tensor: A tensor with an arbitrary shape.
+
+  Returns:
+    The activated input tensor.
+  """
+  input_tensor = tf.convert_to_tensor(input_tensor)
+  return input_tensor * tf.nn.relu6(
+      input_tensor + tf.constant(3.)) * (1. / 6.)
+
+
+def identity(input_tensor):
+  """Identity function.
+
+  Useful for helping in quantization.
+
+  Args:
+    input_tensor: A tensor with an arbitrary shape.
+
+  Returns:
+    The activated input tensor.
+  """
+  input_tensor = tf.convert_to_tensor(input_tensor)
+  return tf.identity(input_tensor)
+
+
+def get_activation(identifier):
+  """Gets activation function via input identifier.
+
+  This function returns the specified customized activation function, if there
+  is any. Otherwise, tf.keras.activations.get is called.
+
+  Args:
+    identifier: A string, name of the activation function.
+
+  Returns:
+    The specified activation function.
+  """
+  if isinstance(identifier, str):
+    name_to_fn = {
+        'gelu': functools.partial(gelu, approximate=False),
+        'approximated_gelu': functools.partial(gelu, approximate=True),
+        'silu': swish,
+        'swish': swish,
+        'hard_swish': hard_swish,
+        'relu6': relu6,
+        'hard_sigmoid': hard_sigmoid,
+        'identity': identity,
+        'none': identity,
+    }
+    identifier = str(identifier).lower()
+    if identifier in name_to_fn:
+      return name_to_fn[identifier]
+  return tf.keras.activations.get(identifier)
diff --git a/model/layers/activations_test.py b/model/layers/activations_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0c867c9fcad5218e28f3fb4f082274d1c48c173
--- /dev/null
+++ b/model/layers/activations_test.py
@@ -0,0 +1,36 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for activations.py."""
+import tensorflow as tf
+
+from deeplab2.model.layers import activations
+
+
+class ActivationsTest(tf.test.TestCase):
+
+  def test_gelu(self):
+    expected_data = [[0.14967535, 0., -0.10032465],
+                     [-0.15880796, -0.04540223, 2.9963627]]
+    gelu_data = activations.gelu([[.25, 0, -.25], [-1, -2, 3]],
+                                 approximate=True)
+    self.assertAllClose(expected_data, gelu_data)
+    gelu_data_via_get_activation = activations.get_activation(
+        'approximated_gelu')([[.25, 0, -.25], [-1, -2, 3]])
+    self.assertAllClose(expected_data, gelu_data_via_get_activation)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/model/layers/axial_block_groups.py b/model/layers/axial_block_groups.py
new file mode 100644
index 0000000000000000000000000000000000000000..594b26381fc99960f6dd5c656b0b63a71a4be6bb
--- /dev/null
+++ b/model/layers/axial_block_groups.py
@@ -0,0 +1,443 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Implements convolutional and attentional residual block groups."""
+
+import math
+import tensorflow as tf
+
+from deeplab2.model import utils
+from deeplab2.model.layers import activations
+from deeplab2.model.layers import axial_blocks
+from deeplab2.model.layers import drop_path
+from deeplab2.model.layers import dual_path_transformer
+from deeplab2.model.layers import positional_encodings
+from deeplab2.model.layers import recompute_grad as recompute_grad_lib
+
+# We will apply 10x larger learning rates on transformer layers. This global
+# variable name will be accessed when we build the optimizers. This keyword is
+# reserved and should not be a part of the variable names in a classification
+# pretrained backbone.
+TRANSFORMER = 'transformer'
+
+
+def _get_current_names(index):
+  current_name = '_block{}'.format(index + 1)
+  transformer_current_name = '_block{}_{}'.format(index + 1, TRANSFORMER)
+  return current_name, transformer_current_name
+
+
+class BlockGroup(tf.keras.layers.Layer):
+  """Applies a group of residual blocks with dual path transformer layers [1].
+
+  An optional dual-path transformer layer is inserted after each residual block.
+  The transformer layer performs memory2pixel attention, pixel2memory attention,
+  and memory2memory self-attention, while the standard residual block applies
+  the pixel2pixel axial-attention, global-attention, or spatial convolution.
+
+  Reference:
+  [1] MaX-DeepLab: End-to-End Panoptic Segmentation with Mask Transformers,
+      CVPR 2021. https://arxiv.org/abs/2012.00759
+        Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen.
+  """
+
+  def __init__(self,
+               filters,
+               num_blocks,
+               name,
+               original_resnet_stride,
+               original_resnet_input_stride,
+               output_stride=16,
+               backbone_type='resnet_beta',
+               positional_encoding_type=None,
+               use_global_beyond_stride=0,
+               use_axial_beyond_stride=16,
+               use_transformer_beyond_stride=32,
+               use_sac_beyond_stride=0,
+               use_squeeze_and_excite=False,
+               conv_use_recompute_grad=False,
+               axial_use_recompute_grad=True,
+               recompute_within_stride=0,
+               transformer_use_recompute_grad=False,
+               transformer_expansion=1,
+               drop_path_keep_prob=0.8,
+               drop_path_beyond_stride=16,
+               drop_path_schedule='constant',
+               activation='relu',
+               attention_bottleneck_expansion=2,
+               axial_layer_config=None,
+               dual_path_transformer_layer_config=None,
+               bn_layer=tf.keras.layers.BatchNormalization,
+               conv_kernel_weight_decay=0.0):
+    """Initializes a BlockGroup layer.
+
+    Args:
+      filters: An integer, the base number of channels for this block group.
+      num_blocks: An integer, the number of blocks for this block group.
+      name: A string, the name of the block group.
+      original_resnet_stride: An integer, the original resnet stride for this
+        block, usually 1 or 2. The stride will be applied if
+        original_resnet_input_stride is smaller than the desired output_stride.
+        Otherwise, the stride will not be applied, and atrous convolution will
+        be used after the first block.
+      original_resnet_input_stride: An integer, the total input stride in the
+        original resnet. For example, the total input stride for the last stage
+        of the original resnet is 16, and the total output stride is 32. This
+        stride differs from the true stride of the feature in that we might use
+        atrous convolution to change both the input and output stride to, e.g.
+        8, but its original resnet input stride remains the same. In this case,
+        we also use the original resnet input stride to compute the atrous rate.
+      output_stride: An integer, the desired output_stride for the ResNet.
+      backbone_type: A string, the type of the backbone. Supports 'resnet',
+        'resnet_beta', and 'wider_resnet'. The 'resnet' refers to the original
+        resnet with a 7x7 convolutional stem. The 'resnet_beta' means a resnet
+        but with an inception stem. The 'wider_resnet' is a wider variant of
+        resnet with extensively used 3x3 convolutions.
+      positional_encoding_type: A string, type of the positional encoding.
+        Support '2D', '1D', and None.
+      use_global_beyond_stride: An integer, the stride beyond which we use
+        global attention. Set to 0 if no global attention is desired. Defaults
+        to 0, i.e. we do not use global attention.
+      use_axial_beyond_stride: An integer, the stride beyond which we use axial
+        attention. Note that use_global_beyond_stride has a higher priority,
+        i.e. we use global attention if the stride is also beyond
+        use_global_beyond_stride. Set to 0 if no axial attention is desired.
+        Defaults to 16 as in MaX-DeepLab.
+      use_transformer_beyond_stride: An integer, the stride beyond which we use
+        a transformer layer. Set to 0 if no transformer is desired. Defaults to
+        32 as in MaX-DeepLab-S.
+      use_sac_beyond_stride: An integer. Use the Switchable Atrous Convolution
+        (SAC) beyond the specified stride. For example, if
+        `use_sac_beyond_stride` = 16, SAC will be applied to the network stage
+        whose output stride >= 16 (i.e., 16 and 32). Set to 0 or -1 to disable
+        it. Defaults to 0 as SAC is not used in MaX-DeepLab.
+      use_squeeze_and_excite: A boolean, whether squeeze-and-excite (SE) is
+        used. Defaults to False as SE is not used in MaX-DeepLab.
+      conv_use_recompute_grad: A boolean, whether to use the gradient
+        checkpointing trick for convolutional blocks. This trick reduces
+        accelerator memory usage, but takes longer to compute gradients.
+        Defaults to False since convolutional layers are memory efficient.
+      axial_use_recompute_grad: A boolean, whether to use the gradient
+        checkpointing trick for axial blocks. This trick reduces accelerator
+        memory usage, but takes longer to compute gradients. Defaults to True
+        since it saves memory for axial blocks.
+      recompute_within_stride: An integer, the stride within which we use the
+        gradient checkpointing trick. This trick reduces accelerator memory
+        usage, but takes longer to compute gradients. Defaults to 0 (do not
+        recompute any layer).
+      transformer_use_recompute_grad: A boolean, whether to use the gradient
+        checkpointing trick for dual-path transformer blocks. This trick reduces
+        accelerator memory usage, but takes longer to compute gradients.
+        Defaults to False.
+      transformer_expansion: An integer, the expansion ratio for the transformer
+        bottleneck.
+      drop_path_keep_prob: A float, the keep probability for dropping path.
+        Defaults to 0.8 as in MaX-DeepLab-S.
+      drop_path_beyond_stride: An integer, the stride beyond which we apply drop
+        path augmentation. Defaults to 16 as in MaX-DeepLab-S.
+      drop_path_schedule: A string, the drop path schedule. Currently, we
+        support 'constant': use the same drop path keep probability for all
+        stages, and 'linear': linearly decrease the drop path keep probability
+        from 1.0 at 0-th stage (or STEM) to `drop_path_keep_prob` at last stage.
+      activation: A string, type of activation function to apply. Support
+        'relu', 'swish' (or 'silu'), 'gelu', 'approximated_gelu', and 'elu'.
+      attention_bottleneck_expansion: An integer, the expansion ratio for
+        axial attention blocks.
+      axial_layer_config: A dict, an argument dictionary for the axial layer.
+      dual_path_transformer_layer_config: A dict, an argument dictionary for the
+        transformer.
+      bn_layer: An optional tf.keras.layers.Layer that computes the
+        normalization (default: tf.keras.layers.BatchNormalization).
+      conv_kernel_weight_decay: A float, the weight decay for convolution
+        kernels.
+
+    Raises:
+      ValueError: If backbone_type is not one of 'resnet', 'resnet_beta', or
+        'wider_resnet'.
+      ValueError: original_resnet_input_stride is not power of 2.
+      ValueError: output_stride is not power of 2.
+    """
+    if original_resnet_input_stride & (original_resnet_input_stride - 1):
+      raise ValueError('original_resnet_input_stride is not power of 2.')
+    if output_stride & (output_stride - 1):
+      raise ValueError('output_stride is not power of 2.')
+
+    super(BlockGroup, self).__init__(name=name)
+    self._add_absolute_positional_encoding = None
+    self._activation_fn = activations.get_activation(activation)
+    self._num_blocks = num_blocks
+    self._drop_path_keep_prob = []
+    self._recompute_grad = []
+    self._transformer_use_recompute_grad = transformer_use_recompute_grad
+    if dual_path_transformer_layer_config is None:
+      dual_path_transformer_layer_config = {}
+    original_resnet_current_stride = original_resnet_input_stride
+
+    use_sac = (original_resnet_input_stride * original_resnet_stride >=
+               use_sac_beyond_stride > 0)
+
+    recompute_grad = (original_resnet_input_stride * original_resnet_stride <=
+                      recompute_within_stride)
+
+    for index in range(num_blocks):
+      current_name, transformer_current_name = _get_current_names(index)
+
+      # Compute the current strides. If there is a stride for this block group,
+      # we do it in the first residual block.
+      if index == 0 and original_resnet_input_stride < output_stride:
+        current_strides = original_resnet_stride
+      else:
+        current_strides = 1
+
+      # Compute the current atrous rate.
+      if original_resnet_current_stride > output_stride:
+        atrous_rate = original_resnet_current_stride // output_stride
+      else:
+        atrous_rate = 1
+
+      # Compute the atrous rate for the second conv in the first basic block.
+      if (index == 0 and original_resnet_input_stride * original_resnet_stride >
+          output_stride):
+        basic_block_second_conv_atrous_rate = (
+            original_resnet_input_stride * original_resnet_stride //
+            output_stride)
+      else:
+        basic_block_second_conv_atrous_rate = atrous_rate
+
+      # Compute the current drop_path_keep_prob.
+      current_stage = math.log2(original_resnet_current_stride) - 1
+      if original_resnet_current_stride >= drop_path_beyond_stride:
+        current_drop_path_keep_prob = drop_path.get_drop_path_keep_prob(
+            drop_path_keep_prob, drop_path_schedule,
+            current_stage=int(round(current_stage)),
+            num_stages=4)
+      else:
+        current_drop_path_keep_prob = 1.0
+
+      # Compute which block_fn to use for this residual block.
+      if original_resnet_current_stride >= use_global_beyond_stride > 0:
+        attention_type = 'global'
+        recompute_grad = axial_use_recompute_grad or recompute_grad
+        filters_list = [filters * attention_bottleneck_expansion,
+                        filters,
+                        filters * 4]
+      elif original_resnet_current_stride >= use_axial_beyond_stride > 0:
+        attention_type = 'axial'
+        recompute_grad = axial_use_recompute_grad or recompute_grad
+        filters_list = [filters * attention_bottleneck_expansion,
+                        filters,
+                        filters * 4]
+      elif backbone_type == 'resnet' or backbone_type == 'resnet_beta':
+        attention_type = None
+        recompute_grad = conv_use_recompute_grad or recompute_grad
+        filters_list = [filters,
+                        filters,
+                        filters * 4]
+      elif backbone_type == 'wider_resnet':
+        if original_resnet_input_stride * original_resnet_stride < 32:
+          # Wider-ResNet uses conv basic blocks except the last stage.
+          attention_type = None
+          recompute_grad = conv_use_recompute_grad or recompute_grad
+          filters_list = [filters * 4,
+                          filters * 4]
+        else:
+          # Wider-ResNet uses an expanded bottleneck block in the last stage.
+          attention_type = None
+          recompute_grad = conv_use_recompute_grad or recompute_grad
+          filters_list = [filters,
+                          filters * 2,
+                          filters * 4]
+      else:
+        raise ValueError(backbone_type + ' is not supported.')
+
+      self._drop_path_keep_prob.append(current_drop_path_keep_prob)
+      # Apply the residual block.
+      # The inputs to block_fn should be activated features.
+      block_fn = axial_blocks.AxialBlock(
+          filters_list,
+          kernel_size=3,
+          strides=current_strides,
+          atrous_rate=atrous_rate,
+          use_squeeze_and_excite=use_squeeze_and_excite,
+          use_sac=use_sac,
+          bn_layer=bn_layer,
+          activation=activation,
+          name=current_name[1:],
+          conv_kernel_weight_decay=conv_kernel_weight_decay,
+          basic_block_second_conv_atrous_rate=(
+              basic_block_second_conv_atrous_rate),
+          attention_type=attention_type,
+          axial_layer_config=axial_layer_config)
+      self._recompute_grad.append(recompute_grad)
+      utils.safe_setattr(self, current_name, block_fn)
+
+      # Modify the original_resnet_stride according to the strides.
+      if index == 0 and original_resnet_stride > 1:
+        original_resnet_current_stride *= original_resnet_stride
+        # Add absolute positional encoding if we will apply global attention
+        # beyond this stride.
+        if original_resnet_current_stride == use_global_beyond_stride > 0:
+          self._add_absolute_positional_encoding = (
+              positional_encodings.AddAbsolutePositionalEncoding(
+                  'add_absolute_positional_encoding',
+                  positional_encoding_type, bn_layer, conv_kernel_weight_decay))
+      if original_resnet_current_stride >= use_transformer_beyond_stride > 0:
+        # Apply a dual-path transformer.
+        transformer_block_fn = dual_path_transformer.DualPathTransformerLayer(
+            name=transformer_current_name[1:],
+            filters=int(128 * transformer_expansion),
+            activation=activation,
+            bn_layer=bn_layer,
+            conv_kernel_weight_decay=conv_kernel_weight_decay,
+            **dual_path_transformer_layer_config)
+        utils.safe_setattr(self, transformer_current_name, transformer_block_fn)
+      else:
+        utils.safe_setattr(self, transformer_current_name, None)
+    # Avoid using recompute_grad for the first call that builds the sub-layers.
+    # Otherwise, recompute_grad will not track newly built model parameters.
+    self._first_building_call = True
+
+  def call(self, inputs, training=False):
+    """Performs a forward pass.
+
+    Args:
+      inputs: two tensors. The first tensor is a pixel_space_input with shape
+        [batch, height, width, pixel_channels]. The second tensor is
+        memory_space_input with shape [batch, length, memory_channels]. This
+        input will be used only if a transformer is used. Otherwise, the input
+        is returned unmodified.
+      training: A boolean flag indicating whether training behavior should be
+        used (default: False).
+
+    Returns:
+      output: An output [batch, height, width, filters * 4] tensor.
+      activated_output: An activated output [batch, height, width, filters * 4]
+        tensor.
+      memory_space_output: A memory space output [batch, length,
+        memory_channels] tensor.
+    """
+    # The pixel space inputs are activated features.
+    activated_features, memory_space_output = inputs
+
+    # Recompute_grad takes only float tensors as inputs. It does not allow
+    # bools or boolean tensors. For this reason, we cast training to a float
+    # tensor and cast it back after we go through the recompute_grad wrap.
+    float_tensor_training = tf.cast(training, tf.float32)
+
+    for index in range(self._num_blocks):
+      current_name, transformer_current_name = _get_current_names(index)
+      block_fn_no_recompute = getattr(
+          self, current_name)
+      transformer_block_fn_no_recompute = getattr(
+          self, transformer_current_name)
+      current_drop_path_keep_prob = self._drop_path_keep_prob[index]
+
+      # Wrap the layer if we want to recompute it in the backward pass.
+      if (self._recompute_grad[index] and training):
+        # The seed is not actually used since we do not have any random
+        # operation in the recomputed function. The purpose of the provided seed
+        # is to prevent recompute_grad from generating a new seed variable which
+        # is not compatible with model exporting.
+        block_fn = recompute_grad_lib.recompute_grad(
+            block_fn_no_recompute, seed=tf.constant(0, tf.int32))
+      else:
+        block_fn = block_fn_no_recompute
+
+      # The inputs to block_fn should be activated features.
+      block_fn_inputs = [activated_features, float_tensor_training]
+      # We have to define drop_path_masks outside the layer call and pass it
+      # into the layer, because tf.recompute_grad (gradient checkpointing) does
+      # not allow any randomness within the function call. In addition,
+      # recompute_grad functions can only take Tensors as inputs, so we do not
+      # pass the drop_path_random_mask (when it is None) into block_fn.
+      if current_drop_path_keep_prob < 1.0 and training:
+        drop_path_random_mask = drop_path.generate_drop_path_random_mask(
+            activated_features, current_drop_path_keep_prob)
+
+        block_fn_inputs.append(drop_path_random_mask)
+
+      # Build the sub-layers when the block_fn is called for the first time.
+      # Otherwise, recompute_grad will not track newly built model parameters.
+      if self._first_building_call:
+        _ = block_fn_no_recompute(tuple(block_fn_inputs))
+      # Apply the residual block.
+      features, activated_features = block_fn(tuple(block_fn_inputs))
+
+      if index == 0 and self._add_absolute_positional_encoding is not None:
+        features = self._add_absolute_positional_encoding(features,
+                                                          training=training)
+        activated_features = self._activation_fn(features)
+
+      if transformer_block_fn_no_recompute is not None:
+        # Reshape pixel space features from 4D to 3D.
+        _, height, width, channels = features.get_shape().as_list()
+        features = tf.reshape(
+            features, [-1, height * width, channels])
+
+        # Wrap the layer if we want to recompute it in the backward pass.
+        if (self._transformer_use_recompute_grad and training):
+          # The seed is not actually used since we do not have any random
+          # operation in the recomputed function. The purpose of the provided
+          # seed is to prevent recompute_grad from generating a new seed
+          # variable which is not compatible with model exporting.
+          transformer_block_fn = recompute_grad_lib.recompute_grad(
+              transformer_block_fn_no_recompute, seed=tf.constant(0, tf.int32))
+        else:
+          transformer_block_fn = transformer_block_fn_no_recompute
+
+        transformer_block_fn_input_list = [
+            features, memory_space_output, float_tensor_training]
+        # We have to define drop_path_masks outside the layer call and pass it
+        # into the layer, because recompute_grad (gradient checkpointing) does
+        # not allow any randomness within the function call. In addition,
+        # recompute_grad functions can only take Tensors as inputs, so we do not
+        # pass the drop_path_masks (when they are None) into
+        # transformer_block_fn.
+        if current_drop_path_keep_prob < 1.0 and training:
+          # Drop path random mask for pixel space attention.
+          pixel_space_drop_path_mask = drop_path.generate_drop_path_random_mask(
+              memory_space_output, current_drop_path_keep_prob)
+          # Drop path random mask for memory space attention.
+          memory_space_attention_drop_path_mask = (
+              drop_path.generate_drop_path_random_mask(
+                  memory_space_output, current_drop_path_keep_prob))
+          # Drop path random mask for memory space feed-forward network.
+          memory_space_feed_forward_network_drop_path_mask = (
+              drop_path.generate_drop_path_random_mask(
+                  memory_space_output, current_drop_path_keep_prob))
+          transformer_block_fn_input_list += [
+              pixel_space_drop_path_mask,
+              memory_space_attention_drop_path_mask,
+              memory_space_feed_forward_network_drop_path_mask]
+
+        # Build the sub-layers when the transformer_block_fn is called for the
+        # first time. Otherwise, recompute_grad will not track newly built model
+        # parameters.
+        if self._first_building_call:
+          _ = transformer_block_fn_no_recompute(
+              tuple(transformer_block_fn_input_list))
+        # Apply a dual-path transformer.
+        features, activated_features, memory_space_output = (
+            transformer_block_fn(tuple(transformer_block_fn_input_list)))
+
+        # Reshape pixel space features back to 4D.
+        features = tf.reshape(features, [-1, height, width, channels])
+        activated_features = tf.reshape(activated_features,
+                                        [-1, height, width, channels])
+    # Now the first call has finished and the sub-layers have been built.
+    self._first_building_call = False
+    # We also return the non-activated output so that the function is compatible
+    # with a decoder that takes a non-activated tensor as input.
+    return features, activated_features, memory_space_output
diff --git a/model/layers/axial_block_groups_test.py b/model/layers/axial_block_groups_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1283bc2f2623035e5b8374ade1974db6d474141
--- /dev/null
+++ b/model/layers/axial_block_groups_test.py
@@ -0,0 +1,182 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for axial_block_groups."""
+
+import numpy as np
+import tensorflow as tf
+
+from deeplab2.model import test_utils
+from deeplab2.model.layers import axial_block_groups
+
+
+class AxialBlockGroupsTest(tf.test.TestCase):
+
+  def test_axial_attention_follows_bottleneck_block(self):
+    layer = axial_block_groups.BlockGroup(
+        filters=512,
+        num_blocks=2,
+        name='block_group',
+        original_resnet_stride=2,
+        original_resnet_input_stride=16,
+        use_axial_beyond_stride=32,
+        output_stride=16)
+    _, pixel_output, memory_output = layer((tf.zeros([2, 65, 65, 1024]),
+                                            tf.zeros([2, 128, 147])))
+    self.assertListEqual(pixel_output.get_shape().as_list(),
+                         [2, 65, 65, 2048])
+    self.assertListEqual(memory_output.get_shape().as_list(),
+                         [2, 128, 147])
+
+  def test_global_attention_follows_basic_block(self):
+    layer = axial_block_groups.BlockGroup(
+        filters=256,
+        num_blocks=2,
+        name='block_group',
+        backbone_type='wider_resnet',
+        original_resnet_stride=2,
+        original_resnet_input_stride=8,
+        use_global_beyond_stride=16,
+        positional_encoding_type='1D')
+
+    _, pixel_output, memory_output = layer((tf.zeros([2, 65, 65, 32]),
+                                            tf.zeros([2, 128, 147])))
+    self.assertListEqual(pixel_output.get_shape().as_list(),
+                         [2, 33, 33, 1024])
+    self.assertListEqual(memory_output.get_shape().as_list(),
+                         [2, 128, 147])
+
+  def test_atrous_consistency_basic_block(self):
+    tf.random.set_seed(0)
+    pixel_inputs = test_utils.create_test_input(2, 11, 11, 3)
+    # Dense feature extraction followed by subsampling.
+    layer1 = axial_block_groups.BlockGroup(
+        filters=2,
+        num_blocks=2,
+        name='stage3',
+        backbone_type='wider_resnet',
+        original_resnet_stride=2,
+        original_resnet_input_stride=8,
+        output_stride=8,
+        use_axial_beyond_stride=0,
+        use_global_beyond_stride=0,
+        use_transformer_beyond_stride=0)
+    # Create the weights
+    layer1((pixel_inputs, None))
+    weights = layer1.get_weights()
+    # Set the batch norm gamma as non-zero so that the 3x3 convolution affects
+    # the output.
+    for index in range(len(weights)):
+      if np.sum(weights[index]) == 0.0:
+        weights[index] = weights[index] + 1
+    layer1.set_weights(weights)
+    _, pixel_outputs, _ = layer1((pixel_inputs, None))
+    output = pixel_outputs[:, ::2, ::2, :]
+    # Feature extraction at the nominal network rate.
+    layer2 = axial_block_groups.BlockGroup(
+        filters=2,
+        num_blocks=2,
+        name='stage3',
+        backbone_type='wider_resnet',
+        original_resnet_stride=2,
+        original_resnet_input_stride=8,
+        output_stride=16,
+        use_axial_beyond_stride=0,
+        use_global_beyond_stride=0,
+        use_transformer_beyond_stride=0)
+    # Create the weights
+    layer2((pixel_inputs, None))
+    # Make the two networks use the same weights.
+    layer2.set_weights(layer1.get_weights())
+    _, expected, _ = layer2((pixel_inputs, None))
+    self.assertAllClose(output, expected, atol=1e-4, rtol=1e-4)
+
+  def test_atrous_consistency_bottleneck_block(self):
+    tf.random.set_seed(0)
+    pixel_inputs = test_utils.create_test_input(2, 11, 11, 3)
+    # Dense feature extraction followed by subsampling.
+    layer1 = axial_block_groups.BlockGroup(
+        filters=2,
+        num_blocks=2,
+        name='stage3',
+        backbone_type='wider_resnet',
+        original_resnet_stride=2,
+        original_resnet_input_stride=16,
+        output_stride=16,
+        use_axial_beyond_stride=0,
+        use_global_beyond_stride=0,
+        use_transformer_beyond_stride=0)
+    # Create the weights
+    layer1((pixel_inputs, None))
+    weights = layer1.get_weights()
+    # Set the batch norm gamma as non-zero so that the 3x3 convolution affects
+    # the output.
+    for index in range(len(weights)):
+      if np.sum(weights[index]) == 0.0:
+        weights[index] = weights[index] + 1
+    layer1.set_weights(weights)
+    _, pixel_outputs, _ = layer1((pixel_inputs, None))
+    output = pixel_outputs[:, ::2, ::2, :]
+    # Feature extraction at the nominal network rate.
+    layer2 = axial_block_groups.BlockGroup(
+        filters=2,
+        num_blocks=2,
+        name='stage3',
+        backbone_type='wider_resnet',
+        original_resnet_stride=2,
+        original_resnet_input_stride=16,
+        output_stride=32,
+        use_axial_beyond_stride=0,
+        use_global_beyond_stride=0,
+        use_transformer_beyond_stride=0)
+    # Create the weights
+    layer2((pixel_inputs, None))
+    # Make the two networks use the same weights.
+    layer2.set_weights(layer1.get_weights())
+    _, expected, _ = layer2((pixel_inputs, None))
+    self.assertAllClose(output, expected, atol=1e-4, rtol=1e-4)
+
+  def test_use_se_sac_recompute_drop_path_schedule(self):
+    _ = axial_block_groups.BlockGroup(
+        filters=512,
+        num_blocks=2,
+        name='block_group',
+        original_resnet_stride=2,
+        original_resnet_input_stride=8,
+        use_axial_beyond_stride=0,
+        use_squeeze_and_excite=True,  # True
+        use_sac_beyond_stride=16,  # True
+        recompute_within_stride=16,  # True
+        drop_path_beyond_stride=16,
+        drop_path_schedule='linear',  # 1.0, 0.85
+        output_stride=16)
+
+  def test_nouse_se_sac_recompute_drop_path_schedule(self):
+    _ = axial_block_groups.BlockGroup(
+        filters=512,
+        num_blocks=2,
+        name='block_group',
+        original_resnet_stride=2,
+        original_resnet_input_stride=8,
+        use_axial_beyond_stride=0,
+        use_squeeze_and_excite=False,  # False
+        use_sac_beyond_stride=32,  # False
+        recompute_within_stride=8,  # False
+        drop_path_beyond_stride=32,  # 1.0, 1.0
+        drop_path_schedule='constant',
+        output_stride=16)
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/model/layers/axial_blocks.py b/model/layers/axial_blocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb21189461979d87aa5a8294959053a5960dfe76
--- /dev/null
+++ b/model/layers/axial_blocks.py
@@ -0,0 +1,308 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Implements Axial-Blocks proposed in Axial-DeepLab [1].
+
+Axial-Blocks are based on residual bottleneck blocks, but with the 3x3
+convolution replaced with two axial-attention layers, one on the height-axis,
+followed by the other on the width-axis.
+
+[1] Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation,
+    ECCV 2020 Spotlight.
+      Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille,
+      Liang-Chieh Chen.
+"""
+import tensorflow as tf
+
+from deeplab2.model import utils
+from deeplab2.model.layers import activations
+from deeplab2.model.layers import axial_layers
+from deeplab2.model.layers import convolutions
+from deeplab2.model.layers import squeeze_and_excite
+
+
+class AxialBlock(tf.keras.layers.Layer):
+  """An AxialBlock as a building block for an Axial-ResNet model.
+
+  We implement the Axial-Block proposed in [1] in a general way that also
+  includes convolutional residual blocks, such as the basic block and the
+  bottleneck block (w/ and w/o Switchable Atrous Convolution).
+
+  A basic block consists of two 3x3 convolutions and a residual connection. It
+  is the main building block for wide-resnet variants.
+
+  A bottleneck block consists of consecutive 1x1, 3x3, 1x1 convolutions and a
+  residual connection. It is the main building block for standard resnet
+  variants.
+
+  An axial block consists of a 1x1 input convolution, a self-attention layer
+  (either axial-attention or global attention), a 1x1 output convolution, and a
+  residual connection. It is the main building block for axial-resnet variants.
+
+  Note: We apply the striding in the first spatial operation (i.e. 3x3
+  convolution or self-attention layer).
+  """
+
+  def __init__(self,
+               filters_list,
+               kernel_size=3,
+               strides=1,
+               atrous_rate=1,
+               use_squeeze_and_excite=False,
+               use_sac=False,
+               bn_layer=tf.keras.layers.BatchNormalization,
+               activation='relu',
+               name=None,
+               conv_kernel_weight_decay=0.0,
+               basic_block_second_conv_atrous_rate=None,
+               attention_type=None,
+               axial_layer_config=None):
+    """Initializes an AxialBlock.
+
+    Args:
+      filters_list: A list of filter numbers in the residual block. We currently
+        support filters_list with two or three elements. Two elements specify
+        the filters for two consecutive 3x3 convolutions, while three elements
+        specify the filters for three convolutions (1x1, 3x3, and 1x1).
+      kernel_size: The size of the convolution kernels (default: 3).
+      strides: The strides of the block (default: 1).
+      atrous_rate: The atrous rate of the 3x3 convolutions (default: 1). If this
+        residual block is a basic block, it is recommendeded to specify correct
+        basic_block_second_conv_atrous_rate for the second 3x3 convolution.
+        Otherwise, the second conv will also use atrous rate, which might cause
+        atrous inconsistency with different output strides, as tested in
+        axial_block_groups_test.test_atrous_consistency_basic_block.
+      use_squeeze_and_excite: A boolean flag indicating whether
+        squeeze-and-excite (SE) is used.
+      use_sac: A boolean, using the Switchable Atrous Convolution (SAC) or not.
+      bn_layer: A tf.keras.layers.Layer that computes the normalization
+        (default: tf.keras.layers.BatchNormalization).
+      activation: A string specifying the activation function to apply.
+      name: An string specifying the name of the layer (default: None).
+      conv_kernel_weight_decay: A float, the weight decay for convolution
+        kernels.
+      basic_block_second_conv_atrous_rate: An integer, the atrous rate for the
+        second convolution of basic block. This is necessary to ensure atrous
+        consistency with different output_strides. Defaults to atrous_rate.
+      attention_type: A string, type of attention to apply. Support 'axial' and
+        'global'.
+      axial_layer_config: A dict, an argument dictionary for the axial layer.
+
+    Raises:
+      ValueError: If filters_list does not have two or three elements.
+      ValueError: If attention_type is not supported.
+      ValueError: If double_global_attention is True in axial_layer_config.
+    """
+    super(AxialBlock, self).__init__(name=name)
+
+    self._filters_list = filters_list
+    self._strides = strides
+    self._use_squeeze_and_excite = use_squeeze_and_excite
+    self._bn_layer = bn_layer
+    self._activate_fn = activations.get_activation(activation)
+    self._attention_type = attention_type
+
+    if axial_layer_config is None:
+      axial_layer_config = {}
+
+    if basic_block_second_conv_atrous_rate is None:
+      basic_block_second_conv_atrous_rate = atrous_rate
+
+    if len(filters_list) == 3:
+      # Three consecutive convolutions: 1x1, 3x3, and 1x1.
+      self._conv1_bn_act = convolutions.Conv2DSame(
+          filters_list[0], 1, 'conv1_bn_act',
+          use_bias=False,
+          use_bn=True,
+          bn_layer=bn_layer,
+          activation=activation,
+          conv_kernel_weight_decay=conv_kernel_weight_decay)
+
+      if attention_type is None or attention_type.lower() == 'none':
+        self._conv2_bn_act = convolutions.Conv2DSame(
+            filters_list[1], kernel_size, 'conv2_bn_act',
+            strides=strides,
+            atrous_rate=atrous_rate,
+            use_bias=False,
+            use_bn=True,
+            bn_layer=bn_layer,
+            activation=activation,
+            use_switchable_atrous_conv=use_sac,
+            # We default to use global context in SAC if use_sac is True. This
+            # setting is experimentally found effective.
+            use_global_context_in_sac=use_sac,
+            conv_kernel_weight_decay=conv_kernel_weight_decay)
+      elif attention_type == 'axial':
+        if 'double_global_attention' in axial_layer_config:
+          if axial_layer_config['double_global_attention']:
+            raise ValueError('Double_global_attention takes no effect in '
+                             'AxialAttention2D.')
+          del axial_layer_config['double_global_attention']
+        self._attention = axial_layers.AxialAttention2D(
+            strides=strides,
+            filters=filters_list[1],
+            name='attention',
+            bn_layer=bn_layer,
+            conv_kernel_weight_decay=conv_kernel_weight_decay,
+            **axial_layer_config)
+      elif attention_type == 'global':
+        self._attention = axial_layers.GlobalAttention2D(
+            strides=strides,
+            filters=filters_list[1],
+            name='attention',
+            bn_layer=bn_layer,
+            conv_kernel_weight_decay=conv_kernel_weight_decay,
+            **axial_layer_config)
+      else:
+        raise ValueError(attention_type + ' is not supported.')
+
+      # Here we apply a batch norm with gamma initialized at zero. This ensures
+      # that at random initialization of the model, the skip connections
+      # dominate all residual blocks. In this way, all the skip connections
+      # construct an identity mapping that passes the gradients (without any
+      # distortion from the randomly initialized blocks) to all residual blocks.
+      # This trick helps training at early epochs.
+      # Reference: "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour".
+      # https://arxiv.org/abs/1706.02677
+      self._conv3_bn = convolutions.Conv2DSame(
+          filters_list[2], 1, 'conv3_bn',
+          use_bias=False,
+          use_bn=True,
+          bn_layer=bn_layer,
+          bn_gamma_initializer='zeros',
+          activation='none',
+          conv_kernel_weight_decay=conv_kernel_weight_decay)
+    elif len(filters_list) == 2:
+      # Two consecutive convolutions: 3x3 and 3x3.
+      self._conv1_bn_act = convolutions.Conv2DSame(
+          filters_list[0], kernel_size, 'conv1_bn_act',
+          strides=strides,
+          atrous_rate=atrous_rate,
+          use_bias=False,
+          use_bn=True,
+          bn_layer=bn_layer,
+          activation=activation,
+          use_switchable_atrous_conv=use_sac,
+          use_global_context_in_sac=use_sac,
+          conv_kernel_weight_decay=conv_kernel_weight_decay)
+      # Here we apply a batch norm with gamma initialized at zero. This ensures
+      # that at random initialization of the model, the skip connections
+      # dominate all residual blocks. In this way, all the skip connections
+      # construct an identity mapping that passes the gradients (without any
+      # distortion from the randomly initialized blocks) to all residual blocks.
+      # This trick helps training at early epochs.
+      # Reference: "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour".
+      # https://arxiv.org/abs/1706.02677
+      self._conv2_bn = convolutions.Conv2DSame(
+          filters_list[1], kernel_size, 'conv2_bn',
+          strides=1,
+          atrous_rate=basic_block_second_conv_atrous_rate,
+          use_bias=False,
+          use_bn=True,
+          bn_layer=bn_layer,
+          bn_gamma_initializer='zeros',
+          activation='none',
+          use_switchable_atrous_conv=use_sac,
+          use_global_context_in_sac=use_sac,
+          conv_kernel_weight_decay=conv_kernel_weight_decay)
+    else:
+      raise ValueError('Expect filters_list to have length 2 or 3; got %d' %
+                       len(filters_list))
+
+    if self._use_squeeze_and_excite:
+      self._squeeze_and_excite = squeeze_and_excite.SimplifiedSqueezeAndExcite(
+          filters_list[-1])
+    self._conv_kernel_weight_decay = conv_kernel_weight_decay
+
+  def build(self, input_shape_list):
+    input_tensor_shape = input_shape_list[0]
+    self._shortcut = None
+    if input_tensor_shape[3] != self._filters_list[-1]:
+      self._shortcut = convolutions.Conv2DSame(
+          self._filters_list[-1], 1, 'shortcut',
+          strides=self._strides,
+          use_bias=False,
+          use_bn=True,
+          bn_layer=self._bn_layer,
+          activation='none',
+          conv_kernel_weight_decay=self._conv_kernel_weight_decay)
+
+  def call(self, inputs):
+    """Performs a forward pass.
+
+    We have to define drop_path_random_mask outside the layer call and pass it
+    into the layer, because recompute_grad (gradient checkpointing) does not
+    allow any randomness within the function call. In addition, recompute_grad
+    only supports float tensors as inputs. For this reason, the training flag
+    should be also passed as a float tensor. For the same reason, we cannot
+    support passing drop_path_random_mask as None. Instead, we ask the users to
+    pass only the first two tensors when drop path is not used.
+
+    Args:
+      inputs: A tuple of 2 or 3 tensors, containing
+        input_tensor should be an input tensor of type tf.Tensor with shape
+          [batch, height, width, channels].
+        float_tensor_training should be a float tensor of 0.0 or 1.0, whether
+          the model is in training mode.
+        (optional) drop_path_random_mask is a drop path random mask of type
+          tf.Tensor with shape [batch, 1, 1, 1].
+
+    Returns:
+      outputs: two tensors. The first tensor does not use the last activation
+        function. The second tensor uses the activation. We return non-activated
+        output to support MaX-DeepLab which uses non-activated feature for the
+        stacked decoders.
+
+    Raises:
+      ValueError: If the length of inputs is not 2 or 3.
+    """
+    if len(inputs) not in (2, 3):
+      raise ValueError('The length of inputs should be either 2 or 3.')
+
+    # Unpack the inputs.
+    input_tensor, float_tensor_training, drop_path_random_mask = (
+        utils.pad_sequence_with_none(inputs, target_length=3))
+
+    # Recompute_grad takes only float tensors as inputs. It does not allow
+    # bools or boolean tensors. For this reason, we cast training to a float
+    # tensor outside this call, and now we cast it back to a boolean tensor.
+    training = tf.cast(float_tensor_training, tf.bool)
+
+    shortcut = input_tensor
+    if self._shortcut is not None:
+      shortcut = self._shortcut(shortcut, training=training)
+    elif self._strides != 1:
+      shortcut = shortcut[:, ::self._strides, ::self._strides, :]
+
+    if len(self._filters_list) == 3:
+      x = self._conv1_bn_act(input_tensor, training=training)
+      if (self._attention_type is None or
+          self._attention_type.lower() == 'none'):
+        x = self._conv2_bn_act(x, training=training)
+      else:
+        x = self._attention(x, training=training)
+        x = self._activate_fn(x)
+      x = self._conv3_bn(x, training=training)
+    if len(self._filters_list) == 2:
+      x = self._conv1_bn_act(input_tensor, training=training)
+      x = self._conv2_bn(x, training=training)
+
+    if self._use_squeeze_and_excite:
+      x = self._squeeze_and_excite(x)
+
+    if drop_path_random_mask is not None:
+      x = x * drop_path_random_mask
+    x = x + shortcut
+    return x, self._activate_fn(x)
diff --git a/model/layers/axial_blocks_test.py b/model/layers/axial_blocks_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dad90a38b9587358d898e63fa5d47796e17b1fc
--- /dev/null
+++ b/model/layers/axial_blocks_test.py
@@ -0,0 +1,54 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for axial_blocks."""
+
+import tensorflow as tf
+
+from deeplab2.model.layers import axial_blocks
+
+
+class AxialBlocksTest(tf.test.TestCase):
+
+  def test_conv_basic_block_correct_output_shape(self):
+    layer = axial_blocks.AxialBlock(
+        filters_list=[256, 256],
+        strides=2)
+    float_training_tensor = tf.constant(0.0, dtype=tf.float32)
+    output = layer((tf.zeros([2, 65, 65, 32]),
+                    float_training_tensor))[1]
+    self.assertListEqual(output.get_shape().as_list(), [2, 33, 33, 256])
+
+  def test_conv_bottleneck_block_correct_output_shape(self):
+    layer = axial_blocks.AxialBlock(
+        filters_list=[64, 64, 256],
+        strides=1)
+    float_training_tensor = tf.constant(0.0, dtype=tf.float32)
+    output = layer((tf.zeros([2, 65, 65, 32]),
+                    float_training_tensor))[0]
+    self.assertListEqual(output.get_shape().as_list(), [2, 65, 65, 256])
+
+  def test_axial_block_correct_output_shape(self):
+    layer = axial_blocks.AxialBlock(
+        filters_list=[128, 64, 256],
+        strides=2,
+        attention_type='axial')
+    float_training_tensor = tf.constant(0.0, dtype=tf.float32)
+    output = layer((tf.zeros([2, 65, 65, 32]),
+                    float_training_tensor))[1]
+    self.assertListEqual(output.get_shape().as_list(), [2, 33, 33, 256])
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/model/layers/axial_layers.py b/model/layers/axial_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..48e2f8651c1f3ea1b8eeafc987ffbf6bae753161
--- /dev/null
+++ b/model/layers/axial_layers.py
@@ -0,0 +1,523 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Implements Axial-Attention layers proposed in Axial-DeepLab.
+
+Axial-Attention factorizes 2D self-attention into two 1D self-attentions, so
+that it can be applied on large inputs. Axial-Attention is typically used to
+replace 3x3 convolutions in a bottleneck residual block.
+
+[1] Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation,
+    ECCV 2020 Spotlight.
+      Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille,
+      Liang-Chieh Chen.
+"""
+
+import numpy as np
+import tensorflow as tf
+
+from deeplab2.model import utils
+from deeplab2.model.layers import activations
+from deeplab2.model.layers import positional_encodings
+
+
+class AxialAttention(tf.keras.layers.Layer):
+  """An axial-attention layer."""
+
+  def __init__(self,
+               query_shape=129,
+               memory_flange=32,
+               total_key_depth=512,
+               total_value_depth=1024,
+               num_heads=8,
+               name='axial_attention',
+               use_query_rpe_similarity=True,
+               use_key_rpe_similarity=True,
+               use_content_similarity=True,
+               retrieve_value_rpe=True,
+               retrieve_value_content=True,
+               initialization_std_for_query_key_rpe=1.0,
+               initialization_std_for_value_rpe=1.0,
+               self_attention_activation='softmax',
+               bn_layer=tf.keras.layers.BatchNormalization,
+               conv_kernel_weight_decay=0.0):
+    """Initializes an axial-attention layer.
+
+    This function is designed to support both global and local axial-attention
+    in a unified way. If query_shape is larger than the length of input, a
+    global attention is applied. If query_shape is smaller than the length of
+    input, a local attention is applied. In this case, the input is divided into
+    blocks of length query_shape, padded by memory_flange on both sides. Then,
+    local attention is applied within each query block. The choice of
+    query_shape does not affect the output value but affects computation
+    efficiency and memory usage. In general, use global attention (large
+    query_shape) if possible. Local axial-attention has not been supported yet.
+
+    Args:
+      query_shape: An integer, the block size for local axial attention.
+        Defaults to 129 since 129 is usually the largest feature map where we do
+        global attention (1025 with stride 8, or 2049 with stride 16).
+      memory_flange: An integer, the memory flange padded to each query block in
+        local attention. It has no effect in global attention. Defaults to 32,
+        which is equivalent to a span of 65 in Aixal-DeepLab paper -- A pixel
+        can see 32 pixels on its left and 32 pixels on its right.
+      total_key_depth: An integer, the total depth of keys, which is also the
+        depth of queries and the depth of key (query) positional encodings.
+      total_value_depth: An integer, the total depth of the values, which is
+        also the depth of value positional encodings.
+      num_heads: An integer, the number of heads in multi-head attention.
+      name: A string, the name of this axial attention layer.
+      use_query_rpe_similarity: A boolean, whether to use the attention
+        similarity between the queries and the relative positional encodings.
+      use_key_rpe_similarity: A boolean, whether to use the attention similarity
+        between the keys and the relative positional encodings.
+      use_content_similarity: A boolean, whether to use the content similarity
+        between the queries and the keys.
+      retrieve_value_rpe: A boolean, whether to retrieve the relative positional
+        encodings of the values.
+      retrieve_value_content: A boolean, whether to retrieve the content of the
+        values.
+      initialization_std_for_query_key_rpe: A float, the initialization std for
+        the relative positional encodings of the queries and keys.
+      initialization_std_for_value_rpe: A float, the initialization std for the
+        relative positional encodings of the values.
+      self_attention_activation: A string, type of activation function for
+        self-attention. Support 'sigmoid' and 'softmax'.
+      bn_layer: A tf.keras.layers.Layer that computes the normalization
+        (default: tf.keras.layers.BatchNormalization).
+      conv_kernel_weight_decay: A float, the weight decay for convolution
+        kernels.
+
+    Returns:
+      output: A [batch, length, total_value_depth] tensor.
+
+    Raises:
+      ValueError: If none of the three similarities (use_query_rpe_similarity,
+        use_key_rpe_similarity, use_content_similarity) is used.
+      ValueError: If neither of value content or value rpe is retrieved.
+      ValueError: If self_attention_activation is not supported.
+      ValueError: If total_key_depth is not divisible by num_heads.
+      ValueError: If total_value_depth is not divisible by num_heads.
+    """
+    # Validate the attention similarity choices.
+    if not any([
+        use_content_similarity, use_key_rpe_similarity, use_query_rpe_similarity
+    ]):
+      raise ValueError(
+          'Should use at least one similarity to compute attention.')
+
+    # Validate the retrieve value choices.
+    if not retrieve_value_content and not retrieve_value_rpe:
+      raise ValueError('Should retrieve at least one of content or rpe.')
+
+    if total_key_depth % num_heads:
+      raise ValueError('Total_key_depth should be divisible by num_heads.')
+
+    if total_value_depth % num_heads:
+      raise ValueError('Total_value_depth should be divisible by num_heads.')
+
+    super(AxialAttention, self).__init__(name=name)
+    self._query_shape = query_shape
+    self._memory_flange = memory_flange
+    self._total_key_depth = total_key_depth
+    self._total_value_depth = total_value_depth
+    self._num_heads = num_heads
+    self._use_query_rpe_similarity = use_query_rpe_similarity
+    self._use_key_rpe_similarity = use_key_rpe_similarity
+    self._use_content_similarity = use_content_similarity
+    self._retrieve_value_rpe = retrieve_value_rpe
+    self._retrieve_value_content = retrieve_value_content
+    self._initialization_std_for_query_key_rpe = (
+        initialization_std_for_query_key_rpe)
+    self._initialization_std_for_value_rpe = initialization_std_for_value_rpe
+    self._self_attention_activation = self_attention_activation
+    self._conv_kernel_weight_decay = conv_kernel_weight_decay
+
+    self._batch_norm_qkv = bn_layer(axis=-1, name='batch_norm_qkv')
+    self._batch_norm_similarity = bn_layer(
+        axis=[0, 2], name='batch_norm_similarity')
+    self._batch_norm_retrieved_output = bn_layer(
+        axis=[0, 2, 4], name='batch_norm_retrieved_output')
+
+    self._key_depth_per_head = total_key_depth // num_heads
+    self._attention_activate_fn = activations.get_activation(
+        self_attention_activation)
+
+  def build(self, input_shape):
+    """Builds axial-attention layer weights.
+
+    Args:
+      input_shape: An integer list of length 3, the shape of the input tensor.
+
+    Raises:
+      NotImplementedError: Local axial-attention has not been implemented. It is
+        triggered if query_shape is less than input_shape.
+    """
+
+    # Apply global attention if query_shape is larger than input_shape[1].
+    if self._query_shape >= input_shape[1]:
+      self._query_shape = input_shape[1]
+      self._memory_flange = 0
+    else:
+      raise NotImplementedError('Local axial attention has not been '
+                                'implemented yet.')
+    self._memory_shape = self._query_shape + 2 * self._memory_flange
+
+    # Compute query key value with one convolution and an optional batch norm.
+    # The initialization std is standard transformer initialization (without
+    # batch norm), as used in SASA and ViT. In our case, we use batch norm by
+    # default, so it does not require careful tuning. If one wants to remove
+    # all batch norms in axial attention, this standard initialization should
+    # still be good, but a more careful initialization is encouraged.
+    self.qkv_kernel = self.add_weight(
+        name='qkv_kernel',
+        shape=[input_shape[-1],
+               self._total_key_depth * 2 + self._total_value_depth],
+        initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=input_shape[-1]**-0.5),
+        regularizer=tf.keras.regularizers.l2(self._conv_kernel_weight_decay))
+
+    if self._use_query_rpe_similarity:
+      self._query_rpe = positional_encodings.RelativePositionalEncoding(
+          self._query_shape,
+          self._memory_shape,
+          self._key_depth_per_head,
+          self._num_heads,
+          'query_rpe',
+          initialization_std=self._initialization_std_for_query_key_rpe,
+          conv_kernel_weight_decay=self._conv_kernel_weight_decay)
+
+    if self._use_key_rpe_similarity:
+      self._key_rpe = positional_encodings.RelativePositionalEncoding(
+          self._query_shape,
+          self._memory_shape,
+          self._key_depth_per_head,
+          self._num_heads,
+          'key_rpe',
+          initialization_std=self._initialization_std_for_query_key_rpe,
+          conv_kernel_weight_decay=self._conv_kernel_weight_decay)
+
+    if self._retrieve_value_rpe:
+      self._value_rpe = positional_encodings.RelativePositionalEncoding(
+          self._query_shape,
+          self._memory_shape,
+          self._total_value_depth // self._num_heads,
+          self._num_heads,
+          'value_rpe',
+          initialization_std=self._initialization_std_for_value_rpe,
+          conv_kernel_weight_decay=self._conv_kernel_weight_decay)
+
+  def call(self, input_tensor, training=False):
+    """Performs a forward pass.
+
+    Args:
+      input_tensor: An input [batch, length, channel] tensor.
+      training: A boolean flag indicating whether training behavior should be
+        used (default: False).
+
+    Returns:
+      output: An output [batch, length, total_value_depth] tensor.
+    """
+    # Alternatively, the einsum can be implemented as a 1x1 convolution.
+    # However, it is not obvious which implementation is more efficient (without
+    # careful benchmarking), so we use einsum for its flexibility and
+    # consistency with other parts of the function.
+    query_key_value = tf.einsum(
+        'nlc,cd->nld', input_tensor, self.qkv_kernel, name='compute_qkv')
+    query_key_value = self._batch_norm_qkv(query_key_value, training=training)
+
+    # Split query key value.
+    query, key, value = tf.split(
+        query_key_value,
+        [self._total_key_depth, self._total_key_depth, self._total_value_depth],
+        axis=-1)
+
+    # Reshape the query, key, and value.
+    query = tf.reshape(query, [-1, self._query_shape, self._num_heads,
+                               self._key_depth_per_head])
+    query = tf.transpose(a=query, perm=[0, 2, 1, 3])
+    key = tf.reshape(key, [-1, np.prod(self._memory_shape), self._num_heads,
+                           self._key_depth_per_head])
+    key = tf.transpose(a=key, perm=[0, 2, 1, 3])
+    value = tf.reshape(value, [-1, np.prod(self._memory_shape), self._num_heads,
+                               self._total_value_depth // self._num_heads])
+
+    # Gather all similarity logits into a list.
+    similarity_logits = []
+
+    # Compute the content similarity term: q * k.
+    if self._use_content_similarity:
+      content_similarity = tf.einsum(
+          'bhld,bhmd->bhlm', query, key, name='content_similarity')
+      similarity_logits.append(content_similarity)
+
+    # Compute the query rpe similarity term: q * rpe.
+    if self._use_query_rpe_similarity:
+      query_rpe = self._query_rpe(None)
+      query_rpe_similarity = tf.einsum(
+          'bhld,hlmd->bhlm', query, query_rpe, name='query_rpe_similarity')
+      similarity_logits.append(query_rpe_similarity)
+
+    # Compute the key rpe similarity term: k * rpe.
+    if self._use_key_rpe_similarity:
+      key_rpe = self._key_rpe(None)
+      key_rpe_similarity = tf.einsum(
+          'bhmd,hlmd->bhlm', key, key_rpe, name='key_rpe_similarity')
+      similarity_logits.append(key_rpe_similarity)
+
+    # Apply an optional batch norm to the similarities and sum them.
+    similarity_logits = tf.stack(similarity_logits)
+    similarity_logits = self._batch_norm_similarity(similarity_logits,
+                                                    training=training)
+    similarity_logits = tf.reduce_sum(input_tensor=similarity_logits, axis=0)
+
+    # Apply an attention activation function, e.g. softmax.
+    weights = self._attention_activate_fn(similarity_logits)
+
+    # Gather retrieved values or rpes into a list.
+    retrieve_list = []
+
+    # Retrieve the content of the attended value.
+    if self._retrieve_value_content:
+      retrieved_content = tf.einsum(
+          'bhlm,bmhd->bhld', weights, value, name='retrieve_value_content')
+      retrieve_list.append(retrieved_content)
+
+    # Retrieve the relative position of the attended value.
+    if self._retrieve_value_rpe:
+      value_rpe = self._value_rpe(None)
+      retrieved_rpe = tf.einsum(
+          'bhlm,hlmd->bhld', weights, value_rpe, name='retrieve_value_rpe')
+      retrieve_list.append(retrieved_rpe)
+
+    # Apply batch norms to retrieved contents and rpes respectively.
+    retrieved_output = tf.stack(retrieve_list)
+    retrieved_output = self._batch_norm_retrieved_output(retrieved_output,
+                                                         training=training)
+    # Additive contents and rpes.
+    retrieved_output = tf.reduce_sum(input_tensor=retrieved_output, axis=0)
+
+    # Combine the heads by transposing and reshaping the tensor.
+    retrieved_output = utils.transpose_and_reshape_for_attention_operation(
+        retrieved_output)
+
+    return retrieved_output
+
+
+class AxialAttention2D(tf.keras.layers.Layer):
+  """Sequentially applies height-axis and width-axis axial-attention."""
+
+  def __init__(self,
+               strides=1,
+               filters=512,
+               name='attention',
+               key_expansion=1,
+               value_expansion=2,
+               query_shape=(129, 129),
+               memory_flange=(32, 32),
+               **kwargs):
+    """Initializes an AxialAttention2D layer.
+
+    Args:
+      strides: An integer, the stride for the output, usually 1 or 2.
+      filters: An integer, the base number of channels for the layer.
+      name: A string, the name of the attention layer.
+      key_expansion: A float, the channel expansion ratio for keys.
+      value_expansion: A float, the channel expansion ratio for values.
+      query_shape: An integer, the maximum query shape for both the height axis
+        and the width axis.
+      memory_flange: An integer list of length 2. The memory flange for the
+        height axis and the width axis.
+      **kwargs: A dictionary of keyword arguments passed to height-axis,
+        width-axis, and 2D global AxialAttention.
+
+    Returns:
+      output: A [batch, strided height, strided width, output_channels] tensor.
+    """
+    super(AxialAttention2D, self).__init__(name=name)
+    total_key_depth = int(round(filters * key_expansion))
+    total_value_depth = int(round(filters * value_expansion))
+    self._strides = strides
+    self._total_key_depth = total_key_depth
+    self._total_value_depth = total_value_depth
+    self._height_axis = AxialAttention(
+        total_key_depth=total_key_depth,
+        total_value_depth=total_value_depth,
+        query_shape=query_shape[0],
+        memory_flange=memory_flange[0],
+        name='height_axis',
+        **kwargs)
+    self._width_axis = AxialAttention(
+        total_key_depth=total_key_depth,
+        total_value_depth=total_value_depth,
+        query_shape=query_shape[1],
+        memory_flange=memory_flange[1],
+        name='width_axis',
+        **kwargs)
+
+  def call(self, inputs, training=False):
+    """Performs a forward pass.
+
+    Args:
+      inputs: An input [batch, height, width, channel] tensor.
+      training: A boolean flag indicating whether training behavior should be
+        used (default: False).
+
+    Returns:
+      output: An output [batch, strided_height, strided_width,
+        filters * value_expansion] tensor.
+    """
+    _, height, width, channel = inputs.get_shape().as_list()
+
+    # Transpose and reshape the width axis to the batch dimension.
+    x = tf.transpose(a=inputs, perm=[0, 2, 1, 3])
+    x = tf.reshape(x, [-1, height, channel])
+    x = self._height_axis(x, training=training)
+    # Reshape and transpose back to a 4D tensor.
+    x = tf.reshape(x, [-1, width, height, self._total_value_depth])
+    x = tf.transpose(a=x, perm=[0, 2, 1, 3])
+    # Height axis striding.
+    if self._strides > 1:
+      x = x[:, ::self._strides, :, :]
+
+    # Reshape the height axis to the batch dimension.
+    _, strided_height, _, _ = x.get_shape().as_list()
+    x = tf.reshape(x, [-1, width, self._total_value_depth])
+    x = self._width_axis(x, training=training)
+    # Reshape back to a 4D tensor.
+    x = tf.reshape(x, [-1, strided_height, width, self._total_value_depth])
+    # Width axis striding.
+    if self._strides > 1:
+      x = x[:, :, ::self._strides, :]
+
+    return x
+
+
+class GlobalAttention2D(tf.keras.layers.Layer):
+  """A 2D global attention layer."""
+
+  def __init__(self,
+               strides=1,
+               filters=512,
+               name='attention',
+               key_expansion=1,
+               value_expansion=2,
+               query_shape=(129, 129),
+               memory_flange=(32, 32),
+               double_global_attention=False,
+               **kwargs):
+    """Initializes a GlobalAttention2D layer.
+
+    Args:
+      strides: An integer, the stride for the output, usually 1 or 2.
+      filters: An integer, the base number of channels for the layer.
+      name: A string, the name of the attention layer.
+      key_expansion: A float, the channel expansion ratio for keys.
+      value_expansion: A float, the channel expansion ratio for values.
+      query_shape: An integer, the maximum query shape for both the height axis
+        and the width axis.
+      memory_flange: An integer list of length 2. The memory flange for the
+        height axis and the width axis.
+      double_global_attention: A boolean, whether to use two global attention
+        layers. Two global attention layers match the parameter count to a
+        seqentially applied height and width axial attention layer.
+      **kwargs: A dictionary of keyword arguments passed to height-axis,
+        width-axis, and 2D global AxialAttention.
+
+    Returns:
+      output: A [batch, strided height, strided width, output_channels] tensor.
+
+    Raises:
+      ValueError: If relative positional encoding is enforced in kwargs.
+    """
+    if any([kwargs.get('use_query_rpe_similarity', False),
+            kwargs.get('use_key_rpe_similarity', False),
+            kwargs.get('retrieve_value_rpe', False)]):
+      raise ValueError('GlobalAttention2D does not support relative positional '
+                       'encodings.')
+
+    super(GlobalAttention2D, self).__init__(name=name)
+    total_key_depth = int(round(filters * key_expansion))
+    total_value_depth = int(round(filters * value_expansion))
+    self._strides = strides
+    self._double_global_attention = double_global_attention
+    self._total_key_depth = total_key_depth
+    self._total_value_depth = total_value_depth
+
+    # Global attention does not support relative positional encodings.
+    kwargs['use_query_rpe_similarity'] = False
+    kwargs['use_key_rpe_similarity'] = False
+    kwargs['retrieve_value_rpe'] = False
+    self._kwargs = kwargs
+
+  def build(self, input_shape):
+    """Builds global attention layers according to the 4D input_shape."""
+    _, height, width, _ = input_shape
+    # Implement 2D global attention as 1D axial-attention by flattening the 2D
+    # inputs into 1D. We also disable the relative positional encodings in
+    # axial attention, so that only content-based attention is used. The query
+    # shape is set to height * width, so that the axial attention is global.
+    self._global = AxialAttention(
+        total_key_depth=self._total_key_depth,
+        total_value_depth=self._total_value_depth,
+        query_shape=height*width,
+        memory_flange=0,
+        name='global',
+        **self._kwargs)
+
+    # Use two global attention layers in one residual block. This option
+    # ensures that global attention models have similar number of layers and
+    # parameters as axial-attention models.
+    if self._double_global_attention:
+      self._global2 = AxialAttention(
+          total_key_depth=self._total_key_depth,
+          total_value_depth=self._total_value_depth,
+          query_shape=height*width,
+          memory_flange=0,
+          name='global2',
+          **self._kwargs)
+
+  def call(self, inputs, training=False):
+    """Performs a forward pass.
+
+    Args:
+      inputs: An input [batch, height, width, channel] tensor.
+      training: A boolean flag indicating whether training behavior should be
+        used (default: False).
+
+    Returns:
+      output: An output [batch, strided_height, strided_width,
+        filters * value_expansion] tensor.
+    """
+    _, height, width, channel = inputs.get_shape().as_list()
+
+    # Reshape the inputs so that the attention is global 2D.
+    x = tf.reshape(inputs, [-1, height * width, channel])
+
+    # Implement 2D global attention as 1D axial-attention by flattening the 2D
+    # inputs into 1D. We also disable the relative positional encodings in
+    # axial attention, so that only content-based attention is used.
+    x = self._global(x, training=training)
+
+    # Use two global attention layers in one residual block. This option
+    # ensures that global attention models have the same number of layers and
+    # parameters as axial-attention models.
+    if self._double_global_attention:
+      x = self._global2(x, training=training)
+    x = tf.reshape(x, [-1, height, width, self._total_value_depth])
+    if self._strides > 1:
+      x = x[:, ::self._strides, ::self._strides, :]
+
+    return x
diff --git a/model/layers/axial_layers_test.py b/model/layers/axial_layers_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fb8accdd470e25ec3ad896ba16bff2739a0dbbc
--- /dev/null
+++ b/model/layers/axial_layers_test.py
@@ -0,0 +1,56 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for axial_layers."""
+
+import tensorflow as tf
+
+from deeplab2.model.layers import axial_layers
+
+
+class AxialLayersTest(tf.test.TestCase):
+
+  def test_default_axial_attention_layer_output_shape(self):
+    layer = axial_layers.AxialAttention()
+    output = layer(tf.zeros([10, 5, 32]))
+    self.assertListEqual(output.get_shape().as_list(), [10, 5, 1024])
+
+  def test_axial_attention_2d_layer_output_shape(self):
+    layer = axial_layers.AxialAttention2D()
+    output = layer(tf.zeros([2, 5, 5, 32]))
+    self.assertListEqual(output.get_shape().as_list(), [2, 5, 5, 1024])
+
+  def test_change_filters_output_shape(self):
+    layer = axial_layers.AxialAttention2D(filters=32)
+    output = layer(tf.zeros([2, 5, 5, 32]))
+    self.assertListEqual(output.get_shape().as_list(), [2, 5, 5, 64])
+
+  def test_value_expansion_output_shape(self):
+    layer = axial_layers.AxialAttention2D(value_expansion=1)
+    output = layer(tf.zeros([2, 5, 5, 32]))
+    self.assertListEqual(output.get_shape().as_list(), [2, 5, 5, 512])
+
+  def test_global_attention_output_shape(self):
+    layer = axial_layers.GlobalAttention2D()
+    output = layer(tf.zeros([2, 5, 5, 32]))
+    self.assertListEqual(output.get_shape().as_list(), [2, 5, 5, 1024])
+
+  def test_stride_two_output_shape(self):
+    layer = axial_layers.AxialAttention2D(strides=2)
+    output = layer(tf.zeros([2, 5, 5, 32]))
+    self.assertListEqual(output.get_shape().as_list(), [2, 3, 3, 1024])
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/model/layers/blocks.py b/model/layers/blocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e46651aeaacf1e416ffa19b43de433f2031cc31
--- /dev/null
+++ b/model/layers/blocks.py
@@ -0,0 +1,211 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Implements building blocks for neural networks."""
+from typing import Optional
+
+from absl import logging
+
+import tensorflow as tf
+
+from deeplab2.model import utils
+from deeplab2.model.layers import convolutions
+from deeplab2.model.layers import squeeze_and_excite
+
+backend = tf.keras.backend
+layers = tf.keras.layers
+
+
+class InvertedBottleneckBlock(tf.keras.layers.Layer):
+  """An inverted bottleneck block.
+
+  Reference:
+  Sandler, M., Howard, A., et al. Mobilenetv2: Inverted residuals and linear
+    bottlenecks. In CVPR, 2018
+  Howard, A., Sandler, M., et al. Searching for mobilenetv3. In ICCV, 2019
+  """
+
+  def __init__(self,
+               in_filters: int,
+               out_filters: int,
+               expand_ratio: int,
+               strides: int,
+               kernel_size: int = 3,
+               se_ratio: Optional[float] = None,
+               activation: str = 'relu',
+               se_inner_activation: str = 'relu',
+               se_gating_activation: str = 'sigmoid',
+               depthwise_activation: Optional[str] = None,
+               expand_se_in_filters: bool = False,
+               atrous_rate: int = 1,
+               divisible_by: int = 1,
+               bn_layer: layers.Layer = tf.keras.layers.BatchNormalization,
+               conv_kernel_weight_decay: float = 0.0,
+               regularize_depthwise: bool = False,
+               use_depthwise: bool = True,
+               use_residual: bool = True,
+               name: Optional[str] = None):
+    """Initializes an inverted bottleneck block with BN after convolutions.
+
+    Args:
+      in_filters: The number of filters of the input tensor.
+      out_filters: The number of filters of the output tensor.
+      expand_ratio: The expand_ratio for an inverted bottleneck block. If
+        expand_ratio is <= 1, this argument will be ignored.
+      strides: The number of stride. If greater than 1, this block will
+        ultimately downsample the input.
+      kernel_size: The kernel size of the depthwise conv layer.
+      se_ratio: If not None, se ratio for the squeeze and excitation layer.
+      activation: The name of the activation function.
+      se_inner_activation: The name of squeeze-excitation inner activation.
+      se_gating_activation: The name of squeeze-excitation gating activation.
+      depthwise_activation: The name of the activation function for depthwise
+        only.
+      expand_se_in_filters: Whether or not to expand in_filter in squeeze and
+        excitation layer.
+      atrous_rate: The atrous dilation rate to use for.
+      divisible_by: A number that all inner dimensions are divisible by.
+      bn_layer: An optional tf.keras.layers.Layer that computes the
+          normalization (default: tf.keras.layers.BatchNormalization).
+      conv_kernel_weight_decay: The weight decay for convolution kernels.
+      regularize_depthwise: Whether or not apply regularization on depthwise.
+      use_depthwise: Whether to uses standard convolutions instead of depthwise.
+      use_residual: Whether to include residual connection between input and
+        output.
+      name: Name for the block.
+    """
+    super(InvertedBottleneckBlock, self).__init__(name=name)
+
+    self._in_filters = in_filters
+    self._out_filters = out_filters
+    self._expand_ratio = expand_ratio
+    self._strides = strides
+    self._kernel_size = kernel_size
+    self._se_ratio = se_ratio
+    self._divisible_by = divisible_by
+    self._atrous_rate = atrous_rate
+    self._regularize_depthwise = regularize_depthwise
+    self._use_depthwise = use_depthwise
+    self._use_residual = use_residual
+    self._activation = activation
+    self._se_inner_activation = se_inner_activation
+    self._se_gating_activation = se_gating_activation
+    self._depthwise_activation = depthwise_activation
+    self._expand_se_in_filters = expand_se_in_filters
+
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+
+    if depthwise_activation is None:
+      self._depthwise_activation = activation
+
+    if regularize_depthwise:
+      depthwise_kernel_weight_decay = conv_kernel_weight_decay
+    else:
+      depthwise_kernel_weight_decay = 0.0
+
+    if self._expand_ratio <= 1 and not self._use_depthwise:
+      raise ValueError(
+          'Undefined behavior if expand_ratio <= 1 and not use_depthwise')
+
+    expand_filters = self._in_filters
+    if self._expand_ratio > 1:
+      # First 1x1 conv for channel expansion.
+      expand_filters = utils.make_divisible(
+          self._in_filters * self._expand_ratio, self._divisible_by)
+
+      expand_kernel = 1 if self._use_depthwise else self._kernel_size
+      expand_stride = 1 if self._use_depthwise else self._strides
+
+      self._conv1_bn_act = convolutions.Conv2DSame(
+          output_channels=expand_filters,
+          kernel_size=expand_kernel,
+          strides=expand_stride,
+          atrous_rate=1,
+          use_bias=False,
+          use_bn=True,
+          bn_layer=bn_layer,
+          activation=self._activation,
+          conv_kernel_weight_decay=conv_kernel_weight_decay,
+          name='expand_conv')
+
+    if self._use_depthwise:
+      # Depthwise conv.
+      self._conv2_bn_act = convolutions.DepthwiseConv2DSame(
+          kernel_size=self._kernel_size,
+          strides=self._strides,
+          atrous_rate=self._atrous_rate,
+          use_bias=False,
+          use_bn=True,
+          bn_layer=bn_layer,
+          activation=self._depthwise_activation,
+          name='depthwise_conv')
+
+    # Squeeze and excitation.
+    if self._se_ratio is not None and self._se_ratio > 0:
+      if self._expand_se_in_filters:
+        in_filters = expand_filters
+      else:
+        in_filters = self._in_filters
+      self._squeeze_excitation = squeeze_and_excite.SqueezeAndExcite(
+          in_filters=in_filters,
+          out_filters=expand_filters,
+          se_ratio=self._se_ratio,
+          divisible_by=self._divisible_by,
+          kernel_initializer='he_normal',
+          kernel_regularizer=tf.keras.regularizers.l2(conv_kernel_weight_decay),
+          activation=self._se_inner_activation,
+          gating_activation=self._se_gating_activation,
+          name=name + '_se')
+    else:
+      logging.info(
+          'Squeeze and Excitation is skipped due to undefined se_ratio')
+      self._squeeze_excitation = None
+
+    # Last 1x1 conv.
+    self._conv3_bn = convolutions.Conv2DSame(
+        output_channels=self._out_filters,
+        kernel_size=1,
+        strides=1,
+        atrous_rate=1,
+        use_bias=False,
+        use_bn=True,
+        bn_layer=bn_layer,
+        activation=None,
+        conv_kernel_weight_decay=conv_kernel_weight_decay,
+        name='project_conv')
+
+  def call(self, inputs, training=None):
+    shortcut = inputs
+    if self._expand_ratio > 1:
+      x = self._conv1_bn_act(inputs, training=training)
+    else:
+      x = inputs
+
+    if self._use_depthwise:
+      x = self._conv2_bn_act(x, training=training)
+
+    if self._squeeze_excitation is not None:
+      x = self._squeeze_excitation(x)
+
+    x = self._conv3_bn(x, training=training)
+
+    if (self._use_residual and
+        self._in_filters == self._out_filters):
+      x = tf.add(x, shortcut)
+
+    return x
diff --git a/model/layers/blocks_test.py b/model/layers/blocks_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0be9e6365d2c0b80cfeb3e78453d64b5f7eaac64
--- /dev/null
+++ b/model/layers/blocks_test.py
@@ -0,0 +1,72 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for blocks.py."""
+import tensorflow as tf
+
+from deeplab2.model.layers import blocks
+
+
+class BlocksTest(tf.test.TestCase):
+
+  def test_inverted_bottleneck_block_output_shape(self):
+    batch, height, width, input_channels = 2, 17, 17, 4
+    output_channels = 6
+    input_tensor = tf.random.uniform(
+        shape=(batch, height, width, input_channels))
+    ivb_block = blocks.InvertedBottleneckBlock(
+        in_filters=input_channels,
+        out_filters=output_channels,
+        expand_ratio=2,
+        strides=1,
+        name='inverted_bottleneck',
+    )
+    output_tensor = ivb_block(input_tensor)
+    self.assertListEqual(output_tensor.get_shape().as_list(),
+                         [batch, height, width, output_channels])
+
+  def test_inverted_bottleneck_block_feature_map_alignment(self):
+    batch, height, width, input_channels = 2, 17, 17, 128
+    output_channels = 256
+    input_tensor = tf.random.uniform(
+        shape=(batch, height, width, input_channels))
+    ivb_block1 = blocks.InvertedBottleneckBlock(
+        in_filters=input_channels,
+        out_filters=output_channels,
+        expand_ratio=2,
+        strides=2,
+        name='inverted_bottleneck1',
+    )
+    ivb_block1(input_tensor, False)
+    weights = ivb_block1.get_weights()
+    output_tensor = ivb_block1(input_tensor, False)
+
+    ivb_block2 = blocks.InvertedBottleneckBlock(
+        in_filters=input_channels,
+        out_filters=output_channels,
+        expand_ratio=2,
+        strides=1,
+        name='inverted_bottleneck2',
+    )
+    ivb_block2(input_tensor, False)
+    ivb_block2.set_weights(weights)
+    expected = ivb_block2(input_tensor, False)[:, ::2, ::2, :]
+
+    self.assertAllClose(ivb_block1.get_weights(), ivb_block2.get_weights(),
+                        atol=1e-4, rtol=1e-4)
+    self.assertAllClose(output_tensor, expected, atol=1e-4, rtol=1e-4)
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/model/layers/convolutions.py b/model/layers/convolutions.py
new file mode 100644
index 0000000000000000000000000000000000000000..b24ab892c82e249d27f0ab870939756c6c78af68
--- /dev/null
+++ b/model/layers/convolutions.py
@@ -0,0 +1,666 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This file contains wrapper classes for convolution layers of tf.keras and Switchable Atrous Convolution.
+
+Switchable Atrous Convolution (SAC) is convolution with a switchable atrous
+rate. It also has optional pre- and post-global context layers.
+[1] Siyuan Qiao, Liang-Chieh Chen, Alan Yuille. DetectoRS: Detecting Objects
+    with Recursive Feature Pyramid and Switchable Atrous Convolution.
+    arXiv:2006.02334
+"""
+import functools
+from typing import Optional
+
+from absl import logging
+import tensorflow as tf
+
+from deeplab2.model import utils
+from deeplab2.model.layers import activations
+
+
+def _compute_padding_size(kernel_size, atrous_rate):
+  kernel_size_effective = kernel_size + (kernel_size - 1) * (atrous_rate - 1)
+  pad_total = kernel_size_effective - 1
+  pad_begin = pad_total // 2
+  pad_end = pad_total - pad_begin
+  if pad_begin != pad_end:
+    logging.warn('Convolution requires one more padding to the '
+                 'bottom-right pixel. This may cause misalignment.')
+  return (pad_begin, pad_end)
+
+
+class GlobalContext(tf.keras.layers.Layer):
+  """Class for the global context modules in Switchable Atrous Convolution."""
+
+  def build(self, input_shape):
+    super().build(input_shape)
+    input_shape = tf.TensorShape(input_shape)
+    input_channel = self._get_input_channel(input_shape)
+    self.global_average_pooling = tf.keras.layers.GlobalAveragePooling2D()
+    self.convolution = tf.keras.layers.Conv2D(
+        input_channel, 1, strides=1, padding='same', name=self.name + '_conv',
+        kernel_initializer='zeros', bias_initializer='zeros')
+
+  def call(self, inputs, *args, **kwargs):
+    outputs = self.global_average_pooling(inputs)
+    outputs = tf.expand_dims(outputs, axis=1)
+    outputs = tf.expand_dims(outputs, axis=1)
+    outputs = self.convolution(outputs)
+    return inputs + outputs
+
+  def _get_input_channel(self, input_shape):
+    # Reference: tf.keras.layers.convolutional.Conv.
+    if input_shape.dims[-1].value is None:
+      raise ValueError('The channel dimension of the inputs '
+                       'should be defined. Found `None`.')
+    return int(input_shape[-1])
+
+
+class SwitchableAtrousConvolution(tf.keras.layers.Conv2D):
+  """Class for the Switchable Atrous Convolution."""
+
+  def __init__(self, *args, **kwargs):
+    super().__init__(*args, **kwargs)
+    self._average_pool = tf.keras.layers.AveragePooling2D(
+        pool_size=(5, 5), strides=1, padding='same')
+    self._switch = tf.keras.layers.Conv2D(
+        1,
+        kernel_size=1,
+        strides=self.strides,
+        padding='same',
+        dilation_rate=1,
+        name='switch',
+        kernel_initializer='zeros',
+        bias_initializer='zeros')
+
+  def build(self, input_shape):
+    super().build(input_shape)
+    if self.padding == 'causal':
+      tf_padding = 'VALID'
+    elif isinstance(self.padding, str):
+      tf_padding = self.padding.upper()
+    else:
+      tf_padding = self.padding
+    large_dilation_rate = list(self.dilation_rate)
+    large_dilation_rate = [r * 3 for r in large_dilation_rate]
+    self._large_convolution_op = functools.partial(
+        tf.nn.convolution,
+        strides=list(self.strides),
+        padding=tf_padding,
+        dilations=large_dilation_rate,
+        data_format=self._tf_data_format,
+        name=self.__class__.__name__ + '_large')
+
+  def call(self, inputs):
+    # Reference: tf.keras.layers.convolutional.Conv.
+    input_shape = inputs.shape
+    switches = self._switch(self._average_pool(inputs))
+
+    if self._is_causal:  # Apply causal padding to inputs for Conv1D.
+      inputs = tf.compat.v1.pad(inputs, self._compute_causal_padding(inputs))
+
+    outputs = self._convolution_op(inputs, self.kernel)
+    outputs_large = self._large_convolution_op(inputs, self.kernel)
+
+    outputs = switches * outputs_large + (1 - switches) * outputs
+
+    if self.use_bias:
+      outputs = tf.nn.bias_add(
+          outputs, self.bias, data_format=self._tf_data_format)
+
+    if not tf.executing_eagerly():
+      # Infer the static output shape:
+      out_shape = self.compute_output_shape(input_shape)
+      outputs.set_shape(out_shape)
+
+    if self.activation is not None:
+      return self.activation(outputs)
+    return outputs
+
+  def squeeze_batch_dims(self, inp, op, inner_rank):
+    # Reference: tf.keras.utils.conv_utils.squeeze_batch_dims.
+    with tf.name_scope('squeeze_batch_dims'):
+      shape = inp.shape
+
+      inner_shape = shape[-inner_rank:]
+      if not inner_shape.is_fully_defined():
+        inner_shape = tf.compat.v1.shape(inp)[-inner_rank:]
+
+      batch_shape = shape[:-inner_rank]
+      if not batch_shape.is_fully_defined():
+        batch_shape = tf.compat.v1.shape(inp)[:-inner_rank]
+
+      if isinstance(inner_shape, tf.TensorShape):
+        inp_reshaped = tf.reshape(inp, [-1] + inner_shape.as_list())
+      else:
+        inp_reshaped = tf.reshape(
+            inp, tf.concat(([-1], inner_shape), axis=-1))
+
+      out_reshaped = op(inp_reshaped)
+
+      out_inner_shape = out_reshaped.shape[-inner_rank:]
+      if not out_inner_shape.is_fully_defined():
+        out_inner_shape = tf.compat.v1.shape(out_reshaped)[-inner_rank:]
+
+      out = tf.reshape(
+          out_reshaped, tf.concat((batch_shape, out_inner_shape), axis=-1))
+
+      out.set_shape(inp.shape[:-inner_rank] + out.shape[-inner_rank:])
+      return out
+
+
+class Conv2DSame(tf.keras.layers.Layer):
+  """A wrapper class for a 2D convolution with 'same' padding.
+
+  In contrast to tf.keras.layers.Conv2D, this layer aligns the kernel with the
+  top-left corner rather than the bottom-right corner. Optionally, a batch
+  normalization and an activation can be added on top.
+  """
+
+  def __init__(
+      self,
+      output_channels: int,
+      kernel_size: int,
+      name: str,
+      strides: int = 1,
+      atrous_rate: int = 1,
+      use_bias: bool = True,
+      use_bn: bool = False,
+      bn_layer: tf.keras.layers.Layer = tf.keras.layers.BatchNormalization,
+      bn_gamma_initializer: str = 'ones',
+      activation: Optional[str] = None,
+      use_switchable_atrous_conv: bool = False,
+      use_global_context_in_sac: bool = False,
+      conv_kernel_weight_decay: float = 0.0):
+    """Initializes convolution with zero padding aligned to the top-left corner.
+
+    DeepLab aligns zero padding differently to tf.keras 'same' padding.
+    Considering a convolution with a 7x7 kernel, a stride of 2 and an even input
+    size, tf.keras 'same' padding will add 2 zero padding to the top-left and 3
+    zero padding to the bottom-right. However, for consistent feature alignment,
+    DeepLab requires an equal padding of 3 in all directions. This behavior is
+    consistent with e.g. the ResNet 'stem' block.
+
+    Args:
+      output_channels: An integer specifying the number of filters of the
+        convolution.
+      kernel_size: An integer specifying the size of the convolution kernel.
+      name: A string specifying the name of this layer.
+      strides: An optional integer or tuple of integers specifying the size of
+        the strides (default: 1).
+      atrous_rate: An optional integer or tuple of integers specifying the
+        atrous rate of the convolution (default: 1).
+      use_bias: An optional flag specifying whether bias should be added for the
+        convolution.
+      use_bn: An optional flag specifying whether batch normalization should be
+        added after the convolution (default: False).
+      bn_layer: An optional tf.keras.layers.Layer that computes the
+        normalization (default: tf.keras.layers.BatchNormalization).
+      bn_gamma_initializer: An initializer for the batch norm gamma weight.
+      activation: An optional flag specifying an activation function to be added
+        after the convolution.
+      use_switchable_atrous_conv: Boolean, whether the layer uses switchable
+        atrous convolution.
+      use_global_context_in_sac: Boolean, whether the switchable atrous
+        convolution (SAC) uses pre- and post-global context.
+      conv_kernel_weight_decay: A float, the weight decay for convolution
+        kernels.
+
+    Raises:
+      ValueError: If use_bias and use_bn in the convolution.
+    """
+    super(Conv2DSame, self).__init__(name=name)
+
+    if use_bn and use_bias:
+      raise ValueError('Conv2DSame is using convolution bias with batch_norm.')
+
+    if use_global_context_in_sac:
+      self._pre_global_context = GlobalContext(name='pre_global_context')
+
+    convolution_op = tf.keras.layers.Conv2D
+    convolution_padding = 'same'
+    if strides == 1 or strides == (1, 1):
+      if use_switchable_atrous_conv:
+        convolution_op = SwitchableAtrousConvolution
+    else:
+      padding = _compute_padding_size(kernel_size, atrous_rate)
+      self._zeropad = tf.keras.layers.ZeroPadding2D(
+          padding=(padding, padding), name='zeropad')
+      convolution_padding = 'valid'
+    self._conv = convolution_op(
+        output_channels,
+        kernel_size,
+        strides=strides,
+        padding=convolution_padding,
+        use_bias=use_bias,
+        dilation_rate=atrous_rate,
+        name='conv',
+        kernel_initializer='he_normal',
+        kernel_regularizer=tf.keras.regularizers.l2(
+            conv_kernel_weight_decay))
+
+    if use_global_context_in_sac:
+      self._post_global_context = GlobalContext(name='post_global_context')
+
+    if use_bn:
+      self._batch_norm = bn_layer(axis=3, name='batch_norm',
+                                  gamma_initializer=bn_gamma_initializer)
+
+    self._activation_fn = None
+    if activation is not None:
+      self._activation_fn = activations.get_activation(activation)
+
+    self._use_global_context_in_sac = use_global_context_in_sac
+    self._strides = strides
+    self._use_bn = use_bn
+
+  def call(self, input_tensor, training=False):
+    """Performs a forward pass.
+
+    Args:
+      input_tensor: An input tensor of type tf.Tensor with shape [batch, height,
+        width, channels].
+      training: A boolean flag indicating whether training behavior should be
+        used (default: False).
+
+    Returns:
+      The output tensor.
+    """
+    x = input_tensor
+    if self._use_global_context_in_sac:
+      x = self._pre_global_context(x)
+
+    if not (self._strides == 1 or self._strides == (1, 1)):
+      x = self._zeropad(x)
+    x = self._conv(x)
+
+    if self._use_global_context_in_sac:
+      x = self._post_global_context(x)
+
+    if self._use_bn:
+      x = self._batch_norm(x, training=training)
+
+    if self._activation_fn is not None:
+      x = self._activation_fn(x)
+    return x
+
+
+class DepthwiseConv2DSame(tf.keras.layers.Layer):
+  """A wrapper class for a 2D depthwise convolution.
+
+  In contrast to convolutions in tf.keras.layers.DepthwiseConv2D, this layers
+  aligns the kernel with the top-left corner rather than the bottom-right
+  corner. Optionally, a batch normalization and an activation can be added.
+  """
+
+  def __init__(self,
+               kernel_size: int,
+               name: str,
+               strides: int = 1,
+               atrous_rate: int = 1,
+               use_bias: bool = True,
+               use_bn: bool = False,
+               bn_layer=tf.keras.layers.BatchNormalization,
+               activation: Optional[str] = None):
+    """Initializes a 2D depthwise convolution.
+
+    Args:
+      kernel_size: An integer specifying the size of the convolution kernel.
+      name: A string specifying the name of this layer.
+      strides: An optional integer or tuple of integers specifying the size of
+        the strides (default: 1).
+      atrous_rate: An optional integer or tuple of integers specifying the
+        atrous rate of the convolution (default: 1).
+      use_bias: An optional flag specifying whether bias should be added for the
+        convolution.
+      use_bn: An optional flag specifying whether batch normalization should be
+        added after the convolution (default: False).
+      bn_layer: An optional tf.keras.layers.Layer that computes the
+        normalization (default: tf.keras.layers.BatchNormalization).
+      activation: An optional flag specifying an activation function to be added
+        after the convolution.
+
+    Raises:
+      ValueError: If use_bias and use_bn in the convolution.
+    """
+    super(DepthwiseConv2DSame, self).__init__(name=name)
+
+    if use_bn and use_bias:
+      raise ValueError(
+          'DepthwiseConv2DSame is using convlution bias with batch_norm.')
+
+    if strides == 1 or strides == (1, 1):
+      convolution_padding = 'same'
+    else:
+      padding = _compute_padding_size(kernel_size, atrous_rate)
+      self._zeropad = tf.keras.layers.ZeroPadding2D(
+          padding=(padding, padding), name='zeropad')
+      convolution_padding = 'valid'
+    self._depthwise_conv = tf.keras.layers.DepthwiseConv2D(
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=convolution_padding,
+        use_bias=use_bias,
+        dilation_rate=atrous_rate,
+        name='depthwise_conv')
+    if use_bn:
+      self._batch_norm = bn_layer(axis=3, name='batch_norm')
+
+    self._activation_fn = None
+    if activation is not None:
+      self._activation_fn = activations.get_activation(activation)
+
+    self._strides = strides
+    self._use_bn = use_bn
+
+  def call(self, input_tensor, training=False):
+    """Performs a forward pass.
+
+    Args:
+      input_tensor: An input tensor of type tf.Tensor with shape [batch, height,
+        width, channels].
+      training: A boolean flag indicating whether training behavior should be
+        used (default: False).
+
+    Returns:
+      The output tensor.
+    """
+    x = input_tensor
+    if not (self._strides == 1 or self._strides == (1, 1)):
+      x = self._zeropad(x)
+    x = self._depthwise_conv(x)
+    if self._use_bn:
+      x = self._batch_norm(x, training=training)
+    if self._activation_fn is not None:
+      x = self._activation_fn(x)
+    return x
+
+
+class SeparableConv2DSame(tf.keras.layers.Layer):
+  """A wrapper class for a 2D separable convolution.
+
+  In contrast to convolutions in tf.keras.layers.SeparableConv2D, this layers
+  aligns the kernel with the top-left corner rather than the bottom-right
+  corner. Optionally, a batch normalization and an activation can be added.
+  """
+
+  def __init__(
+      self,
+      output_channels: int,
+      kernel_size: int,
+      name: str,
+      strides: int = 1,
+      atrous_rate: int = 1,
+      use_bias: bool = True,
+      use_bn: bool = False,
+      bn_layer: tf.keras.layers.Layer = tf.keras.layers.BatchNormalization,
+      activation: Optional[str] = None):
+    """Initializes a 2D separable convolution.
+
+    Args:
+      output_channels: An integer specifying the number of filters of the
+        convolution output.
+      kernel_size: An integer specifying the size of the convolution kernel.
+      name: A string specifying the name of this layer.
+      strides: An optional integer or tuple of integers specifying the size of
+        the strides (default: 1).
+      atrous_rate: An optional integer or tuple of integers specifying the
+        atrous rate of the convolution (default: 1).
+      use_bias: An optional flag specifying whether bias should be added for the
+        convolution.
+      use_bn: An optional flag specifying whether batch normalization should be
+        added after the convolution (default: False).
+      bn_layer: An optional tf.keras.layers.Layer that computes the
+        normalization (default: tf.keras.layers.BatchNormalization).
+      activation: An optional flag specifying an activation function to be added
+        after the convolution.
+
+    Raises:
+      ValueError: If use_bias and use_bn in the convolution.
+    """
+    super(SeparableConv2DSame, self).__init__(name=name)
+    if use_bn and use_bias:
+      raise ValueError(
+          'SeparableConv2DSame is using convolution bias with batch_norm.')
+
+    self._depthwise = DepthwiseConv2DSame(
+        kernel_size=kernel_size,
+        name='depthwise',
+        strides=strides,
+        atrous_rate=atrous_rate,
+        use_bias=use_bias,
+        use_bn=use_bn,
+        bn_layer=bn_layer,
+        activation=activation)
+    self._pointwise = Conv2DSame(
+        output_channels=output_channels,
+        kernel_size=1,
+        name='pointwise',
+        strides=1,
+        atrous_rate=1,
+        use_bias=use_bias,
+        use_bn=use_bn,
+        bn_layer=bn_layer,
+        activation=activation)
+
+  def call(self, input_tensor, training=False):
+    """Performs a forward pass.
+
+    Args:
+      input_tensor: An input tensor of type tf.Tensor with shape [batch, height,
+        width, channels].
+      training: A boolean flag indicating whether training behavior should be
+        used (default: False).
+
+    Returns:
+      The output tensor.
+    """
+    x = self._depthwise(input_tensor, training=training)
+    return self._pointwise(x, training=training)
+
+
+class StackedConv2DSame(tf.keras.layers.Layer):
+  """Stacked Conv2DSame or SeparableConv2DSame.
+
+  This class sequentially stacks a given number of Conv2DSame layers or
+  SeparableConv2DSame layers.
+  """
+
+  def __init__(
+      self,
+      num_layers: int,
+      conv_type: str,
+      output_channels: int,
+      kernel_size: int,
+      name: str,
+      strides: int = 1,
+      atrous_rate: int = 1,
+      use_bias: bool = True,
+      use_bn: bool = False,
+      bn_layer: tf.keras.layers.Layer = tf.keras.layers.BatchNormalization,
+      activation: Optional[str] = None):
+    """Initializes a stack of convolutions.
+
+    Args:
+      num_layers: The number of convolutions to create.
+      conv_type: A string specifying the convolution type used in each block.
+        Must be one of 'standard_conv' or 'depthwise_separable_conv'.
+      output_channels: An integer specifying the number of filters of the
+        convolution output.
+      kernel_size: An integer specifying the size of the convolution kernel.
+      name: A string specifying the name of this layer.
+      strides: An optional integer or tuple of integers specifying the size of
+        the strides (default: 1).
+      atrous_rate: An optional integer or tuple of integers specifying the
+        atrous rate of the convolution (default: 1).
+      use_bias: An optional flag specifying whether bias should be added for the
+        convolution.
+      use_bn: An optional flag specifying whether batch normalization should be
+        added after the convolution (default: False).
+      bn_layer: An optional tf.keras.layers.Layer that computes the
+        normalization (default: tf.keras.layers.BatchNormalization).
+      activation: An optional flag specifying an activation function to be added
+        after the convolution.
+
+    Raises:
+      ValueError: An error occurs when conv_type is neither 'standard_conv'
+        nor 'depthwise_separable_conv'.
+    """
+    super(StackedConv2DSame, self).__init__(name=name)
+    if conv_type == 'standard_conv':
+      convolution_op = Conv2DSame
+    elif conv_type == 'depthwise_separable_conv':
+      convolution_op = SeparableConv2DSame
+    else:
+      raise ValueError('Convolution %s not supported.' % conv_type)
+
+    for index in range(num_layers):
+      current_name = utils.get_conv_bn_act_current_name(index, use_bn,
+                                                        activation)
+      utils.safe_setattr(self, current_name, convolution_op(
+          output_channels=output_channels,
+          kernel_size=kernel_size,
+          name=utils.get_layer_name(current_name),
+          strides=strides,
+          atrous_rate=atrous_rate,
+          use_bias=use_bias,
+          use_bn=use_bn,
+          bn_layer=bn_layer,
+          activation=activation))
+    self._num_layers = num_layers
+    self._use_bn = use_bn
+    self._activation = activation
+
+  def call(self, input_tensor, training=False):
+    """Performs a forward pass.
+
+    Args:
+      input_tensor: An input tensor of type tf.Tensor with shape [batch, height,
+        width, channels].
+      training: A boolean flag indicating whether training behavior should be
+        used (default: False).
+
+    Returns:
+      The output tensor.
+    """
+    x = input_tensor
+    for index in range(self._num_layers):
+      current_name = utils.get_conv_bn_act_current_name(index, self._use_bn,
+                                                        self._activation)
+      x = getattr(self, current_name)(x, training=training)
+    return x
+
+
+class Conv1D(tf.keras.layers.Layer):
+  """A wrapper class for a 1D convolution with batch norm and activation.
+
+  Conv1D creates a convolution kernel that is convolved with the layer input
+  over a single spatial (or temporal) dimension to produce a tensor of outputs.
+  The input should always be 3D with shape [batch, length, channel], so
+  accordingly, the optional batch norm is done on axis=2.
+
+  In DeepLab, we use Conv1D only with kernel_size = 1 for dual path transformer
+  layers in MaX-DeepLab [1] architectures.
+
+  Reference:
+  [1] MaX-DeepLab: End-to-End Panoptic Segmentation with Mask Transformers,
+      CVPR 2021.
+        Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen.
+  """
+
+  def __init__(
+      self,
+      output_channels: int,
+      name: str,
+      use_bias: bool = True,
+      use_bn: bool = False,
+      bn_layer: tf.keras.layers.Layer = tf.keras.layers.BatchNormalization,
+      bn_gamma_initializer: str = 'ones',
+      activation: Optional[str] = None,
+      conv_kernel_weight_decay: float = 0.0,
+      kernel_initializer='he_normal',
+      kernel_size: int = 1,
+      padding: str = 'valid'):
+    """Initializes a Conv1D.
+
+    Args:
+      output_channels: An integer specifying the number of filters of the
+        convolution.
+      name: A string specifying the name of this layer.
+      use_bias: An optional flag specifying whether bias should be added for the
+        convolution.
+      use_bn: An optional flag specifying whether batch normalization should be
+        added after the convolution (default: False).
+      bn_layer: An optional tf.keras.layers.Layer that computes the
+        normalization (default: tf.keras.layers.BatchNormalization).
+      bn_gamma_initializer: An initializer for the batch norm gamma weight.
+      activation: An optional flag specifying an activation function to be added
+        after the convolution.
+      conv_kernel_weight_decay: A float, the weight decay for convolution
+        kernels.
+      kernel_initializer: An initializer for the convolution kernel.
+      kernel_size: An integer specifying the size of the convolution kernel.
+      padding: An optional string specifying the padding to use. Must be either
+        'same' or 'valid' (default: 'valid').
+
+    Raises:
+      ValueError: If use_bias and use_bn in the convolution.
+    """
+    super(Conv1D, self).__init__(name=name)
+
+    if use_bn and use_bias:
+      raise ValueError('Conv1D is using convlution bias with batch_norm.')
+
+    self._conv = tf.keras.layers.Conv1D(
+        output_channels,
+        kernel_size=kernel_size,
+        strides=1,
+        padding=padding,
+        use_bias=use_bias,
+        name='conv',
+        kernel_initializer=kernel_initializer,
+        kernel_regularizer=tf.keras.regularizers.l2(
+            conv_kernel_weight_decay))
+
+    self._batch_norm = None
+    if use_bn:
+      # Batch norm uses axis=2 because the input is 3D with channel being the
+      # last dimension.
+      self._batch_norm = bn_layer(axis=2, name='batch_norm',
+                                  gamma_initializer=bn_gamma_initializer)
+
+    self._activation_fn = None
+    if activation is not None:
+      self._activation_fn = activations.get_activation(activation)
+
+  def call(self, input_tensor, training=False):
+    """Performs a forward pass.
+
+    Args:
+      input_tensor: An input tensor of type tf.Tensor with shape [batch, length,
+        channels].
+      training: A boolean flag indicating whether training behavior should be
+        used (default: False).
+
+    Returns:
+      The output tensor.
+    """
+    x = self._conv(input_tensor)
+    if self._batch_norm is not None:
+      x = self._batch_norm(x, training=training)
+    if self._activation_fn is not None:
+      x = self._activation_fn(x)
+    return x
diff --git a/model/layers/convolutions_test.py b/model/layers/convolutions_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..676135cba31b82a582ae8f04c424e55b839dbcff
--- /dev/null
+++ b/model/layers/convolutions_test.py
@@ -0,0 +1,290 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for convolutions."""
+
+import numpy as np
+import tensorflow as tf
+
+from deeplab2.model.layers import convolutions
+from deeplab2.utils import test_utils
+
+
+class ConvolutionsTest(tf.test.TestCase):
+
+  def test_conv2dsame_logging(self):
+    with self.assertLogs(level='WARN'):
+      _ = convolutions.Conv2DSame(
+          output_channels=1,
+          kernel_size=8,
+          strides=2,
+          name='conv',
+          use_bn=False,
+          activation=None)
+
+  def test_conv2dsame_conv(self):
+    conv = convolutions.Conv2DSame(
+        output_channels=1,
+        kernel_size=1,
+        name='conv',
+        use_bn=False,
+        activation=None)
+    input_tensor = tf.random.uniform(shape=(2, 180, 180, 5))
+
+    predicted_tensor = conv(input_tensor)
+    expected_tensor = np.dot(input_tensor.numpy(),
+                             conv._conv.get_weights()[0])[..., 0, 0]
+
+    # Compare only up to 5 decimal digits to account for numerical accuracy.
+    np.testing.assert_almost_equal(
+        predicted_tensor.numpy(), expected_tensor, decimal=5)
+
+  def test_conv2dsame_relu(self):
+    conv = convolutions.Conv2DSame(
+        output_channels=1,
+        kernel_size=1,
+        name='conv',
+        activation='relu',
+        use_bn=False)
+    input_tensor = tf.random.uniform(shape=(2, 180, 180, 5))
+
+    predicted_tensor = conv(input_tensor)
+    expected_tensor = np.dot(input_tensor.numpy(),
+                             conv._conv.get_weights()[0])[..., 0, 0]
+    expected_tensor[expected_tensor < 0.0] = 0.0
+
+    # Compare only up to 5 decimal digits to account for numerical accuracy.
+    np.testing.assert_almost_equal(
+        predicted_tensor.numpy(), expected_tensor, decimal=5)
+
+  def test_conv2dsame_relu6(self):
+    conv = convolutions.Conv2DSame(
+        output_channels=1,
+        kernel_size=1,
+        name='conv',
+        activation='relu6',
+        use_bn=False)
+    input_tensor = tf.random.uniform(shape=(2, 180, 180, 5)) * 10.
+
+    predicted_tensor = conv(input_tensor)
+    expected_tensor = np.dot(input_tensor.numpy(),
+                             conv._conv.get_weights()[0])[..., 0, 0]
+    expected_tensor[expected_tensor < 0.0] = 0.0
+    expected_tensor[expected_tensor > 6.0] = 6.0
+
+    # Compare only up to 5 decimal digits to account for numerical accuracy.
+    np.testing.assert_almost_equal(
+        predicted_tensor.numpy(), expected_tensor, decimal=5)
+
+  def test_conv2dsame_shape(self):
+    conv = convolutions.Conv2DSame(
+        output_channels=64,
+        kernel_size=7,
+        strides=2,
+        name='conv',
+        use_bias=False,
+        use_bn=True)
+    input_tensor = tf.random.uniform(shape=(2, 180, 180, 3))
+
+    predicted_tensor = conv(input_tensor)
+    expected_shape = [2, 90, 90, 64]
+
+    self.assertListEqual(predicted_tensor.shape.as_list(), expected_shape)
+
+  @test_utils.test_all_strategies
+  def test_conv2d_sync_bn(self, strategy):
+    input_tensor = tf.random.uniform(shape=(2, 180, 180, 3))
+
+    for bn_layer in test_utils.NORMALIZATION_LAYERS:
+      with strategy.scope():
+        conv = convolutions.Conv2DSame(
+            output_channels=64,
+            kernel_size=7,
+            strides=2,
+            name='conv',
+            use_bias=False,
+            use_bn=True,
+            bn_layer=bn_layer)
+        conv(input_tensor)
+
+  def test_depthwise_conv(self):
+    conv = convolutions.DepthwiseConv2DSame(
+        kernel_size=1, use_bn=False, use_bias=True, activation=None,
+        name='conv')
+    input_tensor = tf.random.uniform(shape=(2, 180, 180, 5))
+
+    predicted_tensor = conv(input_tensor)
+    expected_tensor = (
+        input_tensor.numpy() * conv._depthwise_conv.get_weights()[0][..., 0])
+
+    np.testing.assert_equal(predicted_tensor.numpy(), expected_tensor)
+
+  def test_depthwise_relu(self):
+    conv = convolutions.DepthwiseConv2DSame(
+        kernel_size=1, use_bn=False, activation='relu', name='conv')
+    input_tensor = tf.random.uniform(shape=(2, 180, 180, 5))
+
+    predicted_tensor = conv(input_tensor)
+    expected_tensor = (
+        input_tensor.numpy() * conv._depthwise_conv.get_weights()[0][..., 0])
+    expected_tensor[expected_tensor < 0.0] = 0.0
+
+    np.testing.assert_equal(predicted_tensor.numpy(), expected_tensor)
+
+  def test_depthwise_shape(self):
+    conv = convolutions.DepthwiseConv2DSame(
+        kernel_size=3, use_bn=True, use_bias=False, activation='relu',
+        name='conv')
+    input_shape = [2, 180, 180, 5]
+    input_tensor = tf.random.uniform(shape=input_shape)
+
+    predicted_tensor = conv(input_tensor)
+    expected_shape = input_shape
+
+    self.assertListEqual(predicted_tensor.shape.as_list(), expected_shape)
+
+  def test_depthwise_shape_with_stride2(self):
+    conv = convolutions.DepthwiseConv2DSame(
+        kernel_size=3, use_bn=True, use_bias=False, activation='relu',
+        strides=2, name='conv')
+    input_shape = [2, 181, 181, 5]
+    input_tensor = tf.random.uniform(shape=input_shape)
+
+    predicted_tensor = conv(input_tensor)
+    expected_shape = [2, 91, 91, 5]
+
+    self.assertListEqual(predicted_tensor.shape.as_list(), expected_shape)
+
+  @test_utils.test_all_strategies
+  def test_depthwise_sync_bn(self, strategy):
+    input_tensor = tf.random.uniform(shape=(2, 180, 180, 3))
+
+    for bn_layer in test_utils.NORMALIZATION_LAYERS:
+      with strategy.scope():
+        conv = convolutions.DepthwiseConv2DSame(
+            kernel_size=7,
+            name='conv',
+            use_bn=True,
+            use_bias=False,
+            bn_layer=bn_layer,
+            activation='relu')
+        _ = conv(input_tensor)
+
+  def test_global_context(self):
+    input_tensor = tf.random.uniform(shape=(2, 180, 180, 3))
+    global_context = convolutions.GlobalContext(name='global_context')
+    output_tensor = global_context(input_tensor)
+    # global_context is supposed to not change any values before training.
+    np.testing.assert_array_almost_equal(input_tensor.numpy(),
+                                         output_tensor.numpy())
+
+  def test_switchable_atrous_conv_class(self):
+    # Tests Switchable Atrous Convolution by equations.
+    input_tensor = tf.random.uniform(shape=(3, 180, 180, 32))
+    sac_layer = convolutions.SwitchableAtrousConvolution(
+        64,
+        kernel_size=3,
+        padding='same',
+        name='sac_conv')
+    switch_conv = sac_layer._switch
+    _ = switch_conv(input_tensor)
+    switch_conv.kernel = tf.random.uniform(
+        switch_conv.kernel.shape,
+        minval=-1,
+        maxval=1,
+        dtype=switch_conv.kernel.dtype)
+    switch_conv.bias = tf.random.uniform(
+        switch_conv.bias.shape,
+        minval=-1,
+        maxval=1,
+        dtype=switch_conv.bias.dtype)
+    small_conv = tf.keras.layers.Conv2D(
+        64,
+        kernel_size=3,
+        padding='same',
+        dilation_rate=1,
+        name='small_conv')
+    large_conv = tf.keras.layers.Conv2D(
+        64,
+        kernel_size=3,
+        padding='same',
+        dilation_rate=3,
+        name='large_conv')
+    _ = small_conv(input_tensor)
+    _ = large_conv(input_tensor)
+    outputs = sac_layer(input_tensor)
+    small_conv.kernel = sac_layer.kernel
+    large_conv.kernel = sac_layer.kernel
+    # Compute the expected outputs.
+    switch_outputs = sac_layer._switch(sac_layer._average_pool(input_tensor))
+    large_outputs = large_conv(input_tensor)
+    small_outputs = small_conv(input_tensor)
+    expected_outputs = (switch_outputs * large_outputs +
+                        (1 - switch_outputs) * small_outputs)
+    np.testing.assert_array_almost_equal(expected_outputs.numpy(),
+                                         outputs.numpy())
+
+  def test_switchable_atrous_conv_in_conv2dsame(self):
+    # Tests Switchable Atrous Convolution in Conv2DSame.
+    input_tensor = tf.random.uniform(shape=(3, 180, 180, 32))
+    layer = convolutions.Conv2DSame(
+        output_channels=64,
+        kernel_size=7,
+        strides=1,
+        name='conv',
+        use_bias=False,
+        use_bn=True,
+        use_switchable_atrous_conv=True,
+        use_global_context_in_sac=True)
+    output_tensor = layer(input_tensor)
+    np.testing.assert_array_almost_equal(output_tensor.shape.as_list(),
+                                         [3, 180, 180, 64])
+
+  def test_conv1d_shape(self):
+    conv = convolutions.Conv1D(
+        output_channels=64,
+        name='conv',
+        use_bias=False,
+        use_bn=True)
+    input_tensor = tf.random.uniform(shape=(2, 180, 3))
+    predicted_tensor = conv(input_tensor)
+    expected_shape = [2, 180, 64]
+    self.assertListEqual(predicted_tensor.shape.as_list(), expected_shape)
+
+  def test_separable_conv2d_same_output_shape(self):
+    conv = convolutions.SeparableConv2DSame(
+        output_channels=64,
+        kernel_size=3,
+        name='conv')
+    input_tensor = tf.random.uniform(shape=(2, 5, 5, 3))
+    predicted_tensor = conv(input_tensor)
+    expected_shape = [2, 5, 5, 64]
+    self.assertListEqual(predicted_tensor.shape.as_list(), expected_shape)
+
+  def test_stacked_conv2d_same_output_shape(self):
+    conv = convolutions.StackedConv2DSame(
+        num_layers=2,
+        conv_type='depthwise_separable_conv',
+        output_channels=64,
+        kernel_size=3,
+        name='conv')
+    input_tensor = tf.random.uniform(shape=(2, 5, 5, 3))
+    predicted_tensor = conv(input_tensor)
+    expected_shape = [2, 5, 5, 64]
+    self.assertListEqual(predicted_tensor.shape.as_list(), expected_shape)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/model/layers/drop_path.py b/model/layers/drop_path.py
new file mode 100644
index 0000000000000000000000000000000000000000..b07f820b52451245725a664b52ea294fa77ebe69
--- /dev/null
+++ b/model/layers/drop_path.py
@@ -0,0 +1,133 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Drop path operation.
+
+This scripts implements the drop path operation, proposed in
+Gao Huang, Yu Sun, Zhuang Liu, Daniel Sedra, Kilian Weinberger,
+Deep Networks with Stochastic Depth. In ECCV, 2016.
+"""
+import tensorflow as tf
+
+
+def get_drop_path_keep_prob(keep_prob_for_last_stage, schedule,
+                            current_stage, num_stages):
+  """Gets drop path keep probability for current stage.
+
+  Args:
+    keep_prob_for_last_stage: A float, the drop path keep probability for
+      last stage. This flag is used in conjunction with the flag `schedule`, as
+      they together determine drop path keep probability for the other stages.
+    schedule: A string, the drop path schedule. Currently, we support
+      'constant': use the same drop path keep probability for all stages, and
+      'linear': linearly decrease the drop path keep probability from 1.0 at
+        0-th stage (or STEM) to `keep_prob_for_last_stage` at last stage.
+    current_stage:  An integer, current stage number.
+    num_stages: An integer, the number of stages.
+
+  Returns:
+    The drop path keep probability for the current stage.
+
+  Raises:
+    ValueError: If schedule is not supported.
+  """
+  if schedule == 'constant':
+    return keep_prob_for_last_stage
+  elif schedule == 'linear':
+    return 1.0 - (1.0 - keep_prob_for_last_stage) * current_stage / num_stages
+  else:
+    raise ValueError('Unexpected schedule %s.' % schedule)
+
+
+def generate_drop_path_random_mask(input_tensor, drop_path_keep_prob):
+  """Generates a random mask for drop path.
+
+  This function generates a random mask for training models with drop path. Each
+  scalar in the output indicates whether the block or path will be kept. The
+  scalars are scaled with (1.0 / drop_path_keep_prob) so that the output will
+  have the same expectation no mather what the drop_path_keep_prob is.
+
+  Reference:
+  "Deep Networks with Stochastic Depth" https://arxiv.org/pdf/1603.09382.pdf
+
+  Args:
+    input_tensor: An input [batch_size, n_1, n_2, ..., n_k] tensor.
+    drop_path_keep_prob: A float, the keep probability for dropping path.
+
+  Returns:
+    binary_tensor: A [batch_size, 1, 1, ..., 1] tensor with the same dtype as
+      the input_tensor.
+  """
+  binary_tensor = None
+  if drop_path_keep_prob < 1.0:
+    input_shape = input_tensor.get_shape().as_list()
+    random_tensor_shape = [input_shape[0]] + [1] * (len(input_shape) - 1)
+    random_tensor = drop_path_keep_prob
+    random_tensor += tf.random.uniform(
+        random_tensor_shape, dtype=input_tensor.dtype)
+    binary_tensor = tf.math.divide(tf.floor(random_tensor), drop_path_keep_prob)
+  return binary_tensor
+
+
+class DropPath(tf.keras.layers.Layer):
+  """Drop path layer.
+
+  For details, please see the original paper listed below.
+  Gao Huang, Yu Sun, Zhuang Liu, Daniel Sedra, Kilian Weinberger,
+  Deep Networks with Stochastic Depth. In ECCV, 2016.
+  """
+
+  def __init__(self, drop_path_keep_prob=1.0, name=None):
+    """Initializes a drop path layer.
+
+    Args:
+      drop_path_keep_prob: A float, the keep probability for dropping path.
+      name: An optional string specifying the operation name.
+
+    Rasies:
+      ValueError: If drop_path_keep_prob is <= 0 or > 1.
+    """
+    super(DropPath, self).__init__(name=name)
+    self._drop_path_keep_prob = drop_path_keep_prob
+    if self._drop_path_keep_prob <= 0 or self._drop_path_keep_prob > 1.0:
+      raise ValueError('drop_path_keep_prob not valid. Got %f.' %
+                       self._drop_path_keep_prob)
+
+  def call(self, input_tensor, training=False):
+    """Performs a forward pass.
+
+    Args:
+      input_tensor: An input tensor of type tf.Tensor with shape [batch, height,
+        width, channels].
+      training: A boolean flag indicating whether training behavior should be
+        used (default: False).
+
+    Returns:
+      The output tensor.
+    """
+    if self._drop_path_keep_prob == 1.0 or not training:
+      return input_tensor
+    drop_path_random_mask = generate_drop_path_random_mask(
+        input_tensor, self._drop_path_keep_prob)
+    if drop_path_random_mask is not None:
+      input_tensor = input_tensor * drop_path_random_mask
+    return input_tensor
+
+  def get_config(self):
+    config = {
+        'drop_path_keep_prob': self._drop_path_keep_prob,
+    }
+    base_config = super(DropPath, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
diff --git a/model/layers/drop_path_test.py b/model/layers/drop_path_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d02f5fa9d2de935cdeb043bfbad81441e0b1b6f
--- /dev/null
+++ b/model/layers/drop_path_test.py
@@ -0,0 +1,76 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test for drop_path.py."""
+import numpy as np
+import tensorflow as tf
+
+from deeplab2.model.layers import drop_path
+
+# Set a fixed random seed.
+tf.random.set_seed(1)
+
+
+class DropPathTest(tf.test.TestCase):
+
+  def test_drop_path_keep_prob_one(self):
+    # Test drop_path_keep_prob = 1, where output should be equal to input.
+    drop_path_keep_prob = 1.0
+    input_tensor = tf.random.uniform(shape=(3, 65, 65, 32))
+    layer_op = drop_path.DropPath(drop_path_keep_prob)
+    output_tensor = layer_op(input_tensor, training=True)
+    np.testing.assert_equal(input_tensor.numpy(), output_tensor.numpy())
+
+  def test_not_training_mode(self):
+    # Test not training mode, where output should be equal to input.
+    drop_path_keep_prob = 0.8
+    input_tensor = tf.random.uniform(shape=(3, 65, 65, 32))
+    layer_op = drop_path.DropPath(drop_path_keep_prob)
+    output_tensor = layer_op(input_tensor, training=False)
+    np.testing.assert_equal(input_tensor.numpy(), output_tensor.numpy())
+
+  def test_drop_path(self):
+    drop_path_keep_prob = 0.8
+    input_tensor = tf.random.uniform(shape=(3, 65, 65, 32))
+    layer_op = drop_path.DropPath(drop_path_keep_prob)
+    output_tensor = layer_op(input_tensor, training=True)
+    self.assertFalse(np.array_equal(input_tensor.numpy(),
+                                    output_tensor.numpy()))
+
+  def test_constant_drop_path_schedule(self):
+    keep_prob_for_last_stage = 0.8
+    current_stage_keep_prob = drop_path.get_drop_path_keep_prob(
+        keep_prob_for_last_stage,
+        schedule='constant',
+        current_stage=2,
+        num_stages=5)
+    self.assertEqual(current_stage_keep_prob, keep_prob_for_last_stage)
+
+  def test_linear_drop_path_schedule(self):
+    keep_prob_for_last_stage = 0.8
+    current_stage_keep_prob = drop_path.get_drop_path_keep_prob(
+        keep_prob_for_last_stage,
+        schedule='linear',
+        current_stage=1,
+        num_stages=4)
+    self.assertEqual(current_stage_keep_prob, 0.95)
+
+  def test_unknown_drop_path_schedule(self):
+    with self.assertRaises(ValueError):
+      _ = drop_path.get_drop_path_keep_prob(0.8, 'unknown', 1, 4)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/model/layers/dual_path_transformer.py b/model/layers/dual_path_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..806db522ac2ece7304d7d4fb481d85274614e580
--- /dev/null
+++ b/model/layers/dual_path_transformer.py
@@ -0,0 +1,488 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Implements dual path transformer layers proposed in MaX-DeepLab [1].
+
+Dual-path transformer introduces a global memory path in addition to a CNN path,
+allowing bi-directional communication with any CNN layers.
+
+[1] MaX-DeepLab: End-to-End Panoptic Segmentation with Mask Transformers,
+    CVPR 2021.
+      Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen.
+"""
+
+import tensorflow as tf
+
+from deeplab2.model import utils
+from deeplab2.model.layers import activations
+from deeplab2.model.layers import convolutions
+
+
+class AttentionOperation(tf.keras.layers.Layer):
+  """Computes standard 1D multi-head attention with query, key, and value."""
+
+  def __init__(self,
+               name,
+               activation,
+               transformer_activation,
+               bn_layer=tf.keras.layers.BatchNormalization):
+    """Initializes an AttentionOperation layer.
+
+    Args:
+      name: A string, the name of this layer.
+      activation: A string, type of activation function to apply.
+      transformer_activation: A string, type of activation function for
+        self-attention. Support 'sigmoid' and 'softmax'.
+      bn_layer: An optional tf.keras.layers.Layer that computes the
+        normalization (default: tf.keras.layers.BatchNormalization).
+    """
+    super(AttentionOperation, self).__init__(name=name)
+    # batch_norm_similarity has shape [batch, num_heads, num_query, num_key],
+    # where num_query and num_key usually equals to height or width or length,
+    # i.e., spatial dimensions, so batch norm is applied to axis=1 only.
+    self._batch_norm_similarity = bn_layer(axis=1, name='batch_norm_similarity')
+    # batch_norm_retrieved_value is done on shape [batch, num_heads, length,
+    # value_channels], which will be reshaped to the output shape [batch,
+    # length, value_channels * num_heads], so we apply batch norm on the
+    # effective channel dimension -- value_channels * num_heads.
+    self._batch_norm_retrieved_value = bn_layer(
+        axis=[1, 3], name='batch_norm_retrieved_value')
+    self._activation_fn = activations.get_activation(activation)
+    self._transformer_activation_fn = activations.get_activation(
+        transformer_activation)
+
+  def call(self, inputs, training=False):
+    """Performs an AttentionOperation.
+
+    Args:
+      inputs: A tuple of (query, key, value), where query is [batch, num_head,
+        query_length, channels] tensor, key is a [batch, num_head, key_length,
+        channels] tensor, and value is a [batch, key_length, num_head,
+        value_channels] tensor.
+      training: A boolean, whether the model is in training mode.
+
+    Returns:
+      output: A [batch, query_length, num_head * value_channels] tensor, the
+        retrieved value.
+    """
+    # Decode query, key, and value from inputs.
+    query, key, value = inputs
+    # Compute attention similarity.
+    similarity_logits = tf.einsum('bhld,bhmd->bhlm', query, key)
+    similarity_logits = self._batch_norm_similarity(
+        similarity_logits, training=training)
+    # Apply a transformer attention activation function, e.g. softmax.
+    attention_weights = self._transformer_activation_fn(similarity_logits)
+    # Retrieve the value content.
+    retrieved_value = tf.einsum(
+        'bhlm,bmhd->bhld', attention_weights, value)
+    retrieved_value = self._batch_norm_retrieved_value(
+        retrieved_value, training=training)
+    retrieved_value = self._activation_fn(retrieved_value)
+    # Reshape the output.
+    return utils.transpose_and_reshape_for_attention_operation(
+        retrieved_value)
+
+
+class DualPathTransformerLayer(tf.keras.layers.Layer):
+  """Applies a dual path transformer layer, as proposed in MaX-DeepLab [1].
+
+  Dual-path transformer layer takes a pixel space input and a memory space
+  input, and performs memory2pixel attention, pixel2memory attention, and
+  memory2memory self-attention. Note that the pixel2pixel self-attention or
+  convolution in the pixel space is implemented in axial_layers.py and
+  axial_blocks.py. Thus, the pixel2pixel operation is not included in this
+  DualPathTransformerLayer implementation. Please use this class together with
+  a residual block with axial-attention, global-attention, or convolution in
+  order to construct the full dual path transformer in the paper.
+
+  [1] MaX-DeepLab: End-to-End Panoptic Segmentation with Mask Transformers,
+      CVPR 2021.
+        Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen.
+  """
+
+  def __init__(self,
+               name='dual_path_transformer_layer',
+               activation='relu',
+               filters=128,
+               num_heads=8,
+               bottleneck_expansion=2,
+               key_expansion=1,
+               value_expansion=2,
+               feed_forward_network_channels=2048,
+               use_memory_self_attention=True,
+               use_pixel2memory_feedback_attention=True,
+               transformer_activation='softmax',
+               bn_layer=tf.keras.layers.BatchNormalization,
+               conv_kernel_weight_decay=0.0):
+    """Initializes a DualPathTransformerLayer.
+
+    This function implements a dual path transformer layer between a pixel space
+    and a memory space, as described in the MaX-DeepLab paper. In this dual path
+    transformer, the memory2pixel cross attention and the memory self-attention
+    share a single activation, e.g. softmax.
+
+    Reference:
+      MaX-DeepLab: "End-to-End Panoptic Segmentation with Mask Transformers",
+        CVPR 2021. https://arxiv.org/abs/2012.00759
+          Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen.
+
+    Args:
+      name: A string, the name of this dual path transformer layer.
+      activation: A string, type of activation function to apply.
+      filters: An integer, the base number of channels for the layer.
+      num_heads: An integer, the number of heads in multi-head attention.
+      bottleneck_expansion: A float, the channel expansion ratio for the
+        bottleneck.
+      key_expansion: A float, the channel expansion ratio for keys.
+      value_expansion: A float, the channel expansion ratio for values.
+      feed_forward_network_channels: An integer, the number of channels for the
+        feed_forward_network. Zero means no feed_forward_network will be
+        applied.
+      use_memory_self_attention: A boolean, whether to apply the memory space
+        self-attention.
+      use_pixel2memory_feedback_attention: A boolean, whether to apply the
+        pixel2memory feedback attention.
+      transformer_activation: A string, type of activation function for
+        self-attention. Support 'sigmoid' and 'softmax'.
+      bn_layer: A tf.keras.layers.Layer that computes the normalization
+        (default: tf.keras.layers.BatchNormalization).
+      conv_kernel_weight_decay: A float, the weight decay for convolution
+        kernels.
+
+    Raises:
+      ValueError: If filters * key_expansion is not divisible by num_heads.
+      ValueError: If filters * value_expansion is not divisible by num_heads.
+    """
+    super(DualPathTransformerLayer, self).__init__(name=name)
+
+    bottleneck_channels = int(round(filters * bottleneck_expansion))
+    total_key_depth = int(round(filters * key_expansion))
+    total_value_depth = int(round(filters * value_expansion))
+
+    if total_key_depth % num_heads:
+      raise ValueError('Total_key_depth should be divisible by num_heads.')
+
+    if total_value_depth % num_heads:
+      raise ValueError('Total_value_depth should be divisible by num_heads.')
+
+    # Compute query key value with one convolution and a batch norm layer. The
+    # initialization std is standard transformer initialization (without batch
+    # norm), as used in SASA and ViT. In our case, we use batch norm by default,
+    # so it does not require careful tuning. If one wants to remove all batch
+    # norms in axial attention, this standard initialization should still be
+    # good, but a more careful initialization is encouraged.
+    initialization_std = bottleneck_channels ** -0.5
+
+    self._memory_conv1_bn_act = convolutions.Conv1D(
+        bottleneck_channels, 'memory_conv1_bn_act',
+        use_bias=False,
+        use_bn=True,
+        bn_layer=bn_layer,
+        activation=activation,
+        conv_kernel_weight_decay=conv_kernel_weight_decay)
+
+    self._pixel_conv1_bn_act = convolutions.Conv1D(
+        bottleneck_channels, 'pixel_conv1_bn_act',
+        use_bias=False,
+        use_bn=True,
+        bn_layer=bn_layer,
+        activation=activation,
+        conv_kernel_weight_decay=conv_kernel_weight_decay)
+
+    # We always compute the query for memory space, since it gathers information
+    # from the pixel space and thus cannot be removed. We compute the key and
+    # value for memory space only when they are necessary (i.e. either
+    # use_memory_self_attention or use_pixel2memory_feedback_attention).
+    if use_memory_self_attention or use_pixel2memory_feedback_attention:
+      self._memory_qkv_conv_bn = convolutions.Conv1D(
+          total_key_depth * 2 + total_value_depth, 'memory_qkv_conv_bn',
+          use_bias=False,
+          use_bn=True,
+          bn_layer=bn_layer,
+          activation='none',
+          conv_kernel_weight_decay=conv_kernel_weight_decay,
+          kernel_initializer=tf.keras.initializers.TruncatedNormal(
+              stddev=initialization_std))
+    else:
+      # Compute memory query only if memory key and value are not used.
+      self._memory_query_conv_bn = convolutions.Conv1D(
+          total_key_depth, 'memory_query_conv_bn',
+          use_bias=False,
+          use_bn=True,
+          bn_layer=bn_layer,
+          activation='none',
+          conv_kernel_weight_decay=conv_kernel_weight_decay,
+          kernel_initializer=tf.keras.initializers.TruncatedNormal(
+              stddev=initialization_std))
+
+    # For the pixel space, we always compute the key and value, since they
+    # provide information for the memory space and thus cannot be removed. We
+    # compute the query for pixel space only when it is necessary (i.e.
+    # use_pixel2memory_feedback_attention is True).
+    if use_pixel2memory_feedback_attention:
+      self._pixel_qkv_conv_bn = convolutions.Conv1D(
+          total_key_depth * 2 + total_value_depth, 'pixel_qkv_conv_bn',
+          use_bias=False,
+          use_bn=True,
+          bn_layer=bn_layer,
+          activation='none',
+          conv_kernel_weight_decay=conv_kernel_weight_decay,
+          kernel_initializer=tf.keras.initializers.TruncatedNormal(
+              stddev=initialization_std))
+    else:
+      self._pixel_kv_conv_bn = convolutions.Conv1D(
+          total_key_depth + total_value_depth, 'pixel_kv_conv_bn',
+          use_bias=False,
+          use_bn=True,
+          bn_layer=bn_layer,
+          activation='none',
+          conv_kernel_weight_decay=conv_kernel_weight_decay,
+          kernel_initializer=tf.keras.initializers.TruncatedNormal(
+              stddev=initialization_std))
+    self._memory_attention = AttentionOperation(
+        'memory_attention', activation, transformer_activation,
+        bn_layer=bn_layer)
+    if use_pixel2memory_feedback_attention:
+      self._pixel_attention = AttentionOperation(
+          'pixel_attention', activation, transformer_activation,
+          bn_layer=bn_layer)
+
+    self._use_memory_self_attention = use_memory_self_attention
+    self._use_pixel2memory_feedback_attention = (
+        use_pixel2memory_feedback_attention)
+    self._total_key_depth = total_key_depth
+    self._total_value_depth = total_value_depth
+    self._num_heads = num_heads
+    self._bn_layer = bn_layer
+    self._conv_kernel_weight_decay = conv_kernel_weight_decay
+    self._activation = activation
+    self._activation_fn = activations.get_activation(activation)
+    self._feed_forward_network_channels = feed_forward_network_channels
+
+  def build(self, input_shape_list):
+    pixel_shape, memory_shape = input_shape_list[:2]
+    # Here we follow ResNet bottleneck blocks: we apply a batch norm with gamma
+    # initialized at zero, followed by drop path and an activation function.
+    # Initializing this gamma at zero ensures that at random initialization of
+    # the model, the skip connections dominate all residual blocks. In this way,
+    # all the skip connections construct an identity mapping that passes the
+    # gradients (without any distortion from the randomly initialized blocks) to
+    # all residual blocks. This helps training at early epochs.
+    # Reference: "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour".
+    # https://arxiv.org/abs/1706.02677
+    self._memory_conv3_bn = convolutions.Conv1D(
+        memory_shape[-1], 'memory_conv3_bn',
+        use_bias=False,
+        use_bn=True,
+        bn_layer=self._bn_layer,
+        bn_gamma_initializer='zeros',
+        activation='none',
+        conv_kernel_weight_decay=self._conv_kernel_weight_decay)
+
+    if self._feed_forward_network_channels > 0:
+      self._memory_ffn_conv1_bn_act = convolutions.Conv1D(
+          self._feed_forward_network_channels, 'memory_ffn_conv1_bn_act',
+          use_bias=False,
+          use_bn=True,
+          bn_layer=self._bn_layer,
+          activation=self._activation,
+          conv_kernel_weight_decay=self._conv_kernel_weight_decay)
+      # Again, we follow ResNet bottleneck blocks: we apply a batch norm with
+      # gamma initialized at zero, followed by drop path and an activation
+      # function.
+      self._memory_ffn_conv2_bn = convolutions.Conv1D(
+          memory_shape[-1], 'memory_ffn_conv2_bn',
+          use_bias=False,
+          use_bn=True,
+          bn_layer=self._bn_layer,
+          bn_gamma_initializer='zeros',
+          activation='none',
+          conv_kernel_weight_decay=self._conv_kernel_weight_decay)
+    if self._use_pixel2memory_feedback_attention:
+      self._pixel_conv3_bn = convolutions.Conv1D(
+          pixel_shape[-1], 'pixel_conv3_bn',
+          use_bias=False,
+          use_bn=True,
+          bn_layer=self._bn_layer,
+          bn_gamma_initializer='zeros',
+          activation='none',
+          conv_kernel_weight_decay=self._conv_kernel_weight_decay)
+
+  def call(self, inputs):
+    """Performs a forward pass.
+
+    We have to define drop_path_masks outside the layer call and pass it into
+    the layer call, because recompute_grad (gradient checkpointing) does not
+    allow any randomness within the function call. In addition, recompute_grad
+    only supports float tensors as inputs. For this reason, the training flag
+    should be also passed as a float tensor. For the same reason, we cannot
+    support passing drop_path_random_mask as None. Instead, we ask the users to
+    pass only the first two tensors when drop path is not used.
+
+    Args:
+      inputs: A tuple of 3 or 6 tensors, containing
+        pixel_space_input should be a [batch, num_pixel, pixel_space_channels]
+          tensor.
+        memory_space_input should be a [batch, num_memory,
+          memory_space_channels] tensor.
+        float_tensor_training should be a float tensor of 0.0 or 1.0, whether
+          the model is in training mode.
+        (optional) pixel_space_drop_path_mask is a drop path mask tensor of
+          shape [batch, 1, 1] for the pixel space.
+        (optional) memory_space_attention_drop_path_mask is a drop path mask
+          tensor of shape [batch, 1, 1] for the memory space.
+        (optional) memory_space_feed_forward_network_drop_path_mask is a drop
+          path mask tensor of shape [batch, 1, 1] for the memory space feed
+          forward network.
+
+    Returns:
+      pixel_space_output: A [batch, num_pixel, pixel_space_channels] tensor.
+      activated_pixel_space_output: A [batch, num_pixel, pixel_space_channels]
+        tensor, activated pixel_space_output.
+      memory_space_output: A [batch, num_memory, memory_space_channels]
+        tensor.
+
+    Raises:
+      ValueError: If the length of inputs is not 3 or 6.
+    """
+    if len(inputs) not in (3, 6):
+      raise ValueError('The length of inputs should be either 3 or 6.')
+
+    # Unpack the inputs.
+    (pixel_space_input, memory_space_input, float_tensor_training,
+     pixel_space_drop_path_mask, memory_space_attention_drop_path_mask,
+     memory_space_feed_forward_network_drop_path_mask) = (
+         utils.pad_sequence_with_none(inputs, target_length=6))
+
+    # Recompute_grad takes only float tensors as inputs. It does not allow
+    # bools or boolean tensors. For this reason, we cast training to a float
+    # tensor outside this call, and now we cast it back to a boolean tensor.
+    training = tf.cast(float_tensor_training, tf.bool)
+
+    # Decode the inputs shapes.
+    pixel_shape = pixel_space_input.get_shape().as_list()
+    memory_shape = memory_space_input.get_shape().as_list()
+
+    # Similar to the ResNet bottleneck design, we do an input down projection
+    # in both the pixel space and the memory space.
+    memory_space = self._memory_conv1_bn_act(memory_space_input,
+                                             training=training)
+
+    # Pixel space input is not activated.
+    pixel_space = self._pixel_conv1_bn_act(
+        self._activation_fn(pixel_space_input), training=training)
+
+    if (self._use_memory_self_attention or
+        self._use_pixel2memory_feedback_attention):
+      memory_space_qkv = self._memory_qkv_conv_bn(memory_space,
+                                                  training=training)
+      # Split, reshape, and transpose the query, key, and value.
+      memory_query, memory_key, memory_value = (
+          tf.split(memory_space_qkv, [
+              self._total_key_depth, self._total_key_depth,
+              self._total_value_depth], axis=-1))
+      memory_key = utils.reshape_and_transpose_for_attention_operation(
+          memory_key, self._num_heads)
+      memory_value = tf.reshape(memory_value, [
+          -1, memory_shape[1], self._num_heads,
+          self._total_value_depth // self._num_heads])
+    else:
+      # Compute memory query only if memory key and value are not used.
+      memory_query = self._memory_query_conv_bn(memory_space,
+                                                training=training)
+    # Reshape and transpose the query.
+    memory_query = utils.reshape_and_transpose_for_attention_operation(
+        memory_query, self._num_heads)
+
+    if self._use_pixel2memory_feedback_attention:
+      pixel_space_qkv = self._pixel_qkv_conv_bn(pixel_space,
+                                                training=training)
+      # Split the query, key, and value.
+      pixel_query, pixel_key, pixel_value = tf.split(
+          pixel_space_qkv, [
+              self._total_key_depth, self._total_key_depth,
+              self._total_value_depth], axis=-1)
+      pixel_query = utils.reshape_and_transpose_for_attention_operation(
+          pixel_query, self._num_heads)
+    else:
+      pixel_space_kv = self._pixel_kv_conv_bn(pixel_space, training=training)
+      # Split the key and the value.
+      pixel_key, pixel_value = tf.split(pixel_space_kv, [
+          self._total_key_depth, self._total_value_depth], axis=-1)
+    # Reshape and transpose the key and the value.
+    pixel_key = utils.reshape_and_transpose_for_attention_operation(
+        pixel_key, self._num_heads)
+    pixel_value = tf.reshape(pixel_value, [
+        -1, pixel_shape[1], self._num_heads,
+        self._total_value_depth // self._num_heads])
+
+    # Compute memory space attention.
+    if not self._use_memory_self_attention:
+      # If memory self attention is not used, then only memory2pixel cross
+      # attention is used for the memory space. In this case, the key and the
+      # value are simply pixel_key and pixel_value.
+      memory_attention_key = pixel_key
+      memory_attention_value = pixel_value
+    else:
+      # If we also use memory self attention, the key and the value are the
+      # concatenation of keys and values in both the pixel space and the
+      # memory space.
+      memory_attention_key = tf.concat([pixel_key, memory_key], axis=2)
+      memory_attention_value = tf.concat([pixel_value, memory_value], axis=1)
+
+    memory_space = self._memory_attention(
+        (memory_query, memory_attention_key, memory_attention_value),
+        training=training)
+    memory_space = self._memory_conv3_bn(memory_space, training=training)
+
+    if memory_space_attention_drop_path_mask is not None:
+      memory_space = memory_space * memory_space_attention_drop_path_mask
+    memory_space_output = self._activation_fn(
+        memory_space_input + memory_space)
+
+    # Apply an optional feed-forward network to the memory space.
+    if self._feed_forward_network_channels > 0:
+      memory_space = self._memory_ffn_conv1_bn_act(memory_space_output,
+                                                   training=training)
+      memory_space = self._memory_ffn_conv2_bn(memory_space,
+                                               training=training)
+      if memory_space_feed_forward_network_drop_path_mask is not None:
+        memory_space = (memory_space *
+                        memory_space_feed_forward_network_drop_path_mask)
+      memory_space_output = self._activation_fn(
+          memory_space_output + memory_space)
+
+    # Compute pixel space attention and the output projection only when
+    # pixel2memory_feedback_attention is used.
+    if self._use_pixel2memory_feedback_attention:
+      pixel_space = self._pixel_attention(
+          (pixel_query, memory_key, memory_value), training=training)
+      pixel_space = self._pixel_conv3_bn(pixel_space, training=training)
+      if pixel_space_drop_path_mask is not None:
+        pixel_space = pixel_space * pixel_space_drop_path_mask
+      pixel_space_output = pixel_space_input + pixel_space
+    else:
+      # If pixel2memory_feedback_attention is not used, the pixel_space_input
+      # is not changed.
+      pixel_space_output = pixel_space_input
+    activated_pixel_space_output = self._activation_fn(pixel_space_output)
+
+    # Return the pixel space output and memory space output. Note that we
+    # return pixel sapce output with and without the activation function,
+    # because our decoder might use non-activated features.
+    return (pixel_space_output,
+            activated_pixel_space_output,
+            memory_space_output)
diff --git a/model/layers/dual_path_transformer_test.py b/model/layers/dual_path_transformer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b2fc42c992188af73bd2974f8198b86ecc6da93
--- /dev/null
+++ b/model/layers/dual_path_transformer_test.py
@@ -0,0 +1,67 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for transformer_layers."""
+
+import tensorflow as tf
+
+from deeplab2.model.layers import dual_path_transformer
+
+
+class TransformerLayersTest(tf.test.TestCase):
+
+  def test_default_attention_operation_output_shape(self):
+    layer = dual_path_transformer.AttentionOperation(
+        'attention', 'relu', 'softmax')
+    output = layer((tf.zeros([2, 8, 4225, 127]),
+                    tf.zeros([2, 8, 422, 127]),
+                    tf.zeros([2, 422, 8, 128])))
+    self.assertListEqual(output.get_shape().as_list(), [2, 4225, 1024])
+
+  def test_default_transformer_layer_output_shape(self):
+    layer = dual_path_transformer.DualPathTransformerLayer()
+    float_training_tensor = tf.constant(0.0, dtype=tf.float32)
+    output = layer((tf.zeros([2, 4225, 126]),
+                    tf.zeros([2, 127, 128]),
+                    float_training_tensor))
+    self.assertListEqual(output[0].get_shape().as_list(), [2, 4225, 126])
+    self.assertListEqual(output[1].get_shape().as_list(), [2, 4225, 126])
+    self.assertListEqual(output[2].get_shape().as_list(), [2, 127, 128])
+
+  def test_zero_feed_forward_network_output_shape(self):
+    layer = dual_path_transformer.DualPathTransformerLayer(
+        feed_forward_network_channels=0)
+    float_training_tensor = tf.constant(0.0, dtype=tf.float32)
+    output = layer((tf.zeros([2, 4225, 128]),
+                    tf.zeros([2, 128, 128]),
+                    float_training_tensor))
+    self.assertListEqual(output[0].get_shape().as_list(), [2, 4225, 128])
+    self.assertListEqual(output[1].get_shape().as_list(), [2, 4225, 128])
+    self.assertListEqual(output[2].get_shape().as_list(), [2, 128, 128])
+
+  def test_attention_types_output_shape(self):
+    layer = dual_path_transformer.DualPathTransformerLayer(
+        use_memory_self_attention=False,
+        use_pixel2memory_feedback_attention=False)
+    float_training_tensor = tf.constant(0.0, dtype=tf.float32)
+    output = layer((tf.zeros([2, 4225, 128]),
+                    tf.zeros([2, 128, 128]),
+                    float_training_tensor))
+    self.assertListEqual(output[0].get_shape().as_list(), [2, 4225, 128])
+    self.assertListEqual(output[1].get_shape().as_list(), [2, 4225, 128])
+    self.assertListEqual(output[2].get_shape().as_list(), [2, 128, 128])
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/model/layers/positional_encodings.py b/model/layers/positional_encodings.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1db2a784dfaa6c4b9b64a7dfde6c8273f927a31
--- /dev/null
+++ b/model/layers/positional_encodings.py
@@ -0,0 +1,243 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Implements relative [1, 2, 3] and global [3, 4] positional encodings.
+
+Our Axial-Deeplab [1] proposes position-sensitive self-attention which uses
+relative positional encodings for query, key, and value.
+
+[1] Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation,
+    ECCV 2020 Spotlight.
+      Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille,
+      Liang-Chieh Chen.
+[2] Self-Attention with Relative Position Representations, NAACL 2018.
+      Peter Shaw, Jakob Uszkoreit, Ashish Vaswani.
+[3] Tensor2Tensor for Neural Machine Translation, arXiv 2018,
+    http://arxiv.org/abs/1803.07416.
+      Ashish Vaswani, Samy Bengio, Eugene Brevdo, Francois Chollet,
+      Aidan N. Gomez, Stephan Gouws, Llion Jones, Łukasz Kaiser,
+      Nal Kalchbrenner, Niki Parmar, Ryan Sepassi, Noam Shazeer,
+      Jakob Uszkoreit.
+[4] Attention Is All You Need, NeurIPS 2017.
+      Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones,
+      Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin.
+[5] An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale,
+    ICLR 2021.
+      Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn,
+      Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer,
+      Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+"""
+
+import tensorflow as tf
+
+# MAX_SPAN defines the maximum shape of positional encoding. It is set as a
+# large constant so that we can easily load and use models with global or
+# different local spans, but it should not be too large so that it takes a
+# reasonable amount of memory. The value 255 is larger than almost all span
+# choices (e.g. 65 for local attention, 129, 193, etc.) so 255 is large enough.
+# 257 will be a good choice for gpu, but 255 is more efficient on TPU which pads
+# tensors to 128x.
+MAX_SPAN = 255
+
+
+def _compute_relative_distance_matrix(query_length, key_length):
+  """Computes a relative distance matrix between queries and keys.
+
+  We assume that the queries and the keys are centered, i.e.,
+  key_length = memory_flange + query_length + memory_flange.
+
+  The function is based on the _generate_relative_positions_matrix function in
+  common_attention.py of tensor2tensor codebase:
+  https://github.com/tensorflow/tensor2tensor/blob/5623deb79cfcd28f8f8c5463b58b5bd76a81fd0d/tensor2tensor/layers/common_attention.py#L1670
+
+  Args:
+    query_length: An integer, the length of queries.
+    key_length: An integer, the length of keys.
+
+  Returns:
+    distance_matrix: A [query_length, key_length] tensor.
+
+  Raises:
+    ValueError: If (key_length - query_length) is odd, i.e., the assumption does
+      not hold.
+  """
+  if (key_length - query_length) % 2:
+    raise ValueError('Key_length should be query_length + 2 * memory_flange.')
+  key_index = tf.range(key_length)
+  query_index = tf.range(query_length) + (key_length - query_length) // 2
+  distance_matrix = key_index[None, :] - query_index[:, None]
+  # Shift the distance_matrix so that it is >= 0. Each entry of the
+  # distance_matrix distance will index a relative positional embedding.
+  distance_matrix = distance_matrix + MAX_SPAN - 1
+  if query_length + (key_length - query_length) // 2 > MAX_SPAN:
+    tf.logging.warn('Axial attention span is larger than MAX_SPAN. In this '
+                    'case, we use a single shared embedding for all positions '
+                    'beyond this relative distance. Please make sure, this '
+                    'behavior is intended.')
+    distance_matrix = tf.clip_by_value(distance_matrix, 0, MAX_SPAN * 2 - 2)
+  return distance_matrix
+
+
+class RelativePositionalEncoding(tf.keras.layers.Layer):
+  """Generates relative positional encoding.
+
+  The function is based on the _generate_relative_positions_embeddings function
+  in common_attention.py of tensor2tensor codebase:
+  https://github.com/tensorflow/tensor2tensor/blob/5623deb79cfcd28f8f8c5463b58b5bd76a81fd0d/tensor2tensor/layers/common_attention.py#L1691
+  """
+
+  def __init__(self, query_length, key_length, depth, num_heads, name,
+               initialization_std=1.0, conv_kernel_weight_decay=0.0):
+    """Initializes a relative position encoding layer.
+
+    Args:
+      query_length: An integer, the length of queries.
+      key_length: An integer, the length of keys.
+      depth: An integer, the number of embedding channels per head.
+      num_heads: An integer, the number of heads in multi-head attention.
+      name: A string, the name of the embedding.
+      initialization_std: A float, the initialization std for the embedding.
+      conv_kernel_weight_decay: A float, the weight decay for convolution
+        kernels.
+
+    Returns:
+      output: A [num_heads, query, key, depth] tensor, the relative positional
+        encodings for each head and each query-key-pair.
+    """
+    super(RelativePositionalEncoding, self).__init__(name=name)
+    self._initializer = tf.keras.initializers.TruncatedNormal(
+        stddev=initialization_std)
+    self._regularizer = tf.keras.regularizers.l2(conv_kernel_weight_decay)
+
+    self._relative_distance_matrix = _compute_relative_distance_matrix(
+        query_length, key_length)
+    self._num_heads = num_heads
+    self._embedding_shape = (MAX_SPAN * 2 - 1, depth)
+
+  def build(self, input_shape):
+    """Builds the embedding weight."""
+    del input_shape
+    self._embeddings = self.add_weight(
+        shape=self._embedding_shape,
+        initializer=self._initializer, trainable=True,
+        name='embeddings',
+        regularizer=self._regularizer)
+
+  def call(self, inputs):
+    """A forward pass that gathers the relative positional encoding."""
+    del inputs
+    # Gather the embeddings according to the relative distances.
+    embeddings = tf.gather(self._embeddings, self._relative_distance_matrix)
+    return tf.tile(tf.expand_dims(embeddings, axis=0),
+                   [self._num_heads, 1, 1, 1])
+
+
+class AddAbsolutePositionalEncoding(tf.keras.layers.Layer):
+  """Adds a learnable absolute positional encoding to the input feature.
+
+  Supports both 1D and 2D versions of the positional encoding: (1) 1D positional
+  encoding represents each row index with an embedding, and represents each
+  column index with another embedding. This results in a total of (height +
+  width) learnable embedding vectors. (2) 2D positional encoding adds
+  independent embeddings to each input grid position. This choice uses a total
+  of (height * width) learnable embedding vectors.
+  """
+
+  def __init__(self, name, positional_encoding_type=None,
+               bn_layer=tf.keras.layers.BatchNormalization,
+               conv_kernel_weight_decay=0.0):
+    """Initializes an AddAbsolutePositionEmbedding layer.
+
+    Args:
+      name: A string specifying the name of the layer.
+      positional_encoding_type: A string, type of the positional encoding.
+        Support '2D', '1D', 'none', and None. The feature is returned as is if
+        positional_encoding_type is 'none' or None.
+      bn_layer: An optional tf.keras.layers.Layer that computes the
+        normalization (default: tf.keras.layers.BatchNormalization).
+      conv_kernel_weight_decay: A float, the weight decay for convolution
+        kernels.
+
+    Raises:
+      ValueError: If positional_encoding_type is not one of '1D', '2D', 'none',
+        and None.
+    """
+    super(AddAbsolutePositionalEncoding, self).__init__(name=name)
+    if not any([positional_encoding_type is None,
+                positional_encoding_type.lower() == 'none',
+                positional_encoding_type.lower() == '2d',
+                positional_encoding_type.lower() == '1d']):
+      raise ValueError(positional_encoding_type + ' is not supported.')
+    self._positional_encoding_type = positional_encoding_type
+    # This initialization std is tuned for global attention, but it does not
+    # seem to be a sensitive hyper-parameter, since we use batch norm on the
+    # positional encodings.
+    self._initializer = tf.keras.initializers.TruncatedNormal(stddev=0.2)
+    self._kernel_regularizer = tf.keras.regularizers.l2(
+        conv_kernel_weight_decay)
+    self._bn_layer = bn_layer
+
+  def build(self, input_shape):
+    """Builds the layer weights whose shape depends on the 4D input shape."""
+    _, height, width, channel = input_shape
+    if self._positional_encoding_type.lower() == '2d':
+      self._embeddings = self.add_weight(
+          shape=(1, height, width, channel),
+          initializer=self._initializer, trainable=True,
+          name='embeddings',
+          regularizer=self._kernel_regularizer)
+      self._batch_norm = self._bn_layer(axis=-1, name='batch_norm')
+    elif self._positional_encoding_type.lower() == '1d':
+      # Generate separable positional encodings for the height axis and the
+      # width axis.
+      self._height_axis_embeddings = self.add_weight(
+          shape=(1, height, 1, channel),
+          initializer=self._initializer, trainable=True,
+          name='height_axis_embeddings',
+          regularizer=self._kernel_regularizer)
+      self._height_axis_batch_norm = self._bn_layer(
+          axis=-1, name='height_axis_batch_norm')
+      self._width_axis_embeddings = self.add_weight(
+          shape=(1, height, 1, channel),
+          initializer=self._initializer, trainable=True,
+          name='width_axis_embeddings',
+          regularizer=self._kernel_regularizer)
+      self._width_axis_batch_norm = self._bn_layer(
+          axis=-1, name='width_axis_batch_norm')
+
+  def call(self, features, training=False):
+    """Performs a forward pass.
+
+    Args:
+      features: An input [batch, height, width, channels] tensor.
+      training: A boolean, whether the model is in training mode.
+
+    Returns:
+      output: The sum of the input feature and learnable positional encodings.
+    """
+    if (self._positional_encoding_type is None or
+        self._positional_encoding_type.lower() == 'none'):
+      return features
+    elif self._positional_encoding_type.lower() == '2d':
+      positional_encoding = self._batch_norm(self._embeddings,
+                                             training=training)
+    elif self._positional_encoding_type.lower() == '1d':
+      height_axis_positional_encoding = self._height_axis_batch_norm(
+          self._height_axis_embeddings, training=training)
+      width_axis_positional_encoding = self._width_axis_batch_norm(
+          self._width_axis_embeddings, training=training)
+      positional_encoding = (height_axis_positional_encoding +
+                             width_axis_positional_encoding)
+    return features + positional_encoding
diff --git a/model/layers/positional_encodings_test.py b/model/layers/positional_encodings_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..05d78b55e42a2acab5dccdd49f00664d9aecf4cb
--- /dev/null
+++ b/model/layers/positional_encodings_test.py
@@ -0,0 +1,60 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for positional_encodings."""
+
+import tensorflow as tf
+
+from deeplab2.model.layers import positional_encodings
+
+
+class PositionalEncodingsTest(tf.test.TestCase):
+
+  def test_compute_relative_distance_matrix_output_shape(self):
+    output = positional_encodings._compute_relative_distance_matrix(33, 97)
+    self.assertListEqual(output.get_shape().as_list(), [33, 97])
+
+  def test_relative_positional_encoding_output_shape(self):
+    layer = positional_encodings.RelativePositionalEncoding(
+        33, 97, 32, 8, 'rpe')
+    output = layer(None)
+    self.assertListEqual(output.get_shape().as_list(), [8, 33, 97, 32])
+
+  def test_add_absolute_positional_encoding_1d_output_shape(self):
+    layer = positional_encodings.AddAbsolutePositionalEncoding(
+        'ape1d', positional_encoding_type='1d')
+    shape = [2, 5, 5, 3]
+    output = layer(tf.zeros(shape))
+    self.assertEqual(len(layer.get_weights()), 10)
+    self.assertListEqual(output.get_shape().as_list(), shape)
+
+  def test_add_absolute_positional_encoding_2d_output_shape(self):
+    layer = positional_encodings.AddAbsolutePositionalEncoding(
+        'ape2d', positional_encoding_type='2d')
+    shape = [2, 5, 5, 3]
+    output = layer(tf.zeros(shape))
+    self.assertEqual(len(layer.get_weights()), 5)
+    self.assertListEqual(output.get_shape().as_list(), shape)
+
+  def test_add_absolute_positional_encoding_none_output_shape(self):
+    layer = positional_encodings.AddAbsolutePositionalEncoding(
+        'none', positional_encoding_type='none')
+    shape = [2, 5, 5, 3]
+    output = layer(tf.zeros(shape))
+    self.assertEqual(len(layer.get_weights()), 0)
+    self.assertListEqual(output.get_shape().as_list(), shape)
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/model/layers/recompute_grad.py b/model/layers/recompute_grad.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bf0e2ad66595e794b187cb7564669ce2ee6c19a
--- /dev/null
+++ b/model/layers/recompute_grad.py
@@ -0,0 +1,289 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Library for rematerialization.
+
+Incubates a version of tf.recompute_grad that is XLA compatible.
+
+This file is based on the recompute_grad.py in the bigbird codebase [1]:
+https://github.com/google-research/bigbird/blob/db06498ec8804c6438111938d8654b66ddaccd5d/bigbird/core/recompute_grad.py
+
+[1] Big Bird: Transformers for Longer Sequences, NeurIPS 2020.
+      Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris
+      Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li
+      Yang, Amr Ahmed.
+"""
+import collections
+import os
+import threading
+from typing import Deque, List, NamedTuple, Optional, Sequence
+
+from absl import logging
+import tensorflow.compat.v2 as tf
+
+# pylint: disable=g-direct-tensorflow-import
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import custom_gradient
+
+
+# Remove when https://github.com/tensorflow/tensorflow/pull/45298
+# gets merged
+def get_variable_by_name(var_name):
+  """Retrieves tf.Variable from name in MirroredStrategy (multi-gpu)."""
+
+  # Get all variables, but it will have copies from different replicas
+  all_global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+
+  def _replica_filter(var):
+    """Filter out variables from different context."""
+    try:
+      return var_name == var.op.name
+    except AttributeError:
+      return False
+  candidate_vars = list(filter(_replica_filter, all_global_vars))
+
+  if len(candidate_vars) >= 1:
+    # Filter out non-trainable variables.
+    candidate_vars = [v for v in candidate_vars if v.trainable]
+  else:
+    raise ValueError('Unsuccessful at finding variable {}.'.format(var_name))
+
+  if len(candidate_vars) == 1:
+    return candidate_vars[0]
+  elif len(candidate_vars) > 1:
+    raise ValueError(
+        'Unsuccessful at finding trainable variable {}. '
+        'Number of candidates: {}. '
+        'Candidates: {}'.format(var_name, len(candidate_vars), candidate_vars))
+  else:
+    # The variable is not trainable.
+    return None
+custom_gradient.get_variable_by_name = get_variable_by_name
+
+
+class RecomputeContext(
+    NamedTuple('RecomputeContext', [
+        ('is_recomputing', bool),
+        ('seed', tf.Tensor),
+        ('children', Deque['RecomputeContext']),
+    ])):
+  """Context for recomputation.
+
+  Attributes:
+    is_recomputing: Whether we are in a recomputation phase.
+    seed: Scalar integer tensor that should be used with stateless random ops
+      for deterministic behavior and correct computation of the gradient.
+    children: Nested `RecomputeContext` instances. Used internally by
+      `recompute_grad` to track nested instances of `RecomputeContext`.
+  """
+
+  def __enter__(self):
+    return _context_stack.push(self)
+
+  def __exit__(self, exc_type, exc_value, traceback):
+    _context_stack.pop(self)
+
+
+# Simplified version of `_DefaultStack` in
+# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/framework/ops.py.
+class _ContextStack(threading.local):
+  """A thread-local stack for providing implicit recompute contexts."""
+
+  def __init__(self):
+    super(_ContextStack, self).__init__()
+    self._stack = []
+
+  def top(self) -> Optional[RecomputeContext]:
+    return self._stack[-1] if self._stack else None
+
+  def push(self, context: RecomputeContext):
+    self._stack.append(context)
+    return context
+
+  def pop(self, context: RecomputeContext):
+    if self._stack[-1] is not context:
+      raise AssertionError('Nesting violated for RecomputeContext.')
+    self._stack.pop()
+
+
+_context_stack = _ContextStack()
+
+
+def get_recompute_context() -> Optional[RecomputeContext]:
+  """Returns the current recomputing context if it exists."""
+  return _context_stack.top()
+
+
+# Adapted from
+# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/control_flow_util.py.
+def _get_containing_xla_context(graph: tf.Graph) -> Optional[object]:
+  """Returns the first ancestor `XLAControlFlowContext` in the `graph`."""
+  ctxt = graph._get_control_flow_context()  # pylint: disable=protected-access
+  while ctxt:
+    if ctxt.IsXLAContext():
+      return ctxt
+    ctxt = ctxt.outer_context
+  return None
+
+
+def _in_xla_context(graph: Optional[tf.Graph] = None) -> bool:
+  """Detects whether we are in an XLA context."""
+  if '--tf_xla_auto_jit=2' in os.environ.get('TF_XLA_FLAGS', ''):
+    return True
+  graph = tf.compat.v1.get_default_graph() if graph is None else graph
+  while True:
+    if _get_containing_xla_context(graph) is not None:
+      return True
+    try:
+      graph = graph.outer_graph
+    except AttributeError:
+      return False
+
+
+def _force_data_dependency(
+    first_compute: Sequence[tf.Tensor],
+    then_compute: Sequence[tf.Tensor]) -> List[tf.Tensor]:
+  """Forces all of `then_compute` to depend on all of `first_compute`.
+
+  Uses a dummy data dependency, which is useful when running on TPUs because
+  XLA ignores control dependencies. Only supports float arguments.
+
+  Args:
+    first_compute: Sequence of `Tensor`s to be executed before `then_compute`.
+    then_compute: Sequence of `Tensor`s to executed after `first_compute`.
+
+  Returns:
+    Sequence of `Tensor`s with same length of `then_compute`.
+
+  Raises:
+    ValueError: if ranks are unknown or types are not floating.
+  """
+
+  def _first_element(x):
+    if x.shape.ndims is None:
+      raise ValueError('Rank of Tensor %s must be known' % x)
+    ndims = x.shape.ndims
+    begin = tf.zeros(ndims, dtype=tf.int32)
+    size = tf.ones(ndims, dtype=tf.int32)
+    return tf.reshape(tf.slice(x, begin, size), [])
+
+  first_compute_sum = tf.add_n(
+      [_first_element(x) for x in first_compute if x is not None])
+  dtype = first_compute_sum.dtype
+  if not dtype.is_floating:
+    raise ValueError('_force_data_dependency only supports floating dtypes.')
+  zero = tf.cast(0.0, first_compute_sum.dtype) * first_compute_sum
+  then_compute_sequence = [
+      x + tf.cast(zero, x.dtype) if x is not None else None
+      for x in tf.nest.flatten(then_compute)
+  ]
+  return tf.nest.pack_sequence_as(then_compute, then_compute_sequence)
+
+
+def _make_seed_if_none(seed: Optional[tf.Tensor]) -> tf.Tensor:
+  """Uses the global generator to make a seed if necessary."""
+  if seed is not None:
+    return seed
+  generator = tf.random.experimental.get_global_generator()
+  # The two seeds for stateless random ops don't have individual semantics and
+  # are scrambled together, so providing one seed is fine. This makes it easier
+  # for users to provide a local seed without worrying about integer overflow.
+  # See `make_seeds` in
+  # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/stateful_random_ops.py.
+  try:
+    return generator.uniform_full_int([], tf.int32, name='recompute_grad_seed')
+  except (RuntimeError, TypeError, ValueError, tf.errors.NotFoundError) as e:
+    # For a number of reasons, the above operation can fail like using multiple
+    # graphs or toggling between eager and graph modes. Reset the generator.
+    logging.warn('Resetting the generator. %s: %s', type(e), e)
+    tf.random.experimental.set_global_generator(None)
+    generator = tf.random.experimental.get_global_generator()
+    return generator.uniform_full_int([], tf.int32, name='recompute_grad_seed')
+
+
+def recompute_grad(f, seed=None):
+  """An eager-compatible version of recompute_grad.
+
+  For f(*args, **kwargs), this supports gradients with respect to args, or to
+  gradients with respect to any variables residing in the kwarg 'variables'.
+  Note that for keras layer and model objects, this is handled automatically.
+
+  Warning: If `f` was originally a tf.keras Model or Layer object, `g` will not
+  be able to access the member variables of that object, because `g` returns
+  through the wrapper function `inner`. When recomputing gradients through
+  objects that inherit from keras, we suggest keeping a reference to the
+  underlying object around for the purpose of accessing these variables.
+
+  Args:
+    f: function `f(*x)` that returns a `Tensor` or sequence of `Tensor` outputs.
+    seed: Optional seed for random ops. `seed` should an integer scalar
+      `Tensor`. When compiling to XLA, `seed` must have dtype `tf.int32`. If
+      `seed` is not provided one will be generated.
+
+  Returns:
+   A function `g` that wraps `f`, but which recomputes `f` on the backwards
+   pass of a gradient call.
+  """
+
+  @tf.custom_gradient
+  def inner(*args, **kwargs):
+    """Inner function closure for calculating gradients."""
+    # Detect when we're nested and in the backwards pass, so we don't generate
+    # an additional seed.
+    parent_context = get_recompute_context()
+    if parent_context is not None and parent_context.is_recomputing:
+      # Use the cached context in the recomputation phase.
+      with parent_context.children.popleft()._replace(
+          is_recomputing=True) as context:
+        result = f(*args, **kwargs)
+    else:
+      with RecomputeContext(
+          is_recomputing=False,
+          seed=_make_seed_if_none(seed),
+          children=collections.deque()) as context:
+        result = f(*args, **kwargs)
+        # In the forward pass, build up a tree of recomputation contexts.
+        if parent_context is not None and not parent_context.is_recomputing:
+          parent_context.children.append(context)
+
+    def grad(*dresult, **grad_kwargs):
+      """Gradient function calculation for inner function."""
+      variables = grad_kwargs.pop('variables', None)
+      if grad_kwargs:
+        raise ValueError('Found unexpected kwargs for `grad`: ',
+                         list(grad_kwargs.keys()))
+      inputs, seed = list(args), context.seed
+      if _in_xla_context():
+        inputs = _force_data_dependency(
+            tf.nest.flatten(dresult), inputs + [seed])
+        seed = inputs.pop()
+      # tf.keras.backend.set_learning_phase(1)
+      with tf.GradientTape() as tape:
+        tape.watch(inputs)
+        if variables is not None:
+          tape.watch(variables)
+        with tf.control_dependencies(dresult):
+          with context._replace(is_recomputing=True, seed=seed):
+            result = f(*inputs, **kwargs)
+      kw_vars = []
+      if variables is not None:
+        kw_vars = list(variables)
+      grads = tape.gradient(
+          result, list(inputs) + kw_vars, output_gradients=dresult)
+      return grads[:len(inputs)], grads[len(inputs):]
+
+    return result, grad
+
+  return inner
diff --git a/model/layers/recompute_grad_test.py b/model/layers/recompute_grad_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d488b2e900ff29454b3aa342ea71bb6ccc8e3c84
--- /dev/null
+++ b/model/layers/recompute_grad_test.py
@@ -0,0 +1,254 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for recompute_grad.
+
+This file is based on the recompute_grad_test.py in the etcmodel codebase [1]:
+https://github.com/google-research/google-research/blob/ae9d07f22d31b36069bb8321e9d015e46dd8e8bb/etcmodel/layers/recompute_grad_test.py
+
+[1] ETC: Encoding Long and Structured Inputs in Transformers, EMNLP 2020.
+      Joshua Ainslie, Santiago Ontanon, Chris Alberti, Vaclav Cvicek, Zachary
+      Fisher, Philip Pham, Anirudh Ravula, Sumit Sanghai, Qifan Wang, Li Yang.
+"""
+from typing import Sequence
+import tensorflow as tf
+from deeplab2.model import test_utils
+from deeplab2.model.encoder import axial_resnet_instances
+from deeplab2.model.layers import recompute_grad as recompute_grad_lib
+
+
+def _compute_deeplab_gradients(inputs, model, training):
+  """Returns an output and all the gradients."""
+  variables = model.trainable_weights[::-1] + [inputs]
+  with tf.GradientTape(persistent=True) as tape:
+    tape.watch(variables)
+    out = model(inputs, training=training)['transformer_mask_feature']
+
+  grads = tape.gradient(out, variables)
+  return out, grads
+
+
+class RecomputeGradTest(tf.test.TestCase):
+
+  def test_real_deeplab_models(self):
+    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+    tf.config.experimental_connect_to_cluster(resolver)
+    tf.tpu.experimental.initialize_tpu_system(resolver)
+    strategy = tf.distribute.TPUStrategy(resolver)
+
+    with strategy.scope():
+      # Test max_deeplab_s since it involves all three types of operations:
+      # convolution, axial-attention, and transformer.
+      model_name = 'max_deeplab_s'
+      kwargs = {'num_blocks': [1, 1, 1, 1],
+                'backbone_layer_multiplier': 1,
+                'width_multiplier': 1.0,
+                'bn_layer': tf.keras.layers.experimental.SyncBatchNormalization,
+                'conv_kernel_weight_decay': 0.0,
+                'block_group_config': {
+                    'drop_path_keep_prob': 1.0,  # Disable the randomness.
+                    'conv_use_recompute_grad': False,
+                    'axial_use_recompute_grad': False,
+                    'recompute_within_stride': 0,
+                    'transformer_use_recompute_grad': False}}
+      # Build test input.
+      tensor = test_utils.create_test_input(1, 33, 33, 3)
+      test_input = tf.Variable(tensor)
+      test_input_recompute = tf.Variable(tensor)
+
+      # Build a model.
+      model = axial_resnet_instances.get_model(model_name, **kwargs)
+      model(test_input, training=True)
+
+      # Set the batch norm gamma as non-zero so that the bottleneck computation
+      # affects the output.
+      for weight in model.trainable_weights:
+        if '/gamma:0' in weight.name:
+          weight.assign(tf.ones_like(weight) * 0.1)
+
+      # Activate all recompute_grad for the recomputed model.
+      kwargs['block_group_config'] = {
+          'drop_path_keep_prob': 1.0,
+          'conv_use_recompute_grad': True,
+          'axial_use_recompute_grad': True,
+          'recompute_within_stride': 0,
+          'transformer_use_recompute_grad': True}
+
+      # Build the same model but with recompute_grad.
+      model_recompute = axial_resnet_instances.get_model(model_name, **kwargs)
+      model_recompute(test_input_recompute, training=True)
+      model_recompute.set_weights(model.get_weights())
+
+    @tf.function
+    def function():
+      outs_recompute, grads_recompute = _compute_deeplab_gradients(
+          test_input_recompute, model_recompute, True)
+      outs, grads = _compute_deeplab_gradients(
+          test_input, model, True)
+      return grads_recompute, grads, outs_recompute, outs
+
+    grads_recompute, grads, outs_recompute, outs = strategy.run(function)
+
+    # Similar outputs.
+    self.assertAllClose(outs.values[0], outs_recompute.values[0],
+                        rtol=1e-4, atol=1e-4)
+
+    # Similar gradients.
+    for grad, grad_recompute in zip(grads, grads_recompute):
+      if grad is None or grad_recompute is None:
+        continue
+      grad = grad.values[0]
+      grad_recompute = grad_recompute.values[0]
+      if (isinstance(grad, tf.IndexedSlices) and
+          isinstance(grad_recompute, tf.IndexedSlices)):
+        continue
+      self.assertAllClose(grad, grad_recompute, rtol=1e-1, atol=1e-1)
+
+
+def _compute_gradients(model, x):
+  with tf.GradientTape() as tape:
+    y = model(x)
+  return tape.gradient(
+      y, model.trainable_variables
+      if hasattr(model, 'trainable_variables') else tape.watched_variables())
+
+
+def _make_gradients_op(model, x):
+  f = lambda x: _compute_gradients(model, x)
+  return (tf.function(experimental_compile=True)(lambda: f(x))
+          if tf.executing_eagerly() else tf.compat.v1.tpu.rewrite(f, (x,)))
+
+
+class RecomputeDense(tf.keras.layers.Layer):
+  """Dense layer that recomputes the forward pass during backpropagation."""
+
+  def __init__(self, units: Sequence[int], **kwargs):
+    super(RecomputeDense, self).__init__(**kwargs)
+    self._units = tf.nest.flatten(units)
+
+  def build(self, input_shape: tf.TensorShape):
+    units = input_shape[-1:] + self._units
+    kernels = []
+    biases = []
+    for i in range(1, len(units)):
+      kernels.append(
+          self.add_weight('kernel_{}'.format(i), (units[i - 1], units[i])))
+      biases.append(self.add_weight('bias_{}'.format(i), (units[i],)))
+    self._kernels = kernels
+    self._biases = biases
+    super(RecomputeDense, self).build(input_shape)
+
+  def call(self, inputs: tf.Tensor, **kwargs):
+
+    @recompute_grad_lib.recompute_grad
+    def f(x):
+      for kernel, bias in zip(self._kernels, self._biases):
+        x = tf.nn.tanh(tf.matmul(x, kernel) + bias)
+      return x
+
+    return f(inputs)
+
+
+class RecomputeDense2Args(RecomputeDense):
+  """Extension of `RecomputeDense` that takes and returns 2 arguments."""
+
+  def build(self, input_shape: Sequence[tf.TensorShape]):
+    super(RecomputeDense2Args, self).build(input_shape[0])
+
+  def call(self, inputs: Sequence[tf.Tensor], **kwargs):
+
+    @recompute_grad_lib.recompute_grad
+    def f(x1, x2):
+      for kernel, bias in zip(self._kernels, self._biases):
+        x1 = tf.nn.tanh(tf.matmul(x1, kernel) + bias)
+      for kernel, bias in zip(self._kernels, self._biases):
+        x2 = tf.nn.tanh(tf.matmul(x2, kernel) + bias)
+      return x1, x2
+
+    return f(*inputs)
+
+
+class RecomputeGradXlaTest(tf.test.TestCase):
+  """Tests for recompute_grad_lib.recompute_grad with XLA."""
+
+  @property
+  def device(self):
+    if tf.config.list_logical_devices('TPU'):
+      return sorted(tf.config.list_logical_devices('TPU'))[0]
+    elif tf.config.list_logical_devices('GPU'):
+      return sorted(tf.config.list_logical_devices('GPU'))[0]
+    else:
+      return sorted(tf.config.list_logical_devices('CPU'))[0]
+
+  def test_xla_model_correctness(self):
+    """Tests correctness of the gradient calculation."""
+
+    def _make_model(input_size):
+      inputs = tf.keras.Input((input_size,))
+      x = inputs
+      for _ in range(2):
+        x = RecomputeDense([16] * 2)(x)
+      outputs = tf.keras.layers.Dense(1)(x)
+      return tf.keras.Model(inputs, outputs)
+
+    with tf.device(self.device):
+      recompute_model = _make_model(4)
+      control_model = tf.keras.Sequential([
+          tf.keras.layers.Dense(16, activation='tanh', input_shape=(4,)),
+          tf.keras.layers.Dense(16, activation='tanh'),
+          tf.keras.layers.Dense(16, activation='tanh'),
+          tf.keras.layers.Dense(16, activation='tanh'),
+          tf.keras.layers.Dense(1),
+      ])
+      if not tf.executing_eagerly():
+        self.evaluate(tf.compat.v1.tpu.initialize_system())
+        self.evaluate(tf.compat.v1.initializers.global_variables())
+      for source, target in zip(control_model.trainable_variables,
+                                recompute_model.trainable_variables):
+        self.evaluate(target.assign(source))
+      x = tf.ones((32, 4))
+      actual_gradients = self.evaluate(_make_gradients_op(recompute_model, x))
+      expected_gradients = self.evaluate(_make_gradients_op(control_model, x))
+    for actual, expected in zip(actual_gradients, expected_gradients):
+      self.assertAllClose(actual, expected)
+
+  def test_xla_model_2_argument_case(self):
+    """Tests for a recomputed function that takes and returns multiple args.
+
+    We don't test correctness of the gradients here; we're just making sure
+    `recompute_grad` runs without error in this case.
+    """
+
+    def _make_model(input_size):
+      input1 = tf.keras.Input((input_size,))
+      input2 = tf.keras.Input((input_size,))
+      x = (input1, input2)
+      for _ in range(2):
+        x = RecomputeDense2Args([16] * 2)(x)
+      outputs = tf.keras.layers.Dense(1)(x[0] + x[1])
+      return tf.keras.Model((input1, input2), outputs)
+
+    with tf.device(self.device):
+      recompute_model = _make_model(4)
+      if not tf.executing_eagerly():
+        self.evaluate(tf.compat.v1.tpu.initialize_system())
+        self.evaluate(tf.compat.v1.initializers.global_variables())
+      x1 = tf.ones((32, 4))
+      x2 = 2 * tf.ones((32, 4))
+      _ = self.evaluate(_make_gradients_op(recompute_model, (x1, x2)))
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/model/layers/resized_fuse.py b/model/layers/resized_fuse.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb68fdbf985b2c12600067f8f96d722032914012
--- /dev/null
+++ b/model/layers/resized_fuse.py
@@ -0,0 +1,165 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Implements a resized feature fuser for stacked decoders in MaX-DeepLab.
+
+Reference:
+  MaX-DeepLab: End-to-End Panoptic Segmentation with Mask Transformers,
+    CVPR 2021. https://arxiv.org/abs/2012.00759
+      Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen.
+"""
+
+import tensorflow as tf
+
+from deeplab2.model import utils
+from deeplab2.model.layers import activations
+from deeplab2.model.layers import convolutions
+
+
+class ResizedFuse(tf.keras.layers.Layer):
+  """Fuses features by resizing and 1x1 convolutions.
+
+  This function fuses all input features to a desired shape, by projecting the
+  features to the desired number of channels, bilinear resizing the outputs
+  (either upsampling or downsampling), and finally adding the outputs. If the
+  input channel equals the desired output channels, the 1x1 convolutional
+  projection is skipped. If the projection and bilinear resizing can be fused
+  into a stride 2 convolution, we use this faster implementation. Other strides
+  are also supported with the bilinear resizing, but are probably slower than
+  strided convolutions.
+
+  Reference:
+    MaX-DeepLab: End-to-End Panoptic Segmentation with Mask Transformers,
+      CVPR 2021. https://arxiv.org/abs/2012.00759
+        Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen.
+  """
+
+  def __init__(self,
+               name,
+               height,
+               width,
+               num_channels,
+               activation='relu',
+               bn_layer=tf.keras.layers.BatchNormalization,
+               conv_kernel_weight_decay=0.0):
+    """Initializes a ResizedFuse layer.
+
+    Args:
+      name: A string, the name of this layer.
+      height: An integer, the desired height of the output.
+      width: An integer, the desired width of the output.
+      num_channels: An integer, the num of output channels.
+      activation: A string, type of activation function to apply.
+      bn_layer: A tf.keras.layers.Layer that computes the normalization
+        (default: tf.keras.layers.BatchNormalization).
+      conv_kernel_weight_decay: A float, the weight decay for convolution
+        kernels.
+    """
+    super(ResizedFuse, self).__init__(name=name)
+    self._height = height
+    self._width = width
+    self._num_channels = num_channels
+    self._activation_fn = activations.get_activation(activation)
+    self._bn_layer = bn_layer
+    self._conv_kernel_weight_decay = conv_kernel_weight_decay
+
+  def build(self, input_shapes):
+    for index, feature_shape in enumerate(input_shapes):
+      _, feature_height, feature_width, feature_channels = feature_shape
+      if feature_channels == self._num_channels:
+        continue
+      elif ((feature_height + 1) // 2 == self._height and
+            (feature_width + 1) // 2 == self._width):
+        # Use stride 2 convolution to accelerate the operation if it generates
+        # the desired spatial shape. Otherwise, the more general 1x1 convolution
+        # and bilinear resizing are applied.
+
+        # In a stacked decoder, we follow relu-conv-bn because we do the feature
+        # summation before relu and after bn (following ResNet bottleneck
+        # design). This ordering makes it easier to implement. Besides, it
+        # avoids using many 1x1 convolutions when the input has a correct shape.
+        current_name = '_strided_conv_bn{}'.format(index + 1)
+        utils.safe_setattr(
+            self, current_name, convolutions.Conv2DSame(
+                self._num_channels, 1, current_name[1:],
+                strides=2,
+                use_bias=False,
+                use_bn=True,
+                bn_layer=self._bn_layer,
+                activation='none',
+                conv_kernel_weight_decay=self._conv_kernel_weight_decay))
+      else:
+        # If the input channel does not match that of the output, and the
+        # operation cannot be accelerated by stride 2 convolution, then we
+        # perform a flexible operation as follows. We first project the feature
+        # to the desired number of channels, and then bilinearly resize the
+        # output to the desired spatial resolution.
+        current_name = '_resized_conv_bn{}'.format(index + 1)
+        utils.safe_setattr(
+            self, current_name, convolutions.Conv2DSame(
+                self._num_channels, 1, current_name[1:],
+                use_bias=False,
+                use_bn=True,
+                bn_layer=self._bn_layer,
+                activation='none',
+                conv_kernel_weight_decay=self._conv_kernel_weight_decay))
+
+  def call(self, inputs, training=False):
+    """Performs a forward pass.
+
+    Args:
+      inputs: A list of input [batch, input_height, input_width, input_channels]
+        tensors to fuse, where each input tensor may have different spatial
+        resolutions and number of channels.
+      training: A boolean, whether the model is in training mode.
+
+    Returns:
+      output: A fused feature [batch, height, width, num_channels] tensor.
+    """
+
+    output_features = []
+    for index, feature in enumerate(inputs):
+      _, feature_height, feature_width, feature_channels = (
+          feature.get_shape().as_list())
+      if feature_channels == self._num_channels:
+        # Resize the input feature if the number of channels equals the output.
+        # We do not use a 1x1 convolution for this case because the previous
+        # operation and the next operation are usually also 1x1 convolutions.
+        # Besides, in stacked decoder, a feature can be reused many time, so it
+        # saves parameter to avoid those many 1x1 convolutions.
+        output_features.append(utils.resize_bilinear(
+            feature, [self._height, self._width],
+            align_corners=True))
+      elif ((feature_height + 1) // 2 == self._height and
+            (feature_width + 1) // 2 == self._width):
+        current_name = '_strided_conv_bn{}'.format(index + 1)
+        feature = self._activation_fn(feature)
+        feature = getattr(self, current_name)(feature, training=training)
+        output_features.append(feature)
+      else:
+        current_name = '_resized_conv_bn{}'.format(index + 1)
+        feature = self._activation_fn(feature)
+        feature = getattr(self, current_name)(feature, training=training)
+        output_features.append(utils.resize_bilinear(
+            feature, [self._height, self._width],
+            align_corners=True))
+      # Set the spatial shape of each output feature if possible.
+      output_features[-1].set_shape(
+          [None,
+           self._height,
+           self._width,
+           self._num_channels])
+    output = tf.add_n(output_features)
+    return self._activation_fn(output)
diff --git a/model/layers/resized_fuse_test.py b/model/layers/resized_fuse_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ba8431462e4bb5b4e714834bef2dbb97facdc46
--- /dev/null
+++ b/model/layers/resized_fuse_test.py
@@ -0,0 +1,56 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for resized_fuse."""
+
+import tensorflow as tf
+
+from deeplab2.model.layers import resized_fuse
+
+
+class ResizedFuseTest(tf.test.TestCase):
+
+  def test_resize_and_fuse_features(self):
+    batch, height, width, channels = 2, 11, 11, 6
+    smaller_height, smaller_width, smaller_channels = 6, 6, 3
+    larger_height1, larger_width1 = 21, 21  # Stride 2 conv.
+    larger_height2, larger_width2 = 22, 22  # Stride 2 conv.
+    larger_height3, larger_width3 = 23, 23  # Conv and resize.
+
+    feature_list = []
+    feature_list.append(tf.zeros([batch, smaller_height, smaller_width,
+                                  smaller_channels]))
+    feature_list.append(tf.zeros([batch, smaller_height, smaller_width,
+                                  channels]))
+    feature_list.append(tf.zeros([batch, height, width, smaller_channels]))
+    feature_list.append(tf.zeros([batch, height, width, channels]))
+    feature_list.append(tf.zeros([batch, larger_height1, larger_width1,
+                                  channels]))
+    feature_list.append(tf.zeros([batch, larger_height1, larger_width1,
+                                  smaller_channels]))
+    feature_list.append(tf.zeros([batch, larger_height2, larger_width2,
+                                  smaller_channels]))
+    feature_list.append(tf.zeros([batch, larger_height3, larger_width3,
+                                  smaller_channels]))
+    layer = resized_fuse.ResizedFuse(name='fuse',
+                                     height=height,
+                                     width=width,
+                                     num_channels=channels)
+    output = layer(feature_list)
+    self.assertEqual(output.get_shape().as_list(), [batch, height, width,
+                                                    channels])
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/model/layers/squeeze_and_excite.py b/model/layers/squeeze_and_excite.py
new file mode 100644
index 0000000000000000000000000000000000000000..d77d73b66dacd3faa47f59106a69d4da1bc6cc10
--- /dev/null
+++ b/model/layers/squeeze_and_excite.py
@@ -0,0 +1,186 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Squeeze and excite layer.
+
+This script implements the squeeze-and-excite (SE), proposed in
+- Squeeze-and-Excitation Networks, Jie Hu, Li Shen, Samuel Albanie,
+Gang Sun, Enhua Wu. In CVPR 2018.
+
+Recently, this SE operation is further simplied with a single fully
+connected layer, referred as simplified_squeeze_and_excite in our
+implementation. For details, please see
+- Lee and Park proposed to use only one fully connected layer in SE.
+CenterMask : Real-Time Anchor-Free Instance Segmentation.
+Youngwan Lee and Jongyoul Park. In CVPR 2020.
+"""
+from typing import Optional
+
+from absl import logging
+import tensorflow as tf
+
+from deeplab2.model import utils
+from deeplab2.model.layers import activations
+
+layers = tf.keras.layers
+
+
+class SimplifiedSqueezeAndExcite(tf.keras.layers.Layer):
+  """A simplified squeeze-and-excite layer.
+
+  Original squeeze-and-exciation (SE) is proposed in
+  Squeeze-and-Excitation Networks, Jie Hu, Li Shen, Samuel Albanie,
+  Gang Sun, Enhua Wu. In CVPR 2018.
+
+  Lee and Park proposed to use only one fully connected layer in SE.
+  CenterMask : Real-Time Anchor-Free Instance Segmentation.
+  Youngwan Lee and Jongyoul Park. In CVPR 2020.
+
+  In this function, we implement the simplified version of SE.
+
+  Additionally, we follow MobileNetv3 to use the hard sigmoid function.
+  """
+
+  def __init__(self, squeeze_channels, name=None):
+    """Initializes a simplified squeeze-and-excite layer.
+
+    Args:
+      squeeze_channels: Integer, channels for the squeezed features.
+      name: An optional string specifying the operation name.
+    """
+    super(SimplifiedSqueezeAndExcite, self).__init__(name=name)
+    self._squeeze_channels = squeeze_channels
+
+    self._se_conv = layers.Conv2D(self._squeeze_channels,
+                                  1,
+                                  name='squeeze_and_excite',
+                                  use_bias=True,
+                                  kernel_initializer='VarianceScaling')
+    self._hard_sigmoid = activations.get_activation('hard_sigmoid')
+
+  def call(self, input_tensor):
+    """Performs a forward pass.
+
+    Args:
+      input_tensor: An input tensor of type tf.Tensor with shape [batch, height,
+        width, channels].
+
+    Returns:
+      The output tensor.
+    """
+    pooled = tf.reduce_mean(input_tensor, [1, 2], keepdims=True)
+    squeezed = self._se_conv(pooled)
+    excited = self._hard_sigmoid(squeezed) * input_tensor
+    return excited
+
+  def get_config(self):
+    config = {
+        'squeeze_channels': self._squeeze_channels,
+    }
+    base_config = super(SimplifiedSqueezeAndExcite, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class SqueezeAndExcite(tf.keras.layers.Layer):
+  """Creates a squeeze and excitation layer.
+
+  Reference: Squeeze-and-Excitation Networks, Jie Hu, Li Shen, Samuel Albanie,
+  Gang Sun, Enhua Wu. In CVPR 2018.
+  This implementation follows the original SE and differs from the above
+  simplified version.
+  """
+
+  def __init__(
+      self,
+      in_filters: int,
+      out_filters: int,
+      se_ratio: float,
+      divisible_by: int = 1,
+      kernel_initializer: str = 'VarianceScaling',
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      activation: str = 'relu',
+      gating_activation: str = 'sigmoid',
+      name: Optional[str] = None):
+    """Initializes a squeeze and excitation layer.
+
+    Args:
+      in_filters: The number of filters that se_ratio should be applied to.
+      out_filters: The number of filters of the output tensor.
+      se_ratio: The SE ratio for the squeeze and excitation layer.
+      divisible_by: An `int` that ensures all inner dimensions are divisible by
+        this number.
+      kernel_initializer: The kernel_initializer for convolutional
+        layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default to None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
+        Default to None.
+      activation: The name of the activation function.
+      gating_activation: The name of the activation function for final
+        gating function.
+      name: The layer name.
+    """
+    super(SqueezeAndExcite, self).__init__(name=name)
+
+    self._in_filters = in_filters
+    self._out_filters = out_filters
+    self._se_ratio = se_ratio
+    self._divisible_by = divisible_by
+    self._activation = activation
+    self._gating_activation = gating_activation
+    self._kernel_initializer = kernel_initializer
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._spatial_axis = [1, 2]
+    else:
+      self._spatial_axis = [2, 3]
+    self._activation_fn = activations.get_activation(activation)
+    self._gating_activation_fn = activations.get_activation(gating_activation)
+
+    num_reduced_filters = utils.make_divisible(
+        max(1, int(self._in_filters * self._se_ratio)),
+        divisor=self._divisible_by)
+    if self._se_ratio > 1.0:
+      logging.warn('Squeezing ratio %d is larger than 1.0.', self._se_ratio)
+
+    self._se_reduce = tf.keras.layers.Conv2D(
+        filters=num_reduced_filters,
+        kernel_size=1,
+        strides=1,
+        padding='same',
+        use_bias=True,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer,
+        name=name + '_reduce')
+
+    self._se_expand = tf.keras.layers.Conv2D(
+        filters=self._out_filters,
+        kernel_size=1,
+        strides=1,
+        padding='same',
+        use_bias=True,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer,
+        name=name + '_expand')
+
+  def call(self, inputs):
+    x = tf.reduce_mean(inputs, self._spatial_axis, keepdims=True)
+    x = self._activation_fn(self._se_reduce(x))
+    x = self._gating_activation_fn(self._se_expand(x))
+    return x * inputs
diff --git a/model/layers/squeeze_and_excite_test.py b/model/layers/squeeze_and_excite_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ae4c864378d731fb259a76e3f23259298eba6a9
--- /dev/null
+++ b/model/layers/squeeze_and_excite_test.py
@@ -0,0 +1,50 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for squeeze_and_excite.py."""
+
+import tensorflow as tf
+
+from deeplab2.model.layers import squeeze_and_excite
+
+
+class SqueezeAndExciteTest(tf.test.TestCase):
+
+  def test_simpliefied_squeeze_and_excite_input_output_shape(self):
+    # Test the shape of input and output of SimplifiedSqueezeAndExcite.
+    channels = 32
+    input_tensor = tf.random.uniform(shape=(3, 65, 65, channels))
+    layer_op = squeeze_and_excite.SimplifiedSqueezeAndExcite(
+        channels)
+    output_tensor = layer_op(input_tensor)
+    self.assertListEqual(input_tensor.get_shape().as_list(),
+                         output_tensor.get_shape().as_list())
+
+  def test_squeeze_and_excite_input_output_shape(self):
+    # Test the shape of input and output of SqueezeAndExcite.
+    channels = 32
+    input_tensor = tf.random.uniform(shape=(3, 65, 65, channels))
+    layer_op = squeeze_and_excite.SqueezeAndExcite(
+        in_filters=channels,
+        out_filters=channels,
+        se_ratio=8,
+        name='se')
+    output_tensor = layer_op(input_tensor)
+    self.assertListEqual(input_tensor.get_shape().as_list(),
+                         output_tensor.get_shape().as_list())
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/model/layers/stems.py b/model/layers/stems.py
new file mode 100644
index 0000000000000000000000000000000000000000..03e315cbb89afc268a8b370c2a031fda181dfdcf
--- /dev/null
+++ b/model/layers/stems.py
@@ -0,0 +1,113 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This script contains STEMs for neural networks.
+
+The `STEM` is defined as the first few convolutions that process the input
+image to a spatially smaller feature map (e.g., output stride = 2).
+
+
+Reference code:
+https://github.com/tensorflow/models/blob/master/research/deeplab/core/resnet_v1_beta.py
+"""
+import tensorflow as tf
+
+from deeplab2.model.layers import convolutions
+
+layers = tf.keras.layers
+
+
+class InceptionSTEM(tf.keras.layers.Layer):
+  """A InceptionSTEM layer.
+
+  This class builds an InceptionSTEM layer which can be used to as the first
+  few layers in a neural network. In particular, InceptionSTEM contains three
+  consecutive 3x3 colutions.
+
+  Reference:
+  - Christian Szegedy, Sergey Ioffe, Vincent Vanhoucke, and Alexander Alemi.
+    "Inception-v4, inception-resnet and the impact of residual connections on
+    learning." In AAAI, 2017.
+  """
+
+  def __init__(self,
+               bn_layer=tf.keras.layers.BatchNormalization,
+               width_multiplier=1.0,
+               conv_kernel_weight_decay=0.0,
+               activation='relu'):
+    """Creates the InceptionSTEM layer.
+
+    Args:
+      bn_layer: An optional tf.keras.layers.Layer that computes the
+        normalization (default: tf.keras.layers.BatchNormalization).
+      width_multiplier: A float multiplier, controlling the value of
+        convolution output channels.
+      conv_kernel_weight_decay: A float, the weight decay for convolution
+        kernels.
+      activation: A string specifying an activation function to be used in this
+        stem.
+    """
+    super(InceptionSTEM, self).__init__(name='stem')
+
+    self._conv1_bn_act = convolutions.Conv2DSame(
+        output_channels=int(64 * width_multiplier),
+        kernel_size=3,
+        name='conv1_bn_act',
+        strides=2,
+        use_bias=False,
+        use_bn=True,
+        bn_layer=bn_layer,
+        activation=activation,
+        conv_kernel_weight_decay=conv_kernel_weight_decay)
+
+    self._conv2_bn_act = convolutions.Conv2DSame(
+        output_channels=int(64 * width_multiplier),
+        kernel_size=3,
+        name='conv2_bn_act',
+        strides=1,
+        use_bias=False,
+        use_bn=True,
+        bn_layer=bn_layer,
+        activation=activation,
+        conv_kernel_weight_decay=conv_kernel_weight_decay)
+
+    self._conv3_bn = convolutions.Conv2DSame(
+        output_channels=int(128 * width_multiplier),
+        kernel_size=3,
+        strides=1,
+        use_bias=False,
+        use_bn=True,
+        bn_layer=bn_layer,
+        activation='none',
+        name='conv3_bn',
+        conv_kernel_weight_decay=conv_kernel_weight_decay)
+
+  def call(self, input_tensor, training=False):
+    """Performs a forward pass.
+
+    Args:
+      input_tensor: An input tensor of type tf.Tensor with shape [batch, height,
+        width, channels].
+      training: A boolean flag indicating whether training behavior should be
+        used (default: False).
+
+    Returns:
+      Two output tensors. The first output tensor is not activated. The second
+        tensor is activated.
+    """
+    x = self._conv1_bn_act(input_tensor, training=training)
+    x = self._conv2_bn_act(x, training=training)
+    x = self._conv3_bn(x, training=training)
+    return x
diff --git a/model/layers/stems_test.py b/model/layers/stems_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..bac14055be6b1cf8f100e1a18cdeb59834471cad
--- /dev/null
+++ b/model/layers/stems_test.py
@@ -0,0 +1,40 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for resnet_utils."""
+import tensorflow as tf
+
+from deeplab2.model.layers import stems
+from deeplab2.utils import test_utils
+
+
+class ResnetUtilsTest(tf.test.TestCase):
+
+  def test_inception_stem_output_shape(self):
+    batch = 2
+    height, width = 65, 65
+    input_tensor = test_utils.create_test_input(batch, height, width, 3)
+    model = stems.InceptionSTEM()
+    output_tensor = model(input_tensor)
+    expected_height = (height - 1) / 2 + 1
+    expected_width = (width - 1) / 2 + 1
+    expected_channels = 128
+    self.assertListEqual(
+        output_tensor.get_shape().as_list(),
+        [batch, expected_height, expected_width, expected_channels])
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/model/loss/base_loss.py b/model/loss/base_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4614c8b03e8a7a456eb73d7b8d0a685f86757cf
--- /dev/null
+++ b/model/loss/base_loss.py
@@ -0,0 +1,559 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This file contains basic loss classes used in the DeepLab model."""
+
+from typing import Text, Dict, Callable, Optional
+
+import tensorflow as tf
+from deeplab2.model import utils
+
+
+def compute_average_top_k_loss(loss: tf.Tensor,
+                               top_k_percentage: float) -> tf.Tensor:
+  """Computes the avaerage top-k loss per sample.
+
+  Args:
+    loss: A tf.Tensor with 2 or more dimensions of shape [batch, ...].
+    top_k_percentage: A float representing the % of pixel that should be used
+      for calculating the loss.
+
+  Returns:
+    A tensor of shape [batch] containing the mean top-k loss per sample. Due to
+    the use of different tf.strategy, we return the loss per sample and require
+    explicit averaging by the user.
+  """
+  loss = tf.reshape(loss, shape=(tf.shape(loss)[0], -1))
+
+  if top_k_percentage != 1.0:
+    num_elements_per_sample = tf.shape(loss)[1]
+    top_k_pixels = tf.cast(
+        tf.math.round(top_k_percentage *
+                      tf.cast(num_elements_per_sample, tf.float32)), tf.int32)
+
+    def top_k_1d(inputs):
+      return tf.math.top_k(inputs, top_k_pixels, sorted=False)[0]
+    loss = tf.map_fn(fn=top_k_1d, elems=loss)
+
+  # Compute mean loss over spatial dimension.
+  num_non_zero = tf.reduce_sum(tf.cast(tf.not_equal(loss, 0.0), tf.float32), 1)
+  loss_sum_per_sample = tf.reduce_sum(loss, 1)
+  return tf.math.divide_no_nan(loss_sum_per_sample, num_non_zero)
+
+
+def compute_mask_dice_loss(y_true: tf.Tensor,
+                           y_pred: tf.Tensor,
+                           prediction_activation='softmax') -> tf.Tensor:
+  """Computes the Mask Dice loss between y_true and y_pred masks.
+
+  Reference:
+  [1] Milletari, F., Navab, N., Ahmadi, S.A.: V-net: Fully convolutional neural
+      networks for volumetric medical image segmentation. In: 3DV (2016)
+      https://arxiv.org/abs/1606.04797
+
+  Args:
+    y_true: A tf.Tensor of shape [batch, height, width, channels] (or [batch,
+      length, channels]) containing the ground-truth. The channel dimension
+      indicates the mask ID in MaX-DeepLab, instead of a "class" dimension in
+      the V-net paper. In our case, for all batch, height, width, (or batch,
+      length) the [batch, height, width, :] (or [batch, length, :]) should be
+      one-hot encodings only, with valid pixels having one and only one 1.0, and
+      with void pixels being all 0.0. The valid pixels of the masks do not and
+      should not overlap because of the non-overlapping definition of panoptic
+      segmentation. The output loss is computed and normalized by valid (not
+      void) pixels.
+    y_pred: A tf.Tensor of shape [batch, height, width, channels] (or [batch,
+      length, channels]) containing the prediction.
+    prediction_activation: A String indicating activation function of the
+      prediction. It should be either 'sigmoid' or 'softmax'.
+
+  Returns:
+    A tf.Tensor of shape [batch, channels] with the computed dice loss value.
+
+  Raises:
+      ValueError: An error occurs when prediction_activation is not either
+        'sigmoid' or 'softmax'.
+  """
+  tf.debugging.assert_rank_in(
+      y_pred, [3, 4], message='Input tensors y_pred must have rank 3 or 4.')
+  tf.debugging.assert_rank_in(
+      y_true, [3, 4], message='Input tensors y_true must have rank 3 or 4.')
+
+  shape_list = y_true.shape.as_list()
+  batch, channels = shape_list[0], shape_list[-1]
+  if prediction_activation == 'sigmoid':
+    y_pred = tf.math.sigmoid(y_pred)
+  elif prediction_activation == 'softmax':
+    y_pred = tf.nn.softmax(y_pred, axis=-1)
+  else:
+    raise ValueError(
+        "prediction_activation should be either 'sigmoid' or 'softmax'")
+
+  y_true_flat = tf.reshape(y_true, [batch, -1, channels])
+  # valid_flat indicates labeled pixels in the groudtruth. y_true is one-hot
+  # encodings only, with valid pixels having one and only one 1.0, and with
+  # invalid pixels having 0.0 values in all the channels. The valid pixels of
+  # the masks do not overlap because of the non-overlapping definition of
+  # panoptic segmentation.
+  valid_flat = tf.reduce_sum(y_true_flat, axis=-1, keepdims=True)
+  y_pred_flat = tf.reshape(
+      y_pred, [batch, -1, channels]) * valid_flat
+  # Use smooth = 1 to avoid division by zero when both y_pred and y_true are
+  # zeros.
+  smooth = 1.0
+  intersection = 2 * tf.reduce_sum(y_pred_flat * y_true_flat, axis=1) + smooth
+  denominator = (tf.reduce_sum(y_pred_flat, axis=1) +
+                 tf.reduce_sum(y_true_flat, axis=1) + smooth)
+  loss = 1. - tf.math.divide_no_nan(intersection, denominator)
+  return loss
+
+
+def mean_absolute_error(y_true: tf.Tensor,
+                        y_pred: tf.Tensor,
+                        force_keep_dims=False) -> tf.Tensor:
+  """Computes the per-pixel mean absolute error for 3D and 4D tensors.
+
+  Default reduction behavior: If a 3D tensor is used, no reduction is applied.
+  In case of a 4D tensor, reduction is applied. This behavior can be overridden
+  by force_keep_dims.
+  Note: tf.keras.losses.mean_absolute_error always reduces the output by one
+  dimension.
+
+  Args:
+    y_true: A tf.Tensor of shape [batch, height, width] or [batch, height,
+      width, channels] containing the ground-truth.
+    y_pred: A tf.Tensor of shape [batch, height, width] or [batch, height,
+      width, channels] containing the prediction.
+    force_keep_dims: A boolean flag specifying whether no reduction should be
+      applied.
+
+  Returns:
+    A tf.Tensor with the mean absolute error.
+  """
+  tf.debugging.assert_rank_in(
+      y_pred, [3, 4], message='Input tensors must have rank 3 or 4.')
+  if len(y_pred.shape.as_list()) == 3 or force_keep_dims:
+    return tf.abs(y_true - y_pred)
+  else:
+    return tf.reduce_mean(tf.abs(y_true - y_pred), axis=[3])
+
+
+def mean_squared_error(y_true: tf.Tensor,
+                       y_pred: tf.Tensor,
+                       force_keep_dims=False) -> tf.Tensor:
+  """Computes the per-pixel mean squared error for 3D and 4D tensors.
+
+  Default reduction behavior: If a 3D tensor is used, no reduction is applied.
+  In case of a 4D tensor, reduction is applied. This behavior can be overridden
+  by force_keep_dims.
+  Note: tf.keras.losses.mean_squared_error always reduces the output by one
+  dimension.
+
+  Args:
+    y_true: A tf.Tensor of shape [batch, height, width] or [batch, height,
+      width, channels] containing the ground-truth.
+    y_pred: A tf.Tensor of shape [batch, height, width] or [batch, height,
+      width, channels] containing the prediction.
+    force_keep_dims: A boolean flag specifying whether no reduction should be
+      applied.
+
+  Returns:
+    A tf.Tensor with the mean squared error.
+  """
+  tf.debugging.assert_rank_in(
+      y_pred, [3, 4], message='Input tensors must have rank 3 or 4.')
+  if len(y_pred.shape.as_list()) == 3 or force_keep_dims:
+    return tf.square(y_true - y_pred)
+  else:
+    return tf.reduce_mean(tf.square(y_true - y_pred), axis=[3])
+
+
+def encode_one_hot(gt: tf.Tensor,
+                   num_classes: int,
+                   weights: tf.Tensor,
+                   ignore_label: Optional[int]):
+  """Helper function for one-hot encoding of integer labels.
+
+  Args:
+    gt: A tf.Tensor providing ground-truth information. Integer type label.
+    num_classes: An integer indicating the number of classes considered in the
+      ground-truth. It is used as 'depth' in tf.one_hot().
+    weights: A tf.Tensor containing weights information.
+    ignore_label: An integer specifying the ignore label or None.
+
+  Returns:
+    gt: A tf.Tensor of one-hot encoded gt labels.
+    weights: A tf.Tensor with ignore_label considered.
+  """
+  if ignore_label is not None:
+    keep_mask = tf.cast(tf.not_equal(gt, ignore_label), dtype=tf.float32)
+  else:
+    keep_mask = tf.ones_like(gt, dtype=tf.float32)
+  gt = tf.stop_gradient(tf.one_hot(gt, num_classes))
+  weights = tf.multiply(weights, keep_mask)
+  return gt, weights
+
+
+def is_one_hot(gt: tf.Tensor, pred: tf.Tensor):
+  """Helper function for checking if gt tensor is one-hot encoded or not.
+
+  Args:
+    gt: A tf.Tensor providing ground-truth information.
+    pred: A tf.Tensor providing prediction information.
+
+  Returns:
+    A boolean indicating whether the gt is one-hot encoded (True) or
+    in integer type (False).
+  """
+  gt_shape = gt.get_shape().as_list()
+  pred_shape = pred.get_shape().as_list()
+  # If the ground truth is one-hot encoded, the rank of the ground truth should
+  # match that of the prediction. In addition, we check that the first
+  # dimension, batch_size, and the last dimension, channels, should also match
+  # the prediction. However, we still allow spatial dimensions, e.g., height and
+  # width, to be different since we will downsample the ground truth if needed.
+  return (len(gt_shape) == len(pred_shape) and
+          gt_shape[0] == pred_shape[0] and gt_shape[-1] == pred_shape[-1])
+
+
+def _ensure_topk_value_is_percentage(top_k_percentage: float):
+  """Checks if top_k_percentage is between 0.0 and 1.0.
+
+  Args:
+    top_k_percentage: The floating point value to check.
+  """
+  if top_k_percentage < 0.0 or top_k_percentage > 1.0:
+    raise ValueError('The top-k percentage parameter must lie within 0.0 and '
+                     '1.0, but %f was given' % top_k_percentage)
+
+
+class TopKGeneralLoss(tf.keras.losses.Loss):
+  """This class contains code to compute the top-k loss."""
+
+  def __init__(self,
+               loss_function: Callable[[tf.Tensor, tf.Tensor], tf.Tensor],
+               gt_key: Text,
+               pred_key: Text,
+               weight_key: Text,
+               top_k_percent_pixels: float = 1.0):
+    """Initializes a top-k L1 loss.
+
+    Args:
+      loss_function: A callable loss function.
+      gt_key: A key to extract the ground-truth tensor.
+      pred_key: A key to extract the prediction tensor.
+      weight_key: A key to extract the weight tensor.
+      top_k_percent_pixels: An optional float specifying the percentage of
+        pixels used to compute the loss. The value must lie within [0.0, 1.0].
+    """
+    # Implicit reduction might mess with tf.distribute.Strategy, hence we
+    # explicitly reduce the loss.
+    super(TopKGeneralLoss,
+          self).__init__(reduction=tf.keras.losses.Reduction.NONE)
+
+    _ensure_topk_value_is_percentage(top_k_percent_pixels)
+
+    self._loss_function = loss_function
+    self._top_k_percent_pixels = top_k_percent_pixels
+    self._gt_key = gt_key
+    self._pred_key = pred_key
+    self._weight_key = weight_key
+
+  def call(self, y_true: Dict[Text, tf.Tensor],
+           y_pred: Dict[Text, tf.Tensor]) -> tf.Tensor:
+    """Computes the top-k loss.
+
+    Args:
+      y_true: A dict of tensors providing ground-truth information.
+      y_pred: A dict of tensors providing predictions.
+
+    Returns:
+      A tensor of shape [batch] containing the loss per sample.
+    """
+    gt = y_true[self._gt_key]
+    pred = y_pred[self._pred_key]
+    weights = y_true[self._weight_key]
+
+    per_pixel_loss = self._loss_function(gt, pred)
+    per_pixel_loss = tf.multiply(per_pixel_loss, weights)
+
+    return compute_average_top_k_loss(per_pixel_loss,
+                                      self._top_k_percent_pixels)
+
+
+class TopKCrossEntropyLoss(tf.keras.losses.Loss):
+  """This class contains code for top-k cross-entropy."""
+
+  def __init__(self,
+               gt_key: Text,
+               pred_key: Text,
+               weight_key: Text,
+               num_classes: Optional[int],
+               ignore_label: Optional[int],
+               top_k_percent_pixels: float = 1.0,
+               dynamic_weight: bool = False):
+    """Initializes a top-k cross entropy loss.
+
+    Args:
+      gt_key: A key to extract the ground-truth tensor.
+      pred_key: A key to extract the prediction tensor.
+      weight_key: A key to extract the weight tensor.
+      num_classes: An integer specifying the number of classes in the dataset.
+      ignore_label: An optional integer specifying the ignore label or None.
+      top_k_percent_pixels: An optional float specifying the percentage of
+        pixels used to compute the loss. The value must lie within [0.0, 1.0].
+      dynamic_weight: A boolean indicating whether the weights are determined
+        dynamically w.r.t. the class confidence of each predicted mask.
+
+    Raises:
+      ValueError: An error occurs when top_k_percent_pixels is not between 0.0
+        and 1.0.
+    """
+    # Implicit reduction might mess with tf.distribute.Strategy, hence we
+    # explicitly reduce the loss.
+    super(TopKCrossEntropyLoss,
+          self).__init__(reduction=tf.keras.losses.Reduction.NONE)
+
+    _ensure_topk_value_is_percentage(top_k_percent_pixels)
+
+    self._num_classes = num_classes
+    self._ignore_label = ignore_label
+    self._top_k_percent_pixels = top_k_percent_pixels
+    self._gt_key = gt_key
+    self._pred_key = pred_key
+    self._weight_key = weight_key
+    self._dynamic_weight = dynamic_weight
+
+  def call(self, y_true: Dict[Text, tf.Tensor],
+           y_pred: Dict[Text, tf.Tensor]) -> tf.Tensor:
+    """Computes the top-k cross-entropy loss.
+
+    Args:
+      y_true: A dict of tensors providing ground-truth information. The tensors
+        can be either integer type or one-hot encoded. When is integer type, the
+        shape can be either [batch, num_elements] or [batch, height, width].
+        When one-hot encoded, the shape can be [batch, num_elements, channels]
+        or [batch, height, width, channels].
+      y_pred: A dict of tensors providing predictions. The tensors are of shape
+        [batch, num_elements, channels] or [batch, height, width, channels]. If
+        the prediction is 2D (with height and width), we allow the spatial
+        dimension to be strided_height and strided_width. In this case, we
+        downsample the ground truth accordingly.
+
+    Returns:
+      A tensor of shape [batch] containing the loss per image.
+
+    Raises:
+      ValueError: If the prediction is 1D (with the length dimension) but its
+        length does not match that of the ground truth.
+    """
+    gt = y_true[self._gt_key]
+    pred = y_pred[self._pred_key]
+    gt_shape = gt.get_shape().as_list()
+    pred_shape = pred.get_shape().as_list()
+    if self._dynamic_weight:
+      weights = y_pred[self._weight_key]
+    else:
+      weights = y_true[self._weight_key]
+
+    # Downsample the ground truth for 2D prediction cases.
+    if len(pred_shape) == 4 and gt_shape[1:3] != pred_shape[1:3]:
+      gt = utils.strided_downsample(gt, pred_shape[1:3])
+      weights = utils.strided_downsample(weights, pred_shape[1:3])
+    elif len(pred_shape) == 3 and gt_shape[1] != pred_shape[1]:
+      # We don't support downsampling for 1D predictions.
+      raise ValueError('The shape of gt does not match the shape of pred.')
+
+    if is_one_hot(gt, pred):
+      gt = tf.cast(gt, tf.float32)
+    else:
+      gt = tf.cast(gt, tf.int32)
+      gt, weights = encode_one_hot(gt, self._num_classes, weights,
+                                   self._ignore_label)
+    pixel_losses = tf.keras.backend.categorical_crossentropy(
+        gt, pred, from_logits=True)
+    weighted_pixel_losses = tf.multiply(pixel_losses, weights)
+
+    return compute_average_top_k_loss(weighted_pixel_losses,
+                                      self._top_k_percent_pixels)
+
+
+class FocalCrossEntropyLoss(tf.keras.losses.Loss):
+  """This class contains code for focal cross-entropy."""
+
+  def __init__(self,
+               gt_key: Text,
+               pred_key: Text,
+               weight_key: Text,
+               num_classes: Optional[int],
+               ignore_label: Optional[int],
+               focal_loss_alpha: float = 0.75,
+               focal_loss_gamma: float = 0.0,
+               background_channel_index: int = -1,
+               dynamic_weight: bool = True):
+    """Initializes a focal cross entropy loss.
+
+    FocalCrossEntropyLoss supports focal-loss mode with integer
+    or one-hot ground-truth labels.
+    Reference:
+    [1] Lin, T. Y., Goyal, P., Girshick, R., He, K., & Dollár, P. Focal loss for
+        dense object detection. In Proceedings of the IEEE International
+        Conference on Computer Vision (ICCV). (2017)
+        https://arxiv.org/abs/1708.02002
+
+    Args:
+      gt_key: A key to extract the ground-truth tensor.
+      pred_key: A key to extract the prediction tensor.
+      weight_key: A key to extract the weight tensor.
+      num_classes: An integer specifying the number of classes in the dataset.
+      ignore_label: An optional integer specifying the ignore label or None.
+        Only effective when ground truth labels are in integer mode.
+      focal_loss_alpha: An optional float specifying the coefficient that
+        weights between positive (matched) and negative (unmatched) masks in
+        focal loss. The positives are weighted by alpha, while the negatives
+        are weighted by (1. - alpha). Default to 0.75.
+      focal_loss_gamma: An optional float specifying the coefficient that
+        weights probability (pt) term in focal loss. Focal loss = - ((1 - pt) ^
+        gamma) * log(pt). Default to 0.0.
+      background_channel_index: The index for background channel. When alpha
+        is used, we assume the last channel is background and others are
+        foreground. Default to -1.
+      dynamic_weight: A boolean indicating whether the weights are determined
+        dynamically w.r.t. the class confidence of each predicted mask.
+    """
+    # Implicit reduction might mess with tf.distribute.Strategy, hence we
+    # explicitly reduce the loss.
+    super(FocalCrossEntropyLoss,
+          self).__init__(reduction=tf.keras.losses.Reduction.NONE)
+
+    self._num_classes = num_classes
+    self._ignore_label = ignore_label
+    self._focal_loss_alpha = focal_loss_alpha
+    self._focal_loss_gamma = focal_loss_gamma
+    self._background_channel_index = background_channel_index
+    self._gt_key = gt_key
+    self._pred_key = pred_key
+    self._weight_key = weight_key
+    self._dynamic_weight = dynamic_weight
+
+  def call(self, y_true: Dict[Text, tf.Tensor],
+           y_pred: Dict[Text, tf.Tensor]) -> tf.Tensor:
+    """Computes the focal cross-entropy loss.
+
+    Args:
+      y_true: A dict of tensors providing ground-truth information. The tensors
+        can be either integer type or one-hot encoded. When is integer type, the
+        shape can be either [batch, num_elements] or [batch, height, width].
+        When one-hot encoded, the shape can be [batch, num_elements, channels]
+        or [batch, height, width, channels].
+      y_pred: A dict of tensors providing predictions. The tensors are of shape
+        [batch, num_elements, channels] or [batch, height, width, channels].
+
+    Returns:
+      A tensor of shape [batch] containing the loss per image.
+
+    """
+
+    gt = y_true[self._gt_key]
+    pred = y_pred[self._pred_key]
+    if self._dynamic_weight:
+      # Dynamic weights w.r.t. the class confidence of each predicted mask.
+      weights = y_pred[self._weight_key]
+    else:
+      weights = y_true[self._weight_key]
+
+    if is_one_hot(gt, pred):
+      gt = tf.cast(gt, tf.float32)
+    else:
+      gt = tf.cast(gt, tf.int32)
+      gt, weights = encode_one_hot(gt, self._num_classes, weights,
+                                   self._ignore_label)
+    pixel_losses = tf.nn.softmax_cross_entropy_with_logits(gt, pred)
+    # Focal loss
+    if self._focal_loss_gamma == 0.0:
+      pixel_focal_losses = pixel_losses
+    else:
+      predictions = tf.nn.softmax(pred, axis=-1)
+      pt = tf.reduce_sum(predictions * gt, axis=-1)
+      pixel_focal_losses = tf.multiply(
+          tf.pow(1.0 - pt, self._focal_loss_gamma), pixel_losses)
+
+    if self._focal_loss_alpha >= 0:
+      # alpha_weights = alpha * positive masks + (1 - alpha) * negative masks.
+      alpha = self._focal_loss_alpha
+      alpha_weights = (
+          alpha * (1.0 - gt[..., self._background_channel_index])
+          + (1 - alpha) * gt[..., self._background_channel_index])
+      pixel_focal_losses = alpha_weights * pixel_focal_losses
+    weighted_pixel_losses = tf.multiply(pixel_focal_losses, weights)
+    weighted_pixel_losses = tf.reshape(
+        weighted_pixel_losses, shape=(tf.shape(weighted_pixel_losses)[0], -1))
+    # Compute mean loss over spatial dimension.
+    num_non_zero = tf.reduce_sum(
+        tf.cast(tf.not_equal(weighted_pixel_losses, 0.0), tf.float32), 1)
+    loss_sum_per_sample = tf.reduce_sum(weighted_pixel_losses, 1)
+    return tf.math.divide_no_nan(loss_sum_per_sample, num_non_zero)
+
+
+class MaskDiceLoss(tf.keras.losses.Loss):
+  """This class contains code to compute Mask Dice loss.
+
+  The channel dimension in Mask Dice loss indicates the mask ID in MaX-DeepLab,
+  instead of a "class" dimension in the original Dice loss.
+  """
+
+  def __init__(self,
+               gt_key: Text,
+               pred_key: Text,
+               weight_key: Text,
+               prediction_activation='softmax'):
+    """Initializes a Mask Dice loss.
+
+    Args:
+      gt_key: A key to extract the ground-truth tensor.
+      pred_key: A key to extract the pred tensor.
+      weight_key: A key to extract the weight tensor.
+      prediction_activation: A String indicating activation function of the
+        prediction. It should be either 'sigmoid' or 'softmax'.
+    """
+    # Implicit reduction might mess with tf.distribute.Strategy, hence we
+    # explicitly reduce the loss.
+    super(MaskDiceLoss, self).__init__(reduction=tf.keras.losses.Reduction.NONE)
+
+    self._gt_key = gt_key
+    self._pred_key = pred_key
+    self._weight_key = weight_key
+    self._prediction_activation = prediction_activation
+
+  def call(self, y_true: Dict[Text, tf.Tensor],
+           y_pred: Dict[Text, tf.Tensor]) -> tf.Tensor:
+    """Computes the Mask Dice loss.
+
+    Args:
+      y_true: A dict of tensors providing ground-truth information.
+      y_pred: A dict of tensors providing predictions.
+
+    Returns:
+      A tensor of shape [batch] containing the loss per sample.
+    """
+    gt = y_true[self._gt_key]
+    pred = y_pred[self._pred_key]
+    # Dynamic weights w.r.t. the class confidence of each predicted mask.
+    weights = y_pred[self._weight_key]
+    weighted_dice_losses = tf.multiply(
+        compute_mask_dice_loss(gt, pred, self._prediction_activation),
+        weights)
+    # Reduce_sum over the channels (i.e., number of masks).
+    return tf.reduce_sum(weighted_dice_losses, axis=-1)
diff --git a/model/loss/base_loss_test.py b/model/loss/base_loss_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6855eefa1a6d16a02e247e8bc3e9169b1ebdb17
--- /dev/null
+++ b/model/loss/base_loss_test.py
@@ -0,0 +1,279 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for base_loss.py."""
+
+import numpy as np
+import tensorflow as tf
+
+from deeplab2.model.loss import base_loss as loss
+
+
+class BaseLossTest(tf.test.TestCase):
+
+  def test_general_loss(self):
+    y_true = {
+        'gt': tf.ones([2, 33, 33]) * 2,
+        'weight': tf.ones([2, 33, 33])
+    }
+    y_pred = {'pred': tf.zeros([2, 33, 33])}
+
+    with self.subTest('L1'):
+      loss_layer = loss.TopKGeneralLoss(
+          loss.mean_absolute_error,
+          'gt',
+          'pred',
+          'weight')
+      expected_loss = tf.ones([2]) * 2
+    with self.subTest('MSE'):
+      loss_layer = loss.TopKGeneralLoss(
+          loss.mean_squared_error,
+          'gt',
+          'pred',
+          'weight')
+      expected_loss = tf.ones([2]) * 4
+    loss_result = loss_layer(y_true, y_pred)
+    np.testing.assert_almost_equal(
+        loss_result.numpy(), expected_loss.numpy(), decimal=5)
+
+  def test_general_loss_weights(self):
+    weights = np.zeros((2, 33, 33))
+    weights[:, 17:29, 15:23] = 1
+
+    gt = np.ones([2, 33, 33]) * 1.5
+    gt[:, 17:29, 15:23] = 2
+
+    y_true = {
+        'gt': tf.convert_to_tensor(gt, dtype=tf.float32),
+        'weight': tf.convert_to_tensor(weights, dtype=tf.float32)
+    }
+    y_pred = {'pred': tf.zeros([2, 33, 33])}
+    loss_layer = loss.TopKGeneralLoss(
+        loss.mean_absolute_error,
+        'gt',
+        'pred',
+        'weight')
+
+    expected_loss = tf.ones([2]) * 2
+    loss_result = loss_layer(y_true, y_pred)
+
+    np.testing.assert_almost_equal(
+        loss_result.numpy(), expected_loss.numpy(), decimal=5)
+
+  def test_topk_ce_loss_ignore(self):
+    num_classes = 19
+    ignore_label = 255
+    loss_layer = loss.TopKCrossEntropyLoss(
+        gt_key='gt',
+        pred_key='pred',
+        weight_key='weight',
+        num_classes=num_classes,
+        ignore_label=ignore_label)
+
+    gt_tensor = np.ones(shape=[2, 33, 33], dtype=np.int32) * ignore_label
+    gt_tensor[:, 17:29, 15:23] = 1
+    logits = tf.random.uniform(shape=[2, 33, 33, num_classes])
+
+    y_true = {
+        'gt': tf.convert_to_tensor(gt_tensor),
+        'weight': tf.ones([2, 33, 33])
+    }
+    y_pred = {'pred': logits}
+
+    expected_result = tf.nn.softmax_cross_entropy_with_logits(
+        tf.one_hot(np.squeeze(gt_tensor[:, 17:29, 15:23]), num_classes),
+        logits[:, 17:29, 15:23, :])
+    expected_result = tf.reduce_mean(expected_result, axis=[1, 2])
+
+    per_sample_loss = loss_layer(y_true, y_pred)
+
+    np.testing.assert_almost_equal(
+        per_sample_loss.numpy(), expected_result.numpy(), decimal=5)
+
+  def test_topk_ce_loss_global_weight(self):
+    num_classes = 19
+    weight = 3.145
+    loss_layer = loss.TopKCrossEntropyLoss(
+        gt_key='gt',
+        pred_key='pred',
+        weight_key='weight',
+        num_classes=num_classes,
+        ignore_label=255)
+    logits = tf.random.uniform(shape=[2, 33, 33, num_classes])
+
+    y_true = {
+        'gt': tf.ones([2, 33, 33], tf.int32),
+        'weight': tf.ones([2, 33, 33])
+    }
+    y_pred = {'pred': logits}
+
+    expected_result = tf.nn.softmax_cross_entropy_with_logits(
+        tf.one_hot(y_true['gt'], num_classes), logits)
+    expected_result = tf.reduce_mean(expected_result, axis=[1, 2])
+    expected_result *= weight
+
+    per_sample_loss = loss_layer(y_true, y_pred, weight)
+
+    np.testing.assert_almost_equal(
+        per_sample_loss.numpy(), expected_result.numpy(), decimal=5)
+
+  def test_topk_ce_loss_topk(self):
+    num_classes = 19
+    top_k = 0.5
+    loss_layer = loss.TopKCrossEntropyLoss(
+        gt_key='gt',
+        pred_key='pred',
+        weight_key='weight',
+        num_classes=num_classes,
+        top_k_percent_pixels=top_k,
+        ignore_label=255)
+
+    logits = tf.random.uniform(shape=[2, 33, 33, num_classes])
+    y_true = {
+        'gt': tf.ones([2, 33, 33], tf.int32),
+        'weight': tf.ones([2, 33, 33])
+    }
+    y_pred = {'pred': logits}
+
+    expected_result = tf.nn.softmax_cross_entropy_with_logits(
+        tf.one_hot(y_true['gt'], num_classes), logits)
+    expected_result, _ = tf.math.top_k(
+        tf.reshape(expected_result, shape=[2, -1]),
+        tf.cast((top_k * tf.size(y_true['gt'], tf.float32) / 2), tf.int32))
+    expected_result = tf.reduce_mean(expected_result, axis=[1])
+
+    per_sample_loss = loss_layer(y_true, y_pred)
+
+    np.testing.assert_almost_equal(
+        per_sample_loss.numpy(), expected_result.numpy(), decimal=5)
+
+  def test_is_one_hot(self):
+    num_classes = 19
+    gt_list = [
+        tf.ones([2, 33, 33], tf.int32),
+        tf.ones([2, 33], tf.int32),
+        tf.one_hot(tf.ones([2, 33, 33], tf.int32), num_classes),
+        tf.one_hot(tf.ones([2, 33], tf.int32), num_classes),
+    ]
+    pred_list = [
+        tf.random.uniform(shape=[2, 33, 33, num_classes]),
+        tf.random.uniform(shape=[2, 33, num_classes]),
+        tf.random.uniform(shape=[2, 33, 33, num_classes]),
+        tf.random.uniform(shape=[2, 33, num_classes]),
+    ]
+    expected_result_list = [False, False, True, True]
+    output_list = []
+    for gt, pred in zip(gt_list, pred_list):
+      output_list.append(loss.is_one_hot(gt, pred))
+    np.testing.assert_equal(output_list, expected_result_list)
+
+  def test_focal_ce_loss_integer_or_one_hot(self):
+    num_classes = 19
+    gamma = 0.5
+    alpha = 0.75
+    loss_layer = loss.FocalCrossEntropyLoss(
+        gt_key='gt',
+        pred_key='pred',
+        weight_key='weight',
+        num_classes=num_classes,
+        focal_loss_alpha=alpha,
+        focal_loss_gamma=gamma,
+        ignore_label=255)
+
+    logits = tf.random.uniform(shape=[2, 33 * 33, num_classes])
+    gt = tf.ones([2, 33 * 33], tf.int32)
+    use_one_hot_encode_list = [False, True]
+    for use_one_hot_encode in use_one_hot_encode_list:
+      if use_one_hot_encode:
+        gt = tf.one_hot(gt, num_classes)
+      y_true = {'gt': gt}
+      y_pred = {'pred': logits,
+                'weight': tf.ones([2, 33 * 33])}
+      predictions = tf.nn.softmax(logits, axis=-1)
+      if use_one_hot_encode:
+        pt = tf.reduce_sum(predictions * gt, axis=-1)
+        expected_result = tf.nn.softmax_cross_entropy_with_logits(gt, logits)
+      else:
+        pt = tf.reduce_sum(predictions * tf.one_hot(gt, num_classes), axis=-1)
+        expected_result = tf.nn.softmax_cross_entropy_with_logits(
+            tf.one_hot(gt, num_classes), logits)
+      expected_result = tf.multiply(tf.pow(1.0 - pt, gamma), expected_result)
+      expected_result = tf.reshape(expected_result, shape=[2, -1])
+      # Since labels has no '19' (background) in this example, only alpha is
+      # multiplied.
+      expected_result = tf.reduce_mean(expected_result, axis=[1]) * alpha
+      per_sample_loss = loss_layer(y_true, y_pred)
+
+      np.testing.assert_almost_equal(
+          per_sample_loss.numpy(), expected_result.numpy(), decimal=5)
+
+  def test_mask_dice_loss(self):
+    gt = [
+        [
+            [1., 1., 1.],
+            [0., 0., 0.],
+            [0., 0., 0.],
+        ],
+        [
+            [0., 0., 0.],
+            [1., 1., 1.],
+            [1., 1., 1.],
+        ],
+    ]
+    gt = tf.constant(gt, dtype=tf.float32)
+    gt = tf.expand_dims(gt, -1)
+    gt = tf.transpose(gt, perm=[3, 1, 2, 0])
+
+    y_true = {'gt': gt}
+
+    pred = [
+        [
+            [1., 1., 0.],
+            [1., 1., 0.],
+            [1., 1., 0.],
+        ],
+        [
+            [0., 0., 1.],
+            [0., 0., 1.],
+            [0., 0., 1.],
+        ],
+    ]
+    # Multiply 100 to make its Softmax output have 0 or 1 values.
+    pred = tf.constant(pred, dtype=tf.float32) * 100.
+    pred = tf.expand_dims(pred, -1)
+    pred = tf.transpose(pred, perm=[3, 1, 2, 0])
+    y_pred = {
+        'pred': pred,
+        'weight': tf.ones([1]) * 0.5
+    }
+
+    loss_layer = loss.MaskDiceLoss(
+        gt_key='gt',
+        pred_key='pred',
+        weight_key='weight',
+        prediction_activation='softmax')
+    dice_loss = loss_layer(y_true, y_pred)
+    loss_result = dice_loss.numpy()
+    # For each channel,
+    #   nominator = 2 * intersection(=2) + smooth(=1) = 5
+    #   denominator = 9 + smooth(=1) = 10
+    # Channel-wise sum: [5/10, 5/10] -> [1.0]
+    # Weighted result: [1.0] * weight(=0.5) = 0.5
+    expected_result = np.array([0.5])
+    np.testing.assert_almost_equal(loss_result, expected_result)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/model/loss/loss_builder.py b/model/loss/loss_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..89a1606c614e9aaa3666e09ad0e58c4ddf680cc8
--- /dev/null
+++ b/model/loss/loss_builder.py
@@ -0,0 +1,220 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This file contains loss builder classes used in the DeepLab model."""
+
+import collections
+from typing import Any, Dict, Text, Tuple, Optional
+
+import tensorflow as tf
+
+from deeplab2 import common
+from deeplab2 import config_pb2
+from deeplab2.model.loss import base_loss
+from deeplab2.model.loss import max_deeplab_loss
+
+
+def _create_loss_and_weight(
+    loss_options: config_pb2.LossOptions.SingleLossOptions, gt_key: Text,
+    pred_key: Text, weight_key: Text, **kwargs: Any) -> tf.keras.losses.Loss:
+  """Creates a loss and its weight from loss options.
+
+  Args:
+    loss_options: Loss options as defined by
+      config_pb2.LossOptions.SingleLossOptions or None.
+    gt_key: A key to extract the ground-truth from a dictionary.
+    pred_key: A key to extract the prediction from a dictionary.
+    weight_key: A key to extract the per-pixel weights from a dictionary.
+    **kwargs: Additional parameters to initialize the loss.
+
+  Returns:
+    A tuple of an instance of tf.keras.losses.Loss and its corresponding weight
+    as an integer.
+
+  Raises:
+    ValueError: An error occurs when the loss name is not a valid loss.
+  """
+  if loss_options is None:
+    return None, 0
+  if loss_options.name == 'softmax_cross_entropy':
+    return base_loss.TopKCrossEntropyLoss(
+        gt_key,
+        pred_key,
+        weight_key,
+        top_k_percent_pixels=loss_options.top_k_percent,
+        **kwargs), loss_options.weight
+  elif loss_options.name == 'l1':
+    return base_loss.TopKGeneralLoss(
+        base_loss.mean_absolute_error,
+        gt_key,
+        pred_key,
+        weight_key,
+        top_k_percent_pixels=loss_options.top_k_percent), loss_options.weight
+  elif loss_options.name == 'mse':
+    return base_loss.TopKGeneralLoss(
+        base_loss.mean_squared_error,
+        gt_key,
+        pred_key,
+        weight_key,
+        top_k_percent_pixels=loss_options.top_k_percent), loss_options.weight
+
+  raise ValueError('Loss %s is not a valid loss.' % loss_options.name)
+
+
+class DeepLabFamilyLoss(tf.keras.layers.Layer):
+  """This class contains code to build and call losses for DeepLabFamilyLoss."""
+
+  def __init__(
+      self,
+      loss_options: config_pb2.LossOptions,
+      num_classes: Optional[int],
+      ignore_label: Optional[int],
+      thing_class_ids: Tuple[int]):
+    """Initializes the losses for Panoptic-DeepLab.
+
+    Args:
+      loss_options: Loss options as defined by config_pb2.LossOptions.
+      num_classes: An integer specifying the number of classes in the dataset.
+      ignore_label: An optional integer specifying the ignore label or None.
+      thing_class_ids: A tuple of length [N] containing N thing indices.
+    """
+    super(DeepLabFamilyLoss, self).__init__(name='DeepLabFamilyLoss')
+
+    # Single-term losses are losses that have only one loss term and thus each
+    # loss function directly returns a single tensor as the loss value, as
+    # opposed to multi-term losses that involve multiple terms and return a
+    # dictionary of loss values.
+    self._single_term_loss_func_and_weight_dict = collections.OrderedDict()
+    self._extra_loss_names = [common.TOTAL_LOSS]
+
+    if loss_options.HasField(common.SEMANTIC_LOSS):
+      self._single_term_loss_func_and_weight_dict[
+          common.SEMANTIC_LOSS] = _create_loss_and_weight(
+              loss_options.semantic_loss,
+              common.GT_SEMANTIC_KEY,
+              common.PRED_SEMANTIC_LOGITS_KEY,
+              common.SEMANTIC_LOSS_WEIGHT_KEY,
+              num_classes=num_classes,
+              ignore_label=ignore_label)
+
+    if loss_options.HasField(common.CENTER_LOSS):
+      self._single_term_loss_func_and_weight_dict[
+          common.CENTER_LOSS] = _create_loss_and_weight(
+              loss_options.center_loss, common.GT_INSTANCE_CENTER_KEY,
+              common.PRED_CENTER_HEATMAP_KEY, common.CENTER_LOSS_WEIGHT_KEY)
+
+    if loss_options.HasField(common.REGRESSION_LOSS):
+      self._single_term_loss_func_and_weight_dict[
+          common.REGRESSION_LOSS] = _create_loss_and_weight(
+              loss_options.regression_loss, common.GT_INSTANCE_REGRESSION_KEY,
+              common.PRED_OFFSET_MAP_KEY, common.REGRESSION_LOSS_WEIGHT_KEY)
+
+    # Currently, only used for Motion-DeepLab.
+    if loss_options.HasField(common.MOTION_LOSS):
+      self._single_term_loss_func_and_weight_dict[
+          common.MOTION_LOSS] = _create_loss_and_weight(
+              loss_options.motion_loss, common.GT_FRAME_OFFSET_KEY,
+              common.PRED_FRAME_OFFSET_MAP_KEY,
+              common.FRAME_REGRESSION_LOSS_WEIGHT_KEY)
+
+    # Next-frame regression loss used in ViP-DeepLab.
+    if loss_options.HasField(common.NEXT_REGRESSION_LOSS):
+      self._single_term_loss_func_and_weight_dict[
+          common.NEXT_REGRESSION_LOSS] = _create_loss_and_weight(
+              loss_options.next_regression_loss,
+              common.GT_NEXT_INSTANCE_REGRESSION_KEY,
+              common.PRED_NEXT_OFFSET_MAP_KEY,
+              common.NEXT_REGRESSION_LOSS_WEIGHT_KEY)
+
+    # Multi-term losses that return dictionaries of loss terms.
+    self._multi_term_losses = []
+
+    # MaXDeepLabLoss optionally returns four loss terms in total:
+    # - common.PQ_STYLE_LOSS_CLASS_TERM
+    # - common.PQ_STYLE_LOSS_MASK_DICE_TERM
+    # - common.MASK_ID_CROSS_ENTROPY_LOSS
+    # - common.INSTANCE_DISCRIMINATION_LOSS
+    if any([loss_options.HasField('pq_style_loss'),
+            loss_options.HasField('mask_id_cross_entropy_loss'),
+            loss_options.HasField('instance_discrimination_loss')]):
+      self._multi_term_losses.append(max_deeplab_loss.MaXDeepLabLoss(
+          loss_options, ignore_label, thing_class_ids))
+
+    for multi_term_loss in self._multi_term_losses:
+      self._extra_loss_names += multi_term_loss.loss_terms
+
+  def get_loss_names(self):
+    # Keep track of all the keys that will be returned in self.call().
+    loss_names = list(self._single_term_loss_func_and_weight_dict.keys())
+    return loss_names + self._extra_loss_names
+
+  def call(self, y_true: Dict[Text, tf.Tensor],
+           y_pred: Dict[Text, tf.Tensor]) -> Dict[Text, tf.Tensor]:
+    """Performs the loss computations given ground-truth and predictions.
+
+    The loss is computed for each sample separately. Currently, smoothed
+    ground-truth labels are not supported.
+
+    Args:
+      y_true: A dictionary of tf.Tensor containing all ground-truth data to
+        compute the loss. Depending on the configuration, the dict has to
+        contain common.GT_SEMANTIC_KEY, and optionally
+        common.GT_INSTANCE_CENTER_KEY, common.GT_INSTANCE_REGRESSION_KEY, and
+        common.GT_FRAME_OFFSET_KEY.
+      y_pred: A dicitionary of tf.Tensor containing all predictions to compute
+        the loss. Depending on the configuration, the dict has to contain
+        common.PRED_SEMANTIC_LOGITS_KEY, and optionally
+        common.PRED_CENTER_HEATMAP_KEY, common.PRED_OFFSET_MAP_KEY, and
+        common.PRED_FRAME_OFFSET_MAP_KEY.
+
+    Returns:
+      The loss as a dict of tf.Tensor, optionally containing the following:
+      - common.SEMANTIC_LOSS: [batch].
+      - common.CENTER_LOSS: [batch].
+      - common.REGRESSION_LOSS: [batch].
+      - common.MOTION_LOSS: [batch], the frame offset regression loss.
+      - common.NEXT_REGRESSION_LOSS: [batch], the next regression loss.
+
+    Raises:
+      AssertionError: If the keys of the resulting_dict do not match
+        self.get_loss_names().
+      AssertionError: The keys of the resulting_dict overlap with the keys of
+        the loss_dict.
+    """
+    resulting_dict = collections.OrderedDict()
+
+    # Single-term losses.
+    for loss_name, func_and_weight in (
+        self._single_term_loss_func_and_weight_dict.items()):
+      loss_func, loss_weight = func_and_weight
+      loss_value = loss_func(y_true, y_pred)
+      resulting_dict[loss_name] = loss_value * loss_weight
+
+    # Multi-term losses predict a dictionary, so we handle them differently.
+    for multi_term_loss in self._multi_term_losses:
+      loss_dict = multi_term_loss((y_true, y_pred))
+      if not set(loss_dict).isdisjoint(resulting_dict):
+        raise AssertionError('The keys of the resulting_dict overlap with the '
+                             'keys of the loss_dict.')
+      resulting_dict.update(loss_dict)
+
+    # Also include the total loss in the resulting_dict.
+    total_loss = tf.math.accumulate_n(list(resulting_dict.values()))
+    resulting_dict[common.TOTAL_LOSS] = total_loss
+
+    if sorted(resulting_dict.keys()) != sorted(self.get_loss_names()):
+      raise AssertionError(
+          'The keys of the resulting_dict should match self.get_loss_names().')
+    return resulting_dict
diff --git a/model/loss/loss_builder_test.py b/model/loss/loss_builder_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0abcddd15bdbf6aa1434c8a5a5edf1bc91899c51
--- /dev/null
+++ b/model/loss/loss_builder_test.py
@@ -0,0 +1,224 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for loss_builder.py."""
+
+import numpy as np
+import tensorflow as tf
+
+from deeplab2 import common
+from deeplab2 import trainer_pb2
+from deeplab2.model.loss import loss_builder as loss
+
+
+class LossTest(tf.test.TestCase):
+
+  def test_panoptic_deeplab_loss(self):
+    ignore_label = 255
+    num_classes = 19
+    semantic_loss_options = trainer_pb2.LossOptions.SingleLossOptions(
+        name='softmax_cross_entropy')
+    center_loss_options = trainer_pb2.LossOptions.SingleLossOptions(name='mse')
+    regression_loss_options = trainer_pb2.LossOptions.SingleLossOptions(
+        name='l1')
+    motion_loss_options = trainer_pb2.LossOptions.SingleLossOptions(
+        name='l1')
+    loss_options = trainer_pb2.LossOptions(
+        semantic_loss=semantic_loss_options,
+        center_loss=center_loss_options,
+        regression_loss=regression_loss_options,
+        motion_loss=motion_loss_options)
+
+    loss_layer = loss.DeepLabFamilyLoss(
+        loss_options,
+        num_classes=num_classes,
+        ignore_label=ignore_label,
+        thing_class_ids=tuple(range(11, 19)))
+
+    pred_dict = {
+        common.PRED_SEMANTIC_LOGITS_KEY:
+            tf.random.uniform(shape=[2, 33, 33, num_classes]),
+        common.PRED_CENTER_HEATMAP_KEY:
+            tf.zeros(shape=[2, 33, 33]),
+        common.PRED_OFFSET_MAP_KEY:
+            tf.zeros(shape=[2, 33, 33, 2]),
+        common.PRED_FRAME_OFFSET_MAP_KEY:
+            tf.zeros(shape=[2, 33, 33, 2]),
+    }
+
+    with self.subTest('Test center loss.'):
+      gt_dict = {
+          common.GT_SEMANTIC_KEY:
+              tf.ones(shape=[2, 33, 33]) * ignore_label,
+          common.GT_INSTANCE_CENTER_KEY:
+              tf.ones(shape=[2, 33, 33]) * 2,
+          common.GT_INSTANCE_REGRESSION_KEY:
+              tf.zeros(shape=[2, 33, 33, 2]),
+          common.GT_FRAME_OFFSET_KEY:
+              tf.zeros(shape=[2, 33, 33, 2]),
+          common.SEMANTIC_LOSS_WEIGHT_KEY:
+              tf.ones(shape=[2, 33, 33]),
+          common.CENTER_LOSS_WEIGHT_KEY:
+              tf.ones(shape=[2, 33, 33]),
+          common.REGRESSION_LOSS_WEIGHT_KEY:
+              tf.ones(shape=[2, 33, 33]),
+          common.FRAME_REGRESSION_LOSS_WEIGHT_KEY:
+              tf.ones(shape=[2, 33, 33]),
+      }
+      # expected_result = square(2 - 0).
+      expected_result = tf.ones(shape=[2]) * 4
+      loss_result = loss_layer(gt_dict, pred_dict)[common.TOTAL_LOSS]
+
+      np.testing.assert_equal(loss_result.numpy(), expected_result.numpy())
+
+    with self.subTest('Test regression loss.'):
+      gt_dict = {
+          common.GT_SEMANTIC_KEY:
+              tf.ones(shape=[2, 33, 33]) * ignore_label,
+          common.GT_INSTANCE_CENTER_KEY:
+              tf.zeros(shape=[2, 33, 33]),
+          common.GT_INSTANCE_REGRESSION_KEY:
+              tf.ones(shape=[2, 33, 33, 2]) * 2,
+          common.GT_FRAME_OFFSET_KEY:
+              tf.ones(shape=[2, 33, 33, 2]) * 2,
+          common.SEMANTIC_LOSS_WEIGHT_KEY:
+              tf.ones(shape=[2, 33, 33]),
+          common.CENTER_LOSS_WEIGHT_KEY:
+              tf.ones(shape=[2, 33, 33]),
+          common.REGRESSION_LOSS_WEIGHT_KEY:
+              tf.ones(shape=[2, 33, 33]),
+          common.FRAME_REGRESSION_LOSS_WEIGHT_KEY:
+              tf.ones(shape=[2, 33, 33]),
+      }
+      expected_result = tf.ones(shape=[2]) * 4
+      loss_result = loss_layer(gt_dict, pred_dict)[common.TOTAL_LOSS]
+
+      np.testing.assert_equal(loss_result.numpy(), expected_result.numpy())
+
+    with self.subTest('Test instances losses.'):
+      gt_dict = {
+          common.GT_SEMANTIC_KEY:
+              tf.ones(shape=[2, 33, 33]) * ignore_label,
+          common.GT_INSTANCE_CENTER_KEY:
+              tf.ones(shape=[2, 33, 33]) * 2,
+          common.GT_INSTANCE_REGRESSION_KEY:
+              tf.ones(shape=[2, 33, 33, 2]) * 2,
+          common.GT_FRAME_OFFSET_KEY:
+              tf.ones(shape=[2, 33, 33, 2]) * 2,
+          common.SEMANTIC_LOSS_WEIGHT_KEY:
+              tf.ones(shape=[2, 33, 33]),
+          common.CENTER_LOSS_WEIGHT_KEY:
+              tf.ones(shape=[2, 33, 33]),
+          common.REGRESSION_LOSS_WEIGHT_KEY:
+              tf.ones(shape=[2, 33, 33]),
+          common.FRAME_REGRESSION_LOSS_WEIGHT_KEY:
+              tf.zeros(shape=[2, 33, 33]),
+      }
+      expected_result = tf.ones(shape=[2]) * 6
+      loss_result = loss_layer(gt_dict, pred_dict)[common.TOTAL_LOSS]
+
+      np.testing.assert_equal(loss_result.numpy(), expected_result.numpy())
+
+    with self.subTest('Test all losses.'):
+      gt_dict = {
+          common.GT_SEMANTIC_KEY:
+              tf.ones(shape=[2, 33, 33], dtype=tf.int32),
+          common.GT_INSTANCE_CENTER_KEY:
+              tf.ones(shape=[2, 33, 33]) * 2,
+          common.GT_INSTANCE_REGRESSION_KEY:
+              tf.ones(shape=[2, 33, 33, 2]) * 2,
+          common.GT_FRAME_OFFSET_KEY:
+              tf.ones(shape=[2, 33, 33, 2]) * 2,
+          common.SEMANTIC_LOSS_WEIGHT_KEY:
+              tf.ones(shape=[2, 33, 33]),
+          common.CENTER_LOSS_WEIGHT_KEY:
+              tf.ones(shape=[2, 33, 33]),
+          common.REGRESSION_LOSS_WEIGHT_KEY:
+              tf.ones(shape=[2, 33, 33]),
+          common.FRAME_REGRESSION_LOSS_WEIGHT_KEY:
+              tf.ones(shape=[2, 33, 33]),
+      }
+      expected_result = tf.nn.softmax_cross_entropy_with_logits(
+          tf.one_hot(gt_dict[common.GT_SEMANTIC_KEY], num_classes),
+          pred_dict[common.PRED_SEMANTIC_LOGITS_KEY])
+      expected_result = tf.reduce_mean(expected_result, axis=[1, 2])
+      # Add center and regression loss.
+      expected_result += tf.ones(shape=[2]) * 8
+
+      loss_result = loss_layer(gt_dict, pred_dict)[common.TOTAL_LOSS]
+
+      np.testing.assert_equal(loss_result.numpy(), expected_result.numpy())
+
+  def test_panoptic_deeplab_semantic_loss_only(self):
+    ignore_label = 255
+    num_classes = 19
+    semantic_loss_options = trainer_pb2.LossOptions.SingleLossOptions(
+        name='softmax_cross_entropy')
+    loss_options = trainer_pb2.LossOptions(
+        semantic_loss=semantic_loss_options)
+
+    loss_layer = loss.DeepLabFamilyLoss(
+        loss_options,
+        num_classes=num_classes,
+        ignore_label=ignore_label,
+        thing_class_ids=tuple(range(11, 19)))
+
+    pred_dict = {
+        common.PRED_SEMANTIC_LOGITS_KEY:
+            tf.random.uniform(shape=[2, 33, 33, num_classes]),
+    }
+    gt_dict = {
+        common.GT_SEMANTIC_KEY: tf.ones(shape=[2, 33, 33], dtype=tf.int32),
+        common.SEMANTIC_LOSS_WEIGHT_KEY: tf.ones(shape=[2, 33, 33]),
+    }
+
+    expected_result = tf.nn.softmax_cross_entropy_with_logits(
+        tf.one_hot(gt_dict[common.GT_SEMANTIC_KEY], num_classes),
+        pred_dict[common.PRED_SEMANTIC_LOGITS_KEY])
+    expected_result = tf.reduce_mean(expected_result, axis=[1, 2])
+
+    loss_dict = loss_layer(gt_dict, pred_dict)
+    self.assertIn(common.SEMANTIC_LOSS, loss_dict)
+    self.assertNotIn(common.CENTER_LOSS, loss_dict)
+    self.assertNotIn(common.REGRESSION_LOSS, loss_dict)
+    self.assertNotIn(common.MOTION_LOSS, loss_dict)
+    loss_result = loss_dict[common.SEMANTIC_LOSS]
+
+    np.testing.assert_equal(loss_result.numpy(), expected_result.numpy())
+
+  def test_panoptic_deeplab_loss_error(self):
+    semantic_loss_options = trainer_pb2.LossOptions.SingleLossOptions(
+        name='softmax_cross_entropy')
+    center_loss_options = trainer_pb2.LossOptions.SingleLossOptions(
+        name='not_a_loss', weight=1.0)
+    regression_loss_options = trainer_pb2.LossOptions.SingleLossOptions(
+        name='l1', weight=1.0)
+    motion_loss_options = trainer_pb2.LossOptions.SingleLossOptions(
+        name='l1', weight=1.0)
+    loss_options = trainer_pb2.LossOptions(
+        semantic_loss=semantic_loss_options,
+        center_loss=center_loss_options,
+        regression_loss=regression_loss_options,
+        motion_loss=motion_loss_options)
+
+    with self.assertRaises(ValueError):
+      _ = loss.DeepLabFamilyLoss(loss_options,
+                                 num_classes=19,
+                                 ignore_label=255,
+                                 thing_class_ids=tuple(range(11, 19)))
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/model/loss/matchers_ops.py b/model/loss/matchers_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4273e94476d7d7859797c8dfc1ace1ffe100f892
--- /dev/null
+++ b/model/loss/matchers_ops.py
@@ -0,0 +1,495 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tensorflow implementation to solve the Linear Sum Assignment problem.
+
+The Linear Sum Assignment problem involves determining the minimum weight
+matching for bipartite graphs. For example, this problem can be defined by
+a 2D matrix C, where each element i,j determines the cost of matching worker i
+with job j. The solution to the problem is a complete assignment of jobs to
+workers, such that no job is assigned to more than one work and no worker is
+assigned more than one job, with minimum cost.
+
+This implementation is designed to be used with tf.compat.v2 to be compatible
+with the rest of the DeepLab2 library. It builds off of the Hungarian Matching
+Algorithm (https://www.cse.ust.hk/~golin/COMP572/Notes/Matching.pdf), the
+original Lingvo tensorflow implementation by Jiquan Ngiam, and the modified TF1
+version by Amil Merchant.
+"""
+
+import tensorflow as tf
+
+
+def _prepare(weights):
+  """Prepare the cost matrix.
+
+  To speed up computational efficiency of the algorithm, all weights are shifted
+  to be non-negative. Each element is reduced by the row / column minimum. Note
+  that neither operation will effect the resulting solution but will provide
+  a better starting point for the greedy assignment. Note this corresponds to
+  the pre-processing and step 1 of the Hungarian algorithm from Wikipedia.
+
+  Args:
+    weights: A float32 [batch_size, num_elems, num_elems] tensor, where each
+      inner matrix represents weights to be use for matching.
+
+  Returns:
+    A prepared weights tensor of the same shape and dtype.
+  """
+  # Since every worker needs a job and every job needs a worker, we can subtract
+  # the minimum from each.
+  weights -= tf.reduce_min(weights, axis=2, keepdims=True)
+  weights -= tf.reduce_min(weights, axis=1, keepdims=True)
+  return weights
+
+
+def _greedy_assignment(adj_matrix):
+  """Greedily assigns workers to jobs based on an adjaceny matrix.
+
+  Starting with an adjacency matrix representing the available connections
+  in the bi-partite graph, this function greedily chooses elements such
+  that each worker is matched to at most one job (or each job is assigned to
+  at most one worker). Note, if the adjacency matrix has no available values
+  for a particular row/column, the corresponding job/worker may go unassigned.
+
+  Args:
+    adj_matrix: A bool [batch_size, num_elems, num_elems] tensor, where each
+      element of the inner matrix represents whether the worker (row) can be
+      matched to the job (column).
+
+  Returns:
+    A bool [batch_size, num_elems, num_elems] tensor, where each element of the
+    inner matrix represents whether the worker has been matched to the job.
+    Each row and column can have at most one true element. Some of the rows
+    and columns may not be matched.
+  """
+  _, num_elems, _ = get_shape_list(adj_matrix, expected_rank=3)
+  adj_matrix = tf.transpose(adj_matrix, [1, 0, 2])
+
+  # Create a dynamic TensorArray containing the assignments for each worker/job
+  assignment = tf.TensorArray(tf.bool, num_elems)
+
+  # Store the elements assigned to each column to update each iteration
+  col_assigned = tf.zeros_like(adj_matrix[0, ...], dtype=tf.bool)
+
+  # Iteratively assign each row using tf.foldl. Intuitively, this is a loop
+  # over rows, where we incrementally assign each row.
+  def _assign_row(accumulator, row_adj):
+    # The accumulator tracks the row assignment index.
+    idx, assignment, col_assigned = accumulator
+
+    # Viable candidates cannot already be assigned to another job.
+    candidates = row_adj & (~col_assigned)
+
+    # Deterministically assign to the candidates of the highest index count.
+    max_candidate_idx = tf.argmax(
+        tf.cast(candidates, tf.int32), axis=1, output_type=tf.int32)
+
+    candidates_indicator = tf.one_hot(
+        max_candidate_idx,
+        num_elems,
+        on_value=True,
+        off_value=False,
+        dtype=tf.bool)
+    candidates_indicator &= candidates
+
+    # Make assignment to the column.
+    col_assigned |= candidates_indicator
+    assignment = assignment.write(idx, candidates_indicator)
+
+    return idx + 1, assignment, col_assigned
+
+  _, assignment, _ = tf.foldl(
+      _assign_row, adj_matrix, (0, assignment, col_assigned), back_prop=False)
+
+  assignment = assignment.stack()
+  assignment = tf.transpose(assignment, [1, 0, 2])
+  return assignment
+
+
+def _find_augmenting_path(assignment, adj_matrix):
+  """Finds an augmenting path given an assignment and an adjacency matrix.
+
+  The augmenting path search starts from the unassigned workers, then goes on
+  to find jobs (via an unassigned pairing), then back again to workers (via an
+  existing pairing), and so on. The path alternates between unassigned and
+  existing pairings. Returns the state after the search.
+
+  Note: In the state the worker and job, indices are 1-indexed so that we can
+  use 0 to represent unreachable nodes. State contains the following keys:
+
+  - jobs: A [batch_size, 1, num_elems] tensor containing the highest index
+      unassigned worker that can reach this job through a path.
+  - jobs_from_worker: A [batch_size, num_elems] tensor containing the worker
+      reached immediately before this job.
+  - workers: A [batch_size, num_elems, 1] tensor containing the highest index
+      unassigned worker that can reach this worker through a path.
+  - workers_from_job: A [batch_size, num_elems] tensor containing the job
+      reached immediately before this worker.
+  - new_jobs: A bool [batch_size, num_elems] tensor containing True if the
+      unassigned job can be reached via a path.
+
+  State can be used to recover the path via backtracking.
+
+  Args:
+    assignment: A bool [batch_size, num_elems, num_elems] tensor, where each
+      element of the inner matrix represents whether the worker has been matched
+      to the job. This may be a partial assignment.
+    adj_matrix: A bool [batch_size, num_elems, num_elems] tensor, where each
+      element of the inner matrix represents whether the worker (row) can be
+      matched to the job (column).
+
+  Returns:
+    A state dict, which represents the outcome of running an augmenting
+    path search on the graph given the assignment.
+  """
+  batch_size, num_elems, _ = get_shape_list(assignment, expected_rank=3)
+  unassigned_workers = ~tf.reduce_any(assignment, axis=2, keepdims=True)
+  unassigned_jobs = ~tf.reduce_any(assignment, axis=1, keepdims=True)
+
+  unassigned_pairings = tf.cast(adj_matrix & ~assignment, tf.int32)
+  existing_pairings = tf.cast(assignment, tf.int32)
+
+  # Initialize unassigned workers to have non-zero ids, assigned workers will
+  # have ids = 0.
+  worker_indices = tf.range(1, num_elems + 1, dtype=tf.int32)
+  init_workers = tf.tile(worker_indices[tf.newaxis, :, tf.newaxis],
+                         [batch_size, 1, 1])
+  init_workers *= tf.cast(unassigned_workers, tf.int32)
+
+  state = {
+      "jobs": tf.zeros((batch_size, 1, num_elems), dtype=tf.int32),
+      "jobs_from_worker": tf.zeros((batch_size, num_elems), dtype=tf.int32),
+      "workers": init_workers,
+      "workers_from_job": tf.zeros((batch_size, num_elems), dtype=tf.int32)
+  }
+
+  def _has_active_workers(state, curr_workers):
+    """Check if there are still active workers."""
+    del state
+    return tf.reduce_sum(curr_workers) > 0
+
+  def _augment_step(state, curr_workers):
+    """Performs one search step."""
+
+    # Note: These steps could be potentially much faster if sparse matrices are
+    # supported. The unassigned_pairings and existing_pairings matrices can be
+    # very sparse.
+
+    # Find potential jobs using current workers.
+    potential_jobs = curr_workers * unassigned_pairings
+    curr_jobs = tf.reduce_max(potential_jobs, axis=1, keepdims=True)
+    curr_jobs_from_worker = 1 + tf.argmax(
+        potential_jobs, axis=1, output_type=tf.int32)
+
+    # Remove already accessible jobs from curr_jobs.
+    default_jobs = tf.zeros_like(state["jobs"], dtype=state["jobs"].dtype)
+    curr_jobs = tf.where(state["jobs"] > 0, default_jobs, curr_jobs)
+    curr_jobs_from_worker *= tf.cast(curr_jobs > 0, tf.int32)[:, 0, :]
+
+    # Find potential workers from current jobs.
+    potential_workers = curr_jobs * existing_pairings
+    curr_workers = tf.reduce_max(potential_workers, axis=2, keepdims=True)
+    curr_workers_from_job = 1 + tf.argmax(
+        potential_workers, axis=2, output_type=tf.int32)
+
+    # Remove already accessible workers from curr_workers.
+    default_workers = tf.zeros_like(state["workers"])
+    curr_workers = tf.where(
+        state["workers"] > 0, default_workers, curr_workers)
+    curr_workers_from_job *= tf.cast(curr_workers > 0, tf.int32)[:, :, 0]
+
+    # Update state so that we can backtrack later.
+    state = state.copy()
+    state["jobs"] = tf.maximum(state["jobs"], curr_jobs)
+    state["jobs_from_worker"] = tf.maximum(state["jobs_from_worker"],
+                                           curr_jobs_from_worker)
+    state["workers"] = tf.maximum(state["workers"], curr_workers)
+    state["workers_from_job"] = tf.maximum(state["workers_from_job"],
+                                           curr_workers_from_job)
+
+    return state, curr_workers
+
+  with tf.name_scope("find_augmenting_path"):
+    state, _ = tf.while_loop(
+        _has_active_workers,
+        _augment_step, (state, init_workers),
+        back_prop=False)
+
+    # Compute new jobs, this is useful for determnining termnination of the
+    # maximum bi-partite matching and initialization for backtracking.
+    new_jobs = (state["jobs"] > 0) & unassigned_jobs
+    state["new_jobs"] = new_jobs[:, 0, :]
+  return state
+
+
+def _improve_assignment(assignment, state):
+  """Improves an assignment by backtracking the augmented path using state.
+
+  Args:
+    assignment: A bool [batch_size, num_elems, num_elems] tensor, where each
+      element of the inner matrix represents whether the worker has been matched
+      to the job. This may be a partial assignment.
+    state: A dict, which represents the outcome of running an augmenting path
+      search on the graph given the assignment.
+
+  Returns:
+    A new assignment matrix of the same shape and type as assignment, where the
+    assignment has been updated using the augmented path found.
+  """
+  batch_size, num_elems, _ = get_shape_list(assignment, 3)
+
+  # We store the current job id and iteratively backtrack using jobs_from_worker
+  # and workers_from_job until we reach an unassigned worker. We flip all the
+  # assignments on this path to discover a better overall assignment.
+
+  # Note: The indices in state are 1-indexed, where 0 represents that the
+  # worker / job cannot be reached.
+
+  # Obtain initial job indices based on new_jobs.
+  curr_job_idx = tf.argmax(
+      tf.cast(state["new_jobs"], tf.int32), axis=1, output_type=tf.int32)
+
+  # Track whether an example is actively being backtracked. Since we are
+  # operating on a batch, not all examples in the batch may be active.
+  active = tf.gather(state["new_jobs"], curr_job_idx, batch_dims=1)
+  batch_range = tf.range(0, batch_size, dtype=tf.int32)
+
+  # Flip matrix tracks which assignments we need to flip - corresponding to the
+  # augmenting path taken. We use an integer tensor here so that we can use
+  # tensor_scatter_nd_add to update the tensor, and then cast it back to bool
+  # after the loop.
+  flip_matrix = tf.zeros((batch_size, num_elems, num_elems), dtype=tf.int32)
+
+  def _has_active_backtracks(flip_matrix, active, curr_job_idx):
+    """Check if there are still active workers."""
+    del flip_matrix, curr_job_idx
+    return tf.reduce_any(active)
+
+  def _backtrack_one_step(flip_matrix, active, curr_job_idx):
+    """Take one step in backtracking."""
+    # Discover the worker that the job originated from, note that this worker
+    # must exist by construction.
+    curr_worker_idx = tf.gather(
+        state["jobs_from_worker"], curr_job_idx, batch_dims=1) - 1
+    curr_worker_idx = tf.maximum(curr_worker_idx, 0)
+    update_indices = tf.stack([batch_range, curr_worker_idx, curr_job_idx],
+                              axis=1)
+    update_indices = tf.maximum(update_indices, 0)
+    flip_matrix = tf.tensor_scatter_nd_add(flip_matrix, update_indices,
+                                           tf.cast(active, tf.int32))
+
+    # Discover the (potential) job that the worker originated from.
+    curr_job_idx = tf.gather(
+        state["workers_from_job"], curr_worker_idx, batch_dims=1) - 1
+    # Note that jobs may not be active, and we track that here (before
+    # adjusting indices so that they are all >= 0 for gather).
+    active &= curr_job_idx >= 0
+    curr_job_idx = tf.maximum(curr_job_idx, 0)
+    update_indices = tf.stack([batch_range, curr_worker_idx, curr_job_idx],
+                              axis=1)
+    update_indices = tf.maximum(update_indices, 0)
+    flip_matrix = tf.tensor_scatter_nd_add(flip_matrix, update_indices,
+                                           tf.cast(active, tf.int32))
+
+    return flip_matrix, active, curr_job_idx
+
+  with tf.name_scope("improve_assignment"):
+    flip_matrix, _, _ = tf.while_loop(
+        _has_active_backtracks,
+        _backtrack_one_step, (flip_matrix, active, curr_job_idx),
+        back_prop=False)
+
+  flip_matrix = tf.cast(flip_matrix, tf.bool)
+  assignment = tf.math.logical_xor(assignment, flip_matrix)
+
+  return assignment
+
+
+def _maximum_bipartite_matching(adj_matrix, assignment=None):
+  """Performs maximum bipartite matching using augmented paths.
+
+  Args:
+    adj_matrix: A bool [batch_size, num_elems, num_elems] tensor, where each
+      element of the inner matrix represents whether the worker (row) can be
+      matched to the job (column).
+    assignment: An optional bool [batch_size, num_elems, num_elems] tensor,
+      where each element of the inner matrix represents whether the worker has
+      been matched to the job. This may be a partial assignment. If specified,
+      this assignment will be used to seed the iterative algorithm.
+
+  Returns:
+    A state dict representing the final augmenting path state search, and
+    a maximum bipartite matching assignment tensor. Note that the state outcome
+    can be used to compute a minimum vertex cover for the bipartite graph.
+  """
+
+  if assignment is None:
+    assignment = _greedy_assignment(adj_matrix)
+
+  state = _find_augmenting_path(assignment, adj_matrix)
+
+  def _has_new_jobs(state, assignment):
+    del assignment
+    return tf.reduce_any(state["new_jobs"])
+
+  def _improve_assignment_and_find_new_path(state, assignment):
+    assignment = _improve_assignment(assignment, state)
+    state = _find_augmenting_path(assignment, adj_matrix)
+    return state, assignment
+
+  with tf.name_scope("maximum_bipartite_matching"):
+    state, assignment = tf.while_loop(
+        _has_new_jobs,
+        _improve_assignment_and_find_new_path, (state, assignment),
+        back_prop=False)
+
+  return state, assignment
+
+
+def _compute_cover(state, assignment):
+  """Computes a cover for the bipartite graph.
+
+  We compute a cover using the construction provided at
+  https://en.wikipedia.org/wiki/K%C5%91nig%27s_theorem_(graph_theory)#Proof
+  which uses the outcome from the alternating path search.
+
+  Args:
+    state: A state dict, which represents the outcome of running an augmenting
+      path search on the graph given the assignment.
+    assignment: An optional bool [batch_size, num_elems, num_elems] tensor,
+      where each element of the inner matrix represents whether the worker has
+      been matched to the job. This may be a partial assignment. If specified,
+      this assignment will be used to seed the iterative algorithm.
+
+  Returns:
+    A tuple of (workers_cover, jobs_cover) corresponding to row and column
+    covers for the bipartite graph. workers_cover is a boolean tensor of shape
+    [batch_size, num_elems, 1] and jobs_cover is a boolean tensor of shape
+    [batch_size, 1, num_elems].
+  """
+  assigned_workers = tf.reduce_any(assignment, axis=2, keepdims=True)
+  assigned_jobs = tf.reduce_any(assignment, axis=1, keepdims=True)
+
+  reachable_workers = state["workers"] > 0
+  reachable_jobs = state["jobs"] > 0
+
+  workers_cover = assigned_workers & (~reachable_workers)
+  jobs_cover = assigned_jobs & reachable_jobs
+
+  return workers_cover, jobs_cover
+
+
+def _update_weights_using_cover(workers_cover, jobs_cover, weights):
+  """Updates weights for hungarian matching using a cover.
+
+  We first find the minimum uncovered weight. Then, we subtract this from all
+  the uncovered weights, and add it to all the doubly covered weights.
+
+  Args:
+    workers_cover: A boolean tensor of shape [batch_size, num_elems, 1].
+    jobs_cover: A boolean tensor of shape [batch_size, 1, num_elems].
+    weights: A float32 [batch_size, num_elems, num_elems] tensor, where each
+      inner matrix represents weights to be use for matching.
+
+  Returns:
+    A new weight matrix with elements adjusted by the cover.
+  """
+  max_value = tf.reduce_max(weights)
+
+  covered = workers_cover | jobs_cover
+  double_covered = workers_cover & jobs_cover
+
+  uncovered_weights = tf.where(covered,
+                               tf.ones_like(weights) * max_value, weights)
+  min_weight = tf.reduce_min(uncovered_weights, axis=[-2, -1], keepdims=True)
+
+  add_weight = tf.where(double_covered,
+                        tf.ones_like(weights) * min_weight,
+                        tf.zeros_like(weights))
+  sub_weight = tf.where(covered, tf.zeros_like(weights),
+                        tf.ones_like(weights) * min_weight)
+
+  return weights + add_weight - sub_weight
+
+
+def get_shape_list(tensor, expected_rank=None):
+  """Returns a list of the shape of tensor.
+
+  Args:
+    tensor: A tf.Tensor object to find the shape of
+    expected_rank: An (optional) int with the expected rank of the inputted
+      tensor.
+
+  Returns:
+    A list representing the shape of the tesnor.
+
+  Raises:
+    ValueError: If the expected rank does not match the expected rank of the
+      inputted tensor.
+  """
+  actual_rank = tensor.shape.ndims
+
+  if expected_rank and actual_rank != expected_rank:
+    raise ValueError("The tensor has rank %d which is not equal to the "
+                     "expected rank %d" % (actual_rank, expected_rank))
+
+  shape = tensor.shape.as_list()
+  dynamic = tf.shape(tensor)
+  output = [dim if dim else dynamic[ind] for ind, dim in enumerate(shape)]
+  return output
+
+
+def hungarian_matching(weights):
+  """Computes the minimum linear sum assignment using the Hungarian algorithm.
+
+  Args:
+    weights: A float32 [batch_size, num_elems, num_elems] tensor, where each
+      inner matrix represents weights to be use for matching.
+
+  Returns:
+    A bool [batch_size, num_elems, num_elems] tensor, where each element of the
+    inner matrix represents whether the worker has been matched to the job.
+    The returned matching will always be a perfect match.
+  """
+  batch_size, num_elems, _ = get_shape_list(weights, 3)
+
+  weights = _prepare(weights)
+  adj_matrix = tf.equal(weights, 0.)
+  state, assignment = _maximum_bipartite_matching(adj_matrix)
+  workers_cover, jobs_cover = _compute_cover(state, assignment)
+
+  def _cover_incomplete(workers_cover, jobs_cover, *args):
+    del args
+    cover_sum = (
+        tf.reduce_sum(tf.cast(workers_cover, tf.int32)) +
+        tf.reduce_sum(tf.cast(jobs_cover, tf.int32)))
+    return tf.less(cover_sum, batch_size * num_elems)
+
+  def _update_weights_and_match(workers_cover, jobs_cover, weights, assignment):
+    weights = _update_weights_using_cover(workers_cover, jobs_cover, weights)
+    adj_matrix = tf.equal(weights, 0.)
+    state, assignment = _maximum_bipartite_matching(adj_matrix, assignment)
+    workers_cover, jobs_cover = _compute_cover(state, assignment)
+    return workers_cover, jobs_cover, weights, assignment
+
+  with tf.name_scope("hungarian_matching"):
+    workers_cover, jobs_cover, weights, assignment = tf.while_loop(
+        _cover_incomplete,
+        _update_weights_and_match,
+        (workers_cover, jobs_cover, weights, assignment),
+        back_prop=False)
+
+  return assignment
diff --git a/model/loss/matchers_ops_test.py b/model/loss/matchers_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e453a12329a9ac79b9f24399fa8f7e2e047e29c
--- /dev/null
+++ b/model/loss/matchers_ops_test.py
@@ -0,0 +1,136 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for matchers_ops."""
+
+import numpy as np
+from scipy import optimize
+import tensorflow as tf
+
+from deeplab2.model.loss import matchers_ops
+
+
+class MatchersOpsTest(tf.test.TestCase):
+
+  def hungarian_matching_tpu(self, cost_matrix):
+    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+    tf.config.experimental_connect_to_cluster(resolver)
+    tf.tpu.experimental.initialize_tpu_system(resolver)
+    strategy = tf.distribute.TPUStrategy(resolver)
+
+    @tf.function
+    def function():
+      costs = tf.constant(cost_matrix, cost_matrix.dtype, cost_matrix.shape)
+      return matchers_ops.hungarian_matching(costs)
+    # Get the first replica output.
+    return strategy.run(function).values[0].numpy()
+
+  def testLinearSumAssignment(self):
+    """Check a simple 2D test case of the Linear Sum Assignment problem.
+
+    Ensures that the implementation of the matching algorithm is correct
+    and functional on TPUs.
+    """
+    cost_matrix = np.array([[[4, 1, 3], [2, 0, 5], [3, 2, 2]]],
+                           dtype=np.float32)
+    adjacency_output = self.hungarian_matching_tpu(cost_matrix)
+
+    correct_output = np.array([
+        [0, 1, 0],
+        [1, 0, 0],
+        [0, 0, 1],
+    ], dtype=bool)
+    self.assertAllEqual(adjacency_output[0], correct_output)
+
+  def testBatchedLinearSumAssignment(self):
+    """Check a batched case of the Linear Sum Assignment Problem.
+
+    Ensures that a correct solution is found for all inputted problems within
+    a batch.
+    """
+    cost_matrix = np.array([
+        [[4, 1, 3], [2, 0, 5], [3, 2, 2]],
+        [[1, 4, 3], [0, 2, 5], [2, 3, 2]],
+        [[1, 3, 4], [0, 5, 2], [2, 2, 3]],
+    ],
+                           dtype=np.float32)
+
+    adjacency_output = self.hungarian_matching_tpu(cost_matrix)
+
+    # Hand solved correct output for the linear sum assignment problem
+    correct_output = np.array([
+        [[0, 1, 0], [1, 0, 0], [0, 0, 1]],
+        [[1, 0, 0], [0, 1, 0], [0, 0, 1]],
+        [[1, 0, 0], [0, 0, 1], [0, 1, 0]],
+    ],
+                              dtype=bool)
+    self.assertAllClose(adjacency_output, correct_output)
+
+  def testMaximumBipartiteMatching(self):
+    """Check that the maximum bipartite match assigns the correct numbers."""
+    adj_matrix = tf.cast([[
+        [1, 0, 0, 0, 1],
+        [0, 1, 0, 1, 0],
+        [0, 0, 1, 0, 0],
+        [0, 1, 0, 0, 0],
+        [1, 0, 0, 0, 0],
+    ]], tf.bool)  # pyformat: disable
+    _, assignment = matchers_ops._maximum_bipartite_matching(adj_matrix)
+    self.assertEqual(np.sum(assignment), 5)
+
+  def testAssignmentMatchesScipy(self):
+    """Check that the Linear Sum Assignment matches the Scipy implementation."""
+    batch_size, num_elems = 2, 25
+    weights = tf.random.uniform((batch_size, num_elems, num_elems),
+                                minval=0.,
+                                maxval=1.)
+    assignment = matchers_ops.hungarian_matching(weights)
+    actual_weights = weights.numpy()
+    actual_assignment = assignment.numpy()
+
+    for idx in range(batch_size):
+      _, scipy_assignment = optimize.linear_sum_assignment(actual_weights[idx])
+      hungarian_assignment = np.where(actual_assignment[idx])[1]
+
+      self.assertAllEqual(hungarian_assignment, scipy_assignment)
+
+  def testAssignmentRunsOnTPU(self):
+    """Check that a batch of assignments matches Scipy."""
+    batch_size, num_elems = 4, 100
+    cost_matrix = np.random.rand(batch_size, num_elems, num_elems)
+
+    actual_assignment = self.hungarian_matching_tpu(cost_matrix)
+
+    for idx in range(batch_size):
+      _, scipy_assignment = optimize.linear_sum_assignment(cost_matrix[idx])
+      hungarian_assignment = np.where(actual_assignment[idx])[1]
+      self.assertAllEqual(hungarian_assignment, scipy_assignment)
+
+  def testLargeBatch(self):
+    """Check large-batch performance of Hungarian matcher.
+
+    Useful for testing efficiency of the proposed solution and regression
+    testing. Current solution is thought to be quadratic in nature, yielding
+    significant slowdowns when the number of queries is increased.
+    """
+    batch_size, num_elems = 64, 100
+    cost_matrix = np.abs(
+        np.random.normal(size=(batch_size, num_elems, num_elems)))
+
+    _ = self.hungarian_matching_tpu(cost_matrix)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/model/loss/max_deeplab_loss.py b/model/loss/max_deeplab_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..67368c5a69b4c9b6e871fb4ccded7cb7a502a762
--- /dev/null
+++ b/model/loss/max_deeplab_loss.py
@@ -0,0 +1,721 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This file contains the loss functions for MaX-DeepLab models.
+
+Reference:
+  MaX-DeepLab: "End-to-End Panoptic Segmentation with Mask Transformers",
+    CVPR 2021. https://arxiv.org/abs/2012.00759
+      Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen.
+"""
+from typing import Text, Dict, Tuple, List
+
+import tensorflow as tf
+from deeplab2 import common
+from deeplab2 import config_pb2
+from deeplab2.model import utils
+from deeplab2.model.loss import base_loss
+from deeplab2.model.loss import matchers_ops
+
+# Positive and negative constants that are used to pad or mask hungarian
+# matching weights.
+_MATCHING_NEGATIVE_CONSTANT = -999.0
+_MATCHING_POSITIVE_CONSTANT = 999.0
+# A large negative constant applied before softmax. This will make the softmax
+# ignore the masked logits.
+_SOFTMAX_MASKING_CONSTANT = -99999.0
+
+_GT_KEY = 'gt_key'
+_PRED_KEY = 'pred_key'
+_WEIGHT_KEY = 'weight_key'
+
+
+def _generate_mask_slot_semantic_one_hot(
+    matched_mask_slot_indices: tf.Tensor,
+    mask_gt_semantic_map: tf.Tensor,
+    num_mask_slots: int,
+    thing_stuff_class_ids: List[int]):
+  """Generates the ground truth for transformer_class_logits.
+
+  This function generates a pseudo ground truth that we will use to train the
+  transformer class head logits. The input tensors, matched_mask_slot_indices
+  and mask_gt_semantic_map, are obtained by (hungarian) matching the ground
+  truth masks with the predicted masks. Note that this function generates the
+  positive one hot encodings only, i.e., the void class is not included in the
+  output tensor but will be generated outside the function.
+
+  Args:
+    matched_mask_slot_indices: An int32 tf.Tensor of shape [batch_size,
+      num_ground_truth_masks] that encodes the matched mask slot id for each
+      ground truth mask.
+    mask_gt_semantic_map: An int32 tf.Tensor of shape [batch_size,
+      num_ground_truth_masks] that encodes the semantic label for each ground
+      truth mask. A padded mask (or void, or no object) will have the label -1.
+    num_mask_slots: An integer, the number of mask slots for the MaX-DeepLab
+      model.
+    thing_stuff_class_ids: A list of integers of length [num_thing_classes +
+      num_stuff_classes] that encodes the class IDs for all thing and stuff
+      classes. It is a concatenation of the thing_class_ids list and the
+      stuff_class_ids list.
+
+  Returns:
+    mask_slot_semantic_one_hot: An output tf.Tensor with shape [batch_size,
+      num_mask_slots, num_thing_classes + num_stuff_classes].
+  """
+  semantic_map_shape = mask_gt_semantic_map.get_shape().as_list()
+  batch_size = semantic_map_shape[0]
+  num_ground_truth_masks = semantic_map_shape[-1]
+
+  # Concatenate the indices in each dimension of the ground truth one hot
+  # output.
+  batch_indices = tf.expand_dims(tf.range(batch_size), axis=-1)
+  batch_indices = tf.tile(batch_indices, [1, num_ground_truth_masks])
+  batch_indices = tf.reshape(batch_indices, [-1, 1])
+  matched_mask_slot_indices = tf.reshape(matched_mask_slot_indices, [-1, 1])
+  # We shift the semantic map by one so that void labels (-1) will be a valid
+  # index too. Otherwise, tf.scatter_nd raises error if it runs on CPU.
+  semantic_indices = tf.reshape(mask_gt_semantic_map, [-1, 1]) + 1
+  indices = tf.concat([batch_indices,
+                       matched_mask_slot_indices,
+                       semantic_indices], axis=-1)
+
+  # Generate mask_slot_semantic_one_hot by scattering constant ones onto a
+  # constant zero tensor.
+  updates = tf.ones([batch_size * num_ground_truth_masks], dtype=tf.float32)
+  mask_slot_semantic_one_hot = tf.scatter_nd(
+      indices, updates,
+      shape=[batch_size, num_mask_slots, max(thing_stuff_class_ids) + 2])
+
+  # Gather the wanted classes in the desired (thing + stuff) order.
+  thing_stuff_tensor = tf.cast(thing_stuff_class_ids, tf.int32)
+  # We also shift the thing_stuff_tensor index by one in order to revert the
+  # semantic map shifting above.
+  mask_slot_semantic_one_hot = tf.gather(mask_slot_semantic_one_hot,
+                                         thing_stuff_tensor + 1, axis=2)
+  return mask_slot_semantic_one_hot
+
+
+def nonsquare_hungarian_matching(
+    weights: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
+  """Hungarian matching with arbitrary shape.
+
+  The matchers_ops.hungarian_matching supports only squared weight matrices.
+  This function generalizes the hungarian matching to nonsquare cases by padding
+  the weights to a square and running the square version matching. The property
+  of hungarian matching ensures that the solutions are equivalent for the padded
+  square problem and the original nonsquare problem.
+
+  Args:
+    weights: A [batch, shape1, shape2] float32 tf.Tensor.
+
+  Returns:
+    square_permutation: A [batch, max(shape1, shape2), max(shape1, shape2)]
+      float32 tf.Tensor that is the permutation matrix that achieves the minimum
+      total weight. Note that a permutation matrix contains only value 0.0 and
+      1.0, with each row and each column sums to 1.0.
+    nonsquare_permutation: A [batch, shape1, shape2] float32 tf.Tensor. The
+      nonsquare part of the permutation matrix.
+  """
+  _, height, width = weights.get_shape().as_list()
+  max_height_width = max(height, width)
+  # Padding a constant on one axis does not affect matching results.
+  weights = tf.pad(weights,
+                   [[0, 0],  # Do not pad the batch dimension.
+                    [0, max_height_width - height],
+                    [0, max_height_width - width]],
+                   constant_values=_MATCHING_NEGATIVE_CONSTANT)
+  square_permutation = matchers_ops.hungarian_matching(weights)
+
+  square_permutation = tf.cast(square_permutation, tf.float32)
+  return square_permutation, square_permutation[:, :height, :width]
+
+
+def _mask_similarity(gt_mask: tf.Tensor, pred_mask: tf.Tensor,
+                     metric: str = 'dice') -> tf.Tensor:
+  """Computes mask similarity between gt_masks and pred_masks.
+
+  Args:
+    gt_mask: A [batch, height * width, num_gt_masks] float32 tf.Tensor, that
+      contains only value 0.0 and 1.0. Each 1.0 indicates that the pixel belongs
+      to the ground truth mask. Note that panoptic segmentation enforces that
+      ground truth masks do not overlap.
+    pred_mask: A [batch, height * width, num_pred_masks] float32 tf.Tensor, that
+      is positive. For each batch_id and pixel_id, the [num_pred_masks] vector
+      encodes whether each pixel belongs to each mask. The sum of each vector is
+      less than or equal to one.
+    metric: A string, the mask similarity metric that we will compute. Supports
+      'dice' (default), 'iou', 'intersection_over_ground_truth', and
+      'intersection_over_prediction'.
+
+  Returns:
+    mask_similarity: A float32 [batch, num_gt_masks, num_pred_masks] tf.Tensor
+      that contains the mask similarity between all ground truth masks and all
+      predicted masks.
+
+  Raises:
+    ValueError: If the mask similarity metric is not one of 'dice', 'iou',
+    'intersection_over_ground_truth', or 'intersection_over_prediction'.
+  """
+  denominator_epsilon = 1e-5
+  intersection = tf.einsum('bpi,bpj->bij', gt_mask, pred_mask)
+  if metric.lower() == 'dice':
+    denominator = (tf.expand_dims(tf.reduce_sum(gt_mask, axis=1), axis=2) +
+                   tf.reduce_sum(pred_mask, axis=1, keepdims=True)) / 2
+  elif metric.lower() == 'iou':
+    denominator = (tf.expand_dims(tf.reduce_sum(gt_mask, axis=1), axis=2) +
+                   tf.reduce_sum(pred_mask, axis=1, keepdims=True) -
+                   intersection)
+  elif metric.lower() == 'intersection_over_ground_truth':
+    denominator = tf.expand_dims(tf.reduce_sum(gt_mask, axis=1), axis=2)
+  elif metric.lower() == 'intersection_over_prediction':
+    denominator = tf.reduce_sum(pred_mask, axis=1, keepdims=True)
+  else:
+    raise ValueError('The mask similarity metric is not supported.')
+  return intersection / (denominator + denominator_epsilon)
+
+
+class MaXDeepLabLoss(tf.keras.layers.Layer):
+  """This class contains code for MaX-DeepLab losses."""
+
+  def __init__(self,
+               loss_options: config_pb2.LossOptions,
+               ignore_label: int,
+               thing_class_ids: Tuple[int],
+               focal_loss_alpha: float = 0.75,
+               instance_discrimination_temperature: float = 0.3):
+    """Initializes a MaX-DeepLab loss.
+
+    This class supports PQ-style loss, mask id cross entropy loss, and instance
+    discrimination loss, proposed in MaX-DeepLab. The PQ-style loss can be
+    further decomposed in to a classification term and a mask dice term.
+
+    Reference:
+      MaX-DeepLab: "End-to-End Panoptic Segmentation with Mask Transformers",
+      CVPR 2021. https://arxiv.org/abs/2012.00759
+        Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen.
+
+    Args:
+      loss_options: Loss options as defined by config_pb2.LossOptions.
+      ignore_label: An integer specifying the ignore label.
+      thing_class_ids: A tuple of length [N] containing N thing indices.
+      focal_loss_alpha: An optional float specifying the coefficient that
+        weights between positive (matched) and negative (unmatched) masks in
+        focal loss. The positives are weighted by alpha, while the negatives
+        are weighted by (1. - alpha). Note that we do not use a focal loss
+        gamma here, i.e., the gamma is set to zero which is equivalent to the
+        normal cross-entropy loss, except for the alpha weighting. Default to
+        0.75.
+      instance_discrimination_temperature: An optional float specifying the
+        temperature for the instance discrimination loss.
+    """
+    super(MaXDeepLabLoss, self).__init__(name='MaXDeepLabLoss')
+    # The loss_terms will optionally include
+    #  - common.PQ_STYLE_LOSS_CLASS_TERM
+    #  - common.PQ_STYLE_LOSS_MASK_DICE_TERM
+    #  - common.MASK_ID_CROSS_ENTROPY_LOSS
+    #  - common.INSTANCE_DISCRIMINATION_LOSS
+    # These loss terms will be accessed by loss_builder.py and will be used to
+    # build loss metrics.
+    self.loss_terms = []
+
+    # The PQ-style loss includes two terms.
+    self._pq_style_loss_weight = 0.0
+    if loss_options.HasField(common.PQ_STYLE_LOSS):
+      self._pq_style_loss_weight = loss_options.pq_style_loss.weight
+      self.loss_terms.append(common.PQ_STYLE_LOSS_CLASS_TERM)
+      self.loss_terms.append(common.PQ_STYLE_LOSS_MASK_DICE_TERM)
+
+    # Mask-ID cross entropy loss.
+    self._mask_id_cross_entropy_loss_weight = 0.0
+    if loss_options.HasField(common.MASK_ID_CROSS_ENTROPY_LOSS):
+      self._mask_id_cross_entropy_loss_weight = (
+          loss_options.mask_id_cross_entropy_loss.weight)
+      self.loss_terms.append(common.MASK_ID_CROSS_ENTROPY_LOSS)
+
+    # Instance discrimination loss.
+    self._instance_discrimination_loss_weight = 0.0
+    if loss_options.HasField(common.INSTANCE_DISCRIMINATION_LOSS):
+      self._instance_discrimination_loss_weight = (
+          loss_options.instance_discrimination_loss.weight)
+      self.loss_terms.append(common.INSTANCE_DISCRIMINATION_LOSS)
+
+    self._ignore_label = ignore_label
+    self._thing_class_ids = list(thing_class_ids)
+    self._focal_loss_alpha = focal_loss_alpha
+    self._instance_discrimination_temperature = (
+        instance_discrimination_temperature)
+
+    # Build the base loss functions.
+    self._pq_style_loss_class_term = base_loss.FocalCrossEntropyLoss(
+        gt_key=_GT_KEY, pred_key=_PRED_KEY, weight_key=_WEIGHT_KEY,
+        # Num_classes and ignore_label are not necessary since the inputs will
+        # be one hot encoded already.
+        num_classes=None, ignore_label=None,
+        focal_loss_alpha=focal_loss_alpha,
+        focal_loss_gamma=0.0, background_channel_index=-1,
+        dynamic_weight=True)
+    self._pq_style_loss_mask_dice_term = base_loss.MaskDiceLoss(
+        gt_key=_GT_KEY, pred_key=_PRED_KEY, weight_key=_WEIGHT_KEY,
+        prediction_activation='softmax')
+    self._mask_id_cross_entropy_loss = base_loss.TopKCrossEntropyLoss(
+        gt_key=_GT_KEY, pred_key=_PRED_KEY, weight_key=_WEIGHT_KEY,
+        # Num_classes and ignore_label are not necessary since the inputs will
+        # be one hot encoded already.
+        num_classes=None, ignore_label=None,
+        top_k_percent_pixels=1.0, dynamic_weight=True)
+    self._instance_discrimination_loss = base_loss.TopKCrossEntropyLoss(
+        gt_key=_GT_KEY, pred_key=_PRED_KEY, weight_key=_WEIGHT_KEY,
+        # Num_classes and ignore_label are not necessary since the inputs will
+        # be one hot encoded already.
+        num_classes=None, ignore_label=None,
+        top_k_percent_pixels=1.0, dynamic_weight=True)
+
+  def build(self,
+            input_shapes: Tuple[Dict[Text, tf.Tensor], Dict[Text, tf.Tensor]]):
+    """Extracts useful constants that depend on the input shapes."""
+    y_true_shapes = input_shapes[0]
+    self._max_thing_id = int(y_true_shapes[common.GT_THING_ID_CLASS_KEY][-1])
+    y_pred_shapes = input_shapes[1]
+    transformer_class_logits_shape = y_pred_shapes[
+        common.PRED_TRANSFORMER_CLASS_LOGITS_KEY]
+    self._num_mask_slots = int(transformer_class_logits_shape[1])
+    # The transformer_class_logits contain thing classes, stuff classes, and the
+    # void class, so num_thing_stuff_classes should be the total number of
+    # classes minus one.
+    self._num_thing_stuff_classes = int(transformer_class_logits_shape[2]) - 1
+    # Since we implement the PQ-style loss with the class term plus the mask
+    # dice term (Equation 10 of the paper), we need to balance the two terms to
+    # have the same weight and normalizating constants. The focal loss alpha is
+    # a weight on the positive class term, so we apply it to the mask dice term
+    # too. The class loss is also normalized by the number of mask slots, so we
+    # do the same normalization for the mask dice term.
+    self._mask_dice_term_modifier = (
+        self._focal_loss_alpha / self._num_mask_slots)
+
+    self._stuff_class_ids = utils.get_stuff_class_ids(
+        self._num_thing_stuff_classes,
+        self._thing_class_ids,
+        self._ignore_label)
+    self._num_stuff_classes = len(self._stuff_class_ids)
+    self._thing_stuff_class_ids = self._thing_class_ids + self._stuff_class_ids
+    self._pixel_gt_num_mask_id = self._max_thing_id + self._num_stuff_classes
+
+  def _pre_process_ground_truth(
+      self, y_true: Dict[Text, tf.Tensor], output_height: int, output_width: int
+  ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor,
+             tf.Tensor]:
+    """Pre-processes the ground truth before we compute the losses.
+
+    This function generates tensors that do not depend on the prediction of the
+    model, but are useful to the calculation of the losses. The function mainly
+    downsamples the pixel space ground truth to the model output resolution, and
+    combines (or concatenates) the thing masks and the stuff masks. The output
+    shape pixel_gt_num_mask_id = max_thing_id + num_stuff_classes, which means
+    the output masks contain both thing masks and stuff masks.
+
+    Args:
+      y_true: A dict of tensors providing ground-truth information, containing
+       - common.GT_SEMANTIC_KEY: A [batch, height, width] int32 tf.Tensor, the
+         semantic label map.
+       - common.GT_THING_ID_MASK_KEY: A [batch, height, width] int32 tf.Tensor.
+         It assigns each non-crowd thing instance a unique mask-ID label,
+         starting from 0. Unassigned pixels are set to -1.
+       - common.GT_THING_ID_CLASS_KEY: A [batch, max_thing_id] int32 tf.Tensor.
+         It contains semantic ID of each instance assigned to thing_id_mask. The
+         remaining (max_thing_id - num_things) elements are set to -1.
+      output_height: An integer, the height of the model output.
+      output_width: An integer, the width of the model output.
+
+    Returns:
+      pixel_gt_thing_mask: A [batch, output_height * output_width] float32
+        tensor, with value 0.0 and 1.0 only, indicating whether a pixel belongs
+        to a 'thing' class.
+      pixel_gt_non_void_mask: A [batch, output_height * output_width] float32
+        tensor, with value 0.0 and 1.0 only, indicating if a pixel does not
+        belong to the void class.
+      pixel_gt_mask_id_one_hot: A [batch, output_height * output_width,
+        pixel_gt_num_mask_id] float32 tensor, with value 0.0 and 1.0 only,
+        indicating the mask id each pixel belongs to.
+      mask_gt_semantic_map: A [batch, pixel_gt_num_mask_id] int32 tensor, the
+        semantic class of each ground truth mask.
+      mask_gt_non_void_mask: A [batch, pixel_gt_num_mask_id] int32 tensor, with
+        value 0.0 and 1.0 only, indicating if the ground truth mask is a valid
+        mask, not a padded mask. The masks are padded because TPU does not
+        support dynamic shapes except in the batch axis. We pad all ground truth
+        thing masks to a large enough constant max_thing_id. Similarly, stuff
+        classes that do not present in the current image will be set to a void
+        mask too.
+      mask_gt_semantic_one_hot: A [batch, pixel_gt_num_mask_id,
+        num_thing_stuff_classes] float32 tensor, with value 0.0 and 1.0 only,
+        containing the one hot encodings of the ground truth mask classes. The
+        last dimension contains concatenated thing classes and stuff classes,
+        which is different from the dataset class IDs in mask_gt_semantic_map.
+      mask_gt_area: A [batch, pixel_gt_num_mask_id] float32 tensor, the area of
+        each ground truth mask. Padded masks have an area of 0.0.
+    """
+    # The depth of one hot encoding should be the largest id plus one. For
+    # example, if we want to one-hot encode a class ID of 133 (the largest ID
+    # for the COCO dataset), we will need a one-hot encoding of length 134.
+    one_hot_depth = max(self._thing_stuff_class_ids) + 1
+    batch_size = y_true[common.GT_SEMANTIC_KEY].get_shape().as_list()[0]
+
+    # Compute pixel_gt_semantic_map (downsampling and reshaping to the 1D
+    # representation that will be mainly used in this loss function).
+    pixel_gt_semantic_map = utils.strided_downsample(
+        y_true[common.GT_SEMANTIC_KEY],
+        target_size=[output_height, output_width])
+    pixel_gt_semantic_map = tf.reshape(
+        pixel_gt_semantic_map,
+        [batch_size, output_height * output_width])
+
+    # Compute pixel_gt_non_void_mask.
+    pixel_gt_non_void_mask = tf.cast(
+        tf.not_equal(pixel_gt_semantic_map, self._ignore_label), tf.float32)
+    pixel_gt_non_void_mask = tf.ensure_shape(
+        pixel_gt_non_void_mask,
+        [batch_size, output_height * output_width])
+
+    # Compute pixel_gt_semantic_one_hot from pixel_gt_semantic_map in order to
+    # gather pixel_gt_stuff_id_one_hot from pixel_gt_semantic_one_hot.
+    pixel_gt_semantic_one_hot = tf.one_hot(pixel_gt_semantic_map, one_hot_depth)
+    # Convert the one hot encoding from the dataset id order to (thing, stuff)
+    # order used in MaX-DeepLab.
+    pixel_gt_stuff_id_one_hot = tf.gather(pixel_gt_semantic_one_hot,
+                                          self._stuff_class_ids, axis=-1)
+    pixel_gt_stuff_id_one_hot = tf.ensure_shape(
+        pixel_gt_stuff_id_one_hot,
+        [batch_size, output_height * output_width, self._num_stuff_classes])
+
+    # Compute pixel_gt_thing_id_one_hot for thing masks.
+    pixel_gt_thing_id_map = utils.strided_downsample(
+        y_true[common.GT_THING_ID_MASK_KEY],
+        target_size=[output_height, output_width])
+    pixel_gt_thing_id_map = tf.reshape(
+        pixel_gt_thing_id_map, shape=[batch_size, output_height * output_width])
+    # Note that common.GT_THING_ID_MASK_KEY uses -1 for void masks. And 0 to
+    # (num_mask_slots - 1) are used for num_mask_slots mask slots.
+    pixel_gt_thing_mask = tf.cast(
+        tf.not_equal(pixel_gt_thing_id_map, -1), tf.float32)
+    pixel_gt_thing_id_one_hot = tf.one_hot(pixel_gt_thing_id_map,
+                                           self._max_thing_id)
+    # Compute pixel_gt_mask_id_one_hot by concatenating thing masks with stuff
+    # masks.
+    pixel_gt_mask_id_one_hot = tf.concat([pixel_gt_thing_id_one_hot,
+                                          pixel_gt_stuff_id_one_hot], axis=-1)
+    pixel_gt_mask_id_one_hot = tf.ensure_shape(
+        pixel_gt_mask_id_one_hot,
+        [batch_size, output_height * output_width, self._pixel_gt_num_mask_id])
+
+    # Compute mask_gt_area by summing the one hot encodings spatially.
+    mask_gt_area = tf.expand_dims(
+        tf.reduce_sum(pixel_gt_mask_id_one_hot, axis=1), axis=-1)
+    # Generate a binary mask for ground truth masks, indicating whether each
+    # ground truth mask exists in the pixel space with a non-zero area. Note
+    # that a mask that exists in the original input resolution will be removed
+    # if its area is zero in the output resolution, due to downsampling.
+    mask_gt_area_mask = tf.reshape(mask_gt_area > 0.5,
+                                   [batch_size, self._pixel_gt_num_mask_id])
+
+    # Compute mask_gt_semantic_map and mask_gt_semantic_one_hot.
+    thing_id_gt_semantic_map = tf.reshape(
+        tf.cast(y_true[common.GT_THING_ID_CLASS_KEY], tf.int32),
+        [batch_size, self._max_thing_id])
+    # The stuff ground truth semantic map is just the stuff class IDs.
+    stuff_id_gt_semantic_map = tf.tile(
+        tf.reshape(
+            tf.cast(self._stuff_class_ids, tf.int32),
+            [1, self._num_stuff_classes]), [batch_size, 1])
+    mask_gt_semantic_map = tf.concat(
+        [thing_id_gt_semantic_map, stuff_id_gt_semantic_map], axis=-1)
+    # Set masks with zero area to void (-1), which is consistent with the void
+    # label used in common.GT_THING_ID_CLASS_KEY but is different from the
+    # ignore_labels of the datasets.
+    mask_gt_semantic_map = (
+        (mask_gt_semantic_map + 1) * tf.cast(mask_gt_area_mask, tf.int32) - 1)
+    # Void (-1) classes will automatically be ignored by tf.one_hot.
+    mask_gt_semantic_one_hot = tf.one_hot(mask_gt_semantic_map, one_hot_depth)
+    mask_gt_semantic_one_hot = tf.gather(
+        mask_gt_semantic_one_hot, self._thing_stuff_class_ids, axis=-1)
+
+    # Compute mask_gt_non_void_mask. Again, a mask that exists in the original
+    # input resolution is set to void if its area is zero in the output
+    # resolution, due to downsampling.
+    mask_gt_non_void_mask = tf.cast(mask_gt_semantic_map > -1, tf.float32)
+    mask_gt_non_void_mask = tf.ensure_shape(
+        mask_gt_non_void_mask, [batch_size, self._pixel_gt_num_mask_id])
+
+    return (pixel_gt_thing_mask, pixel_gt_non_void_mask,
+            pixel_gt_mask_id_one_hot, mask_gt_semantic_map,
+            mask_gt_non_void_mask, mask_gt_semantic_one_hot, mask_gt_area)
+
+  def call(
+      self, inputs: Tuple[Dict[Text, tf.Tensor], Dict[Text, tf.Tensor]]
+  ) -> Dict[Text, tf.Tensor]:
+    """Computes the MaX-DeepLab losses.
+
+    Args:
+      inputs: A tuple of two dicts (y_true, y_pred):
+      - y_true: A dict of tensors providing ground-truth information, containing
+         - common.GT_SEMANTIC_KEY: A [batch, height, width] int32 tf.Tensor, the
+           semantic label map.
+         - common.GT_THING_ID_MASK_KEY: A [batch, height, width] int32
+           tf.Tensor. It assigns each non-crowd thing instance a unique mask-ID
+           label, starting from 0. Unassigned pixels are set to -1.
+         - common.GT_THING_ID_CLASS_KEY: A [batch, max_thing_id] int32
+           tf.Tensor. It contains semantic ID of each instance assigned to
+           thing_id_mask. The remaining (max_thing_id - num_things) elements are
+           set to -1.
+      - y_pred: A dict of tensors providing predictions.
+         - common.PRED_PIXEL_SPACE_NORMALIZED_FEATURE_KEY: A [batch_size,
+           output_height, output_width, channels] float32 tensor.
+         - common.PRED_PIXEL_SPACE_MASK_LOGITS_KEY: A [batch_size,
+           output_height, output_width, num_mask_slots] float32 tensor, the
+           logits that a pixel belongs to a mask slot.
+         - common.PRED_TRANSFORMER_CLASS_LOGITS_KEY: A [batch_size,
+           num_mask_slots, num_thing_stuff_classes + 1] float32 tensor, the
+           logits that a mask belongs to a semantic class (including thing,
+           stuff, and void)
+
+    Returns:
+      The loss as a dict of tf.Tensor, optionally containing the following:
+      - common.PQ_STYLE_LOSS_CLASS_TERM: [batch].
+      - common.PQ_STYLE_LOSS_MASK_DICE_TERM: [batch].
+      - common.MASK_ID_CROSS_ENTROPY_LOSS: [batch].
+      - common.INSTANCE_DISCRIMINATION_LOSS: [batch].
+    """
+    y_true, y_pred = inputs
+    resulting_dict = {}
+
+    pixel_feature = y_pred[common.PRED_PIXEL_SPACE_NORMALIZED_FEATURE_KEY]
+    batch_size, output_height, output_width, _ = (
+        pixel_feature.get_shape().as_list())
+
+    # Pre-process the ground truth.
+    (pixel_gt_thing_mask, pixel_gt_non_void_mask, pixel_gt_mask_id_one_hot,
+     mask_gt_semantic_map, mask_gt_non_void_mask, mask_gt_semantic_one_hot,
+     mask_gt_area) = self._pre_process_ground_truth(y_true,
+                                                    output_height, output_width)
+    pixel_gt_non_void_mask_expanded = tf.expand_dims(
+        pixel_gt_non_void_mask, axis=-1)
+
+    # Compute mask_average_feature by averaging the feature of each mask.
+    pixel_feature = tf.reshape(
+        pixel_feature, [batch_size, output_height * output_width, -1])
+    mask_average_feature = tf.einsum(
+        'bpd,bpi->bid',
+        pixel_feature,
+        pixel_gt_mask_id_one_hot) / tf.maximum(mask_gt_area, 1.0)
+    # Normalize the mask feature as the pixel space output feature is usually
+    # normalized too.
+    mask_average_feature = tf.math.l2_normalize(mask_average_feature, axis=-1)
+
+    # Compute instance_discrimination_similarity, scaled by a constant
+    # temperature.
+    instance_discrimination_similarity = tf.einsum(
+        'bpd,bid->bpi', pixel_feature, mask_average_feature)
+    instance_discrimination_similarity /= (
+        self._instance_discrimination_temperature)
+    mask_gt_non_void_mask_expanded_1 = tf.expand_dims(
+        mask_gt_non_void_mask, axis=1)
+    # Mask void masks by setting them to a large negative value, so that they
+    # will be ignored by the softmax in the loss.
+    instance_discrimination_similarity = (
+        mask_gt_non_void_mask_expanded_1 * instance_discrimination_similarity +
+        (1.0 - mask_gt_non_void_mask_expanded_1) * _SOFTMAX_MASKING_CONSTANT)
+
+    # Auxiliary instance_discrimination_loss.
+    if self._instance_discrimination_loss_weight > 0.0:
+      resulting_dict[common.INSTANCE_DISCRIMINATION_LOSS] = (
+          self._instance_discrimination_loss(
+              {_GT_KEY: pixel_gt_mask_id_one_hot},
+              {_PRED_KEY: instance_discrimination_similarity,
+               _WEIGHT_KEY: pixel_gt_thing_mask}) *
+          self._instance_discrimination_loss_weight)
+
+    # Extract pixel_space_mask_logits and pixel_space_mask_probs.
+    pixel_space_mask_logits = y_pred[common.PRED_PIXEL_SPACE_MASK_LOGITS_KEY]
+    pixel_space_mask_logits = tf.reshape(
+        pixel_space_mask_logits,
+        [batch_size, output_height * output_width, self._num_mask_slots])
+    pixel_space_mask_probs = tf.nn.softmax(pixel_space_mask_logits, axis=-1)
+
+    # Compute the mask similarity between all ground truth masks and all
+    # predicted masks.
+    mask_similarity = _mask_similarity(
+        pixel_gt_mask_id_one_hot,
+        pixel_space_mask_probs * pixel_gt_non_void_mask_expanded,
+        metric='dice')
+
+    # Compute the class similarity by multiplying the ground truth one hot
+    # encoding with the predicted probability distribution. This is done between
+    # all ground truth masks and all predicted masks.
+    transformer_class_logits = y_pred[common.PRED_TRANSFORMER_CLASS_LOGITS_KEY]
+    transformer_class_probs = tf.nn.softmax(
+        transformer_class_logits, axis=-1)[:, :, :-1]
+    class_similarity = tf.einsum(
+        'bij,bkj->bik', mask_gt_semantic_one_hot, transformer_class_probs)
+
+    # Compute hungarian matching weights. We take the negative here since the
+    # hungarian matching algorithm looks for the matching with the least total
+    # weight.
+    hungarian_weights = - mask_similarity * class_similarity
+    mask_gt_non_void_mask_expanded_2 = tf.expand_dims(
+        mask_gt_non_void_mask, axis=2)
+
+    # Mask the void ground truth masks (in the rows) so that they do not affect
+    # the result of the hungarian matching.
+    if self._num_mask_slots >= self._pixel_gt_num_mask_id:
+      # If the number of mask slots (number of columns) is larger than the
+      # constant number of ground truth masks (number of rows), the
+      # nonsquare_hungarian_matching will pad the rows with
+      # _MATCHING_NEGATIVE_CONSTANT. In this case, we can fill in the void mask
+      # rows with _MATCHING_NEGATIVE_CONSTANT too, then the void mask rows will
+      # be ignored too, according to the hungarian matching property.
+      hungarian_weights = (
+          hungarian_weights * mask_gt_non_void_mask_expanded_2 +
+          (1 - mask_gt_non_void_mask_expanded_2) * _MATCHING_NEGATIVE_CONSTANT)
+    else:
+      # If the number of mask slots (number of columns) is smaller than the
+      # constant number of ground truth masks (number of rows), the
+      # nonsquare_hungarian_matching will pad the columns with
+      # _MATCHING_NEGATIVE_CONSTANT. In this case, we should fill in the void
+      # mask rows with _MATCHING_POSITIVE_CONSTANT here, then the void mask rows
+      # will have a huge cost compared with existing non-void mask rows, so that
+      # the predicted masks will prefer matching with existing non-void masks
+      # rather than the padded void masks, according to the hungarian matching
+      # property.
+      hungarian_weights = (
+          hungarian_weights * mask_gt_non_void_mask_expanded_2 +
+          (1 - mask_gt_non_void_mask_expanded_2) * _MATCHING_POSITIVE_CONSTANT)
+
+    # Perform the hungarian matching algorithm.
+    full_permutation, nonsquare_permutation = (
+        nonsquare_hungarian_matching(hungarian_weights))
+
+    # Extract the permutation (matching) between all existing non-void ground
+    # truth masks and the matched predicted masks.
+    matched_permutation = (
+        nonsquare_permutation * mask_gt_non_void_mask_expanded_2)
+    # The matched mask dice scores for each mask slot. The scores will be used
+    # as a loss weight for the PQ-style loss class term after the stop_gradient.
+    matched_mask_dice = tf.reduce_max(
+        mask_similarity * matched_permutation, axis=-2)
+    matched_mask_dice = tf.stop_gradient(matched_mask_dice)
+
+    # The matched class probabilities for each ground truth mask. The
+    # probabilities will be used as a loss weight for the PQ-style loss mask
+    # dice term after the stop_gradient.
+    matched_class_prob = tf.reduce_max(
+        class_similarity * matched_permutation, axis=-1)
+    matched_class_prob = tf.stop_gradient(matched_class_prob)
+
+    # Extract the index of the matched mask slot for each ground truth mask.
+    matched_mask_slot_indices = tf.math.argmax(
+        nonsquare_permutation, axis=-1, output_type=tf.dtypes.int32)
+
+    full_num_mask_slots = full_permutation.get_shape().as_list()[-1]
+    # Pad the pixel_space_mask_logits so that it is compatible with the
+    # permutation matrix.
+    full_pixel_space_mask_logits = tf.pad(
+        pixel_space_mask_logits,
+        [[0, 0], [0, 0], [0, full_num_mask_slots - self._num_mask_slots]],
+        constant_values=_SOFTMAX_MASKING_CONSTANT)
+
+    # Permute the pixel space mask logits with the permutation matrix, which
+    # converts the mask slot indices to the ground truth indices.
+    permuted_full_pixel_space_mask_logits = tf.einsum(
+        'bpi,bji->bpj', full_pixel_space_mask_logits, full_permutation)
+
+    # Pad the class probabilities too.
+    full_matched_class_prob = tf.pad(
+        matched_class_prob,
+        [[0, 0], [0, full_num_mask_slots - self._pixel_gt_num_mask_id]])
+    # We only compute dice loss term on non-void ground truth masks.
+    mask_dice_term_loss_weight = tf.pad(
+        mask_gt_non_void_mask,
+        [[0, 0], [0, full_num_mask_slots - self._pixel_gt_num_mask_id]])
+    # Use the class probabilities as the loss weight for the mask dice term. In
+    # addition, we set a lower bound, 1e-5, for the mask dice term loss weight.
+    # Otherwise, if a loss weight is accidentally zero, the dice loss will treat
+    # it as void and use an incorrect denominator or normalizing constant for
+    # the loss.
+    mask_dice_term_loss_weight *= tf.maximum(full_matched_class_prob, 1e-5)
+
+    # Pad the one hot encoding too.
+    full_pixel_gt_mask_id_one_hot = tf.pad(
+        pixel_gt_mask_id_one_hot,
+        [[0, 0], [0, 0], [0, full_num_mask_slots - self._pixel_gt_num_mask_id]])
+
+    if self._pq_style_loss_weight > 0.0:
+      # Mask_dice_term_modifier balances the mask_dice_term and the class_term
+      # of the PQ-style loss to have the same weight and normalizating constant.
+      resulting_dict[common.PQ_STYLE_LOSS_MASK_DICE_TERM] = (
+          self._pq_style_loss_mask_dice_term(
+              {_GT_KEY: full_pixel_gt_mask_id_one_hot},
+              {_PRED_KEY: permuted_full_pixel_space_mask_logits,
+               _WEIGHT_KEY: mask_dice_term_loss_weight}) *
+          (self._pq_style_loss_weight * self._mask_dice_term_modifier))
+
+    # Mask-ID cross entropy loss shares the same ground truth and logits as the
+    # dice loss term, but with different weights.
+    if self._mask_id_cross_entropy_loss_weight > 0.0:
+      resulting_dict[common.MASK_ID_CROSS_ENTROPY_LOSS] = (
+          self._mask_id_cross_entropy_loss(
+              {_GT_KEY: full_pixel_gt_mask_id_one_hot},
+              {_PRED_KEY: permuted_full_pixel_space_mask_logits,
+               _WEIGHT_KEY: pixel_gt_non_void_mask}) *
+          self._mask_id_cross_entropy_loss_weight)
+
+    # Generate a pseudo ground truth for transformer_class_logits.
+    mask_slot_semantic_one_hot = _generate_mask_slot_semantic_one_hot(
+        matched_mask_slot_indices, mask_gt_semantic_map,
+        self._num_mask_slots, self._thing_stuff_class_ids)
+
+    # Compute the positive mask and the negative mask.
+    mask_slot_positive_mask = tf.cast(tf.equal(tf.reduce_max(
+        mask_slot_semantic_one_hot, axis=-1), 1.0), tf.float32)
+    mask_slot_negative_mask = 1.0 - mask_slot_positive_mask
+
+    # Compute the overlap ratio between all predicted masks and the void region.
+    # This void ratio will be used as a weight for the negative class term.
+    mask_void_ratio = tf.stop_gradient(_mask_similarity(
+        1.0 - pixel_gt_non_void_mask_expanded,
+        pixel_space_mask_probs,
+        'intersection_over_prediction'))
+    mask_void_ratio = tf.squeeze(mask_void_ratio, axis=1)
+
+    # Use the matched mask dice scores as the weights for the positive class
+    # terms. For the negative class terms, we reduce the penalty for a mask slot
+    # class term if the mask prediction overlaps a lot with void regions.
+    transformer_class_loss_weight = (
+        mask_slot_positive_mask * tf.maximum(matched_mask_dice, 1e-5) +
+        mask_slot_negative_mask * tf.maximum(mask_void_ratio, 1e-5))
+
+    # Concatenate the void mask in the last channel, constructing the final
+    # ground truth one hot label with (thing + stuff + void) channels.
+    transformer_class_one_hot = tf.concat(
+        [mask_slot_semantic_one_hot,
+         tf.expand_dims(mask_slot_negative_mask, axis=-1)], axis=-1)
+
+    # Apply the PQ-style loss class term.
+    if self._pq_style_loss_weight > 0.0:
+      resulting_dict[common.PQ_STYLE_LOSS_CLASS_TERM] = (
+          self._pq_style_loss_class_term(
+              {_GT_KEY: transformer_class_one_hot},
+              {_PRED_KEY: transformer_class_logits,
+               _WEIGHT_KEY: transformer_class_loss_weight}) *
+          self._pq_style_loss_weight)
+
+    return resulting_dict
diff --git a/model/loss/max_deeplab_loss_test.py b/model/loss/max_deeplab_loss_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..895bca5a2246a35eef90ae54273499f6684772cd
--- /dev/null
+++ b/model/loss/max_deeplab_loss_test.py
@@ -0,0 +1,103 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for max_deeplab_loss.py."""
+
+import tensorflow as tf
+
+from deeplab2 import common
+from deeplab2 import trainer_pb2
+from deeplab2.data import dataset
+from deeplab2.model.loss import max_deeplab_loss
+
+
+class MaXDeepLabLossTest(tf.test.TestCase):
+
+  def test_max_deeplab_loss(self):
+    # Build the loss layer.
+    dataset_info = dataset.COCO_PANOPTIC_INFORMATION
+    semantic_loss_options = trainer_pb2.LossOptions.SingleLossOptions(
+        name='softmax_cross_entropy')
+    pq_style_loss_options = trainer_pb2.LossOptions.SingleLossOptions()
+    mask_id_cross_entropy_loss_options = (
+        trainer_pb2.LossOptions.SingleLossOptions())
+    instance_discrimination_loss_options = (
+        trainer_pb2.LossOptions.SingleLossOptions())
+    loss_options_1 = trainer_pb2.LossOptions(
+        semantic_loss=semantic_loss_options,
+        pq_style_loss=pq_style_loss_options,
+        mask_id_cross_entropy_loss=mask_id_cross_entropy_loss_options,
+        instance_discrimination_loss=instance_discrimination_loss_options)
+    loss_layer_1 = max_deeplab_loss.MaXDeepLabLoss(
+        loss_options_1,
+        ignore_label=dataset_info.ignore_label,
+        thing_class_ids=dataset_info.class_has_instances_list)
+    loss_options_2 = trainer_pb2.LossOptions(
+        pq_style_loss=pq_style_loss_options)
+    loss_layer_2 = max_deeplab_loss.MaXDeepLabLoss(
+        loss_options_2,
+        ignore_label=dataset_info.ignore_label,
+        thing_class_ids=dataset_info.class_has_instances_list)
+
+    # Build the inputs.
+    pred_dict = {
+        common.PRED_PIXEL_SPACE_NORMALIZED_FEATURE_KEY:
+            tf.random.uniform(shape=[2, 9, 9, 8]),
+        common.PRED_PIXEL_SPACE_MASK_LOGITS_KEY:
+                tf.random.uniform(shape=[2, 9, 9, 128]),
+        common.PRED_TRANSFORMER_CLASS_LOGITS_KEY:
+                tf.random.uniform(shape=[2, 128, 134]),
+    }
+    gt_dict = {
+        common.GT_SEMANTIC_KEY: tf.ones(shape=[2, 33, 33], dtype=tf.int32),
+        common.GT_THING_ID_MASK_KEY: tf.ones(shape=[2, 33, 33],
+                                             dtype=tf.int32),
+        common.GT_THING_ID_CLASS_KEY: tf.concat(
+            # An image with ten people (class_id = 1) and 118 void masks.
+            [tf.ones(shape=[2, 10], dtype=tf.int32),
+             -tf.ones(shape=[2, 118], dtype=tf.int32)], axis=-1),
+    }
+    loss_dict_1 = loss_layer_1((gt_dict, pred_dict))
+
+    self.assertIn(common.PQ_STYLE_LOSS_CLASS_TERM, loss_dict_1)
+    self.assertIn(common.PQ_STYLE_LOSS_MASK_DICE_TERM, loss_dict_1)
+    self.assertIn(common.MASK_ID_CROSS_ENTROPY_LOSS, loss_dict_1)
+    self.assertIn(common.INSTANCE_DISCRIMINATION_LOSS, loss_dict_1)
+    self.assertNotIn(common.PQ_STYLE_LOSS, loss_dict_1)
+
+    self.assertIn(common.PQ_STYLE_LOSS_CLASS_TERM, loss_layer_1.loss_terms)
+    self.assertIn(common.PQ_STYLE_LOSS_MASK_DICE_TERM, loss_layer_1.loss_terms)
+    self.assertIn(common.MASK_ID_CROSS_ENTROPY_LOSS, loss_layer_1.loss_terms)
+    self.assertIn(common.INSTANCE_DISCRIMINATION_LOSS, loss_layer_1.loss_terms)
+    self.assertNotIn(common.PQ_STYLE_LOSS, loss_layer_1.loss_terms)
+
+    loss_dict_2 = loss_layer_2((gt_dict, pred_dict))
+
+    self.assertIn(common.PQ_STYLE_LOSS_CLASS_TERM, loss_dict_2)
+    self.assertIn(common.PQ_STYLE_LOSS_MASK_DICE_TERM, loss_dict_2)
+    self.assertNotIn(common.MASK_ID_CROSS_ENTROPY_LOSS, loss_dict_2)
+    self.assertNotIn(common.INSTANCE_DISCRIMINATION_LOSS, loss_dict_2)
+    self.assertNotIn(common.PQ_STYLE_LOSS, loss_dict_2)
+
+    self.assertIn(common.PQ_STYLE_LOSS_CLASS_TERM, loss_layer_2.loss_terms)
+    self.assertIn(common.PQ_STYLE_LOSS_MASK_DICE_TERM, loss_layer_2.loss_terms)
+    self.assertNotIn(common.MASK_ID_CROSS_ENTROPY_LOSS, loss_layer_2.loss_terms)
+    self.assertNotIn(common.INSTANCE_DISCRIMINATION_LOSS,
+                     loss_layer_2.loss_terms)
+    self.assertNotIn(common.PQ_STYLE_LOSS, loss_layer_2.loss_terms)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/model/post_processor/__init__.py b/model/post_processor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..35e4ce02ff422f3aa84ab644b88d65b13e0cbc03
--- /dev/null
+++ b/model/post_processor/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/model/post_processor/max_deeplab.py b/model/post_processor/max_deeplab.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab809c2dc9cdfb4ca308bbdd051f08508726d78d
--- /dev/null
+++ b/model/post_processor/max_deeplab.py
@@ -0,0 +1,464 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This file contains functions to post-process MaX-DeepLab results."""
+
+import functools
+from typing import List, Tuple, Dict, Text
+
+import tensorflow as tf
+
+from deeplab2 import common
+from deeplab2 import config_pb2
+from deeplab2.data import dataset
+from deeplab2.model import utils
+
+
+def _get_transformer_class_prediction(
+    transformer_class_probs: tf.Tensor,
+    transformer_class_confidence_threshold: float
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+  """Computes the transformer class prediction and confidence score.
+
+  Args:
+    transformer_class_probs: A tf.Tensor of shape [num_mask_slots,
+      num_thing_stuff_classes + 1]. It is a pixel level logit scores where the
+      num_mask_slots is the number of mask slots (for both thing classes and
+      stuff classes) in MaX-DeepLab. The last channel indicates a `void` class.
+    transformer_class_confidence_threshold: A float for thresholding the
+      confidence of the transformer_class_probs. The panoptic mask slots with
+      class confidence less than the threshold are filtered and not used for
+      panoptic prediction. Only masks whose confidence is larger than the
+      threshold are counted in num_detections.
+
+  Returns:
+    A tuple of:
+    - the detected mask class prediction as float32 tf.Tensor of shape
+      [num_detections].
+    - the detected mask indices as tf.Tensor of shape [num_detections].
+    - the number of detections as tf.Tensor of shape [1].
+  """
+  transformer_class_pred = tf.cast(
+      tf.argmax(transformer_class_probs, axis=-1), tf.float32)
+  transformer_class_confidence = tf.reduce_max(
+      transformer_class_probs, axis=-1, keepdims=False)
+  # Filter mask IDs with class confidence less than the threshold.
+  thresholded_mask = tf.cast(
+      tf.greater_equal(transformer_class_confidence,
+                       transformer_class_confidence_threshold), tf.float32)
+  transformer_class_confidence = (transformer_class_confidence
+                                  * thresholded_mask)
+
+  detected_mask_indices = tf.where(tf.greater(thresholded_mask, 0.5))[:, 0]
+  detected_mask_class_pred = tf.gather(
+      transformer_class_pred, detected_mask_indices)
+  num_detections = tf.shape(detected_mask_indices)[0]
+  return detected_mask_class_pred, detected_mask_indices, num_detections
+
+
+def _get_mask_id_and_semantic_maps(
+    thing_class_ids: List[int],
+    stuff_class_ids: List[int],
+    pixel_space_mask_logits: tf.Tensor,
+    transformer_class_probs: tf.Tensor,
+    image_shape: List[int],
+    pixel_confidence_threshold=0.4,
+    transformer_class_confidence_threshold=0.7,
+    pieces=1) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:
+  """Computes the pixel-level mask ID map and semantic map per image.
+
+  Args:
+    thing_class_ids: A List of integers of shape [num_thing_classes] containing
+      thing class indices.
+    stuff_class_ids: A List of integers of shape [num_thing_classes] containing
+      stuff class indices.
+    pixel_space_mask_logits: A tf.Tensor of shape [height, width,
+      num_mask_slots]. It is a pixel level logit scores where the
+      num_mask_slots is the number of mask slots (for both thing classes
+      and stuff classes) in MaX-DeepLab.
+    transformer_class_probs: A tf.Tensor of shape [num_mask_slots,
+      num_thing_stuff_classes + 1]. It is a pixel level logit scores where the
+      num_mask_slots is the number of mask slots (for both thing classes and
+      stuff classes) in MaX-DeepLab. The last channel indicates a `void` class.
+    image_shape: A list of integers specifying the [height, width] of input
+      image.
+    pixel_confidence_threshold: A float indicating a threshold for the pixel
+      level softmax probability confidence of transformer mask logits. If less
+      than the threshold, the pixel locations have confidence `0` in
+      `confident_regions` output, and represent `void` (ignore) regions.
+    transformer_class_confidence_threshold: A float for thresholding the
+      confidence of the transformer_class_probs. The panoptic mask slots with
+      class confidence less than the threshold are filtered and not used for
+      panoptic prediction.
+    pieces: An integer indicating the number of pieces in the piece-wise
+      operation. When computing panpotic prediction and confident regions, the
+      mask logits are divided width-wise into multiple pieces and processed
+      piece-wise due to the GPU memory limit. Then, the piece-wise outputs are
+      concatenated along the width into the original mask shape. Defaults to 1.
+
+  Returns:
+    A tuple of:
+    - the mask ID prediction as tf.Tensor with shape [height, width].
+    - the semantic prediction as tf.Tensor with shape [height, width].
+    - the thing region mask as tf.Tensor with shape [height, width].
+    - the stuff region mask as tf.Tensor with shape [height, width].
+
+  Raises:
+    ValueError: When input image's `width - 1` is not divisible by `pieces`.
+  """
+  # The last channel indicates `void` class and thus is not included.
+  transformer_class_probs = transformer_class_probs[..., :-1]
+  # Generate mapping from mask IDs to dataset's thing and stuff semantic IDs.
+  thing_stuff_class_ids = thing_class_ids + stuff_class_ids
+
+  detected_mask_class_pred, detected_mask_indices, num_detections = (
+      _get_transformer_class_prediction(transformer_class_probs,
+                                        transformer_class_confidence_threshold))
+  # If num_detections = 0, return empty result maps.
+  def _return_empty_mask_id_and_semantic_maps():
+    return (
+        tf.ones([image_shape[0], image_shape[1]], dtype=tf.int32),
+        tf.zeros([image_shape[0], image_shape[1]], dtype=tf.int32),
+        tf.zeros([image_shape[0], image_shape[1]], dtype=tf.float32),
+        tf.zeros([image_shape[0], image_shape[1]], dtype=tf.float32))
+
+  # If num_detections > 0:
+  def _generate_mask_id_and_semantic_maps():
+    output_mask_id_map = []
+    output_confident_region = []
+    logits_width = pixel_space_mask_logits.get_shape().as_list()[1]
+    output_width = image_shape[1]
+
+    if (output_width - 1) % pieces > 0:
+      raise ValueError('`output_width - 1` must be divisible by `pieces`.')
+    # Use of input shape of a multiple of the feature stride, plus one, so that
+    # it preserves left- and right-alignment.
+    piece_output_width = (output_width - 1) // pieces + 1
+
+    for piece_id in range(pieces):
+      piece_begin = (logits_width - 1) // pieces * piece_id
+      # Use of input shape of a multiple of the feature stride, plus one, so
+      # that it preserves left- and right-alignment.
+      piece_end = (logits_width - 1) // pieces * (piece_id + 1) + 1
+      piece_pixel_mask_logits = (
+          pixel_space_mask_logits[:, piece_begin:piece_end, :])
+      piece_pixel_mask_logits = tf.compat.v1.image.resize_bilinear(
+          tf.expand_dims(piece_pixel_mask_logits, 0),
+          (image_shape[0], piece_output_width),
+          align_corners=True)
+      piece_pixel_mask_logits = tf.squeeze(piece_pixel_mask_logits, axis=0)
+      piece_detected_pixel_mask_logits = tf.gather(
+          piece_pixel_mask_logits, detected_mask_indices, axis=-1)
+      # Filter the pixels which are assigned to a mask ID that does not survive.
+      piece_max_logits = tf.reduce_max(piece_pixel_mask_logits, axis=-1)
+      piece_detected_max_logits = tf.reduce_max(
+          piece_detected_pixel_mask_logits, axis=-1)
+      piece_detected_mask = tf.cast(tf.math.equal(
+          piece_max_logits, piece_detected_max_logits), tf.float32)
+      # Filter with pixel mask threshold.
+      piece_pixel_confidence_map = tf.reduce_max(
+          tf.nn.softmax(piece_detected_pixel_mask_logits, axis=-1), axis=-1)
+      piece_confident_region = tf.cast(
+          piece_pixel_confidence_map > pixel_confidence_threshold, tf.float32)
+      piece_confident_region = piece_confident_region * piece_detected_mask
+      piece_mask_id_map = tf.cast(
+          tf.argmax(piece_detected_pixel_mask_logits, axis=-1), tf.int32)
+      if piece_id == pieces - 1:
+        output_mask_id_map.append(piece_mask_id_map)
+        output_confident_region.append(piece_confident_region)
+      else:
+        output_mask_id_map.append(piece_mask_id_map[:, :-1])
+        output_confident_region.append(piece_confident_region[:, :-1])
+
+    mask_id_map = tf.concat(output_mask_id_map, axis=1)
+    confident_region = tf.concat(output_confident_region, axis=1)
+    mask_id_map_flat = tf.reshape(mask_id_map, [-1])
+    mask_id_semantic_map_flat = tf.gather(
+        detected_mask_class_pred, mask_id_map_flat)
+    mask_id_semantic_map = tf.reshape(
+        mask_id_semantic_map_flat, [image_shape[0], image_shape[1]])
+    # Generate thing and stuff masks (with value 1/0 indicates the
+    # presence/absence)
+    thing_mask = tf.cast(mask_id_semantic_map < len(thing_class_ids),
+                         tf.float32) * confident_region
+    stuff_mask = tf.cast(mask_id_semantic_map >= len(thing_class_ids),
+                         tf.float32) * confident_region
+    # Generate semantic_map.
+    semantic_map = tf.gather(
+        tf.convert_to_tensor(thing_stuff_class_ids),
+        tf.cast(tf.round(mask_id_semantic_map_flat), tf.int32))
+    semantic_map = tf.reshape(semantic_map, [image_shape[0], image_shape[1]])
+    # Add 1 because mask ID 0 is reserved for unconfident region.
+    mask_id_map_plus_one = mask_id_map + 1
+    semantic_map = tf.cast(tf.round(semantic_map), tf.int32)
+    return (mask_id_map_plus_one, semantic_map, thing_mask, stuff_mask)
+
+  mask_id_map_plus_one, semantic_map, thing_mask, stuff_mask = tf.cond(
+      tf.cast(num_detections, tf.float32) < tf.cast(0.5, tf.float32),
+      _return_empty_mask_id_and_semantic_maps,
+      _generate_mask_id_and_semantic_maps)
+
+  return (mask_id_map_plus_one, semantic_map, thing_mask, stuff_mask)
+
+
+def _filter_by_count(input_index_map: tf.Tensor,
+                     area_limit: int) -> Tuple[tf.Tensor, tf.Tensor]:
+  """Filters input index map by area limit threshold per index.
+
+  Args:
+    input_index_map: A float32 tf.Tensor of shape [batch, height, width].
+    area_limit: An integer specifying the number of pixels that each index
+      regions need to have at least. If not over the limit, the index regions
+      are masked (zeroed) out.
+
+  Returns:
+    masked input_index_map: A tf.Tensor with shape [batch, height, width],
+      masked by the area_limit threshold.
+    mask: A tf.Tensor with shape [batch, height, width]. It is a pixel-level
+      mask with 1. indicating the regions over the area limit, and 0. otherwise.
+  """
+  batch_size = tf.shape(input_index_map)[0]
+  index_map = tf.cast(tf.round(input_index_map), tf.int32)
+  index_map_flat = tf.reshape(index_map, [batch_size, -1])
+  counts = tf.math.bincount(index_map_flat, axis=-1)
+  counts_map = tf.gather(counts, index_map_flat, batch_dims=1)
+  counts_map = tf.reshape(counts_map, tf.shape(index_map))
+
+  mask = tf.cast(
+      tf.cast(counts_map, tf.float32) > tf.cast(area_limit - 0.5, tf.float32),
+      input_index_map.dtype)
+  return input_index_map * mask, mask
+
+
+def _merge_mask_id_and_semantic_maps(
+    mask_id_maps_plus_one: tf.Tensor,
+    semantic_maps: tf.Tensor,
+    thing_masks: tf.Tensor,
+    stuff_masks: tf.Tensor,
+    void_label: int,
+    label_divisor: int,
+    thing_area_limit: int,
+    stuff_area_limit: int,) -> tf.Tensor:
+  """Merges mask_id maps and semantic_maps to obtain panoptic segmentation.
+
+  Args:
+    mask_id_maps_plus_one: A tf.Tensor of shape [batch, height, width].
+    semantic_maps: A tf.Tensor of shape [batch, height, width].
+    thing_masks: A float32 tf.Tensor of shape [batch, height, width] containing
+      masks with 1. at thing regions, 0. otherwise.
+    stuff_masks: A float32 tf.Tensor of shape [batch, height, width] containing
+      masks with 1. at thing regions, 0. otherwise.
+    void_label: An integer specifying the void label.
+    label_divisor: An integer specifying the label divisor of the dataset.
+    thing_area_limit: An integer specifying the number of pixels that thing
+      regions need to have at least. The thing region will be included in the
+      panoptic prediction, only if its area is larger than the limit; otherwise,
+      it will be re-assigned as void_label.
+    stuff_area_limit: An integer specifying the number of pixels that stuff
+      regions need to have at least. The stuff region will be included in the
+      panoptic prediction, only if its area is larger than the limit; otherwise,
+      it will be re-assigned as void_label.
+
+  Returns:
+    panoptic_maps: A tf.Tensor with shape [batch, height, width].
+
+  """
+  thing_mask_id_maps_plus_one = (tf.cast(mask_id_maps_plus_one, tf.float32)
+                                 * thing_masks)
+  # We increase semantic_maps by 1 before masking (zeroing) by thing_masks and
+  # stuff_masks, to ensure all valid semantic IDs are greater than 0 and thus
+  # not masked out.
+  semantic_maps_plus_one = semantic_maps + 1
+  tf.debugging.assert_less(
+      tf.reduce_sum(thing_masks * stuff_masks), 0.5,
+      message='thing_masks and stuff_masks must be mutually exclusive.')
+
+  thing_semantic_maps = (tf.cast(semantic_maps_plus_one, tf.float32)
+                         * thing_masks)
+  stuff_semantic_maps = (tf.cast(semantic_maps_plus_one, tf.float32)
+                         * stuff_masks)
+
+  # Filter stuff_semantic_maps by stuff_area_limit.
+  stuff_semantic_maps, _ = _filter_by_count(
+      stuff_semantic_maps, stuff_area_limit)
+  # Filter thing_mask_id_map and thing_semantic_map by thing_area_limit
+  thing_mask_id_maps_plus_one, mask_id_count_filter_mask = _filter_by_count(
+      thing_mask_id_maps_plus_one, thing_area_limit)
+  thing_semantic_maps = thing_semantic_maps * mask_id_count_filter_mask
+
+  # Filtered un-confident region will be replaced with `void_label`. The
+  # "plus_one" will be reverted, the un-confident region (0) will be -1, and so
+  # we add (void + 1)
+  semantic_maps_new = thing_semantic_maps + stuff_semantic_maps - 1.0
+  semantic_maps_new = (tf.cast(semantic_maps_new < -0.5, tf.float32)
+                       * tf.cast(void_label + 1, tf.float32)
+                       + semantic_maps_new)
+  panoptic_maps = (semantic_maps_new * label_divisor
+                   + thing_mask_id_maps_plus_one)
+  panoptic_maps = tf.cast(tf.round(panoptic_maps), tf.int32)
+  return panoptic_maps
+
+
+def _get_panoptic_predictions(
+    pixel_space_mask_logits: tf.Tensor,
+    transformer_class_logits: tf.Tensor,
+    thing_class_ids: List[int],
+    void_label: int,
+    label_divisor: int,
+    thing_area_limit: int,
+    stuff_area_limit: int,
+    image_shape: List[int],
+    pixel_confidence_threshold=0.4,
+    transformer_class_confidence_threshold=0.7,
+    pieces=1) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+  """Computes the pixel-level panoptic, mask ID, and semantic maps.
+
+  Args:
+    pixel_space_mask_logits: A tf.Tensor of shape [batch, strided_height,
+      strided_width, num_mask_slots]. It is a pixel level logit scores where the
+      num_mask_slots is the number of mask slots (for both thing classes
+      and stuff classes) in MaX-DeepLab.
+    transformer_class_logits: A tf.Tensor of shape [batch, num_mask_slots,
+      num_thing_stuff_classes + 1]. It is a pixel level logit scores where the
+      num_mask_slots is the number of mask slots (for both thing classes and
+      stuff classes) in MaX-DeepLab. The last channel indicates a `void` class.
+    thing_class_ids: A List of integers of shape [num_thing_classes] containing
+      thing class indices.
+    void_label: An integer specifying the void label.
+    label_divisor: An integer specifying the label divisor of the dataset.
+    thing_area_limit: An integer specifying the number of pixels that thing
+      regions need to have at least. The thing region will be included in the
+      panoptic prediction, only if its area is larger than the limit; otherwise,
+      it will be re-assigned as void_label.
+    stuff_area_limit: An integer specifying the number of pixels that stuff
+      regions need to have at least. The stuff region will be included in the
+      panoptic prediction, only if its area is larger than the limit; otherwise,
+      it will be re-assigned as void_label.
+    image_shape: A list of integers specifying the [height, width] of input
+      image.
+    pixel_confidence_threshold: A float indicating a threshold for the pixel
+      level softmax probability confidence of transformer mask logits. If less
+      than the threshold, the pixel locations have confidence `0` in
+      `confident_regions` output, and represent `void` (ignore) regions.
+    transformer_class_confidence_threshold: A float for thresholding the
+      confidence of the transformer_class_probs. The panoptic mask slots with
+      class confidence less than the threshold are filtered and not used for
+      panoptic prediction.
+    pieces: An integer indicating the number of pieces in the piece-wise
+      operation in `_get_mask_id_and_semantic_maps`. When computing panoptic
+      prediction and confident regions, the mask logits are divided width-wise
+      into multiple pieces and processed piece-wise due to the GPU memory limit.
+      Then, the piece-wise outputs are concatenated along the width into the
+      original mask shape. Defaults to 1.
+
+  Returns:
+    A tuple of:
+    - the panoptic prediction as tf.Tensor with shape [batch, height, width].
+    - the mask ID prediction as tf.Tensor with shape [batch, height, width].
+    - the semantic prediction as tf.Tensor with shape [batch, height, width].
+  """
+  transformer_class_probs = tf.nn.softmax(transformer_class_logits, axis=-1)
+  batch_size = tf.shape(transformer_class_logits)[0]
+  # num_thing_stuff_classes does not include `void` class, so we decrease by 1.
+  num_thing_stuff_classes = (
+      transformer_class_logits.get_shape().as_list()[-1] - 1)
+  # Generate thing and stuff class ids
+  stuff_class_ids = utils.get_stuff_class_ids(
+      num_thing_stuff_classes, thing_class_ids, void_label)
+
+  mask_id_map_plus_one_lists = tf.TensorArray(
+      tf.int32, size=batch_size, dynamic_size=False)
+  semantic_map_lists = tf.TensorArray(
+      tf.int32, size=batch_size, dynamic_size=False)
+  thing_mask_lists = tf.TensorArray(
+      tf.float32, size=batch_size, dynamic_size=False)
+  stuff_mask_lists = tf.TensorArray(
+      tf.float32, size=batch_size, dynamic_size=False)
+  for i in tf.range(batch_size):
+    mask_id_map_plus_one, semantic_map, thing_mask, stuff_mask = (
+        _get_mask_id_and_semantic_maps(
+            thing_class_ids, stuff_class_ids,
+            pixel_space_mask_logits[i, ...], transformer_class_probs[i, ...],
+            image_shape, pixel_confidence_threshold,
+            transformer_class_confidence_threshold, pieces)
+        )
+    mask_id_map_plus_one_lists = mask_id_map_plus_one_lists.write(
+        i, mask_id_map_plus_one)
+    semantic_map_lists = semantic_map_lists.write(i, semantic_map)
+    thing_mask_lists = thing_mask_lists.write(i, thing_mask)
+    stuff_mask_lists = stuff_mask_lists.write(i, stuff_mask)
+  # This does not work with unknown shapes.
+  mask_id_maps_plus_one = mask_id_map_plus_one_lists.stack()
+  semantic_maps = semantic_map_lists.stack()
+  thing_masks = thing_mask_lists.stack()
+  stuff_masks = stuff_mask_lists.stack()
+
+  panoptic_maps = _merge_mask_id_and_semantic_maps(
+      mask_id_maps_plus_one, semantic_maps, thing_masks, stuff_masks,
+      void_label, label_divisor, thing_area_limit, stuff_area_limit)
+  return panoptic_maps, mask_id_maps_plus_one, semantic_maps
+
+
+class PostProcessor(tf.keras.layers.Layer):
+  """This class contains code of a MaX-DeepLab post-processor."""
+
+  def __init__(
+      self,
+      config: config_pb2.ExperimentOptions,
+      dataset_descriptor: dataset.DatasetDescriptor):
+    """Initializes a MaX-DeepLab post-processor.
+
+    Args:
+      config: A config_pb2.ExperimentOptions configuration.
+      dataset_descriptor: A dataset.DatasetDescriptor.
+    """
+    super(PostProcessor, self).__init__(name='PostProcessor')
+    self._post_processor = functools.partial(
+        _get_panoptic_predictions,
+        thing_class_ids=list(dataset_descriptor.class_has_instances_list),
+        void_label=dataset_descriptor.ignore_label,
+        label_divisor=dataset_descriptor.panoptic_label_divisor,
+        thing_area_limit=config.evaluator_options.thing_area_limit,
+        stuff_area_limit=config.evaluator_options.stuff_area_limit,
+        image_shape=list(config.eval_dataset_options.crop_size),
+        transformer_class_confidence_threshold=config.evaluator_options
+        .transformer_class_confidence_threshold,
+        pixel_confidence_threshold=config.evaluator_options
+        .pixel_confidence_threshold,
+        pieces=1)
+
+  def call(self, result_dict: Dict[Text, tf.Tensor]) -> Dict[Text, tf.Tensor]:
+    """Performs the post-processing given model predicted results.
+
+    Args:
+      result_dict: A dictionary of tf.Tensor containing model results. The dict
+      has to contain
+       - common.PRED_PIXEL_SPACE_MASK_LOGITS_KEY,
+       - common.PRED_TRANSFORMER_CLASS_LOGITS_KEY,
+
+    Returns:
+      The post-processed dict of tf.Tensor, containing the following:
+       - common.PRED_SEMANTIC_KEY,
+       - common.PRED_INSTANCE_KEY,
+       - common.PRED_PANOPTIC_KEY,
+    """
+    processed_dict = {}
+    (processed_dict[common.PRED_PANOPTIC_KEY],
+     processed_dict[common.PRED_INSTANCE_KEY],
+     processed_dict[common.PRED_SEMANTIC_KEY]
+    ) = self._post_processor(
+        result_dict[common.PRED_PIXEL_SPACE_MASK_LOGITS_KEY],
+        result_dict[common.PRED_TRANSFORMER_CLASS_LOGITS_KEY])
+    return processed_dict
diff --git a/model/post_processor/max_deeplab_test.py b/model/post_processor/max_deeplab_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..639e3c515c20ce9e498bd23c39965951b7514823
--- /dev/null
+++ b/model/post_processor/max_deeplab_test.py
@@ -0,0 +1,231 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test for max_deeplab.py."""
+import numpy as np
+import tensorflow as tf
+
+from deeplab2.model.post_processor import max_deeplab
+
+
+class PostProcessingTest(tf.test.TestCase):
+
+  def test_filter_by_count(self):
+    input_index_map = tf.convert_to_tensor(
+        [[[1, 1, 1, 1],
+          [1, 2, 2, 1],
+          [3, 3, 3, 3],
+          [4, 5, 5, 5]],
+         [[4, 5, 5, 5],
+          [3, 3, 3, 3],
+          [1, 2, 2, 1],
+          [1, 1, 1, 1]]], dtype=tf.float32)
+    area_limit = 3
+    filtered_index_map, mask = max_deeplab._filter_by_count(
+        input_index_map, area_limit)
+
+    expected_filtered_index_map = tf.convert_to_tensor(
+        [[[1, 1, 1, 1],
+          [1, 0, 0, 1],
+          [3, 3, 3, 3],
+          [0, 5, 5, 5]],
+         [[0, 5, 5, 5],
+          [3, 3, 3, 3],
+          [1, 0, 0, 1],
+          [1, 1, 1, 1]]], dtype=tf.float32)
+    np.testing.assert_equal(filtered_index_map.numpy(),
+                            expected_filtered_index_map.numpy())
+    expected_mask = tf.convert_to_tensor(
+        [[[1, 1, 1, 1],
+          [1, 0, 0, 1],
+          [1, 1, 1, 1],
+          [0, 1, 1, 1]],
+         [[0, 1, 1, 1],
+          [1, 1, 1, 1],
+          [1, 0, 0, 1],
+          [1, 1, 1, 1]]], dtype=tf.float32)
+    np.testing.assert_equal(mask.numpy(), expected_mask.numpy())
+
+  def test_get_mask_id_and_semantic_maps(self):
+    height = 21
+    width = 21
+    num_mask_slots = 5
+    num_thing_stuff_classes = 19
+    thing_class_ids = list(range(11, 19))
+    stuff_class_ids = list(range(0, 11))
+    pixel_space_mask_logits = tf.random.uniform(
+        (height, width, num_mask_slots), minval=-10, maxval=10)
+    # Class scores are normalized beforehand (softmax-ed beforehand).
+    transformer_class_probs = tf.random.uniform(
+        (num_mask_slots, num_thing_stuff_classes + 1), minval=0, maxval=1)
+    input_shape = [41, 41]
+    pixel_confidence_threshold = 0.4
+    transformer_class_confidence_threshold = 0.7
+    pieces = 2
+
+    mask_id_map, semantic_map, thing_mask, stuff_mask = (
+        max_deeplab._get_mask_id_and_semantic_maps(
+            thing_class_ids, stuff_class_ids, pixel_space_mask_logits,
+            transformer_class_probs, input_shape, pixel_confidence_threshold,
+            transformer_class_confidence_threshold, pieces)
+        )
+    self.assertListEqual(mask_id_map.get_shape().as_list(), input_shape)
+    self.assertListEqual(semantic_map.get_shape().as_list(), input_shape)
+    self.assertListEqual(thing_mask.get_shape().as_list(), input_shape)
+    self.assertListEqual(stuff_mask.get_shape().as_list(), input_shape)
+
+  def test_merge_mask_id_and_semantic_maps(self):
+    mask_id_maps = tf.convert_to_tensor(
+        [[[1, 1, 1, 1],
+          [1, 2, 2, 1],
+          [3, 3, 4, 4],
+          [5, 5, 6, 6]]], dtype=tf.int32)
+    semantic_maps = tf.convert_to_tensor(
+        [[[0, 0, 0, 0],
+          [0, 1, 1, 0],
+          [2, 2, 2, 2],
+          [2, 2, 3, 3]]], dtype=tf.int32)
+    thing_masks = tf.convert_to_tensor(
+        [[[0, 0, 0, 0],
+          [0, 0, 0, 0],
+          [1, 1, 1, 1],
+          [1, 0, 1, 1]]], dtype=tf.float32)  # thing_class_ids = [2, 3]
+    stuff_masks = tf.convert_to_tensor(
+        [[[1, 1, 1, 0],
+          [1, 1, 1, 1],
+          [0, 0, 0, 0],
+          [0, 0, 0, 0]]], dtype=tf.float32)  # stuff_class_ids = [0, 1]
+
+    batch_size = 3
+    mask_id_maps = tf.repeat(mask_id_maps, repeats=batch_size, axis=0)
+    semantic_maps = tf.repeat(semantic_maps, repeats=batch_size, axis=0)
+    thing_masks = tf.repeat(thing_masks, repeats=batch_size, axis=0)
+    stuff_masks = tf.repeat(stuff_masks, repeats=batch_size, axis=0)
+
+    label_divisor = 100
+    stuff_area_limit = 3
+    void_label = 255
+    thing_area_limit = 2
+    # The expected_panoptic_prediction is computed as follows.
+    # All un-certain regions will be labeled as `void_label * label_divisor`.
+    # For `thing` segmentation, instance 3, 4, and 6 are kept, but instance 5
+    # is re-labeled as `void_label * label_divisor` since its area had been
+    # reduced by `confident_regions` and is then filtered by thing_area_limit.
+    # For `stuff` segmentation, class-0 region is kept, while class-1 region
+    # is re-labeled as `void_label * label_divisor` since its area is smaller
+    # than stuff_area_limit.
+    expected_panoptic_prediction = tf.convert_to_tensor(
+        [[[0, 0, 0, void_label * label_divisor],
+          [0, void_label * label_divisor, void_label * label_divisor, 0],
+          [2 * label_divisor + 3, 2 * label_divisor + 3, 2 * label_divisor + 4,
+           2 * label_divisor + 4],
+          [void_label * label_divisor, void_label * label_divisor,
+           3 * label_divisor + 6, 3 * label_divisor + 6]]],
+        dtype=tf.int32)
+    expected_panoptic_prediction = tf.repeat(
+        expected_panoptic_prediction, repeats=batch_size, axis=0)
+    panoptic_prediction = (
+        max_deeplab._merge_mask_id_and_semantic_maps(
+            mask_id_maps, semantic_maps, thing_masks, stuff_masks, void_label,
+            label_divisor, thing_area_limit, stuff_area_limit))
+
+    np.testing.assert_equal(expected_panoptic_prediction.numpy(),
+                            panoptic_prediction.numpy())
+
+  def test_get_panoptic_predictions(self):
+    batch = 1
+    height = 5
+    width = 5
+    num_thing_stuff_classes = 2
+    thing_class_ids = list(range(1, num_thing_stuff_classes + 1))  # [1, 2]
+    label_divisor = 10
+    stuff_area_limit = 3
+    void_label = 0  # `class-0` is `void`
+
+    o, x = 10, -10
+    pixel_space_mask_logits = tf.convert_to_tensor(
+        [[[[o, o, o, o, o],  # instance-1 mask
+           [o, x, x, o, o],
+           [x, x, x, x, x],
+           [x, x, x, x, x],
+           [x, x, x, x, x]],
+
+          [[x, x, x, x, x],  # instance-2 mask
+           [x, o, o, x, x],
+           [x, o, o, x, x],
+           [x, o, o, x, x],
+           [x, x, x, x, x]],
+
+          [[x, x, x, x, x],  # instance-3 mask
+           [x, x, x, x, x],
+           [o, x, x, o, o],
+           [o, x, x, o, o],
+           [o, o, o, o, o]]]],
+        dtype=tf.float32)
+    pixel_space_mask_logits = tf.transpose(pixel_space_mask_logits,
+                                           perm=[0, 2, 3, 1])  # b, h, w, c
+    # class scores are 0-1 normalized beforehand.
+    # 3-rd column (class-2) represents `void` class scores.
+    transformer_class_logits = tf.convert_to_tensor(
+        [[
+            [o, x, x],  # instance-1 -- class-0
+            [o, x, x],  # instance-2 -- class-0
+            [x, o, x],  # instance-3 -- class-1
+        ]], dtype=tf.float32)
+
+    input_shape = [5, 5]
+    pixel_confidence_threshold = 0.4
+    transformer_class_confidence_threshold = 0.7
+    thing_area_limit = 3
+    pieces = 1  # No piece-wise operation used.
+
+    panoptic_maps, mask_id_maps, semantic_maps = (
+        max_deeplab._get_panoptic_predictions(
+            pixel_space_mask_logits, transformer_class_logits, thing_class_ids,
+            void_label, label_divisor, thing_area_limit, stuff_area_limit,
+            input_shape, pixel_confidence_threshold,
+            transformer_class_confidence_threshold, pieces)
+        )
+    self.assertSequenceEqual(panoptic_maps.shape, (batch, height, width))
+    self.assertSequenceEqual(semantic_maps.shape, (batch, height, width))
+    self.assertSequenceEqual(mask_id_maps.shape, (batch, height, width))
+    expected_panoptic_maps = [[  # label_divisor = 10
+        [11, 11, 11, 11, 11],  # 11: semantic_id=1, instance_id=1
+        [11, 12, 12, 11, 11],  # 12: semantic_id=1, instance_id=2
+        [23, 12, 12, 23, 23],  # 23: semantic_id=2, instance_id=3
+        [23, 12, 12, 23, 23],
+        [23, 23, 23, 23, 23],
+    ]]
+    np.testing.assert_array_equal(panoptic_maps, expected_panoptic_maps)
+    expected_mask_id_maps = [[
+        [1, 1, 1, 1, 1],
+        [1, 2, 2, 1, 1],
+        [3, 2, 2, 3, 3],
+        [3, 2, 2, 3, 3],
+        [3, 3, 3, 3, 3],
+    ]]
+    np.testing.assert_array_equal(mask_id_maps, expected_mask_id_maps)
+    expected_semantic_maps = [[
+        [1, 1, 1, 1, 1],
+        [1, 1, 1, 1, 1],
+        [2, 1, 1, 2, 2],
+        [2, 1, 1, 2, 2],
+        [2, 2, 2, 2, 2],
+    ]]
+    np.testing.assert_array_equal(semantic_maps, expected_semantic_maps)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/model/post_processor/motion_deeplab.py b/model/post_processor/motion_deeplab.py
new file mode 100644
index 0000000000000000000000000000000000000000..afd637f6e4e2d1c57a9f3f7df9d1c98c617a8cc3
--- /dev/null
+++ b/model/post_processor/motion_deeplab.py
@@ -0,0 +1,257 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This file contains functions to post-process Motion-DeepLab results."""
+
+from typing import Tuple
+
+import tensorflow as tf
+
+
+def assign_instances_to_previous_tracks(
+    prev_centers: tf.Tensor,
+    current_centers: tf.Tensor,
+    heatmap: tf.Tensor,
+    offsets: tf.Tensor,
+    panoptic_map: tf.Tensor,
+    next_id: tf.Tensor,
+    label_divisor: int,
+    sigma=7) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+  """Greedy assignment of current centers to previous centers.
+
+  Current centers are selected in decreasing order of confidence (heatmap
+  scores). These centers are transformed with the offsets and assigned to
+  previous centers.
+
+  Args:
+    prev_centers: A tf.Tensor containing previous centers of shape [Np, 5]. This
+      tensor contains:
+      [0]: The x-coordinate.
+      [1]: The y-coordinate.
+      [2]: The panoptic ID.
+      [3]: The geometric mean of width and height of the instance mask.
+      [4]: The number of frames that no new masks got assigned to this center.
+    current_centers: A tf.Tensor containing centers of current frame of shape
+      [Nc, 5]. This tensor contains:
+      [0]: The x-coordinate.
+      [1]: The y-coordinate.
+      [2]: The panoptic ID.
+      [3]: The geometric mean of width and height of the instance mask.
+      [4]: The number of frames that no new masks got assigned to this center.
+    heatmap: A tf.Tensor of shape [batch, height, width] containing the center
+      heatmap.
+    offsets: A tf.Tensor of shape [batch, height, width, 2] containing the
+      center offsets.
+    panoptic_map: A tf.Tensor of shape [batch, height, width] containing the
+      panoptic segmentation.
+    next_id: A tf.Tensor of shape [1] containing the next ID.
+    label_divisor: An integer specifying the label divisor for panoptic IDs.
+    sigma: An optional integer specifying the number of frames that unmatched
+      centers should be kept (default: 7).
+
+  Returns:
+    A tuple of three tf.Tensor:
+      1. The updated panoptic segmentation map that contains track IDs.
+      2. The updated tensor containing all current centers (including unmatched
+        previous ones).
+      3. The updated next ID that can be used for new tracks.
+  """
+  # Switch x and y coordinates for indexing.
+  center_indices = tf.concat(
+      [tf.zeros([tf.shape(current_centers)[0], 1], dtype=tf.int32),
+       current_centers[:, 1:2], current_centers[:, 0:1]],
+      axis=1)
+  confidence_scores = tf.gather_nd(heatmap, center_indices)
+
+  scores = tf.argsort(confidence_scores, direction='DESCENDING')
+  cond = lambda i, *_: i < tf.shape(center_indices)[0]
+
+  def body(i, current_centers_loop, prev_centers_loop, new_panoptic_map_loop,
+           next_id_loop):
+    row_index = scores[i]
+    i = tf.add(i, 1)
+    center_id = current_centers_loop[row_index, 2]
+    center_location = current_centers_loop[row_index, :2]
+    center_offset_yx = offsets[0, center_location[1], center_location[0], :]
+    center_offset_xy = center_offset_yx[::-1]
+    center_location = center_offset_xy + tf.cast(center_location, tf.float32)
+    center_sem_id = center_id // label_divisor
+    center_mask = tf.equal(panoptic_map, center_id)
+    prev_centers_class = prev_centers_loop[:, 2] // label_divisor
+    prev_centers_with_same_class = tf.squeeze(
+        tf.cast(
+            tf.gather(
+                prev_centers_loop,
+                tf.where(tf.equal(prev_centers_class, center_sem_id)),
+                axis=0), tf.float32),
+        axis=1)
+
+    # Check if there are still unassigned previous centers of the same class.
+    if tf.shape(prev_centers_with_same_class)[0] > 0:
+      # For efficieny reasons, we do not take the sqrt when we compute the
+      # minimal distances. See render_panoptic_map_as_heatmap as well.
+      distances = tf.reduce_sum(
+          tf.square(prev_centers_with_same_class[:, :2] - center_location),
+          axis=1)
+      prev_center_index = tf.math.argmin(
+          distances, axis=0, output_type=tf.int32)
+      min_dist = distances[prev_center_index]
+
+      # If previous center is within a certain range, continue track.
+      if min_dist < prev_centers_with_same_class[prev_center_index, 3]:
+        new_center_id = tf.cast(
+            prev_centers_with_same_class[prev_center_index, 2], dtype=tf.int32)
+        shape = new_panoptic_map_loop.get_shape()
+        new_panoptic_map_loop = tf.where(center_mask, new_center_id,
+                                         new_panoptic_map_loop)
+        new_panoptic_map_loop.set_shape(shape)
+        current_centers_loop = tf.tensor_scatter_nd_update(
+            current_centers_loop, tf.expand_dims([row_index, 2], 0),
+            [new_center_id])
+        # Remove previous center.
+        prev_centers_loop = tf.squeeze(
+            tf.gather(
+                prev_centers_loop,
+                tf.where(tf.not_equal(prev_centers_loop[:, 2], new_center_id)),
+                axis=0),
+            axis=1)
+        return (i, current_centers_loop, prev_centers_loop,
+                new_panoptic_map_loop, next_id_loop)
+      else:
+        # Assign new track ID
+        new_center_id = center_sem_id * label_divisor + next_id_loop
+        shape = new_panoptic_map_loop.get_shape()
+        new_panoptic_map_loop = tf.where(center_mask, new_center_id,
+                                         new_panoptic_map_loop)
+        new_panoptic_map_loop.set_shape(shape)
+        current_centers_loop = tf.tensor_scatter_nd_update(
+            current_centers_loop, tf.expand_dims([row_index, 2], 0),
+            [new_center_id])
+        next_id_loop += 1
+        return (i, current_centers_loop, prev_centers_loop,
+                new_panoptic_map_loop, next_id_loop)
+    else:
+      # Assign new track ID
+      new_center_id = center_sem_id * label_divisor + next_id_loop
+      shape = new_panoptic_map_loop.get_shape()
+      new_panoptic_map_loop = tf.where(center_mask, new_center_id,
+                                       new_panoptic_map_loop)
+      new_panoptic_map_loop.set_shape(shape)
+      current_centers_loop = tf.tensor_scatter_nd_update(
+          current_centers_loop, tf.expand_dims([row_index, 2], 0),
+          [new_center_id])
+      next_id_loop += 1
+      return (i, current_centers_loop, prev_centers_loop, new_panoptic_map_loop,
+              next_id_loop)
+
+  loop_start_index = tf.constant(0)
+  (_, current_centers,
+   unmatched_centers, new_panoptic_map, next_id) = tf.while_loop(
+       cond, body,
+       (loop_start_index, current_centers, prev_centers, panoptic_map,
+        next_id))
+
+  # Keep unmatched centers for sigma frames.
+  if tf.shape(unmatched_centers)[0] > 0:
+    current_centers = tf.concat([current_centers, unmatched_centers], axis=0)
+
+  number_centers = tf.shape(current_centers)[0]
+  indices_row = tf.range(number_centers, dtype=tf.int32)
+  indices_column = tf.repeat([4], number_centers, axis=0)
+  indices = tf.stack([indices_row, indices_column], axis=1)
+  current_centers = tf.tensor_scatter_nd_add(
+      current_centers, indices,
+      tf.repeat([1], number_centers, axis=0))
+
+  # Remove centers after sigma frames.
+  current_centers = tf.squeeze(
+      tf.gather(
+          current_centers,
+          tf.where(tf.not_equal(current_centers[:, 4], sigma)),
+          axis=0),
+      axis=1)
+
+  return new_panoptic_map, current_centers, next_id
+
+
+def render_panoptic_map_as_heatmap(
+    panoptic_map: tf.Tensor, sigma: int, label_divisor: int,
+    void_label: int) -> Tuple[tf.Tensor, tf.Tensor]:
+  """Extracts centers from panoptic map and renders as heatmap."""
+  gaussian_size = 6 * sigma + 3
+  x = tf.range(gaussian_size, dtype=tf.float32)
+  y = tf.expand_dims(x, axis=1)
+  x0, y0 = 3 * sigma + 1, 3 * sigma + 1
+  gaussian = tf.math.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma**2))
+  gaussian = tf.cast(tf.reshape(gaussian, [-1]), tf.float32)
+
+  height = tf.shape(panoptic_map)[1]
+  width = tf.shape(panoptic_map)[2]
+  # Pad center to make boundary handling easier.
+  center_pad_begin = int(round(3 * sigma + 1))
+  center_pad_end = int(round(3 * sigma + 2))
+  center_pad = center_pad_begin + center_pad_end
+
+  center = tf.zeros((height + center_pad, width + center_pad))
+  unique_ids, _ = tf.unique(tf.reshape(panoptic_map, [-1]))
+  centers_and_ids = tf.TensorArray(
+      tf.int32, size=0, dynamic_size=True, clear_after_read=False)
+  counter = tf.zeros([], dtype=tf.int32)
+
+  for panoptic_id in unique_ids:
+    semantic_id = panoptic_id // label_divisor
+    # Filter out IDs that should be ignored, are stuff classes or crowd.
+    # Stuff classes and crowd regions both have IDs of the form panoptic_id =
+    # semantic_id * label_divisor
+    if semantic_id == void_label or panoptic_id % label_divisor == 0:
+      continue
+
+    # Convert [[0, y0, x0], ...] to [[0, ...], [y0, ...], [x0, ...]].
+    mask_index = tf.cast(
+        tf.transpose(tf.where(panoptic_map == panoptic_id)), tf.float32)
+    mask_size = (
+        tf.reduce_max(mask_index, axis=1) - tf.reduce_min(mask_index, axis=1))
+    # The radius is defined as the geometric mean of width and height.
+    # For efficieny reasons, we do not take the sqrt when we compute the minimal
+    # distances. See assign_instances_to_previous_tracks as well.
+    mask_radius = tf.cast(tf.round(mask_size[1] * mask_size[2]), tf.int32)
+    centers = tf.reduce_mean(mask_index, axis=1)
+
+    center_x = tf.cast(tf.round(centers[2]), tf.int32)
+    center_y = tf.cast(tf.round(centers[1]), tf.int32)
+    centers_and_ids = centers_and_ids.write(
+        counter,
+        [center_x, center_y, tf.cast(panoptic_id, tf.int32), mask_radius, 0])
+    counter += 1
+
+    # Due to the padding with center_pad_begin in center, the computed center
+    # becomes the upper left corner in the center tensor.
+    upper_left = center_x, center_y
+    bottom_right = (upper_left[0] + gaussian_size,
+                    upper_left[1] + gaussian_size)
+
+    indices_x, indices_y = tf.meshgrid(
+        tf.range(upper_left[0], bottom_right[0]),
+        tf.range(upper_left[1], bottom_right[1]))
+    indices = tf.transpose(
+        tf.stack([tf.reshape(indices_y, [-1]),
+                  tf.reshape(indices_x, [-1])]))
+
+    center = tf.tensor_scatter_nd_max(
+        center, indices, gaussian, name='center_scatter')
+
+  center = center[center_pad_begin:(center_pad_begin + height),
+                  center_pad_begin:(center_pad_begin + width)]
+  return tf.expand_dims(center, axis=0), centers_and_ids.stack()
diff --git a/model/post_processor/panoptic_deeplab.py b/model/post_processor/panoptic_deeplab.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1d6aa31331c9035bc741f33494bd9df95fe676b
--- /dev/null
+++ b/model/post_processor/panoptic_deeplab.py
@@ -0,0 +1,463 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This file contains functions to post-process Panoptic-DeepLab results."""
+
+import functools
+from typing import Tuple, Dict, Text
+
+import tensorflow as tf
+
+from deeplab2 import common
+from deeplab2 import config_pb2
+from deeplab2.data import dataset
+from deeplab2.model import utils
+from deeplab2.tensorflow_ops.python.ops import merge_semantic_and_instance_maps_op as merge_ops
+
+
+def _get_semantic_predictions(semantic_logits: tf.Tensor) -> tf.Tensor:
+  """Computes the semantic classes from the predictions.
+
+  Args:
+    semantic_logits: A tf.tensor of shape [batch, height, width, classes].
+
+  Returns:
+    A tf.Tensor containing the semantic class prediction of shape
+      [batch, height, width].
+  """
+  return tf.argmax(semantic_logits, axis=-1, output_type=tf.int32)
+
+
+def _get_instance_centers_from_heatmap(
+    center_heatmap: tf.Tensor, center_threshold: float, nms_kernel_size: int,
+    keep_k_centers: int) -> Tuple[tf.Tensor, tf.Tensor]:
+  """Computes a list of instance centers.
+
+  Args:
+    center_heatmap: A tf.Tensor of shape [height, width, 1].
+    center_threshold: A float setting the threshold for the center heatmap.
+    nms_kernel_size: An integer specifying the nms kernel size.
+    keep_k_centers: An integer specifying the number of centers to keep (K).
+      Non-positive values will keep all centers.
+
+  Returns:
+    A tuple of
+    - tf.Tensor of shape [N, 2] containing N center coordinates (after
+      non-maximum suppression) in (y, x) order.
+    - tf.Tensor of shape [height, width] containing the center heatmap after
+      non-maximum suppression.
+  """
+  # Threshold center map.
+  center_heatmap = tf.where(
+      tf.greater(center_heatmap, center_threshold), center_heatmap, 0.0)
+
+  # Non-maximum suppression.
+  padded_map = utils.add_zero_padding(center_heatmap, nms_kernel_size, rank=3)
+  pooled_center_heatmap = tf.keras.backend.pool2d(
+      tf.expand_dims(padded_map, 0),
+      pool_size=(nms_kernel_size, nms_kernel_size),
+      strides=(1, 1),
+      padding='valid',
+      pool_mode='max')
+  center_heatmap = tf.where(
+      tf.equal(pooled_center_heatmap, center_heatmap), center_heatmap, 0.0)
+  center_heatmap = tf.squeeze(center_heatmap, axis=[0, 3])
+
+  # `centers` is of shape (N, 2) with (y, x) order of the second dimension.
+  centers = tf.where(tf.greater(center_heatmap, 0.0))
+
+  if keep_k_centers > 0 and tf.shape(centers)[0] > keep_k_centers:
+    topk_scores, _ = tf.math.top_k(
+        tf.reshape(center_heatmap, [-1]), keep_k_centers, sorted=False)
+    centers = tf.where(tf.greater(center_heatmap, topk_scores[-1]))
+
+  return centers, center_heatmap
+
+
+def _find_closest_center_per_pixel(centers: tf.Tensor,
+                                   center_offsets: tf.Tensor) -> tf.Tensor:
+  """Assigns all pixels to their closest center.
+
+  Args:
+    centers: A tf.Tensor of shape [N, 2] containing N centers with coordinate
+      order (y, x).
+    center_offsets: A tf.Tensor of shape [height, width, 2].
+
+  Returns:
+    A tf.Tensor of shape [height, width] containing the index of the closest
+      center, per pixel.
+  """
+  height = tf.shape(center_offsets)[0]
+  width = tf.shape(center_offsets)[1]
+
+  x_coord, y_coord = tf.meshgrid(tf.range(width), tf.range(height))
+  coord = tf.stack([y_coord, x_coord], axis=-1)
+
+  center_per_pixel = tf.cast(coord, tf.float32) + center_offsets
+
+  # centers: [N, 2] -> [N, 1, 2].
+  # center_per_pixel: [H, W, 2] -> [1, H*W, 2].
+  centers = tf.cast(tf.expand_dims(centers, 1), tf.float32)
+  center_per_pixel = tf.reshape(center_per_pixel, [height*width, 2])
+  center_per_pixel = tf.expand_dims(center_per_pixel, 0)
+
+  # distances: [N, H*W].
+  distances = tf.norm(centers - center_per_pixel, axis=-1)
+
+  return tf.reshape(tf.argmin(distances, axis=0), [height, width])
+
+
+def _get_instances_from_heatmap_and_offset(
+    semantic_segmentation: tf.Tensor, center_heatmap: tf.Tensor,
+    center_offsets: tf.Tensor, center_threshold: float,
+    thing_class_ids: tf.Tensor, nms_kernel_size: int,
+    keep_k_centers: int) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+  """Computes the instance assignment per pixel.
+
+  Args:
+    semantic_segmentation: A tf.Tensor containing the semantic labels of shape
+      [height, width].
+    center_heatmap: A tf.Tensor of shape [height, width, 1].
+    center_offsets: A tf.Tensor of shape [height, width, 2].
+    center_threshold: A float setting the threshold for the center heatmap.
+    thing_class_ids: A tf.Tensor of shape [N] containing N thing indices.
+    nms_kernel_size: An integer specifying the nms kernel size.
+    keep_k_centers: An integer specifying the number of centers to keep.
+      Negative values will keep all centers.
+
+  Returns:
+    A tuple of:
+    - tf.Tensor containing the instance segmentation (filtered with the `thing`
+      segmentation from the semantic segmentation output) with shape
+      [height, width].
+    - tf.Tensor containing the processed centermap with shape [height, width].
+    - tf.Tensor containing instance scores (where higher "score" is a reasonable
+      signal of a higher confidence detection.) Will be of shape [height, width]
+      with the score for a pixel being the score of the instance it belongs to.
+      The scores will be zero for pixels in background/"stuff" regions.
+  """
+  thing_segmentation = tf.zeros_like(semantic_segmentation)
+  for thing_id in thing_class_ids:
+    thing_segmentation = tf.where(tf.equal(semantic_segmentation, thing_id),
+                                  1,
+                                  thing_segmentation)
+
+  centers, processed_center_heatmap = _get_instance_centers_from_heatmap(
+      center_heatmap, center_threshold, nms_kernel_size, keep_k_centers)
+  if tf.shape(centers)[0] == 0:
+    return (tf.zeros_like(semantic_segmentation), processed_center_heatmap,
+            tf.zeros_like(processed_center_heatmap))
+
+  instance_center_index = _find_closest_center_per_pixel(
+      centers, center_offsets)
+  # Instance IDs should start with 1. So we use the index into the centers, but
+  # shifted by 1.
+  instance_segmentation = tf.cast(instance_center_index, tf.int32) + 1
+
+  # The value of the heatmap at an instance's center is used as the score
+  # for that instance.
+  instance_scores = tf.gather_nd(processed_center_heatmap, centers)
+  tf.debugging.assert_shapes([
+      (centers, ('N', 2)),
+      (instance_scores, ('N',)),
+  ])
+  # This will map the instance scores back to the image space: where each pixel
+  # has a value equal to the score of its instance.
+  flat_center_index = tf.reshape(instance_center_index, [-1])
+  instance_score_map = tf.gather(instance_scores, flat_center_index)
+  instance_score_map = tf.reshape(instance_score_map,
+                                  tf.shape(instance_segmentation))
+  instance_score_map *= tf.cast(thing_segmentation, tf.float32)
+
+  return (thing_segmentation * instance_segmentation, processed_center_heatmap,
+          instance_score_map)
+
+
+@tf.function
+def _get_panoptic_predictions(
+    semantic_logits: tf.Tensor, center_heatmap: tf.Tensor,
+    center_offsets: tf.Tensor, center_threshold: float,
+    thing_class_ids: tf.Tensor, label_divisor: int, stuff_area_limit: int,
+    void_label: int, nms_kernel_size: int, keep_k_centers: int,
+    merge_semantic_and_instance_with_tf_op: bool
+) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:
+  """Computes the semantic class and instance ID per pixel.
+
+  Args:
+    semantic_logits: A tf.Tensor of shape [batch, height, width, classes].
+    center_heatmap: A tf.Tensor of shape [batch, height, width, 1].
+    center_offsets: A tf.Tensor of shape [batch, height, width, 2].
+    center_threshold: A float setting the threshold for the center heatmap.
+    thing_class_ids: A tf.Tensor of shape [N] containing N thing indices.
+    label_divisor: An integer specifying the label divisor of the dataset.
+    stuff_area_limit: An integer specifying the number of pixels that stuff
+      regions need to have at least. The stuff region will be included in the
+      panoptic prediction, only if its area is larger than the limit; otherwise,
+      it will be re-assigned as void_label.
+    void_label: An integer specifying the void label.
+    nms_kernel_size: An integer specifying the nms kernel size.
+    keep_k_centers: An integer specifying the number of centers to keep.
+      Negative values will keep all centers.
+    merge_semantic_and_instance_with_tf_op: Boolean, specifying the merging
+      operation uses TensorFlow (CUDA kernel) implementation (True) or
+      tf.py_function implementation (False). Note the tf.py_function
+      implementation is simply used as a backup solution when you could not
+      successfully compile the provided TensorFlow implementation. To reproduce
+      our results, please use the provided TensorFlow implementation `merge_ops`
+      (i.e., set to True).
+
+  Returns:
+    A tuple of:
+    - the panoptic prediction as tf.Tensor with shape [batch, height, width].
+    - the semantic prediction as tf.Tensor with shape [batch, height, width].
+    - the instance prediction as tf.Tensor with shape [batch, height, width].
+    - the centermap prediction as tf.Tensor with shape [batch, height, width].
+    - the instance score maps as tf.Tensor with shape [batch, height, width].
+  """
+  semantic_prediction = _get_semantic_predictions(semantic_logits)
+  batch_size = tf.shape(semantic_logits)[0]
+
+  instance_map_lists = tf.TensorArray(
+      tf.int32, size=batch_size, dynamic_size=False)
+  center_map_lists = tf.TensorArray(
+      tf.float32, size=batch_size, dynamic_size=False)
+  instance_score_map_lists = tf.TensorArray(
+      tf.float32, size=batch_size, dynamic_size=False)
+
+  for i in tf.range(batch_size):
+    (instance_map, center_map,
+     instance_score_map) = _get_instances_from_heatmap_and_offset(
+         semantic_prediction[i, ...], center_heatmap[i, ...],
+         center_offsets[i, ...], center_threshold, thing_class_ids,
+         nms_kernel_size, keep_k_centers)
+    instance_map_lists = instance_map_lists.write(i, instance_map)
+    center_map_lists = center_map_lists.write(i, center_map)
+    instance_score_map_lists = instance_score_map_lists.write(
+        i, instance_score_map)
+
+  # This does not work with unknown shapes.
+  instance_maps = instance_map_lists.stack()
+  center_maps = center_map_lists.stack()
+  instance_score_maps = instance_score_map_lists.stack()
+
+  if merge_semantic_and_instance_with_tf_op:
+    panoptic_prediction = merge_ops.merge_semantic_and_instance_maps(
+        semantic_prediction, instance_maps, thing_class_ids, label_divisor,
+        stuff_area_limit, void_label)
+  else:
+    panoptic_prediction = _merge_semantic_and_instance_maps(
+        semantic_prediction, instance_maps, thing_class_ids, label_divisor,
+        stuff_area_limit, void_label)
+  return (panoptic_prediction, semantic_prediction, instance_maps, center_maps,
+          instance_score_maps)
+
+
+@tf.function
+def _merge_semantic_and_instance_maps(
+    semantic_prediction: tf.Tensor,
+    instance_maps: tf.Tensor,
+    thing_class_ids: tf.Tensor,
+    label_divisor: int,
+    stuff_area_limit: int,
+    void_label: int) -> tf.Tensor:
+  """Merges semantic and instance maps to obtain panoptic segmentation.
+
+  This function merges the semantic segmentation and class-agnostic
+  instance segmentation to form the panoptic segmentation. In particular,
+  the class label of each instance mask is inferred from the majority
+  votes from the corresponding pixels in the semantic segmentation. This
+  operation is first poposed in the DeeperLab paper and adopted by the
+  Panoptic-DeepLab.
+
+  - DeeperLab: Single-Shot Image Parser, T-J Yang, et al. arXiv:1902.05093.
+  - Panoptic-DeepLab, B. Cheng, et al. In CVPR, 2020.
+
+  Note that this function only supports batch = 1 for simplicity. Additionally,
+  this function has a slightly different implementation from the provided
+  TensorFlow implementation `merge_ops` but with a similar performance. This
+  function is mainly used as a backup solution when you could not successfully
+  compile the provided TensorFlow implementation. To reproduce our results,
+  please use the provided TensorFlow implementation (i.e., not use this
+  function, but the `merge_ops.merge_semantic_and_instance_maps`).
+
+  Args:
+    semantic_prediction: A tf.Tensor of shape [batch, height, width].
+    instance_maps: A tf.Tensor of shape [batch, height, width].
+    thing_class_ids: A tf.Tensor of shape [N] containing N thing indices.
+    label_divisor: An integer specifying the label divisor of the dataset.
+    stuff_area_limit: An integer specifying the number of pixels that stuff
+      regions need to have at least. The stuff region will be included in the
+      panoptic prediction, only if its area is larger than the limit; otherwise,
+      it will be re-assigned as void_label.
+    void_label: An integer specifying the void label.
+
+  Returns:
+    panoptic_prediction: A tf.Tensor with shape [batch, height, width].
+  """
+  prediction_shape = semantic_prediction.get_shape().as_list()
+  # This implementation only supports batch size of 1. Since model construction
+  # might lose batch size information (and leave it to None), override it here.
+  prediction_shape[0] = 1
+  semantic_prediction = tf.ensure_shape(semantic_prediction, prediction_shape)
+  instance_maps = tf.ensure_shape(instance_maps, prediction_shape)
+
+  # Default panoptic_prediction to have semantic label = void_label.
+  panoptic_prediction = tf.ones_like(
+      semantic_prediction) * void_label * label_divisor
+
+  # Start to paste predicted `thing` regions to panoptic_prediction.
+  # Infer `thing` segmentation regions from semantic prediction.
+  semantic_thing_segmentation = tf.zeros_like(semantic_prediction,
+                                              dtype=tf.bool)
+  for thing_class in thing_class_ids:
+    semantic_thing_segmentation = tf.math.logical_or(
+        semantic_thing_segmentation,
+        semantic_prediction == thing_class)
+  # Keep track of how many instances for each semantic label.
+  num_instance_per_semantic_label = tf.TensorArray(
+      tf.int32, size=0, dynamic_size=True, clear_after_read=False)
+  instance_ids, _ = tf.unique(tf.reshape(instance_maps, [-1]))
+  for instance_id in instance_ids:
+    # Instance ID 0 is reserved for crowd region.
+    if instance_id == 0:
+      continue
+    thing_mask = tf.math.logical_and(instance_maps == instance_id,
+                                     semantic_thing_segmentation)
+    if tf.reduce_sum(tf.cast(thing_mask, tf.int32)) == 0:
+      continue
+    semantic_bin_counts = tf.math.bincount(
+        tf.boolean_mask(semantic_prediction, thing_mask))
+    semantic_majority = tf.cast(
+        tf.math.argmax(semantic_bin_counts), tf.int32)
+
+    while num_instance_per_semantic_label.size() <= semantic_majority:
+      num_instance_per_semantic_label = num_instance_per_semantic_label.write(
+          num_instance_per_semantic_label.size(), 0)
+
+    new_instance_id = (
+        num_instance_per_semantic_label.read(semantic_majority) + 1)
+    num_instance_per_semantic_label = num_instance_per_semantic_label.write(
+        semantic_majority, new_instance_id)
+    panoptic_prediction = tf.where(
+        thing_mask,
+        tf.ones_like(panoptic_prediction) * semantic_majority * label_divisor
+        + new_instance_id,
+        panoptic_prediction)
+
+  # Done with `num_instance_per_semantic_label` tensor array.
+  num_instance_per_semantic_label.close()
+
+  # Start to paste predicted `stuff` regions to panoptic prediction.
+  instance_stuff_regions = instance_maps == 0
+  semantic_ids, _ = tf.unique(tf.reshape(semantic_prediction, [-1]))
+  for semantic_id in semantic_ids:
+    if tf.reduce_sum(tf.cast(thing_class_ids == semantic_id, tf.int32)) > 0:
+      continue
+    # Check stuff area.
+    stuff_mask = tf.math.logical_and(semantic_prediction == semantic_id,
+                                     instance_stuff_regions)
+    stuff_area = tf.reduce_sum(tf.cast(stuff_mask, tf.int32))
+    if stuff_area >= stuff_area_limit:
+      panoptic_prediction = tf.where(
+          stuff_mask,
+          tf.ones_like(panoptic_prediction) * semantic_id * label_divisor,
+          panoptic_prediction)
+
+  return panoptic_prediction
+
+
+class SemanticOnlyPostProcessor(tf.keras.layers.Layer):
+  """This class contains code of a semantic only post-processor."""
+
+  def __init__(self):
+    """Initializes a semantic only post-processor."""
+    super(SemanticOnlyPostProcessor, self).__init__(
+        name='SemanticOnlyPostProcessor')
+
+  def call(self, result_dict: Dict[Text, tf.Tensor]) -> Dict[Text, tf.Tensor]:
+    """Performs the post-processing given model predicted results.
+
+    Args:
+      result_dict: A dictionary of tf.Tensor containing model results. The dict
+      has to contain
+       - common.PRED_SEMANTIC_PROBS_KEY,
+
+    Returns:
+      The post-processed dict of tf.Tensor, containing the following:
+       - common.PRED_SEMANTIC_KEY,
+    """
+    processed_dict = {}
+    processed_dict[common.PRED_SEMANTIC_KEY] = _get_semantic_predictions(
+        result_dict[common.PRED_SEMANTIC_PROBS_KEY])
+    return processed_dict
+
+
+class PostProcessor(tf.keras.layers.Layer):
+  """This class contains code of a Panoptic-Deeplab post-processor."""
+
+  def __init__(
+      self,
+      config: config_pb2.ExperimentOptions,
+      dataset_descriptor: dataset.DatasetDescriptor):
+    """Initializes a Panoptic-Deeplab post-processor.
+
+    Args:
+      config: A config_pb2.ExperimentOptions configuration.
+      dataset_descriptor: A dataset.DatasetDescriptor.
+    """
+    super(PostProcessor, self).__init__(name='PostProcessor')
+    self._post_processor = functools.partial(
+        _get_panoptic_predictions,
+        center_threshold=config.evaluator_options.center_score_threshold,
+        thing_class_ids=tf.convert_to_tensor(
+            dataset_descriptor.class_has_instances_list),
+        label_divisor=dataset_descriptor.panoptic_label_divisor,
+        stuff_area_limit=config.evaluator_options.stuff_area_limit,
+        void_label=dataset_descriptor.ignore_label,
+        nms_kernel_size=config.evaluator_options.nms_kernel,
+        keep_k_centers=config.evaluator_options.keep_k_centers,
+        merge_semantic_and_instance_with_tf_op=(
+            config.evaluator_options.merge_semantic_and_instance_with_tf_op),
+        )
+
+  def call(self, result_dict: Dict[Text, tf.Tensor]) -> Dict[Text, tf.Tensor]:
+    """Performs the post-processing given model predicted results.
+
+    Args:
+      result_dict: A dictionary of tf.Tensor containing model results. The dict
+      has to contain
+        - common.PRED_SEMANTIC_PROBS_KEY,
+        - common.PRED_CENTER_HEATMAP_KEY,
+        - common.PRED_OFFSET_MAP_KEY,
+
+    Returns:
+      The post-processed dict of tf.Tensor, containing the following:
+        - common.PRED_SEMANTIC_KEY,
+        - common.PRED_INSTANCE_KEY,
+        - common.PRED_PANOPTIC_KEY,
+        - common.PRED_INSTANCE_CENTER_KEY,
+        - common.PRED_INSTANCE_SCORES_KEY,
+    """
+    processed_dict = {}
+    (processed_dict[common.PRED_PANOPTIC_KEY],
+     processed_dict[common.PRED_SEMANTIC_KEY],
+     processed_dict[common.PRED_INSTANCE_KEY],
+     processed_dict[common.PRED_INSTANCE_CENTER_KEY],
+     processed_dict[common.PRED_INSTANCE_SCORES_KEY]
+    ) = self._post_processor(
+        result_dict[common.PRED_SEMANTIC_PROBS_KEY],
+        result_dict[common.PRED_CENTER_HEATMAP_KEY],
+        result_dict[common.PRED_OFFSET_MAP_KEY])
+    return processed_dict
diff --git a/model/post_processor/panoptic_deeplab_test.py b/model/post_processor/panoptic_deeplab_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..01b23656a1854718af2a73225e8a441efcafe0eb
--- /dev/null
+++ b/model/post_processor/panoptic_deeplab_test.py
@@ -0,0 +1,141 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test for panoptic_deeplab.py."""
+import numpy as np
+import tensorflow as tf
+
+from deeplab2.model.post_processor import panoptic_deeplab
+
+
+class PostProcessingTest(tf.test.TestCase):
+
+  def test_py_func_merge_semantic_and_instance_maps_can_run(self):
+    batch = 1
+    height = 5
+    width = 5
+    semantic_prediction = tf.random.uniform((batch, height, width),
+                                            minval=0,
+                                            maxval=20,
+                                            dtype=tf.int32)
+    instance_maps = tf.random.uniform((batch, height, width),
+                                      minval=0,
+                                      maxval=3,
+                                      dtype=tf.int32)
+    thing_class_ids = tf.convert_to_tensor([1, 2, 3])
+    label_divisor = 256
+    stuff_area_limit = 3
+    void_label = 255
+    panoptic_prediction = panoptic_deeplab._merge_semantic_and_instance_maps(
+        semantic_prediction, instance_maps, thing_class_ids, label_divisor,
+        stuff_area_limit, void_label)
+    self.assertListEqual(semantic_prediction.get_shape().as_list(),
+                         panoptic_prediction.get_shape().as_list())
+
+  def test_merge_semantic_and_instance_maps_with_a_simple_example(self):
+    semantic_prediction = tf.convert_to_tensor(
+        [[[0, 0, 0, 0],
+          [0, 1, 1, 0],
+          [0, 2, 2, 0],
+          [2, 2, 3, 3]]], dtype=tf.int32)
+    instance_maps = tf.convert_to_tensor(
+        [[[0, 0, 0, 0],
+          [0, 0, 0, 0],
+          [0, 1, 1, 0],
+          [2, 2, 3, 3]]], dtype=tf.int32)
+    thing_class_ids = tf.convert_to_tensor([2, 3])
+    label_divisor = 256
+    stuff_area_limit = 3
+    void_label = 255
+    # The expected_panoptic_prediction is computed as follows.
+    # For `thing` segmentation, instance 1, 2, and 3 are kept, but instance 3
+    # will have a new instance ID 1, since it is the first instance in its
+    # own semantic label.
+    # For `stuff` segmentation, class-0 region is kept, while class-1 region
+    # is re-labeled as `void_label * label_divisor` since its area is smaller
+    # than stuff_area_limit.
+    expected_panoptic_prediction = tf.convert_to_tensor(
+        [[[0, 0, 0, 0],
+          [0, void_label * label_divisor, void_label * label_divisor, 0],
+          [0, 2 * label_divisor + 1, 2 * label_divisor + 1, 0],
+          [2 * label_divisor + 2, 2 * label_divisor + 2, 3 * label_divisor + 1,
+           3 * label_divisor + 1]]], dtype=tf.int32)
+    panoptic_prediction = panoptic_deeplab._merge_semantic_and_instance_maps(
+        semantic_prediction, instance_maps, thing_class_ids, label_divisor,
+        stuff_area_limit, void_label)
+    np.testing.assert_equal(expected_panoptic_prediction.numpy(),
+                            panoptic_prediction.numpy())
+
+  def test_gets_panoptic_predictions_with_score(self):
+    batch = 1
+    height = 5
+    width = 5
+    classes = 3
+
+    semantic_logits = tf.random.uniform((batch, 1, 1, classes))
+    semantic_logits = tf.tile(semantic_logits, (1, height, width, 1))
+
+    center_heatmap = tf.convert_to_tensor([
+        [1.0, 0.0, 0.0, 0.0, 0.0],
+        [0.8, 0.0, 0.0, 0.0, 0.0],
+        [0.0, 0.0, 0.0, 0.0, 0.0],
+        [0.0, 0.0, 0.0, 0.1, 0.7],
+        [0.0, 0.0, 0.0, 0.0, 0.2],
+    ],
+                                          dtype=tf.float32)
+    center_heatmap = tf.expand_dims(center_heatmap, 0)
+    center_heatmap = tf.expand_dims(center_heatmap, 3)
+
+    center_offsets = tf.zeros((batch, height, width, 2))
+    center_threshold = 0.0
+    thing_class_ids = tf.range(classes)  # No "stuff" classes.
+    label_divisor = 256
+    stuff_area_limit = 16
+    void_label = classes
+    nms_kernel_size = 3
+    keep_k_centers = 2
+    merge_semantic_and_instance_with_tf_op = True
+
+    result = panoptic_deeplab._get_panoptic_predictions(
+        semantic_logits, center_heatmap, center_offsets, center_threshold,
+        thing_class_ids, label_divisor, stuff_area_limit, void_label,
+        nms_kernel_size, keep_k_centers, merge_semantic_and_instance_with_tf_op)
+    instance_maps = result[2].numpy()
+    instance_scores = result[4].numpy()
+
+    self.assertSequenceEqual(instance_maps.shape, (batch, height, width))
+    expected_instances = [[
+        [1, 1, 1, 1, 2],
+        [1, 1, 1, 2, 2],
+        [1, 1, 2, 2, 2],
+        [1, 2, 2, 2, 2],
+        [1, 2, 2, 2, 2],
+    ]]
+    np.testing.assert_array_equal(instance_maps, expected_instances)
+
+    self.assertSequenceEqual(instance_scores.shape, (batch, height, width))
+    expected_instance_scores = [[
+        [1.0, 1.0, 1.0, 1.0, 0.7],
+        [1.0, 1.0, 1.0, 0.7, 0.7],
+        [1.0, 1.0, 0.7, 0.7, 0.7],
+        [1.0, 0.7, 0.7, 0.7, 0.7],
+        [1.0, 0.7, 0.7, 0.7, 0.7],
+    ]]
+    np.testing.assert_array_almost_equal(instance_scores,
+                                         expected_instance_scores)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/model/post_processor/post_processor_builder.py b/model/post_processor/post_processor_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ca93928236718d510eb65457cfe3da09c72efb5
--- /dev/null
+++ b/model/post_processor/post_processor_builder.py
@@ -0,0 +1,45 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This file contains a post-processor builder used in the DeepLab model."""
+
+import tensorflow as tf
+
+from deeplab2 import common
+from deeplab2 import config_pb2
+from deeplab2.data import dataset
+from deeplab2.model import utils
+from deeplab2.model.post_processor import max_deeplab
+from deeplab2.model.post_processor import panoptic_deeplab
+
+
+def get_post_processor(
+    config: config_pb2.ExperimentOptions,
+    dataset_descriptor: dataset.DatasetDescriptor) -> tf.keras.layers.Layer:
+  """Initializes a DeepLab post-processor.
+
+  Args:
+    config: A config_pb2.ExperimentOptions configuration.
+    dataset_descriptor: A dataset.DatasetDescriptor.
+
+  Returns:
+    PostProcessor: A post-processor depending on the configuration.
+  """
+  supported_tasks = utils.get_supported_tasks(config)
+  if config.model_options.WhichOneof('meta_architecture') == 'max_deeplab':
+    return max_deeplab.PostProcessor(config, dataset_descriptor)
+  if common.TASK_PANOPTIC_SEGMENTATION in supported_tasks:
+    return panoptic_deeplab.PostProcessor(config, dataset_descriptor)
+  return panoptic_deeplab.SemanticOnlyPostProcessor()
diff --git a/model/post_processor/post_processor_builder_test.py b/model/post_processor/post_processor_builder_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c6ad49c819b7651fd3671332f84ce9bbc8f843e
--- /dev/null
+++ b/model/post_processor/post_processor_builder_test.py
@@ -0,0 +1,77 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for post_processor_builder.py."""
+
+import tensorflow as tf
+
+from google.protobuf import text_format
+from deeplab2 import common
+from deeplab2 import config_pb2
+from deeplab2.data import dataset
+from deeplab2.model.post_processor import post_processor_builder
+
+
+class EvaluatorTest(tf.test.TestCase):
+
+  def test_evaluates_panoptic_deeplab_model(self):
+    experiment_options_textproto = """
+      experiment_name: "evaluation_test"
+      eval_dataset_options {
+        dataset: "cityscapes_panoptic"
+        file_pattern: "EMPTY"
+        batch_size: 1
+        crop_size: 1025
+        crop_size: 2049
+        # Skip resizing.
+        min_resize_value: 0
+        max_resize_value: 0
+      }
+      evaluator_options {
+        continuous_eval_timeout: 43200
+        stuff_area_limit: 2048
+        center_score_threshold: 0.1
+        nms_kernel: 13
+        save_predictions: true
+        save_raw_predictions: false
+      }
+    """
+    config = text_format.Parse(experiment_options_textproto,
+                               config_pb2.ExperimentOptions())
+    config.model_options.panoptic_deeplab.instance.enable = True
+    post_processor = post_processor_builder.get_post_processor(
+        config, dataset.CITYSCAPES_PANOPTIC_INFORMATION)
+
+    result_dict = {
+        common.PRED_SEMANTIC_PROBS_KEY:
+            tf.zeros([1, 1025, 2049, 19], dtype=tf.float32),
+        common.PRED_CENTER_HEATMAP_KEY:
+            tf.zeros([1, 1025, 2049, 1], dtype=tf.float32),
+        common.PRED_OFFSET_MAP_KEY:
+            tf.zeros([1, 1025, 2049, 2], dtype=tf.float32)
+    }
+    processed_dict = post_processor(result_dict)
+    expected_keys = {
+        common.PRED_PANOPTIC_KEY,
+        common.PRED_SEMANTIC_KEY,
+        common.PRED_INSTANCE_KEY,
+        common.PRED_INSTANCE_CENTER_KEY,
+        common.PRED_INSTANCE_SCORES_KEY
+    }
+    self.assertCountEqual(processed_dict.keys(), expected_keys)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/model/post_processor/vip_deeplab.py b/model/post_processor/vip_deeplab.py
new file mode 100644
index 0000000000000000000000000000000000000000..552841110d94b053776e0539353f835e8ae095a8
--- /dev/null
+++ b/model/post_processor/vip_deeplab.py
@@ -0,0 +1,106 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This file contains functions to post-process ViP-DeepLab results."""
+
+import numpy as np
+
+
+def stitch_video_panoptic_prediction(
+    concat_panoptic: np.ndarray,
+    next_panoptic: np.ndarray,
+    label_divisor: int,
+    overlap_offset: int = 128,
+    combine_offset: int = 2 ** 32) -> np.ndarray:
+  """The stitching algorithm in ViP-DeepLab.
+
+  This function stitches a pair of image panoptic predictions to form video
+  panoptic predictions by propagating instance IDs from concat_panoptic to
+  next_panoptic based on IoU matching.
+
+  Siyuan Qiao, Yukun Zhu, Hartwig Adam, Alan Yuille, and Liang-Chieh Chen.
+  "ViP-DeepLab: Learning Visual Perception with Depth-aware Video Panoptic
+  Segmentation." CVPR, 2021.
+
+  Args:
+    concat_panoptic: Panoptic prediction of the next frame by concatenating
+      it with the current frame.
+    next_panoptic: Panoptic prediction of the next frame.
+    label_divisor: An integer specifying the label divisor of the dataset.
+    overlap_offset: An integer offset to avoid overlap between the IDs in
+      next_panoptic and the propagated IDs from concat_panoptic.
+    combine_offset: An integer offset to combine concat and next panoptic.
+
+  Returns:
+    Panoptic prediction of the next frame with the instance IDs propragated
+      from the concatenated panoptic prediction.
+  """
+  def _ids_to_counts(id_array: np.ndarray):
+    """Given a numpy array, a mapping from each entry to its count."""
+    ids, counts = np.unique(id_array, return_counts=True)
+    return dict(zip(ids, counts))
+  new_panoptic = next_panoptic.copy()
+  # Increase the panoptic instance ID to avoid overlap.
+  new_category = new_panoptic // label_divisor
+  new_instance = new_panoptic % label_divisor
+  # We skip 0 which is reserved for crowd.
+  instance_mask = new_instance > 0
+  new_instance[instance_mask] = new_instance[instance_mask] + overlap_offset
+  new_panoptic = new_category * label_divisor + new_instance
+  # Pre-compute areas for all the segments.
+  concat_segment_areas = _ids_to_counts(concat_panoptic)
+  next_segment_areas = _ids_to_counts(next_panoptic)
+  # Combine concat_panoptic and next_panoptic.
+  intersection_id_array = (concat_panoptic.astype(np.int64) *
+                           combine_offset + next_panoptic.astype(np.int64))
+  intersection_areas = _ids_to_counts(intersection_id_array)
+  # Compute IoU and sort them.
+  intersection_ious = []
+  for intersection_id, intersection_area in intersection_areas.items():
+    concat_panoptic_label = int(intersection_id // combine_offset)
+    next_panoptic_label = int(intersection_id % combine_offset)
+    concat_category_label = concat_panoptic_label // label_divisor
+    next_category_label = next_panoptic_label // label_divisor
+    if concat_category_label != next_category_label:
+      continue
+    concat_instance_label = concat_panoptic_label % label_divisor
+    next_instance_label = next_panoptic_label % label_divisor
+    # We skip 0 which is reserved for crowd.
+    if concat_instance_label == 0 or next_instance_label == 0:
+      continue
+    union = (
+        concat_segment_areas[concat_panoptic_label] +
+        next_segment_areas[next_panoptic_label] -
+        intersection_area)
+    iou = intersection_area / union
+    intersection_ious.append([
+        concat_panoptic_label, next_panoptic_label, iou])
+  intersection_ious = sorted(
+      intersection_ious, key=lambda e: e[2])
+  # Build mapping and inverse mapping. Two-way mapping guarantees 1-to-1
+  # matching.
+  map_concat_to_next = {}
+  map_next_to_concat = {}
+  for (concat_panoptic_label, next_panoptic_label,
+         iou) in intersection_ious:
+    map_concat_to_next[concat_panoptic_label] = next_panoptic_label
+    map_next_to_concat[next_panoptic_label] = concat_panoptic_label
+  # Match and propagate.
+  for (concat_panoptic_label,
+         next_panoptic_label) in map_concat_to_next.items():
+    if map_next_to_concat[next_panoptic_label] == concat_panoptic_label:
+      propagate_mask = next_panoptic == next_panoptic_label
+      new_panoptic[propagate_mask] = concat_panoptic_label
+  return new_panoptic
diff --git a/model/post_processor/vip_deeplab_test.py b/model/post_processor/vip_deeplab_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e742fe470f5d2410b5c69005130977e9ee50e8a0
--- /dev/null
+++ b/model/post_processor/vip_deeplab_test.py
@@ -0,0 +1,67 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test for vip_deeplab.py."""
+import numpy as np
+import tensorflow as tf
+
+from deeplab2.model.post_processor import vip_deeplab
+
+
+class PostProcessingTest(tf.test.TestCase):
+
+  def test_stitch_video_panoptic_prediction(self):
+    concat_semantic = np.array(
+        [[[0, 0, 0, 0],
+          [0, 1, 1, 0],
+          [0, 2, 2, 0],
+          [2, 2, 3, 3]]], dtype=np.int32)
+    concat_instance = np.array(
+        [[[1, 1, 2, 2],
+          [1, 0, 0, 2],
+          [1, 1, 1, 2],
+          [2, 2, 1, 1]]], dtype=np.int32)
+    next_semantic = np.array(
+        [[[0, 1, 1, 0],
+          [0, 1, 1, 0],
+          [0, 2, 2, 0],
+          [2, 2, 3, 3]]], dtype=np.int32)
+    next_instance = np.array(
+        [[[2, 0, 0, 1],
+          [2, 0, 0, 1],
+          [2, 4, 4, 1],
+          [5, 5, 3, 3]]], dtype=np.int32)
+    label_divisor = 1000
+    concat_panoptic = concat_semantic * label_divisor + concat_instance
+    next_panoptic = next_semantic * label_divisor + next_instance
+    new_panoptic = vip_deeplab.stitch_video_panoptic_prediction(
+        concat_panoptic,
+        next_panoptic,
+        label_divisor)
+    # The expected instance is manually computed. It should receive the IDs
+    # propagated from concat_instance by IoU matching between concat_panoptic
+    # and next_panoptic.
+    expected_semantic = next_semantic
+    expected_instance = np.array(
+        [[[1, 0, 0, 2],
+          [1, 0, 0, 2],
+          [1, 1, 1, 2],
+          [2, 2, 1, 1]]], dtype=np.int32)
+    expected_panoptic = expected_semantic * label_divisor + expected_instance
+    np.testing.assert_array_equal(expected_panoptic, new_panoptic)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/model/test_utils.py b/model/test_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c0933405a86927eeab8ffa5ed076b88a88738f7
--- /dev/null
+++ b/model/test_utils.py
@@ -0,0 +1,31 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This file contains utility functions for the model tests."""
+import numpy as np
+import tensorflow as tf
+
+
+def create_test_input(batch, height, width, channels):
+  """Creates test input tensor."""
+  input_tensor = np.tile(
+      np.reshape(
+          np.reshape(np.arange(height), [height, 1]) +
+          np.reshape(np.arange(width), [1, width]),
+          [1, height, width, 1]),
+      [batch, 1, 1, channels])
+  # Normalize the input tensor so that the outputs are not too large.
+  input_tensor = (input_tensor * 2 / np.max(input_tensor)) - 1
+  return tf.cast(input_tensor, tf.float32)
diff --git a/model/test_utils_test.py b/model/test_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0b676228beedca7ccd01fbe9bf3f7806497b2f3
--- /dev/null
+++ b/model/test_utils_test.py
@@ -0,0 +1,32 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for test_utils."""
+
+import tensorflow as tf
+
+from deeplab2.model import test_utils
+
+
+class TestUtilsTest(tf.test.TestCase):
+
+  def test_create_test_input(self):
+    input_shape = [1, 2, 3, 4]
+    input_tensor = test_utils.create_test_input(*input_shape)
+    self.assertListEqual(input_tensor.get_shape().as_list(), input_shape)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/model/utils.py b/model/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b28a19ea3b18c8eff5039a2c6eb2270e197c8a20
--- /dev/null
+++ b/model/utils.py
@@ -0,0 +1,485 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This file contains utility functions for the model code."""
+
+from typing import Any, List, MutableMapping, MutableSequence, Optional, Set
+
+import tensorflow as tf
+
+from deeplab2 import common
+from deeplab2 import config_pb2
+
+layers = tf.keras.layers
+
+_PREDICTION_WITH_NEAREST_UPSAMPLING = (
+    common.PRED_INSTANCE_KEY,
+    common.PRED_INSTANCE_CENTER_KEY,
+    common.PRED_INSTANCE_SCORES_KEY,
+    common.PRED_PANOPTIC_KEY,
+    common.PRED_SEMANTIC_KEY,
+    common.PRED_NEXT_PANOPTIC_KEY,
+    common.PRED_CONCAT_NEXT_PANOPTIC_KEY,
+    common.PRED_CENTER_HEATMAP_KEY,
+)
+
+_PREDICTION_WITH_BILINEAR_UPSAMPLING = (
+    common.PRED_SEMANTIC_PROBS_KEY,
+    common.PRED_OFFSET_MAP_KEY,
+)
+
+_INPUT_WITH_NEAREST_UPSAMPLING = (
+    common.GT_INSTANCE_CENTER_KEY,
+)
+
+_INPUT_WITH_BILINEAR_UPSAMPLING = (
+    common.IMAGE,
+    common.GT_INSTANCE_REGRESSION_KEY
+)
+
+
+def _scale_helper(value, scale):
+  if isinstance(value, tf.Tensor):
+    return tf.cast(
+        (tf.cast(value, dtype=tf.float32) - 1.0) * scale + 1.0,
+        dtype=tf.int32)
+  else:
+    return int((float(value) - 1.0) * scale + 1.0)
+
+
+def scale_mutable_sequence(input_sequence: MutableSequence[int],
+                           scale: float) -> MutableSequence[int]:
+  return [_scale_helper(x, scale) for x in input_sequence]
+
+
+def scale_int_list(int_list, scale):
+  return [int(x * scale) for x in int_list]
+
+
+def undo_image_preprocessing(image_in: tf.Tensor, method: str,
+                             perform_crop: bool,
+                             regions_to_crop: List[int],
+                             output_shape: List[int]) -> tf.Tensor:
+  """Undoes the image preprocessing.
+
+  In particular, this function slices out the valid regions (determined by
+  `regions_to_crop`) in the input when perform_crop is True. After
+  that, we resize the results to the desired `output_shape`.
+
+  Args:
+    image_in: Input image Tensor with shape [batch, height, width, n_channels].
+    method: Image resize method.
+    perform_crop: Boolean, performing crop or not.
+    regions_to_crop: The regions to crop [height, width]. Will only apply
+      cropping at the bottom right.
+    output_shape: Desired shape after resizing [height, width].
+
+  Returns:
+    Outputs after cropping (if perform_crop = True) and resizing.
+  """
+  if perform_crop:
+    image_out = image_in[
+        :, :regions_to_crop[0], :regions_to_crop[1], :]
+  else:
+    image_out = image_in
+  return resize_align_corners(image_out, output_shape, method=method)
+
+
+def undo_preprocessing(input_or_prediction_dict: MutableMapping[str, Any],
+                       regions_to_crop: List[int],
+                       output_shape: List[int]) -> MutableMapping[str, Any]:
+  """Undoes preprocessing for predictions.
+
+  Args:
+    input_or_prediction_dict: A dictionary storing different types of inputs or
+      predictions.
+    regions_to_crop: The regions to crop [height, width]. Will only apply
+      cropping at the bottom right.
+    output_shape: Desired shape after resizing [height, width].
+
+  Returns:
+    inputs or predictions after cropping (if perform_crop = True) and resizing.
+  """
+  for key in input_or_prediction_dict.keys():
+    if key in _PREDICTION_WITH_NEAREST_UPSAMPLING or key in _INPUT_WITH_NEAREST_UPSAMPLING:
+      input_or_prediction_dict[key] = tf.squeeze(
+          undo_image_preprocessing(
+              tf.expand_dims(input_or_prediction_dict[key], 3),
+              'nearest',
+              perform_crop=True,
+              regions_to_crop=regions_to_crop,
+              output_shape=output_shape),
+          axis=3)
+    elif key in _PREDICTION_WITH_BILINEAR_UPSAMPLING or key in _INPUT_WITH_BILINEAR_UPSAMPLING:
+      input_or_prediction_dict[key] = undo_image_preprocessing(
+          input_or_prediction_dict[key],
+          'bilinear',
+          perform_crop=True,
+          regions_to_crop=regions_to_crop,
+          output_shape=output_shape)
+    else:
+      # We only undo preprocessing for those defined in
+      # _{PREDICTION,INPUT}_WITH_{NEAREST,BILINEAR}_UPSAMPLING.
+      # Other intermediate results are skipped.
+      continue
+  return input_or_prediction_dict
+
+
+def add_zero_padding(input_tensor: tf.Tensor, kernel_size: int,
+                     rank: int) -> tf.Tensor:
+  """Adds zero-padding to the input_tensor."""
+  pad_total = kernel_size - 1
+  pad_begin = pad_total // 2
+  pad_end = pad_total - pad_begin
+  if rank == 3:
+    return tf.pad(
+        input_tensor,
+        paddings=[[pad_begin, pad_end], [pad_begin, pad_end], [0, 0]])
+  else:
+    return tf.pad(
+        input_tensor,
+        paddings=[[0, 0], [pad_begin, pad_end], [pad_begin, pad_end], [0, 0]])
+
+
+def resize_and_rescale_offsets(input_tensor: tf.Tensor, target_size):
+  """Bilinearly resizes and rescales the offsets.
+
+  Args:
+    input_tensor: A tf.Tensor of shape [batch, height, width, 2].
+    target_size: A list or tuple or 1D tf.Tensor that specifies the height and
+      width after resizing.
+
+  Returns:
+    The input_tensor resized to shape `[batch, target_height, target_width, 2]`.
+      Moreover, the offsets along the y-axis are rescaled by a factor equal to
+      (target_height - 1) / (reference_height - 1) and the offsets along the
+      x-axis are rescaled by a factor equal to
+      (target_width - 1) / (reference_width - 1).
+  """
+  input_size_y = tf.shape(input_tensor)[1]
+  input_size_x = tf.shape(input_tensor)[2]
+
+  scale_y = tf.cast(target_size[0] - 1, tf.float32) / tf.cast(
+      input_size_y - 1, tf.float32)
+  scale_x = tf.cast(target_size[1] - 1, tf.float32) / tf.cast(
+      input_size_x - 1, tf.float32)
+
+  target_y, target_x = tf.split(
+      value=input_tensor, num_or_size_splits=2, axis=3)
+  target_y *= scale_y
+  target_x *= scale_x
+  target = tf.concat([target_y, target_x], 3)
+  return resize_bilinear(target, target_size)
+
+
+def resize_align_corners(input_tensor, target_size, method='bilinear'):
+  """Resizes the input_tensor to target_size.
+
+  This returns the same output as tf.compat.v1.image.resize(input_tensor,
+  target_size, align_corners=True).
+
+  Args:
+    input_tensor: A tf.Tensor of shape [batch, height, width, channels].
+    target_size: A list or tuple or 1D tf.Tensor that specifies the height and
+      width after resizing.
+    method: An optional string specifying the method used for resizing.
+      Supported options are 'nearest' and 'bilinear'.
+
+  Returns:
+    The resized tensor.
+
+  Raises:
+    ValueError: An error occurs if 1) the input tensor's rank is not 4 or 2) the
+      resizing method is not supported.
+  """
+  if method == 'bilinear':
+    tf_method = tf.compat.v1.image.ResizeMethod.BILINEAR
+  elif method == 'nearest':
+    tf_method = tf.compat.v1.image.ResizeMethod.NEAREST_NEIGHBOR
+  else:
+    raise ValueError('The given method %s is not supported. Please use bilinear'
+                     ' or nearest.' % method)
+
+  tf.debugging.assert_rank(
+      input_tensor, 4,
+      message='Input tensor to resize method should have rank of 4.')
+
+  return tf.compat.v1.image.resize(
+      input_tensor,
+      target_size,
+      method=tf_method,
+      align_corners=True,
+      name='resize_align_corners')
+
+
+def resize_bilinear(images,
+                    size,
+                    align_corners=True,
+                    name=None):
+  """TPU memory efficient version of tf.compat.v1.image.resize_bilinear.
+
+  ResizeBilinear on TPU requires padded batch and channel dimensions. On a
+  TPUv3, the worst case could lead to 256x memory consumption, if the
+  input is, for example, [1, 257, 513, 1]. In this function, we replace the
+  default resize_bilinear by two resize_bilinear operations, which put one image
+  axis on the channel axis. This reduces TPU padding when batch * channel is
+  small and height * width is large.
+
+  Args:
+    images: Input image of shape [B, H, W, C].
+    size: A list of two elements: [height, width]. The new size for the images.
+    align_corners: Whether to align corners of the image.
+    name: Name of the operation.
+
+  Returns:
+    Resized image.
+  """
+  _, height, width, channel = images.get_shape().as_list()
+  if height == size[0] and width == size[1]:
+    return images
+  dtype = images.dtype
+  images = tf.cast(images, tf.float32)
+  # We check the channel axis only since the batch size is similar (usually 1 or
+  # 2). In this way, this if-else easily supports dynamic batch size without
+  # using tf.cond().
+  if channel > 32 or not align_corners:
+    images = tf.compat.v1.image.resize_bilinear(
+        images, size,
+        align_corners=align_corners,
+        name=name)
+  else:
+    images = tf.transpose(images, [0, 3, 1, 2])
+    images = tf.compat.v1.image.resize_bilinear(
+        images, [channel, size[0]],
+        align_corners=align_corners,
+        name=name + '_height' if name else None)
+    images = tf.transpose(images, [0, 1, 3, 2])
+    images = tf.compat.v1.image.resize_bilinear(
+        images, [channel, size[1]],
+        align_corners=align_corners,
+        name=name + '_width' if name else None)
+    images = tf.transpose(images, [0, 3, 2, 1])
+  return tf.cast(images, dtype)
+
+
+def make_divisible(value: float,
+                   divisor: int,
+                   min_value: Optional[float] = None) -> int:
+  """Ensures all layers have channels that are divisible by the divisor.
+
+  Args:
+    value: A `float` of original value.
+    divisor: An `int` of the divisor that needs to be checked upon.
+    min_value: A `float` of  minimum value threshold.
+
+  Returns:
+    The adjusted value in `int` that is divisible by divisor.
+
+  Raises:
+    ValueError: Minimual value should be divisible by divisor.
+  """
+  if min_value is None:
+    min_value = divisor
+  elif min_value % divisor != 0:
+    raise ValueError('Minimual value should be divisible by divisor.')
+
+  new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
+  # Make sure that round down does not go down by more than 10%.
+  if new_value < 0.9 * value:
+    new_value += divisor
+  return int(new_value)
+
+
+def transpose_and_reshape_for_attention_operation(inputs):
+  """Sequentially transposes and reshapes the tensor.
+
+  Args:
+    inputs: An input [batch, num_heads, length, channel] tensor.
+
+  Returns:
+    output: An output [batch, length, num_heads * channel] tensor.
+  """
+  _, num_heads, length, channel = inputs.get_shape().as_list()
+  transposed_inputs = tf.transpose(inputs, [0, 2, 1, 3])
+  return tf.reshape(transposed_inputs, [-1, length, num_heads * channel])
+
+
+def reshape_and_transpose_for_attention_operation(inputs, num_heads):
+  """Sequentially reshapes and transposes the tensor.
+
+  Args:
+    inputs: An input [batch, length, num_heads * channel] tensor.
+    num_heads: An integer, the number of attention heads.
+
+  Returns:
+    output: An output [batch, num_heads, length, channel] tensor.
+  """
+  _, length, channels = inputs.get_shape().as_list()
+  inputs = tf.reshape(inputs, [-1, length, num_heads, channels // num_heads])
+  return tf.transpose(inputs, [0, 2, 1, 3])
+
+
+def get_layer_name(private_attribute_name):
+  if private_attribute_name[0] != '_':
+    raise ValueError('Private attribute name should start with a \'_\'.')
+  return private_attribute_name[1:]
+
+
+def get_stem_current_name(index):
+  return '_basic_block{}'.format(index + 1)
+
+
+def get_low_level_conv_fusion_conv_current_names(index):
+  return ('_low_level_conv{}'.format(index + 1),
+          '_fusion_conv{}'.format(index + 1))
+
+
+def get_conv_bn_act_current_name(index, use_bn, activation):
+  name = '_conv{}'.format(index + 1)
+  if use_bn:
+    name += '_bn'
+  if (activation is not None and
+      activation.lower() != 'none' and
+      activation.lower() != 'linear'):
+    name += '_act'
+  return name
+
+
+def safe_setattr(obj, name, value):
+  """A conflict-safe version of setattr().
+
+  Different from setattr(), this function raises ValueError if the object
+  already has an attribute with the same name.
+
+  Args:
+    obj: An object whose attribute has to be set.
+    name: A string, the name of the attribute.
+    value: Any type, the value given to the attribute.
+
+  Raises:
+    ValueError: If the object already has an attribute with the same name.
+  """
+  if hasattr(obj, name):
+    raise ValueError('The object already has an attribute with the same name.')
+  setattr(obj, name, value)
+
+
+def pad_sequence_with_none(sequence, target_length):
+  return list(sequence) + [None] * (target_length - len(sequence))
+
+
+def strided_downsample(input_tensor, target_size):
+  """Strided downsamples a tensor to the target size.
+
+  The stride_height and stride_width is computed by (height - 1) //
+  (target_height - 1) and (width - 1) // (target_width - 1). We raise an error
+  if stride_height != stride_width, since this is not intended in our current
+  use cases. But this check can be removed if different strides are desired.
+  This function supports static shape only.
+
+  Args:
+    input_tensor: A [batch, height, width] tf.Tensor to be downsampled.
+    target_size: A list of two integers, [target_height, target_width], the
+      target size after downsampling.
+
+  Returns:
+    output_tensor: A [batch, target_height, target_width] tf.Tensor, the
+      downsampled result.
+
+  Raises:
+    ValueError: If the input cannot be downsampled with integer stride, i.e.,
+      (height - 1) % (target_height - 1) != 0, or (width - 1) % (target_width -
+      1) != 0.
+    ValueError: If the height axis stride does not equal to the width axis
+      stride.
+  """
+  input_height, input_width = input_tensor.get_shape().as_list()[1:3]
+  target_height, target_width = target_size
+
+  if ((input_height - 1) % (target_height - 1) or
+      (input_width - 1) % (target_width - 1)):
+    raise ValueError('The input cannot be downsampled with integer striding. '
+                     'Please ensure (height - 1) % (target_height - 1) == 0 '
+                     'and (width - 1) % (target_width - 1) == 0.')
+  stride_height = (input_height - 1) // (target_height - 1)
+  stride_width = (input_width - 1) // (target_width - 1)
+  if stride_height != stride_width:
+    raise ValueError('The height axis stride does not equal to the width axis '
+                     'stride.')
+  if stride_height > 1 or stride_width > 1:
+    return input_tensor[:, ::stride_height, ::stride_width]
+  return input_tensor
+
+
+def get_stuff_class_ids(num_thing_stuff_classes: int,
+                        thing_class_ids: List[int],
+                        void_label: int) -> List[int]:
+  """Computes stuff_class_ids.
+
+  The stuff_class_ids are computed from the num_thing_stuff_classes, the
+  thing_class_ids and the void_label.
+
+  Args:
+    num_thing_stuff_classes: An integer specifying the number of stuff and thing
+      classes, not including `void` class.
+    thing_class_ids: A List of integers of length [num_thing_classes] containing
+      thing class indices.
+    void_label: An integer specifying the void label.
+
+  Returns:
+    stuff_class_ids: A sorted List of integers of shape [num_stuff_classes]
+      containing stuff class indices.
+  """
+  if void_label >= num_thing_stuff_classes:
+    thing_stuff_class_ids = list(range(num_thing_stuff_classes))
+  else:
+    thing_stuff_class_ids = [_ for _ in range(num_thing_stuff_classes + 1)
+                             if _ is not void_label]
+  return sorted(set(thing_stuff_class_ids) - set(thing_class_ids))
+
+
+def get_supported_tasks(
+    config: config_pb2.ExperimentOptions) -> Set[str]:
+  """Gets currently supported tasks for each meta_architecture.
+
+  Args:
+    config: A config_pb2.ExperimentOptions configuration.
+
+  Returns:
+    supported_tasks: A set of strings (see common.py), optionally
+     - common.TASK_PANOPTIC_SEGMENTATION,
+     - common.TASK_INSTANCE_SEGMENTATION,
+     - common.TASK_VIDEO_PANOPTIC_SEGMENTATION,
+  """
+  supported_tasks = set()
+  meta_architecture = config.model_options.WhichOneof('meta_architecture')
+  is_max_deeplab = meta_architecture == 'max_deeplab'
+  is_motion_deeplab = meta_architecture == 'motion_deeplab'
+  is_panoptic_deeplab = meta_architecture == 'panoptic_deeplab'
+  is_vip_deeplab = meta_architecture == 'vip_deeplab'
+  is_panoptic = (
+      (config.model_options.panoptic_deeplab.instance.enable and
+       is_panoptic_deeplab) or
+      is_motion_deeplab or is_max_deeplab or is_vip_deeplab)
+  if is_panoptic:
+    supported_tasks.add(common.TASK_PANOPTIC_SEGMENTATION)
+    # MaX-DeepLab does not support evaluating instance segmentation mask AP yet.
+    if not is_max_deeplab:
+      supported_tasks.add(common.TASK_INSTANCE_SEGMENTATION)
+  if is_motion_deeplab or is_vip_deeplab:
+    supported_tasks.add(common.TASK_VIDEO_PANOPTIC_SEGMENTATION)
+  if is_vip_deeplab:
+    supported_tasks.add(common.TASK_DEPTH_AWARE_VIDEO_PANOPTIC_SEGMENTATION)
+  return supported_tasks
diff --git a/model/utils_test.py b/model/utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f3848148a8d5eb447c15ae45b5d883d240b6a8f
--- /dev/null
+++ b/model/utils_test.py
@@ -0,0 +1,201 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for utils."""
+
+import itertools
+
+import numpy as np
+import tensorflow as tf
+
+from deeplab2.model import utils
+
+
+class UtilsTest(tf.test.TestCase):
+
+  def test_resize_logits_graph_mode(self):
+    @tf.function
+    def graph_mode_wrapper(*args):
+      return utils.resize_and_rescale_offsets(*args)
+
+    resized_logits = graph_mode_wrapper(tf.ones((2, 33, 33, 2)), [65, 65])
+    resized_logits_2 = graph_mode_wrapper(tf.ones((2, 33, 33, 2)), [33, 33])
+    self.assertListEqual(resized_logits.shape.as_list(), [2, 65, 65, 2])
+    self.assertListEqual(resized_logits_2.shape.as_list(), [2, 33, 33, 2])
+
+  def test_resize_logits(self):
+    offset_logits = tf.convert_to_tensor([[[[2, 2], [2, 1], [2, 0]],
+                                           [[1, 2], [1, 1], [1, 0]],
+                                           [[0, 2], [0, 1], [0, 0]]]],
+                                         dtype=tf.float32)
+    target_size = [5, 5]
+    resized_logits = utils.resize_and_rescale_offsets(offset_logits,
+                                                      target_size)
+
+    self.assertListEqual(resized_logits.shape.as_list(), [1, 5, 5, 2])
+    for i in range(5):
+      for j in range(5):
+        np.testing.assert_array_almost_equal(resized_logits.numpy()[0, i, j, :],
+                                             [4 - i, 4 - j])
+
+  def test_zero_padding(self):
+    input_tensor = tf.ones(shape=(2, 5, 5, 2))
+    input_tensor_2 = tf.ones(shape=(5, 5, 2))
+    padded_tensor = utils.add_zero_padding(input_tensor, kernel_size=5, rank=4)
+    padded_tensor_2 = utils.add_zero_padding(
+        input_tensor_2, kernel_size=5, rank=3)
+
+    self.assertEqual(tf.reduce_sum(padded_tensor), 100)
+    self.assertEqual(tf.reduce_sum(padded_tensor_2), 50)
+    self.assertListEqual(padded_tensor.shape.as_list(), [2, 9, 9, 2])
+    self.assertListEqual(padded_tensor_2.shape.as_list(), [9, 9, 2])
+    # Count zero elements.
+    self.assertEqual(tf.reduce_sum(padded_tensor-1), -224)
+    self.assertEqual(tf.reduce_sum(padded_tensor_2-1), -112)
+
+  def test_resize_function_error(self):
+    input_tensor = tf.random.uniform(shape=(2, 10, 10, 2))
+    with self.assertRaises(ValueError):
+      _ = utils.resize_align_corners(input_tensor, [19, 19],
+                                     method='not_a_valid_method')
+
+  def test_resize_function_shape(self):
+    input_tensor = tf.random.uniform(shape=(2, 10, 10, 2))
+    result_tensor = utils.resize_align_corners(input_tensor, [19, 19])
+
+    self.assertListEqual(result_tensor.shape.as_list(), [2, 19, 19, 2])
+
+  def test_resize_graph_mode(self):
+    @tf.function
+    def graph_mode_wrapper(*args):
+      return utils.resize_align_corners(*args)
+
+    result_tensor = graph_mode_wrapper(tf.ones((2, 33, 33, 2)), [65, 65])
+    result_tensor_2 = graph_mode_wrapper(tf.ones((2, 33, 33, 2)), [33, 33])
+    self.assertListEqual(result_tensor.shape.as_list(), [2, 65, 65, 2])
+    self.assertListEqual(result_tensor_2.shape.as_list(), [2, 33, 33, 2])
+
+  def test_resize_function_constant_input(self):
+    input_tensor = tf.ones(shape=(2, 10, 10, 2))
+    result_tensor = utils.resize_align_corners(input_tensor, [19, 19])
+
+    self.assertTrue(tf.keras.backend.all(result_tensor == 1))
+
+  def test_resize_function_invalid_rank(self):
+    input_tensor = tf.keras.Input(shape=(None, 2))
+    with self.assertRaisesRegex(
+        ValueError, 'should have rank of 4'):
+      _ = utils.resize_align_corners(input_tensor, [19, 19])
+
+  def test_resize_function_v1_compatibility(self):
+    # Test for odd and even input, and output shapes.
+    input_shapes = [(2, 10, 10, 3), (2, 11, 11, 3)]
+    target_sizes = [[19, 19], [20, 20]]
+    methods = ['bilinear', 'nearest']
+
+    for shape, target_size, method in itertools.product(input_shapes,
+                                                        target_sizes, methods):
+      input_tensor = tf.random.uniform(shape=shape)
+
+      result_tensor = utils.resize_align_corners(input_tensor, target_size,
+                                                 method)
+      if method == 'bilinear':
+        expected_tensor = tf.compat.v1.image.resize(
+            input_tensor,
+            target_size,
+            align_corners=True,
+            method=tf.compat.v1.image.ResizeMethod.BILINEAR)
+      else:
+        expected_tensor = tf.compat.v1.image.resize(
+            input_tensor,
+            target_size,
+            align_corners=True,
+            method=tf.compat.v1.image.ResizeMethod.NEAREST_NEIGHBOR)
+
+      np.testing.assert_equal(result_tensor.numpy(), expected_tensor.numpy())
+
+  def test_resize_bilinear_v1_compatibility(self):
+    # Test for odd and even input, and output shapes.
+    input_shapes = [(2, 10, 10, 3), (2, 11, 11, 3), (1, 11, 11, 64)]
+    target_sizes = [[19, 19], [20, 20], [10, 10]]
+
+    for shape, target_size in itertools.product(input_shapes, target_sizes):
+      input_tensor = tf.random.uniform(shape=shape)
+      result_tensor = utils.resize_bilinear(input_tensor, target_size)
+      expected_tensor = tf.compat.v1.image.resize(
+          input_tensor,
+          target_size,
+          align_corners=True,
+          method=tf.compat.v1.image.ResizeMethod.BILINEAR)
+      self.assertAllClose(result_tensor, expected_tensor)
+
+  def test_make_divisible(self):
+    value, divisor, min_value = 17, 2, 8
+    new_value = utils.make_divisible(value, divisor, min_value)
+    self.assertAllEqual(new_value, 18)
+
+    value, divisor, min_value = 17, 2, 22
+    new_value = utils.make_divisible(value, divisor, min_value)
+    self.assertAllEqual(new_value, 22)
+
+  def test_transpose_and_reshape_for_attention_operation(self):
+    images = tf.zeros([2, 8, 11, 2])
+    output = utils.transpose_and_reshape_for_attention_operation(images)
+    self.assertEqual(output.get_shape().as_list(), [2, 11, 16])
+
+  def test_reshape_and_transpose_for_attention_operation(self):
+    images = tf.zeros([2, 11, 16])
+    output = utils.reshape_and_transpose_for_attention_operation(images,
+                                                                 num_heads=8)
+    self.assertEqual(output.get_shape().as_list(), [2, 8, 11, 2])
+
+  def test_safe_setattr_raise_error(self):
+    layer = tf.keras.layers.Conv2D(1, 1)
+    with self.assertRaises(ValueError):
+      utils.safe_setattr(layer, 'filters', 3)
+
+    utils.safe_setattr(layer, 'another_conv', tf.keras.layers.Conv2D(1, 1))
+    with self.assertRaises(ValueError):
+      utils.safe_setattr(layer, 'another_conv', tf.keras.layers.Conv2D(1, 1))
+
+  def test_pad_sequence_with_none(self):
+    sequence = [1, 2]
+    output_2 = utils.pad_sequence_with_none(sequence, target_length=2)
+    self.assertEqual(output_2, [1, 2])
+    output_3 = utils.pad_sequence_with_none(sequence, target_length=3)
+    self.assertEqual(output_3, [1, 2, None])
+
+  def test_strided_downsample(self):
+    inputs = tf.zeros([2, 11, 11])
+    output = utils.strided_downsample(inputs, target_size=[6, 6])
+    self.assertEqual(output.get_shape().as_list(), [2, 6, 6])
+
+  def test_get_stuff_class_ids(self):
+    # num_thing_stuff_classes does not include `void` class.
+    num_thing_stuff_classes = 5
+    thing_class_ids = [3, 4]
+    void_label_list = [5, 0]
+    expected_stuff_class_ids_list = [
+        [0, 1, 2], [1, 2, 5]
+    ]
+    for void_label, expected_stuff_class_ids in zip(
+        void_label_list, expected_stuff_class_ids_list):
+      stuff_class_ids = utils.get_stuff_class_ids(
+          num_thing_stuff_classes, thing_class_ids, void_label)
+      np.testing.assert_equal(stuff_class_ids,
+                              expected_stuff_class_ids)
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..17b9290276f31b40f51a33fd3c8e5937ac32ddb2
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+matplotlib
+numpy
+Pillow
+tensorflow
+gradio
diff --git a/tensorflow_ops/kernels/merge_semantic_and_instance_maps_op.cc b/tensorflow_ops/kernels/merge_semantic_and_instance_maps_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a5d1dd333627a8b6206bdfd6fa2ec902086df87a
--- /dev/null
+++ b/tensorflow_ops/kernels/merge_semantic_and_instance_maps_op.cc
@@ -0,0 +1,86 @@
+// Copyright 2021 The Deeplab2 Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include /*third_party*/"tensorflow/core/framework/op.h"
+#include /*third_party*/"tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow_models {
+namespace deeplab {
+namespace deeplab2 {
+
+using tensorflow::shape_inference::DimensionHandle;
+using tensorflow::shape_inference::InferenceContext;
+using tensorflow::shape_inference::ShapeHandle;
+
+REGISTER_OP("MergeSemanticAndInstanceMaps")
+    .Input("semantic_maps: int32")
+    .Input("instance_maps: int32")
+    .Input("thing_ids: int32")
+    .Attr("label_divisor: int = 256")
+    .Attr("stuff_area_limit: int = 0")
+    .Attr("void_label: int = 0")
+    .Output("parsing_maps: int32")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle semantic_maps;
+      ShapeHandle instance_maps;
+      ShapeHandle thing_ids;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 3, &semantic_maps));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 3, &instance_maps));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &thing_ids));
+      DimensionHandle batch = c->Dim(semantic_maps, 0);
+      DimensionHandle height = c->Dim(semantic_maps, 1);
+      DimensionHandle width = c->Dim(semantic_maps, 2);
+      c->set_output(0, c->MakeShape({batch, height, width}));
+      return tensorflow::Status::OK();
+    })
+    .Doc(R"doc(
+Generates parsing maps from semantic maps and instance maps.
+
+Parsing maps, or panoptic segmentation, are merged from the predicted semantic
+maps and class-agnostic instance maps. This function merges the maps in the
+following way:
+
+1) If a pixel belongs to `stuff` class (e.g., sky), the function directly uses
+  the semantic label from the semantic map and uses 0 as the instance label.
+2) If a pixel belongs to `thing` class (e.g., person), it uses the instance
+  label from the instance map and uses the majority of the semantic labels of
+  the same instance as the final semantic label.
+3) The function relabels each instance, so that the instance label of each
+  semantic class is in the range of [1, num_instances_of_the_semantic_class].
+
+Note that this operation is first poposed in the DeeperLab paper and adopted
+by the Panoptic-DeepLab framework.
+  - DeeperLab: Single-Shot Image Parser, T-J Yang, et al. arXiv:1902.05093.
+  - Panoptic-DeepLab, B. Cheng, et al. In CVPR, 2020.
+
+semantic_maps: An int32 Tensor with shape `[batch, height, width]` whose value
+  indicates the predicted semantic label of each pixel.
+instance_maps: An int32 Tensor with shape `[batch, height, width]` whose value
+  indicates the predicted instance label of each pixel.
+thing_ids: An int32 Tensor with shape `[num_thing_ids]` whose value refers to
+  the semantic ids of the thing classes.
+label_divisor: An integer. The value used to combine the semantic and instance
+  map to generate the parsing map. In particular, the value of a pixel in the
+  parsing map is equal to its corresponding semantic label times label_divisor
+  plus instance label (i.e., semantic_label * label_divisor + instance_label).
+stuff_area_limit: An integer. Predicted stuff segments whose areas are smaller
+  than this threshold are assigned to VOID label.
+void_label: An integer, specifying the VOID label.
+parsing_maps: An int32 Tensor with shape `[batch, height, width]` whose value
+  indicates the merged semantic and instance label of each pixel.
+)doc");
+
+}  // namespace deeplab2
+}  // namespace deeplab
+}  // namespace tensorflow_models
diff --git a/tensorflow_ops/kernels/merge_semantic_and_instance_maps_op_kernel.cc b/tensorflow_ops/kernels/merge_semantic_and_instance_maps_op_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2a5071bb21e0b06a472be9efaba2f7438e6e9f35
--- /dev/null
+++ b/tensorflow_ops/kernels/merge_semantic_and_instance_maps_op_kernel.cc
@@ -0,0 +1,279 @@
+// Copyright 2021 The Deeplab2 Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstdint>
+#define EIGEN_USE_THREADS
+
+#define _USE_MATH_DEFINES
+
+#include <algorithm>
+#include <iterator>
+#include <set>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include /*third_party*/"tensorflow/core/framework/op_kernel.h"
+#include /*third_party*/"tensorflow/core/framework/register_types.h"
+#include /*third_party*/"tensorflow/core/framework/tensor.h"
+#include /*third_party*/"tensorflow/core/framework/tensor_shape.h"
+#include /*third_party*/"tensorflow/core/framework/types.h"
+#include /*third_party*/"tensorflow/core/lib/core/errors.h"
+#include /*third_party*/"tensorflow/core/lib/core/status.h"
+#include /*third_party*/"tensorflow/core/platform/logging.h"
+#include /*third_party*/"merge_semantic_and_instance_maps_op_kernel.h" // local headers
+
+namespace tensorflow_models {
+namespace deeplab {
+namespace deeplab2 {
+
+namespace {
+
+using tensorflow::Tensor;
+using tensorflow::TensorShape;
+using tensorflow::TTypes;
+using tensorflow::errors::InvalidArgument;
+
+}  // namespace
+
+namespace functor {
+
+// This function merges the semantic segmentation and class-agnostic
+// instance segmentation to form the panoptic segmentation. In particular,
+// the class label of each instance mask is inferred from the majority
+// votes from the corresponding pixels in the semantic segmentation. This
+// operation is first poposed in the DeeperLab paper and adopted by the
+// Panoptic-DeepLab.
+// - DeeperLab: Single-Shot Image Parser, T-J Yang, et al. arXiv:1902.05093.
+// - Panoptic-DeepLab, B. Cheng, et al. In CVPR, 2020.
+// Specialization of MergeSemanticAndInstanceMaps< for CPU.
+template <>
+void MergeSemanticAndInstanceMaps<Eigen::ThreadPoolDevice>::operator()(
+    const Eigen::ThreadPoolDevice& d,
+    typename TTypes<int32_t, 3>::ConstTensor semantic_maps,
+    typename TTypes<int32_t, 3>::ConstTensor instance_maps,
+    const std::unordered_set<int32_t>& thing_ids_set, int label_divisor,
+    int stuff_area_limit, int void_label,
+    typename TTypes<int32_t, 3>::Tensor parsing_maps) {
+  const int num_batches = semantic_maps.dimension(0);
+  const int height = semantic_maps.dimension(1);
+  const int width = semantic_maps.dimension(2);
+
+  for (int b = 0; b < num_batches; ++b) {
+    // A vector to keep track of which pixels are predicted as `thing` or
+    // `stuff` class.
+    std::vector<bool> is_thing(height * width, true);
+
+    // For each instance, find its corresponding histogram of semantic labels.
+    // Suppose car label = 2 and road label = 5, and predicted instance 3 has
+    // 5 pixels predicted as car and 20 pixels predicted as road. Then,
+    // instance_id_to_semantic_histogram[3][2] = 5 and
+    // instance_id_to_semantic_histogram[3][5] = 20.
+    using InstanceIdType = int32_t;
+    using SemanticLabelType = int32_t;
+    using CountsType = int32_t;
+    std::unordered_map<InstanceIdType,
+                       std::unordered_map<SemanticLabelType, CountsType>>
+        instance_id_to_semantic_histogram;
+    // A map from stuff label to area.
+    std::unordered_map<SemanticLabelType, CountsType> stuff_label_to_area;
+    for (int h = 0; h < height; ++h) {
+      for (int w = 0; w < width; ++w) {
+        const int semantic_val = semantic_maps(b, h, w);
+        if (thing_ids_set.find(semantic_val) == thing_ids_set.end()) {
+          // Skip if it is `stuff`.
+          is_thing[w + width * h] = false;
+          ++stuff_label_to_area[semantic_val];
+          continue;
+        }
+        const int instance_val = instance_maps(b, h, w);
+        ++instance_id_to_semantic_histogram[instance_val][semantic_val];
+      }
+    }
+    // Keep track of how many instances for each semantic_label.
+    std::unordered_map<SemanticLabelType, CountsType>
+        semantic_label_to_instance_counts;
+    // Find the new semantic label and instance id for each instance. We use
+    // majority vote to find the new semantic label while reorder the instance
+    // id in the following way. In the original instance map, every instance
+    // has a different instance id. In the new instance map, every instance
+    // `in the same semantic class` should have a different id, but instances
+    // `in different semantic classes` can have the same instance id. This
+    // reduces the maximum instance label value and avoids the problem of
+    // combining the two maps with the label_divisor.
+    std::unordered_map<InstanceIdType,
+                       std::pair<SemanticLabelType, InstanceIdType>>
+        instance_id_to_new_semantic_label_and_instance_id;
+    for (const auto& instance_to_histogram :
+         instance_id_to_semantic_histogram) {
+      const int instance_val = instance_to_histogram.first;
+      const std::unordered_map<SemanticLabelType, CountsType>
+          semantic_histogram = instance_to_histogram.second;
+      int semantic_label = -1;
+      int max_count = 0;
+      // Find the majority semantic label.
+      for (const auto& semantic_to_count : semantic_histogram) {
+        // Break ties deterministically by select the smaller semantic label.
+        if (semantic_to_count.second > max_count ||
+            (semantic_to_count.second == max_count &&
+             semantic_to_count.first < semantic_label)) {
+          max_count = semantic_to_count.second;
+          semantic_label = semantic_to_count.first;
+        }
+      }
+      ++semantic_label_to_instance_counts[semantic_label];
+      // For `thing` class, we set instance id starting from 1, while for
+      // `stuff` class, we use instance id 0.
+      instance_id_to_new_semantic_label_and_instance_id[instance_val] = {
+          semantic_label, semantic_label_to_instance_counts[semantic_label]};
+    }
+    // Create a new semantic map by assigning the majority semantic label for
+    // each instance.
+    std::vector<SemanticLabelType> semantic_map(height * width);
+    // Create a new instance map by assigning ordered instance id's.
+    std::vector<InstanceIdType> instance_map(height * width);
+    for (int h = 0; h < height; ++h) {
+      for (int w = 0; w < width; ++w) {
+        const int pixel = w + width * h;
+        if (is_thing[pixel]) {
+          const int instance_val = instance_maps(b, h, w);
+          // Assign the majority semantic vote in the new semantic map, and
+          // reorder the instance id in the new instance map.
+          std::tie(semantic_map[pixel], instance_map[pixel]) =
+              instance_id_to_new_semantic_label_and_instance_id[instance_val];
+        } else {
+          // If current pixel belongs to `stuff` class, keep the same semantic
+          // label in the new semantic map. We also check if its area is
+          // smaller than the stuff_area_limit_ or not. If true, we re-assign
+          // the segment with void_label_.
+          const int semantic_val = semantic_maps(b, h, w);
+          if (stuff_area_limit > 0 &&
+              stuff_label_to_area[semantic_val] <= stuff_area_limit) {
+            semantic_map[pixel] = void_label;
+          } else {
+            semantic_map[pixel] = semantic_val;
+          }
+          // If current pixel belongs to `stuff` class, assign 0 in the new
+          // instance map.
+          instance_map[pixel] = 0;
+        }
+      }
+    }
+    // Merge those semantic map and instance map.
+    for (int h = 0; h < height; ++h) {
+      for (int w = 0; w < width; ++w) {
+        const int pixel = w + width * h;
+        parsing_maps(b, h, w) =
+            semantic_map[pixel] * label_divisor + instance_map[pixel];
+      }
+    }
+  }
+}
+
+template <>
+std::unordered_set<int32_t> Convert1DInt32TensorToSet(
+    const Eigen::ThreadPoolDevice& d, const Tensor& tensor) {
+  std::unordered_set<int32_t> target_set;
+  const int n_vals = tensor.dim_size(0);
+  typename TTypes<int32_t, 1>::ConstTensor tensor_data =
+      tensor.tensor<int32_t, 1>();
+  for (int i = 0; i < n_vals; i++) {
+    target_set.insert(tensor_data(i));
+  }
+
+  return target_set;
+}
+
+}  // namespace functor
+
+template <typename Device>
+class MergeSemanticAndInstanceMapsOp : public tensorflow::OpKernel {
+ public:
+  explicit MergeSemanticAndInstanceMapsOp(
+      tensorflow::OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("label_divisor", &label_divisor_));
+    OP_REQUIRES(context, label_divisor_ > 0,
+                InvalidArgument("Label divisor must be positive."));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("stuff_area_limit", &stuff_area_limit_));
+    OP_REQUIRES(context, stuff_area_limit_ >= 0,
+                InvalidArgument("Stuff area limit must be non-negative."));
+    OP_REQUIRES_OK(context, context->GetAttr("void_label", &void_label_));
+    OP_REQUIRES(context, void_label_ >= 0,
+                InvalidArgument("Void label must be non-negative."));
+  }
+
+  void Compute(tensorflow::OpKernelContext* context) override {
+    // Extract the inputs.
+    const Tensor& semantic_maps = context->input(0);
+    const Tensor& instance_maps = context->input(1);
+    const Tensor& thing_ids_tensor = context->input(2);
+
+    // Convert thing_ids_tensor into a set.
+    std::unordered_set<int32_t> thing_ids_set =
+        functor::Convert1DInt32TensorToSet(context->eigen_device<Device>(),
+                                           thing_ids_tensor);
+
+    // Extract the constants.
+    const int batch = semantic_maps.dim_size(0);
+    const int height = semantic_maps.dim_size(1);
+    const int width = semantic_maps.dim_size(2);
+
+    // Check input shapes.
+    OP_REQUIRES(context,
+                instance_maps.dim_size(0) == batch &&
+                    instance_maps.dim_size(1) == height &&
+                    instance_maps.dim_size(2) == width,
+                InvalidArgument(
+                    "Expect semantic and instance maps have the same shape.",
+                    instance_maps.shape().DebugString()));
+
+    Tensor* parsing_maps = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(
+                       0, TensorShape({batch, height, width}), &parsing_maps));
+
+    functor::MergeSemanticAndInstanceMaps<Device>()(
+        context->eigen_device<Device>(), semantic_maps.tensor<int32_t, 3>(),
+        instance_maps.tensor<int32_t, 3>(), thing_ids_set, label_divisor_,
+        stuff_area_limit_, void_label_, parsing_maps->tensor<int32_t, 3>());
+  }
+
+ private:
+  // Label divisor, the value used to combine the semantic and instance map to
+  // generate the parsing map.
+  int label_divisor_;
+
+  // Stuff area limit is used to remove predicted stuff segments whose area are
+  // smaller than it.
+  int stuff_area_limit_;
+
+  // Removed predicted stuff segments are re-assigned with void label.
+  int void_label_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("MergeSemanticAndInstanceMaps").Device(tensorflow::DEVICE_CPU),
+    MergeSemanticAndInstanceMapsOp<Eigen::ThreadPoolDevice>);
+
+#ifdef GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(
+    Name("MergeSemanticAndInstanceMaps").Device(tensorflow::DEVICE_GPU),
+    MergeSemanticAndInstanceMapsOp<Eigen::GpuDevice>)
+#endif  // GOOGLE_CUDA
+
+}  // namespace deeplab2
+}  // namespace deeplab
+}  // namespace tensorflow_models
diff --git a/tensorflow_ops/kernels/merge_semantic_and_instance_maps_op_kernel.cu.cc b/tensorflow_ops/kernels/merge_semantic_and_instance_maps_op_kernel.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..46a0413ab95d9fd7430d51f481db9b9d7e7bcfe7
--- /dev/null
+++ b/tensorflow_ops/kernels/merge_semantic_and_instance_maps_op_kernel.cu.cc
@@ -0,0 +1,296 @@
+// Copyright 2021 The Deeplab2 Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstdint>
+#ifdef GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include /*third_party*/"tensorflow/core/framework/op_kernel.h"
+#include /*third_party*/"tensorflow/core/framework/register_types.h"
+#include /*third_party*/"tensorflow/core/framework/tensor.h"
+#include /*third_party*/"tensorflow/core/framework/tensor_shape.h"
+#include /*third_party*/"tensorflow/core/framework/types.h"
+#include /*third_party*/"tensorflow/core/util/gpu_kernel_helper.h"
+#include /*third_party*/"merge_semantic_and_instance_maps_op_kernel.h" // local headers
+
+namespace tensorflow_models {
+namespace deeplab {
+namespace deeplab2 {
+
+namespace functor {
+
+namespace {
+
+using ::tensorflow::CudaGridRangeX;
+using ::tensorflow::GetGpuLaunchConfig;
+using ::tensorflow::GpuLaunchConfig;
+using ::tensorflow::Tensor;
+using ::tensorflow::TTypes;
+
+using GPUDevice = ::Eigen::GpuDevice;
+
+// Maximum number of instances and semantic classes. We default to
+// 1024 and 256, respectively. Increase the values, if your dataset
+// contains more instances per image or more semantic classes.
+constexpr int32_t kMaxNumInstance = 1024;
+constexpr int32_t kMaxNumSemantic = 256;
+
+// CUDA kernel that initializes memory with a constant value.
+template <typename T>
+__global__ void SetToValue(const int num_threads, const T value, T* x) {
+  for (int idx : CudaGridRangeX(num_threads)) {
+    x[idx] = value;
+  }
+}
+
+// CUDA kernel that goes over each pixel, and collects the following stats:
+// 1. Whether this pixel belongs to "thing" class.
+// 2. Semantic label count inside each instance.
+// 3. Total pixel area of each "stuff" class.
+// Size of each GPU array:
+//   semantic_data: [height * width]
+//   instance_data: [height * width]
+//   is_thing_per_semantic_id: [kMaxNumSemantic]
+//   is_thing_per_pixel: [height * width]
+//   semantic_count_per_instance: [kMaxNumInstance * kMaxNumSemantic]
+//   stuff_area: [kMaxNumSemantic]
+__global__ void CollectPixelStats(const int num_threads,
+                                  const int32_t* semantic_data,
+                                  const int32_t* instance_data,
+                                  const bool* is_thing_per_semantic_id,
+                                  bool* is_thing_per_pixel,
+                                  int32_t* semantic_count_per_instance,
+                                  int32_t* stuff_area) {
+  for (int idx : CudaGridRangeX(num_threads)) {
+    const int32_t semantic_label =
+        std::min(semantic_data[idx], kMaxNumSemantic - 1);
+    const int32_t instance_label =
+        std::min(instance_data[idx], kMaxNumInstance - 1);
+    const bool is_thing = is_thing_per_semantic_id[semantic_label];
+    is_thing_per_pixel[idx] = is_thing;
+
+    const int offset = instance_label * kMaxNumSemantic + semantic_label;
+    if (is_thing) {
+      tensorflow::CudaAtomicAdd(semantic_count_per_instance + offset, 1);
+    } else {
+      tensorflow::CudaAtomicAdd(stuff_area + semantic_label, 1);
+    }
+  }
+}
+
+// CUDA kernel that merges semantic and instance prediction into panoptic map.
+// Merging rules:
+// 1. For "thing" class, its instance label will be reordered, and its semantic
+//    label depends on major semantic label inside this instance.
+// 2. For "stuff" class, its instance label is 0, and semantic label will be
+//    a) void, if stuff area is small, and b) original semantic label.
+// Size of each GPU array:
+//   semantic_data: [height * width]
+//   instance_data: [height * width]
+//   is_thing_per_semantic_id: [kMaxNumSemantic]
+//   is_thing_per_pixel: [height * width]
+//   stuff_area: [kMaxNumSemantic]
+//   labels_per_instance: [kMaxNumInstance * 2]
+//   parsing_maps: [height * width]
+__global__ void MergePredictions(
+    const int num_threads, const int32_t* semantic_data,
+    const int32_t* instance_data, const bool* is_thing_per_pixel,
+    const int32_t* stuff_area, const int32_t* labels_per_instance,
+    const int32_t stuff_area_limit, const int32_t label_divisor,
+    const int32_t void_label, int32_t* parsing_maps) {
+  for (int idx : CudaGridRangeX(num_threads)) {
+    const int32_t semantic_label =
+        std::min(semantic_data[idx], kMaxNumSemantic - 1);
+    const int32_t instance_label =
+        std::min(instance_data[idx], kMaxNumInstance - 1);
+    const int32_t is_thing = static_cast<int32_t>(is_thing_per_pixel[idx]);
+
+    const int32_t semantic_label_if_is_thing =
+        labels_per_instance[instance_label * 2];
+    const int32_t instance_label_if_is_thing =
+        labels_per_instance[instance_label * 2 + 1];
+    const int32_t panoptic_label_if_is_thing =
+        semantic_label_if_is_thing * label_divisor + instance_label_if_is_thing;
+
+    const int32_t is_void = static_cast<int32_t>(
+        stuff_area_limit > 0 && stuff_area[semantic_label] <= stuff_area_limit);
+    const int32_t semantic_label_if_is_stuff =
+        is_void * void_label + (1 - is_void) * semantic_label;
+
+    parsing_maps[idx] =
+        is_thing * panoptic_label_if_is_thing +
+        (1 - is_thing) * (semantic_label_if_is_stuff * label_divisor);
+  }
+}
+
+// Generates semantic and instance label for each predicted instance.
+// Size of each GPU array:
+//   semantic_count_per_instance: [kMaxNumInstance * kMaxNumSemantic]
+//   labels_per_instance: [kMaxNumInstance * 2]
+void CreateLabelsPerInstance(const GPUDevice& d,
+                             const int32_t* semantic_count_per_instance,
+                             int32_t* labels_per_instance) {
+  std::vector<int32_t> semantic_count_per_instance_host(kMaxNumInstance *
+                                                        kMaxNumSemantic);
+  d.memcpyDeviceToHost(semantic_count_per_instance_host.data(),
+                       semantic_count_per_instance,
+                       kMaxNumInstance * kMaxNumSemantic * sizeof(int32_t));
+
+  // A flat 2D array with shape [kMaxNumInstance, 2], where each row
+  // represents (new semantic label, new instance label) for each instance.
+  std::vector<int32_t> labels_per_instance_host(kMaxNumInstance * 2);
+
+  // Map semantic_label -> largest instance label of this semantic class.
+  std::unordered_map<int32_t, int32_t> instance_count_per_semantic_class;
+  for (int i = 0; i < kMaxNumInstance; ++i) {
+    int max_pixel_count = 0;
+    int max_semantic_label = -1;
+    for (int j = 0; j < kMaxNumSemantic; ++j) {
+      const int current_count =
+          semantic_count_per_instance_host[i * kMaxNumSemantic + j];
+      if (current_count > max_pixel_count) {
+        max_semantic_label = j;
+        max_pixel_count = current_count;
+      }
+    }
+
+    labels_per_instance_host[2 * i] = std::max(0, max_semantic_label);
+    if (max_semantic_label >= 0) {
+      labels_per_instance_host[2 * i + 1] =
+          ++instance_count_per_semantic_class[max_semantic_label];
+    } else {
+      labels_per_instance_host[2 * i + 1] = 0;
+    }
+  }
+
+  d.memcpyHostToDevice(labels_per_instance, labels_per_instance_host.data(),
+                       kMaxNumInstance * 2 * sizeof(int32_t));
+}
+
+}  // namespace
+
+// Specialization of Convert1DInt32TensorToSet for GPU.
+template <>
+std::unordered_set<int32_t> Convert1DInt32TensorToSet(const GPUDevice& d,
+                                                      const Tensor& tensor) {
+  const int n_vals = tensor.dim_size(0);
+  std::vector<int32_t> host_buffer(n_vals);
+  d.memcpyDeviceToHost(host_buffer.data(), tensor.tensor<int32_t, 1>().data(),
+                       n_vals * sizeof(int32_t));
+
+  return std::unordered_set<int32_t>(host_buffer.begin(), host_buffer.end());
+}
+
+// This function merges the semantic segmentation and class-agnostic
+// instance segmentation to form the panoptic segmentation. In particular,
+// the class label of each instance mask is inferred from the majority
+// votes from the corresponding pixels in the semantic segmentation. This
+// operation is first poposed in the DeeperLab paper and adopted by the
+// Panoptic-DeepLab.
+// - DeeperLab: Single-Shot Image Parser, T-J Yang, et al. arXiv:1902.05093.
+// - Panoptic-DeepLab, B. Cheng, et al. In CVPR, 2020.
+// Specialization of MergeSemanticAndInstanceMaps for GPU.
+template <>
+void MergeSemanticAndInstanceMaps<GPUDevice>::operator()(
+    const GPUDevice& d, typename TTypes<int32_t, 3>::ConstTensor semantic_maps,
+    typename TTypes<int32_t, 3>::ConstTensor instance_maps,
+    const std::unordered_set<int32_t>& thing_ids_set, int label_divisor,
+    int stuff_area_limit, int void_label,
+    typename TTypes<int32_t, 3>::Tensor parsing_maps) {
+  const int num_batches = semantic_maps.dimension(0);
+  const int height = semantic_maps.dimension(1);
+  const int width = semantic_maps.dimension(2);
+
+  // Allocate memory on host, which tells each semantic class is "thing" or not.
+  bool is_thing_per_semantic_id[kMaxNumSemantic];
+  for (int i = 0; i < kMaxNumSemantic; ++i) {
+    is_thing_per_semantic_id[i] =
+        (thing_ids_set.find(i) != thing_ids_set.end());
+  }
+  bool* is_thing_per_semantic_id_device =
+      reinterpret_cast<bool*>(d.allocate_temp(kMaxNumSemantic * sizeof(bool)));
+  d.memcpyHostToDevice(is_thing_per_semantic_id_device,
+                       is_thing_per_semantic_id,
+                       kMaxNumSemantic * sizeof(bool));
+
+  // Allocate scratch memories on device.
+  bool* is_thing_per_pixel_device =
+      reinterpret_cast<bool*>(d.allocate_temp(height * width * sizeof(bool)));
+  int32_t* semantic_count_per_instance_device = reinterpret_cast<int32_t*>(
+      d.allocate_temp(kMaxNumInstance * kMaxNumSemantic * sizeof(int32_t)));
+  int32_t* stuff_area_device = reinterpret_cast<int32_t*>(
+      d.allocate_temp(kMaxNumSemantic * sizeof(int32_t)));
+  int32_t* labels_per_instance_device = reinterpret_cast<int32_t*>(
+      d.allocate_temp(kMaxNumInstance * 2 * sizeof(int32_t)));
+
+  GpuLaunchConfig config;
+  int total_count = 0;
+  for (int b = 0; b < num_batches; ++b) {
+    const int batch_offset = b * height * width;
+    // Initialize memories that hold counters.
+    total_count = kMaxNumInstance * kMaxNumSemantic;
+    config = GetGpuLaunchConfig(total_count, d);
+    SetToValue<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+        config.virtual_thread_count, 0, semantic_count_per_instance_device);
+
+    total_count = kMaxNumSemantic;
+    config = GetGpuLaunchConfig(total_count, d);
+    SetToValue<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+        config.virtual_thread_count, 0, stuff_area_device);
+
+    // Step 1: Collect semantic and instance mask stats. Done on GPU.
+    total_count = height * width;
+    config = GetGpuLaunchConfig(total_count, d);
+    CollectPixelStats<<<config.block_count, config.thread_per_block, 0,
+                        d.stream()>>>(
+        config.virtual_thread_count, semantic_maps.data() + batch_offset,
+        instance_maps.data() + batch_offset, is_thing_per_semantic_id_device,
+        is_thing_per_pixel_device, semantic_count_per_instance_device,
+        stuff_area_device);
+
+    // Step 2: Loop over instance, find major "thing" semantic label, and
+    //         reorder instance IDs to share same ID with different thing class.
+    //         This process now runs on CPU.
+    CreateLabelsPerInstance(d, semantic_count_per_instance_device,
+                            labels_per_instance_device);
+
+    // Step 3: Create panoptic prediction.
+    total_count = width * height;
+    config = GetGpuLaunchConfig(total_count, d);
+    MergePredictions<<<config.block_count, config.thread_per_block, 0,
+                       d.stream()>>>(
+        config.virtual_thread_count, semantic_maps.data() + batch_offset,
+        instance_maps.data() + batch_offset, is_thing_per_pixel_device,
+        stuff_area_device, labels_per_instance_device, stuff_area_limit,
+        label_divisor, void_label, parsing_maps.data() + batch_offset);
+  }
+
+  // Free all temp memories.
+  d.deallocate_temp(is_thing_per_semantic_id_device);
+  d.deallocate_temp(is_thing_per_pixel_device);
+  d.deallocate_temp(semantic_count_per_instance_device);
+  d.deallocate_temp(stuff_area_device);
+  d.deallocate_temp(labels_per_instance_device);
+}
+
+}  // namespace functor
+}  // namespace deeplab2
+}  // namespace deeplab
+}  // namespace tensorflow_models
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow_ops/kernels/merge_semantic_and_instance_maps_op_kernel.h b/tensorflow_ops/kernels/merge_semantic_and_instance_maps_op_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f38d4a43ae9fd7d4b857f45141625957faf5293
--- /dev/null
+++ b/tensorflow_ops/kernels/merge_semantic_and_instance_maps_op_kernel.h
@@ -0,0 +1,53 @@
+// Copyright 2021 The Deeplab2 Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef DEEPLAB2_MERGE_SEMANTIC_AND_INSTANCE_MAPS_OP_KERNEL_H_
+#define DEEPLAB2_MERGE_SEMANTIC_AND_INSTANCE_MAPS_OP_KERNEL_H_
+#include <stdint.h>
+
+#include <unordered_set>
+
+#include /*third_party*/"tensorflow/core/framework/numeric_types.h"
+#include /*third_party*/"tensorflow/core/framework/op_kernel.h"
+#include /*third_party*/"tensorflow/core/framework/tensor.h"
+#include /*third_party*/"tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow_models {
+namespace deeplab {
+namespace deeplab2 {
+namespace functor {
+
+template <typename Device>
+struct MergeSemanticAndInstanceMaps {
+  // Functor that merges semantic and instance maps.
+  void operator()(
+      const Device& d,
+      typename tensorflow::TTypes<int32_t, 3>::ConstTensor semantic_maps,
+      typename tensorflow::TTypes<int32_t, 3>::ConstTensor instance_maps,
+      const std::unordered_set<int32_t>& thing_ids_set, int label_divisor,
+      int stuff_area_limit, int void_label,
+      typename tensorflow::TTypes<int32_t, 3>::Tensor parsing_maps);
+};
+
+// Helper method to convert a list of thing IDs into hashset.
+template <typename Device>
+std::unordered_set<int32_t> Convert1DInt32TensorToSet(
+    const Device& d, const tensorflow::Tensor& tensor);
+
+}  // namespace functor
+}  // namespace deeplab2
+}  // namespace deeplab
+}  // namespace tensorflow_models
+
+#endif  // DEEPLAB2_MERGE_SEMANTIC_AND_INSTANCE_MAPS_OP_KERNEL_H_
diff --git a/tensorflow_ops/python/kernel_tests/__init__.py b/tensorflow_ops/python/kernel_tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..35e4ce02ff422f3aa84ab644b88d65b13e0cbc03
--- /dev/null
+++ b/tensorflow_ops/python/kernel_tests/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/tensorflow_ops/python/kernel_tests/merge_semantic_and_instance_maps_op_test.py b/tensorflow_ops/python/kernel_tests/merge_semantic_and_instance_maps_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c38d21c06d7eb113dbce0023edd5d03b532e576d
--- /dev/null
+++ b/tensorflow_ops/python/kernel_tests/merge_semantic_and_instance_maps_op_test.py
@@ -0,0 +1,242 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""Tests for merge_semantic_and_instance_maps_op."""
+
+import numpy as np
+import tensorflow as tf
+from deeplab2.tensorflow_ops.python.ops import merge_semantic_and_instance_maps_op
+
+
+class MergeSemanticAndInstanceMapsOpTest(tf.test.TestCase):
+
+  def testMergeSemanticAndInstanceMaps(self):
+    """Test the op with 2 images."""
+    batch = 2
+    height = 4
+    width = 4
+
+    # Create the instance labels.
+    instance_maps = np.zeros((batch, height, width), dtype=np.int32)
+    instance_maps[0, :, :] = np.array([[0, 2, 1, 0], [0, 1, 1, 0], [2, 0, 1, 2],
+                                       [0, 0, 1, 1]])
+    instance_maps[1, :, :] = np.array([[1, 2, 3, 1], [0, 2, 1, 3], [0, 2, 2, 0],
+                                       [3, 3, 2, 0]])
+
+    # Create the semantic labels.
+    # The instances with the instance label equal to 0 and 2 have the same
+    # semantic label. The other instances all have different semantic labels.
+    semantic_maps = np.zeros((batch, height, width), dtype=np.int32)
+    # Instance 0 has 4 pixels predicted as 0 and 3 pixels predicted as 3.
+    # Instance 1 has 6 pixels predicted as 1.
+    # Instance 2 has 2 pixels predicted as 0 and 1 pixel predicted as 3.
+    semantic_maps[0, :, :] = np.array([[0, 0, 1, 0], [0, 1, 1, 0], [3, 3, 1, 0],
+                                       [3, 3, 1, 1]])
+    # Instance 0 has 3 pixels predicted as 0 and 1 pixel predicted as 3.
+    # Instance 1 has 3 pixels predicted as 1.
+    # Instance 2 has 3 pixels predicted as 0 and 2 pixels predicted as 2.
+    # Instance 3 has 1 pixel predicted as 0 and 3 pixels predicted as 2.
+    semantic_maps[1, :, :] = np.array([[1, 0, 2, 1], [0, 0, 1, 2], [0, 2, 2, 3],
+                                       [0, 2, 0, 0]])
+
+    # Create the ID list for things.
+    thing_ids = [0, 2]
+
+    # Groundtruth semantic segmentation maps after majority voting.
+    gt_semantic_maps = np.zeros((batch, height, width), dtype=np.int32)
+    gt_semantic_maps[0, :, :] = np.array([[0, 0, 1, 0], [0, 1, 1, 0],
+                                          [3, 3, 1, 0], [3, 3, 1, 1]])
+    # Instance 2 takes semantic label 0 after majority voting.
+    # Instance 3 takes semantic label 2 after majority voting.
+    gt_semantic_maps[1, :, :] = np.array([[1, 0, 2, 1], [0, 0, 1, 2],
+                                          [0, 0, 0, 3], [2, 2, 0, 0]])
+    # Groundtruth instance segmentation maps.
+    gt_instance_maps = np.zeros((batch, 2, height, width), dtype=np.int32)
+
+    # There are two cases for gt_instance_maps in batch 1.
+    # Case 1:
+    # Instance 0 is re-assigned instance label 1.
+    # Instance 2 is re-assigned instance label 2.
+    gt_instance_maps[0, 0, :, :] = np.array([[1, 2, 0, 1], [1, 0, 0, 1],
+                                             [0, 0, 0, 2], [0, 0, 0, 0]])
+    # Case 2:
+    # Instance 0 is re-assigned instance label 2.
+    # Instance 2 is re-assigned instance label 1.
+    gt_instance_maps[0, 1, :, :] = np.array([[2, 1, 0, 2], [2, 0, 0, 2],
+                                             [0, 0, 0, 1], [0, 0, 0, 0]])
+    # There are two cases for gt_instance_maps in batch 2.
+    # Case 1:
+    # Instance 0 is re-assigned instance label 1.
+    # Instance 2 is re-assigned instance label 2.
+    # Instance 3 is re-assigned instance label 1.
+    gt_instance_maps[1, 0, :, :] = np.array([[0, 2, 1, 0], [1, 2, 0, 1],
+                                             [1, 2, 2, 0], [1, 1, 2, 1]])
+    # Case 2:
+    # Instance 0 is re-assigned instance label 2.
+    # Instance 2 is re-assigned instance label 1.
+    # Instance 3 is re-assigned instance label 1.
+    gt_instance_maps[1, 1, :, :] = np.array([[0, 1, 1, 0], [2, 1, 0, 1],
+                                             [2, 1, 1, 0], [1, 1, 1, 2]])
+    # Groundtruth parsing maps.
+    label_divisor = 256
+
+    # Run the op.
+    parsing_maps = (
+        merge_semantic_and_instance_maps_op.merge_semantic_and_instance_maps(
+            semantic_maps,
+            instance_maps,
+            thing_ids,
+            label_divisor=label_divisor))
+    pass_test = False
+    for i in range(2):
+      for j in range(2):
+        current_gt_instance_maps = np.stack(
+            [gt_instance_maps[0, i, :, :], gt_instance_maps[1, j, :, :]],
+            axis=0)
+        gt_parsing_maps = (
+            gt_semantic_maps * label_divisor + current_gt_instance_maps)
+        if np.array_equal(parsing_maps, gt_parsing_maps):
+          pass_test = True
+    self.assertTrue(pass_test)
+
+  def testMergeSemanticAndInstanceMapsWithStuffAreaLimit(self):
+    batch = 1
+    height = 4
+    width = 4
+
+    # Create the instance labels.
+    instance_maps = np.zeros((batch, height, width), dtype=np.int32)
+    instance_maps[0, :, :] = np.array([[0, 0, 0, 0],
+                                       [0, 0, 1, 1],
+                                       [0, 0, 0, 0],
+                                       [0, 0, 0, 0]])
+
+    # Create the semantic labels.
+    semantic_maps = np.zeros((batch, height, width), dtype=np.int32)
+    semantic_maps[0, :, :] = np.array([[0, 0, 0, 0],
+                                       [0, 0, 1, 1],
+                                       [0, 0, 2, 2],
+                                       [0, 0, 2, 2]])
+    thing_ids = [0, 2]
+    stuff_area_limit = 3
+    void_label = 3
+    # Groundtruth semantic segmentation maps after majority voting.
+    # Instance 0 takes semantic label 0.
+    # Instance 1 is re-assigned with void label.
+    gt_semantic_maps = np.zeros((batch, height, width), dtype=np.int32)
+    gt_semantic_maps[0, :, :] = np.array([[0, 0, 0, 0],
+                                          [0, 0, void_label, void_label],
+                                          [0, 0, 0, 0],
+                                          [0, 0, 0, 0]])
+    # Groundtruth instance segmentation maps.
+    gt_instance_maps = np.zeros((batch, height, width), dtype=np.int32)
+    gt_instance_maps[0, :, :] = np.array([[1, 1, 1, 1],
+                                          [1, 1, 0, 0],
+                                          [1, 1, 1, 1],
+                                          [1, 1, 1, 1]])
+    label_divisor = 256
+    gt_parsing_maps = gt_semantic_maps * label_divisor + gt_instance_maps
+
+    # Run the op.
+    parsing_maps = (
+        merge_semantic_and_instance_maps_op.merge_semantic_and_instance_maps(
+            semantic_maps,
+            instance_maps,
+            thing_ids,
+            label_divisor=label_divisor,
+            stuff_area_limit=stuff_area_limit,
+            void_label=void_label))
+    self.assertTrue(np.array_equal(parsing_maps, gt_parsing_maps))
+
+
+class MergeSemanticAndInstanceMapsOpGpuTest(MergeSemanticAndInstanceMapsOpTest):
+
+  def session(self, use_gpu=True):
+    return super(MergeSemanticAndInstanceMapsOpGpuTest,
+                 self).session(use_gpu=use_gpu)
+
+  def testMergeSemanticAndInstanceMapsWithRandomInputs(self):
+    batch = 1
+    height = 1441
+    width = 1441
+    rng = np.random.RandomState(0)
+    instance_maps = rng.randint(0, 255, (batch, height, width), dtype=np.int32)
+    semantic_maps = rng.randint(0, 3, (batch, height, width), dtype=np.int32)
+
+    thing_ids = [0, 2]
+    stuff_area_limit = 400
+    void_label = 3
+    label_divisor = 256
+
+    with self.session(use_gpu=False):
+      parsing_maps_cpu = (
+          merge_semantic_and_instance_maps_op.merge_semantic_and_instance_maps(
+              semantic_maps,
+              instance_maps,
+              thing_ids,
+              label_divisor=label_divisor,
+              stuff_area_limit=stuff_area_limit,
+              void_label=void_label))
+      parsing_maps_cpu = parsing_maps_cpu.numpy()
+
+    with self.session():
+      parsing_maps_gpu = (
+          merge_semantic_and_instance_maps_op.merge_semantic_and_instance_maps(
+              semantic_maps,
+              instance_maps,
+              thing_ids,
+              label_divisor=label_divisor,
+              stuff_area_limit=stuff_area_limit,
+              void_label=void_label))
+      parsing_maps_gpu = parsing_maps_gpu.numpy()
+
+    # Checks semantic maps are the same.
+    semantic_maps_cpu = parsing_maps_cpu // label_divisor
+    semantic_maps_gpu = parsing_maps_gpu // label_divisor
+    np.testing.assert_array_equal(semantic_maps_cpu, semantic_maps_gpu)
+
+    # Checks instance maps are the same, despite of label order.
+    instance_maps_cpu = parsing_maps_cpu % label_divisor
+    instance_maps_gpu = parsing_maps_gpu % label_divisor
+
+    thing_labels_cpu = np.unique(semantic_maps_cpu[instance_maps_cpu > 0])
+    for semantic_label in thing_labels_cpu:
+      semantic_mask = semantic_maps_cpu == semantic_label
+      instance_labels_cpu = np.unique(instance_maps_cpu[semantic_mask])
+      instance_labels_gpu = np.unique(instance_maps_gpu[semantic_mask])
+
+      self.assertEqual(len(instance_labels_cpu), len(instance_labels_gpu))
+
+      # For each instance (cpu reference) of this semantic label, we check:
+      # 1. Within this instance mask, GPU produces one and only one instance
+      #    label.
+      # 2. GPU results with the current semantic and instance label matches
+      #    CPU instance mask.
+      for instance_label in instance_labels_cpu:
+        instance_mask_cpu = np.logical_and(
+            instance_maps_cpu == instance_label, semantic_mask)
+        instance_labels_gpu = set(instance_maps_gpu[instance_mask_cpu])
+        self.assertLen(instance_labels_gpu, 1)
+
+        instance_label_gpu = instance_labels_gpu.pop()
+        # Here GPU must use the same semantic mask (given we have checked
+        # semantic maps are the same).
+        instance_mask_gpu = np.logical_and(
+            instance_maps_gpu == instance_label_gpu, semantic_mask)
+        np.testing.assert_array_equal(instance_mask_cpu, instance_mask_gpu)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow_ops/python/ops/__init__.py b/tensorflow_ops/python/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..35e4ce02ff422f3aa84ab644b88d65b13e0cbc03
--- /dev/null
+++ b/tensorflow_ops/python/ops/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/tensorflow_ops/python/ops/merge_semantic_and_instance_maps_op.py b/tensorflow_ops/python/ops/merge_semantic_and_instance_maps_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..169681833f260c495bfe401176f15f4d86926516
--- /dev/null
+++ b/tensorflow_ops/python/ops/merge_semantic_and_instance_maps_op.py
@@ -0,0 +1,28 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utility function for the C++ TensorFlow MergeSemanticAndInstanceMaps op."""
+
+import tensorflow as tf
+
+# Make the TensorFlow MergeSemanticAndInstanceMaps op accessible by importing
+# merge_semantic_and_instance_maps_op.py.
+from tensorflow.python.framework import load_library
+from tensorflow.python.platform import resource_loader
+gen_merge_semantic_and_instance_maps_op = load_library.load_op_library(resource_loader.get_path_to_datafile('../../kernels/merge_semantic_and_instance_maps_op.so'))
+
+merge_semantic_and_instance_maps = gen_merge_semantic_and_instance_maps_op.merge_semantic_and_instance_maps
+
+tf.no_gradient('MergeSemanticAndInstanceMaps')
diff --git a/tracker/__init__.py b/tracker/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..35e4ce02ff422f3aa84ab644b88d65b13e0cbc03
--- /dev/null
+++ b/tracker/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/tracker/iou_tracker.py b/tracker/iou_tracker.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e4dc26c25586c984b91d28fadcbde25b6d0598c
--- /dev/null
+++ b/tracker/iou_tracker.py
@@ -0,0 +1,367 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""This file contains code to track based on IoU overlaps.
+
+The IoUTracker takes frame-by-frame panoptic segmentation prediction and
+generates video panoptic segmentation with re-ordered identities based on IoU
+overlaps within consecutive frames.
+
+To run this script, you need to install scipy.
+For example, install it via pip:
+$pip install scipy
+"""
+
+import collections
+import os
+import pprint
+from typing import List, Text, Tuple, Optional
+
+from absl import app
+from absl import flags
+from absl import logging
+import numpy as np
+from scipy import optimize
+import tensorflow as tf
+
+from deeplab2.data import dataset
+from deeplab2.evaluation import segmentation_and_tracking_quality as stq
+from deeplab2.tracker import optical_flow_utils
+from deeplab2.trainer import vis_utils
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('gt', None, 'The path to the gt video frames. This folder '
+                    'should contain one folder per sequence.')
+flags.DEFINE_string('pred', None, 'The path to the prediction video frames. '
+                    'This folder should contain one folder per sequence.')
+flags.DEFINE_string('output', '', 'The path to store the tracked video frames.'
+                    'This folder should contain one folder per sequence.')
+flags.DEFINE_string('sequence', '', 'The sequence ID to evaluate on.')
+flags.DEFINE_string(
+    'dataset', 'kitti_step', 'The specified dataset is used'
+    ' to interpret the labels. Supported options are: ' +
+    ', '.join(dataset.MAP_NAMES))
+flags.DEFINE_string('optical_flow', None,
+                    'The path to the optical flow predictions. This folder '
+                    'should contain one folder per sequence.')
+
+_LABEL_DIVISOR = 10000
+_OCCLUSION_EXT = '.occ_forward'
+_FLOW_EXT = '.flow_forward'
+
+
+def _format_output(output, indent=4):
+  """Formats `output`, either on one line, or indented across multiple lines."""
+  formatted = pprint.pformat(output)
+  lines = formatted.splitlines()
+  if len(lines) == 1:
+    return formatted
+  lines = [' ' * indent + line for line in lines]
+  return '\n' + '\n'.join(lines)
+
+
+def _compute_mask_iou(instance_a: np.ndarray, instance_b: np.ndarray) -> int:
+  """Computes the IoU of two binary masks."""
+  intersection = np.count_nonzero(
+      np.logical_and(instance_a > 0, instance_b > 0).astype(np.uint8))
+  non_intersection_a = np.count_nonzero(instance_a > 0) - intersection
+  non_intersection_b = np.count_nonzero(instance_b > 0) - intersection
+  return intersection / (
+      intersection + non_intersection_a + non_intersection_b)
+
+
+class IoUTracker(object):
+  """This class computes track IDs based on IoU overlap."""
+
+  def __init__(self,
+               classes_to_track: List[int],
+               label_divisor: int,
+               sigma=10,
+               iou_threshold=0.3):
+    """Initializes the tracker.
+
+    Args:
+      classes_to_track: A list of class IDs that should be tracked.
+      label_divisor: The divisor to split the label map into semantic classes
+        and instance IDs.
+      sigma: An integer specifying the number of frames that tracks should be
+        kept active while being discontinued.
+      iou_threshold: A float specifying the minimum IoU value for a match.
+    """
+    self._sigma = sigma
+    self._iou_threshold = iou_threshold
+    self._classes_to_track = classes_to_track
+    self._label_divisor = label_divisor
+    self.reset_states()
+
+  def reset_states(self):
+    """Resets all tracking states."""
+    self._last_mask_per_track = {
+        i: collections.OrderedDict() for i in self._classes_to_track
+    }
+    self._frames_since_last_update = {
+        i: collections.OrderedDict() for i in self._classes_to_track
+    }
+    # `0` is reserved for `crowd`.
+    self._next_track_id = 1
+
+  def _add_track(self, object_mask: np.ndarray, class_index: int):
+    """Adds a new track."""
+    track_id = self._next_track_id
+    self._last_mask_per_track[class_index][track_id] = object_mask
+    self._frames_since_last_update[class_index][track_id] = 0
+    self._next_track_id += 1
+
+  def _remove_track(self, track_id: int, class_index: int):
+    """Removes a track."""
+    del self._last_mask_per_track[class_index][track_id]
+    del self._frames_since_last_update[class_index][track_id]
+
+  def _increase_inactivity_of_track(self, track_id: int, class_index: int):
+    """Increases inactivity of track and potentially remove it."""
+    self._frames_since_last_update[class_index][track_id] += 1
+    if (self._frames_since_last_update[class_index][track_id] >
+        self._sigma):
+      self._remove_track(track_id, class_index)
+
+  def _match_instances_to_tracks(
+      self, instances: List[np.ndarray], class_index: int,
+      instances_with_track_id: np.ndarray,
+      warped_instances: List[np.ndarray]) -> np.ndarray:
+    """Match instances to tracks and update tracks accordingly."""
+    track_ids = list(self._last_mask_per_track[class_index].keys())
+
+    # Match instances to tracks based on IoU overlap.
+    if warped_instances:
+      matches, unmatched_instances, unmatched_tracks = (
+          self._associate_instances_to_tracks(warped_instances, class_index))
+    else:
+      matches, unmatched_instances, unmatched_tracks = (
+          self._associate_instances_to_tracks(instances, class_index))
+
+    # Extend existing tracks.
+    for instance_index, track_id_index in matches:
+      track_id = track_ids[track_id_index]
+      instance_mask = instances[instance_index]
+      self._last_mask_per_track[class_index][track_id] = instance_mask
+      self._frames_since_last_update[class_index][track_id] = 0
+      instances_with_track_id[instance_mask] = track_id
+
+    # Add new tracks.
+    for instance_index in unmatched_instances:
+      instance_mask = instances[instance_index]
+      self._add_track(instance_mask, class_index)
+      instances_with_track_id[instance_mask] = self._next_track_id - 1
+
+    # Remove tracks that are inactive for more than `sigma` frames.
+    for track_id_index in unmatched_tracks:
+      track_id = track_ids[track_id_index]
+      self._increase_inactivity_of_track(track_id, class_index)
+
+    return instances_with_track_id
+
+  def update(self, predicted_frame: np.ndarray,
+             predicted_flow: Optional[np.ndarray],
+             predicted_occlusion: Optional[np.ndarray]) -> np.ndarray:
+    """Updates the tracking states and computes the track IDs.
+
+    Args:
+      predicted_frame: The panoptic label map for a particular video frame.
+      predicted_flow: An optional np.array containing the optical flow.
+      predicted_occlusion: An optional np.array containing the predicted
+        occlusion map.
+
+    Returns:
+      The updated panoptic label map for the input frame containing track IDs.
+    """
+    predicted_classes = predicted_frame // self._label_divisor
+    predicted_instances = predicted_frame % self._label_divisor
+
+    instances_with_track_id = np.zeros_like(predicted_instances)
+
+    for class_index in self._classes_to_track:
+      instances_mask = np.logical_and(predicted_classes == class_index,
+                                      predicted_instances > 0)
+      instance_ids = np.unique(predicted_instances[instances_mask])
+      instances = [
+          np.logical_and(instances_mask, predicted_instances == i)
+          for i in instance_ids
+      ]
+      # If current class has no instances, check if tracks needs to be removed,
+      # because they are inactive for more than `sigma` frames.
+      if not instances:
+        immutable_key_list = list(self._frames_since_last_update[class_index])
+        for track_id in immutable_key_list:
+          self._increase_inactivity_of_track(track_id, class_index)
+        continue
+
+      # If there are no tracks recorded yet, all all instances as new tracks.
+      if not self._last_mask_per_track[class_index]:
+        for instance_mask in instances:
+          self._add_track(instance_mask, class_index)
+          instances_with_track_id[instance_mask] = self._next_track_id - 1
+      else:
+        # If optical flow is used, warp all instances.
+        warped_instances = []
+        if predicted_occlusion is not None and predicted_flow is not None:
+          for instance in instances:
+            warped_instance = optical_flow_utils.warp_flow(
+                instance.astype(np.float32), predicted_flow)
+            warped_instances.append(
+                optical_flow_utils.remove_occlusions(warped_instance,
+                                                     predicted_occlusion))
+        instances_with_track_id = self._match_instances_to_tracks(
+            instances, class_index, instances_with_track_id, warped_instances)
+
+    if self._next_track_id >= self._label_divisor:
+      raise ValueError('To many tracks were detected for the given '
+                       'label_divisor. Please increase the label_divisor to '
+                       'make sure that the track Ids are less than the '
+                       'label_divisor.')
+
+    return predicted_classes * self._label_divisor + instances_with_track_id
+
+  def _associate_instances_to_tracks(
+      self, instances: List[np.ndarray],
+      class_index: int) -> Tuple[List[Tuple[int, int]], List[int], List[int]]:
+    """Matches the instances to existing tracks.
+
+    Args:
+      instances: A list of numpy arrays specifying the instance masks.
+      class_index: An integer specifying the class index.
+
+    Returns:
+      A tuple of Lists:
+      - Containing all indices of matches between instances and tracks.
+      - Containing all indices of unmatched instances.
+      - Containing all indices of unmatched tracks.
+    """
+    number_of_instances = len(instances)
+    number_of_tracks = len(self._last_mask_per_track[class_index])
+    iou_matrix = np.zeros((number_of_instances, number_of_tracks))
+
+    for i, instance_mask in enumerate(instances):
+      for j, last_mask in enumerate(
+          self._last_mask_per_track[class_index].values()):
+        iou_matrix[i, j] = _compute_mask_iou(instance_mask, last_mask)
+
+    matches_indices = np.stack(
+        list(optimize.linear_sum_assignment(-iou_matrix)), axis=1)
+    unmatched_instances = [
+        inst_id for inst_id in range(number_of_instances)
+        if inst_id not in matches_indices[:, 0]
+    ]
+    unmatched_tracks = [
+        inst_id for inst_id in range(number_of_tracks)
+        if inst_id not in matches_indices[:, 1]
+    ]
+
+    list_of_matches = []
+    for m in matches_indices:
+      if iou_matrix[m[0], m[1]] > self._iou_threshold:
+        list_of_matches.append(m)
+      else:
+        unmatched_instances.append(m[0])
+        unmatched_tracks.append(m[1])
+
+    return list_of_matches, unmatched_instances, unmatched_tracks
+
+
+def read_panoptic_image(path: Text, label_divisor: int) -> np.ndarray:
+  """Reads in a panoptic image in 2 channel format and returns as np array."""
+  with tf.io.gfile.GFile(path, 'rb') as f:
+    image = tf.cast(tf.io.decode_image(f.read()), tf.int32).numpy()
+  return image[..., 0] * label_divisor + image[..., 1]
+
+
+def read_numpy_tensor(path: Text) -> np.ndarray:
+  """Reads a numpy array from `path` and returns it."""
+  with tf.io.gfile.GFile(path, 'rb') as f:
+    return np.load(f)
+
+
+def main(unused_args):
+  if FLAGS.dataset not in dataset.MAP_NAME_TO_DATASET_INFO:
+    raise ValueError('Given dataset option is not a valid dataset. Please use '
+                     '--help to see available options.')
+  dataset_info = dataset.MAP_NAME_TO_DATASET_INFO[FLAGS.dataset]
+  thing_classes = dataset_info.class_has_instances_list
+  ignore_label = dataset_info.ignore_label
+  num_classes = dataset_info.num_classes
+  colormap_name = dataset_info.colormap
+  use_optical_flow = FLAGS.optical_flow is not None
+
+  # Create Tracker and metric.
+  tracker = IoUTracker(thing_classes, _LABEL_DIVISOR)
+  metric = stq.STQuality(num_classes, thing_classes, ignore_label,
+                         _LABEL_DIVISOR, 256*256*256)
+
+  # Get ground-truth files.
+  for gt_sequence_folder in tf.io.gfile.glob(os.path.join(FLAGS.gt, '*')):
+    tracker.reset_states()
+    color_map = dict()
+
+    sequence = os.path.basename(gt_sequence_folder)
+    if FLAGS.sequence and FLAGS.sequence != sequence:
+      continue
+    pred_sequence_folder = os.path.join(FLAGS.pred, sequence)
+    if use_optical_flow:
+      optical_flow_sequence_folder = os.path.join(FLAGS.optical_flow, sequence)
+
+    for gt_frame_path in sorted(tf.io.gfile.glob(
+        os.path.join(gt_sequence_folder, '*.png'))):
+      gt_frame_name = gt_frame_path.split('/')[-1]
+      pred_frame_name = os.path.join(pred_sequence_folder, gt_frame_name)
+      flow = None
+      occlusion = None
+      logging.info('Processing sequence %s: frame %s.', sequence, gt_frame_name)
+      gt_frame = read_panoptic_image(gt_frame_path, _LABEL_DIVISOR)
+      pred_frame = read_panoptic_image(pred_frame_name, _LABEL_DIVISOR)
+      if use_optical_flow:
+        frame_id = int(os.path.splitext(gt_frame_name)[0])
+        flow_path = os.path.join(optical_flow_sequence_folder,
+                                 '%06d%s' % (frame_id - 1, _FLOW_EXT))
+        occlusion_path = os.path.join(optical_flow_sequence_folder,
+                                      '%06d%s' % (frame_id - 1, _OCCLUSION_EXT))
+        if tf.io.gfile.exists(flow_path):
+          flow = read_numpy_tensor(flow_path)
+          occlusion = read_numpy_tensor(occlusion_path)[0, ..., 0]
+        else:
+          logging.info('Could not find optical flow for current frame.')
+          h, w = gt_frame.shape
+          flow = np.zeros_like((h, w, 2), np.float32)
+          occlusion = np.zeros_like((h, w), np.float32)
+      pred_frame = tracker.update(pred_frame, flow, occlusion)
+      if FLAGS.output:
+        output_folder = os.path.join(FLAGS.output, sequence)
+        tf.io.gfile.makedirs(output_folder)
+        color_map = vis_utils.save_parsing_result(pred_frame, _LABEL_DIVISOR,
+                                                  thing_classes, output_folder,
+                                                  os.path.splitext(
+                                                      gt_frame_name)[0],
+                                                  color_map,
+                                                  colormap_name=colormap_name)
+      metric.update_state(
+          tf.convert_to_tensor(gt_frame), tf.convert_to_tensor(pred_frame),
+          sequence)
+
+  logging.info('Final results:')
+  logging.info(_format_output(metric.result()))
+
+
+if __name__ == '__main__':
+  flags.mark_flags_as_required(['gt', 'pred'])
+  app.run(main)
diff --git a/tracker/optical_flow_utils.py b/tracker/optical_flow_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..db6cd0d56b3b675aadad87cd47e9ba468d8e8ddb
--- /dev/null
+++ b/tracker/optical_flow_utils.py
@@ -0,0 +1,32 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utility functions for optical flow."""
+from cvx2 import latest as cv
+import numpy as np
+
+
+def warp_flow(img: np.ndarray, flow_tensor: np.ndarray) -> np.ndarray:
+  flow = flow_tensor.copy()
+  h, w = flow.shape[:2]
+  flow[..., 0] += np.arange(w)
+  flow[..., 1] += np.arange(h)[:, np.newaxis]
+  res = cv.remap(img, flow, None, cv.INTER_LINEAR)
+  return res
+
+
+def remove_occlusions(warped_binary_img: np.ndarray,
+                      occlusion_map: np.ndarray) -> np.ndarray:
+  return warped_binary_img.astype(np.bool) & (1 - occlusion_map).astype(np.bool)
diff --git a/trainer.proto b/trainer.proto
new file mode 100644
index 0000000000000000000000000000000000000000..5ab9dd3f0c450d3f44f2c41f247128c038d80aee
--- /dev/null
+++ b/trainer.proto
@@ -0,0 +1,104 @@
+// Copyright 2021 The Deeplab2 Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+
+package deeplab2;
+
+option java_multiple_files = true;
+
+// Configure the solver options.
+// Next ID: 17
+message SolverOptions {
+  optional bool use_sync_batchnorm = 1 [default = true];
+  optional float batchnorm_momentum = 14 [default = 0.99];
+  optional float batchnorm_epsilon = 15 [default = 0.001];
+  // Set the learning rate policy for training. Available policies: 'poly',
+  // 'cosine'.
+  optional string learning_policy = 2 [default = 'poly'];
+  // Set the base learning rate for model training.
+  optional float base_learning_rate = 3 [default = 1e-3];
+  // Set the power value used in the poly learning policy.
+  optional float poly_learning_power = 4 [default = 0.9];
+  // End learning rate for polynomial learning rate schedule.
+  optional float poly_end_learning_rate = 5 [default = 0.0];
+  // Set the number of steps for the warmup phase. We currently only
+  // support linear warmup, i.e., if global_step < warmup_steps, the
+  // learning rate will be `global_step / warmup_steps * base_learning_rate`.
+  optional int32 warmup_steps = 6 [default = 0];
+  // Set the optimizer method. Supported types: 'adam', 'sgd'.
+  optional string optimizer = 7 [default = 'adam'];
+  // Set the value of the weight decay for training.
+  optional float weight_decay = 8 [default = 0];
+  // Set whether to use gradient clipping or not.
+  optional bool use_gradient_clipping = 9 [default = false];
+  // Set the norm used in gradient clipping.
+  optional float clip_gradient_norm = 10 [default = 10.0];
+  // Set the number of steps for training.
+  optional int32 training_number_of_steps = 11 [default = 60000];
+  // Set the backbone learning rate multiplier when different learning rates
+  // are desired for the backbone and for the other layers. For example,
+  // MaX-DeepLab uses this field to set a 0.1x learning rate for the pretrained
+  // backbone parameters.
+  optional float backbone_learning_rate_multiplier = 16 [default = 1.0];
+}
+
+/********** Submessages used to config loss options **********/
+// Configure the loss options.
+message LossOptions {
+  message SingleLossOptions {
+    // Set the name of the loss.
+    optional string name = 1;
+    // Set the global weight of the loss used to weight the contribution of this
+    // loss with respect to all other losses.
+    optional float weight = 2 [default = 1.0];
+    // Set the percentage of top-k pixels to be used for backpropagation.
+    optional float top_k_percent = 3 [default = 1.0];
+  }
+  // Set the loss options for the semantic segmentation output.
+  optional SingleLossOptions semantic_loss = 1;
+  // Set the loss options for the center head.
+  optional SingleLossOptions center_loss = 2;
+  // Set the loss options for the regression head.
+  optional SingleLossOptions regression_loss = 3;
+  // Set the loss options for the motion head.
+  optional SingleLossOptions motion_loss = 4;
+  // Set the loss options for the next regression head.
+  optional SingleLossOptions next_regression_loss = 5;
+  // Set the loss options for the PQ-style loss.
+  optional SingleLossOptions pq_style_loss = 6;
+  // Set the loss options for the mask id cross entropy loss.
+  optional SingleLossOptions mask_id_cross_entropy_loss = 7;
+  // Set the loss options for the instance discrimination loss.
+  optional SingleLossOptions instance_discrimination_loss = 8;
+}
+
+// Configure the trainer options.
+message TrainerOptions {
+  // Set the maximum number of checkpoints to keep.
+  optional int32 num_checkpoints_to_keep = 1 [default = 5];
+  // Set the number of steps after which a checkpoint should be made.
+  optional int32 save_checkpoints_steps = 2 [default = 1000];
+  // Set after how many steps the summary should be written. Must be a multiple
+  // of steps_per_loop.
+  optional int32 save_summaries_steps = 3 [default = 1000];
+  // Set how many steps one `inner` train loop should have. This relates to the
+  // orbit framework:
+  // https://github.com/tensorflow/models/blob/master/orbit/controller.py#L33
+  optional int32 steps_per_loop = 4 [default = 1000];
+  // Set the loss options.
+  optional LossOptions loss_options = 5;
+  // Set the solver options.
+  optional SolverOptions solver_options = 6;
+}
diff --git a/trainer/__init__.py b/trainer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..35e4ce02ff422f3aa84ab644b88d65b13e0cbc03
--- /dev/null
+++ b/trainer/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/trainer/distribution_utils.py b/trainer/distribution_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a20469ed570ff230ae5276b884f75827e7c9287
--- /dev/null
+++ b/trainer/distribution_utils.py
@@ -0,0 +1,73 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This file contains helper functions to run training in a distributed way."""
+
+from typing import Text, Optional
+
+import tensorflow as tf
+
+
+def tpu_initialize(tpu_address: Text):
+  """Initializes TPU for TF 2.x training.
+
+  Args:
+    tpu_address: string, bns address of master TPU worker.
+
+  Returns:
+    A TPUClusterResolver.
+  """
+  cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
+      tpu=tpu_address)
+  if tpu_address not in ('', 'local'):
+    tf.config.experimental_connect_to_cluster(cluster_resolver)
+  tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
+  return cluster_resolver
+
+
+def create_strategy(tpu_address: Optional[Text],
+                    num_gpus: int = 0) -> tf.distribute.Strategy:
+  """Creates a strategy based on the given parameters.
+
+  The strategies are created based on the following criteria and order:
+  1. If A tpu_address is not None, a TPUStrategy is used.
+  2. If num_gpus > 1, a MirrorStrategy is used which replicates the model on
+    each GPU.
+  3. If num_gpus == 1, a OneDevice strategy is used on the GPU.
+  4. If num_gpus == 0, a OneDevice strategy is used on the CPU.
+
+  Args:
+    tpu_address: The optional name or address of the TPU to connect to or None.
+    num_gpus: A non-negative integer specifying the number of GPUs.
+
+  Returns:
+    A tf.distribute.Strategy.
+
+  Raises:
+    ValueError: If `num_gpus` is negative and tpu_address is None.
+  """
+  if tpu_address is not None:
+    resolver = tpu_initialize(tpu_address)
+    return tf.distribute.TPUStrategy(resolver)
+  else:
+    if num_gpus < 0:
+      raise ValueError('`num_gpus` must not be negative.')
+    elif num_gpus == 0:
+      devices = ['device:CPU:0']
+    else:
+      devices = ['device:GPU:%d' % i for i in range(num_gpus)]
+    if len(devices) == 1:
+      return tf.distribute.OneDeviceStrategy(devices[0])
+    return tf.distribute.MirroredStrategy(devices)
diff --git a/trainer/evaluator.py b/trainer/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebc13ca9e8c1840ed2b590f62b7269101b1e0670
--- /dev/null
+++ b/trainer/evaluator.py
@@ -0,0 +1,393 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This file contains code to create an evaluator runner.
+
+Note that the evaluator is not well-optimized for inference speed. There are
+some redundant outputs, e.g., visualization results, evaluation loss, and so
+on. We still compute them in this implementation with the goal to provide more
+detailed information for research development. One should remove those
+redundant outputs for a faster inference speed.
+"""
+
+import os
+import orbit
+import tensorflow as tf
+
+from deeplab2 import common
+from deeplab2.data import dataset
+from deeplab2.evaluation import coco_instance_ap as instance_ap
+from deeplab2.evaluation import panoptic_quality
+from deeplab2.evaluation import segmentation_and_tracking_quality as stq
+from deeplab2.evaluation import video_panoptic_quality as vpq
+from deeplab2.model import utils
+from deeplab2.trainer import runner_utils
+from deeplab2.trainer import vis
+
+
+_PANOPTIC_METRIC_OFFSET = 256 * 256
+# Video Panoptic Segmentation requires a larger offset value for accommodating
+# more instance IDs.
+_VIDEO_PANOPTIC_METRIC_OFFSET = _PANOPTIC_METRIC_OFFSET * 256
+_PREDICTIONS_KEY = 'unique_key_for_storing_predictions'
+_LABELS_KEY = 'unique_key_for_storing_labels'
+
+
+class Evaluator(orbit.StandardEvaluator):
+  """Implements an evaluator for DeepLab models."""
+
+  def __init__(self, config, model, loss, global_step, model_dir):
+    """Initializes the Evaluator.
+
+    Args:
+      config: A config_pb2.ExperimentOptions configuration.
+      model: A tf.keras.Model.
+      loss: A tf.keras.losses.Loss.
+      global_step: A tf.Variable that records the global training step.
+      model_dir: A path to store all experimental artifacts.
+    """
+    self._strategy = tf.distribute.get_strategy()
+
+    self._supported_tasks = utils.get_supported_tasks(config)
+    eval_dataset = runner_utils.create_dataset(
+        config.eval_dataset_options,
+        is_training=False,
+        only_semantic_annotations=(
+            common.TASK_PANOPTIC_SEGMENTATION not in self._supported_tasks))
+    eval_dataset = orbit.utils.make_distributed_dataset(self._strategy,
+                                                        eval_dataset)
+    evaluator_options_override = orbit.StandardEvaluatorOptions(
+        config.evaluator_options.use_tf_function)
+    super(Evaluator, self).__init__(eval_dataset, evaluator_options_override)
+    self._config = config
+    self._model = model
+    self._loss = loss
+    self._global_step = global_step
+    self._sample_counter = 0
+    self._enable_visualization = config.evaluator_options.save_predictions
+    self._num_vis_samples = config.evaluator_options.num_vis_samples
+    self._save_raw_predictions = config.evaluator_options.save_raw_predictions
+    self._decode_groundtruth_label = (
+        config.eval_dataset_options.decode_groundtruth_label)
+    if config.evaluator_options.HasField('override_save_dir'):
+      self._vis_dir = config.evaluator_options.override_save_dir
+    else:
+      self._vis_dir = os.path.join(model_dir, 'vis')
+
+    self._dataset_info = dataset.MAP_NAME_TO_DATASET_INFO[
+        config.eval_dataset_options.dataset]
+
+    # Create eval loss metrics.
+    self._eval_loss_metric_dict = runner_utils.create_loss_metric_dict(
+        loss.get_loss_names(), prefix='eval_')
+    # Create metrics (PQ, IoU).
+    self._ignore_label = self._dataset_info.ignore_label
+    self._eval_iou_metric = tf.keras.metrics.MeanIoU(
+        self._dataset_info.num_classes, 'IoU')
+
+    if common.TASK_PANOPTIC_SEGMENTATION in self._supported_tasks:
+      self._eval_pq_metric = panoptic_quality.PanopticQuality(
+          self._dataset_info.num_classes,
+          self._dataset_info.ignore_label,
+          self._dataset_info.panoptic_label_divisor,
+          offset=_PANOPTIC_METRIC_OFFSET)
+    if common.TASK_INSTANCE_SEGMENTATION in self._supported_tasks:
+      self._eval_ap_metric = instance_ap.PanopticInstanceAveragePrecision(
+          self._dataset_info.num_classes,
+          self._dataset_info.class_has_instances_list,
+          self._dataset_info.panoptic_label_divisor,
+          self._dataset_info.ignore_label)
+    if common.TASK_VIDEO_PANOPTIC_SEGMENTATION in self._supported_tasks:
+      self._eval_tracking_metric = stq.STQuality(
+          self._dataset_info.num_classes,
+          self._dataset_info.class_has_instances_list,
+          self._dataset_info.ignore_label,
+          self._dataset_info.panoptic_label_divisor,
+          offset=_VIDEO_PANOPTIC_METRIC_OFFSET)
+    if (common.TASK_DEPTH_AWARE_VIDEO_PANOPTIC_SEGMENTATION
+            in self._supported_tasks):
+      # We compute two-frame video panoptic quality as an additional metric
+      # for the task of depth-aware video panoptic segmentation.
+      self._eval_vpq_metric = vpq.VideoPanopticQuality(
+          self._dataset_info.num_classes,
+          self._dataset_info.ignore_label,
+          self._dataset_info.panoptic_label_divisor,
+          offset=_VIDEO_PANOPTIC_METRIC_OFFSET)
+
+  def _reset(self):
+    for metric in self._eval_loss_metric_dict.values():
+      metric.reset_states()
+    self._eval_iou_metric.reset_states()
+    if common.TASK_PANOPTIC_SEGMENTATION in self._supported_tasks:
+      self._eval_pq_metric.reset_states()
+    if common.TASK_INSTANCE_SEGMENTATION in self._supported_tasks:
+      self._eval_ap_metric.reset_states()
+    if common.TASK_VIDEO_PANOPTIC_SEGMENTATION in self._supported_tasks:
+      self._eval_tracking_metric.reset_states()
+    if (common.TASK_DEPTH_AWARE_VIDEO_PANOPTIC_SEGMENTATION
+            in self._supported_tasks):
+      self._eval_vpq_metric.reset_states()
+    self._sample_counter = 0
+
+  def eval_begin(self):
+    """Called once at the beginning of the evaluation.
+
+    This method is called before dataset iterators creation.
+    """
+    self._reset()
+    tf.io.gfile.makedirs(self._vis_dir)
+    if self._save_raw_predictions:
+      tf.io.gfile.makedirs(
+          os.path.join(self._vis_dir, 'raw_semantic'))
+      if common.TASK_PANOPTIC_SEGMENTATION in self._supported_tasks:
+        tf.io.gfile.makedirs(
+            os.path.join(self._vis_dir, 'raw_panoptic'))
+
+  def eval_step(self, iterator):
+    """Implements one step of evaluation.
+
+    Runs one step of evaluation with respect to the chosen strategy. In case of
+    a distributed strategy, the replica results are gathered and returned.
+
+    Note that all operations within `_eval_step` are tf.function compatible, as
+    they will be traced with tf.function. Any other/numpy operations are put in
+    `eval_begin`, `eval_end` or `eval_reduce` functions.
+
+    Args:
+      iterator: A tf.nest-compatible structure of tf.data Iterator or
+        DistributedIterator.
+
+    Returns:
+      An output which is passed as `step_outputs` argument into `eval_reduce`
+      function.
+    """
+    def step_fn(inputs):
+      step_outputs = self._eval_step(inputs)
+      return step_outputs
+
+    distributed_outputs = self._strategy.run(step_fn, args=(next(iterator),))
+    return tf.nest.map_structure(self._strategy.experimental_local_results,
+                                 distributed_outputs)
+
+  def _eval_step(self, inputs):
+    tf.assert_equal(tf.shape(inputs[common.IMAGE])[0], 1, 'Currently only a '
+                    'batchsize of 1 is supported in evaluation due to resizing.'
+                    )
+    outputs = self._model(inputs[common.IMAGE], training=False)
+    raw_size = [
+        inputs[common.GT_SIZE_RAW][0, 0], inputs[common.GT_SIZE_RAW][0, 1]
+    ]
+    resized_size = [
+        tf.shape(inputs[common.RESIZED_IMAGE])[1],
+        tf.shape(inputs[common.RESIZED_IMAGE])[2],
+    ]
+
+    step_outputs = {}
+    if self._decode_groundtruth_label:
+
+      loss_dict = self._loss(inputs, outputs)
+      # Average over the batch.
+      average_loss_dict = {
+          key: tf.reduce_mean(value) for key, value in loss_dict.items()}
+
+      for name, value in average_loss_dict.items():
+        self._eval_loss_metric_dict[name].update_state(value)
+
+      # We only undo-preprocess for those defined in tuples in model/utils.py.
+      outputs = utils.undo_preprocessing(outputs, resized_size,
+                                         raw_size)
+
+      self._eval_iou_metric.update_state(
+          tf.where(
+              tf.equal(inputs[common.GT_SEMANTIC_RAW], self._ignore_label),
+              0,
+              inputs[common.GT_SEMANTIC_RAW]),
+          outputs[common.PRED_SEMANTIC_KEY],
+          tf.where(
+              tf.equal(inputs[common.GT_SEMANTIC_RAW], self._ignore_label),
+              0.0,
+              1.0))
+      if common.TASK_PANOPTIC_SEGMENTATION in self._supported_tasks:
+        step_outputs[self._eval_pq_metric.name] = (
+            inputs[common.GT_PANOPTIC_RAW], outputs[common.PRED_PANOPTIC_KEY])
+      if common.TASK_INSTANCE_SEGMENTATION in self._supported_tasks:
+        step_outputs[self._eval_ap_metric.name] = (
+            inputs[common.GT_PANOPTIC_RAW], outputs[common.PRED_PANOPTIC_KEY],
+            outputs[common.PRED_SEMANTIC_PROBS_KEY],
+            outputs[common.PRED_INSTANCE_SCORES_KEY],
+            inputs[common.GT_IS_CROWD_RAW])
+      if (common.TASK_DEPTH_AWARE_VIDEO_PANOPTIC_SEGMENTATION
+              in self._supported_tasks):
+        step_outputs[self._eval_vpq_metric.name] = (
+            inputs[common.GT_PANOPTIC_RAW],
+            inputs[common.GT_NEXT_PANOPTIC_RAW],
+            outputs[common.PRED_PANOPTIC_KEY],
+            outputs[common.PRED_NEXT_PANOPTIC_KEY])
+    else:
+      # We only undo-preprocess for those defined in tuples in model/utils.py.
+      outputs = utils.undo_preprocessing(outputs, resized_size,
+                                         raw_size)
+    # We only undo-preprocess for those defined in tuples in model/utils.py.
+    inputs = utils.undo_preprocessing(inputs, resized_size,
+                                      raw_size)
+    if common.SEQUENCE_ID in inputs:
+      step_outputs[common.SEQUENCE_ID] = inputs[common.SEQUENCE_ID]
+    if self._enable_visualization or self._save_raw_predictions:
+      step_outputs[_PREDICTIONS_KEY] = outputs
+      step_outputs[_LABELS_KEY] = inputs
+    return step_outputs
+
+  def eval_end(self, state=None):
+    """Called at the end of the evaluation.
+
+    Args:
+      state: The outputs from `eval_reduce` after the last eval step.
+
+    Returns:
+      A dictionary of `Tensors`, which will be written to logs and as
+      TensorBoard summaries.
+    """
+    if not self._decode_groundtruth_label:
+      return {}
+
+    eval_logs = {}
+    for loss_metric in self._eval_loss_metric_dict.values():
+      eval_logs['losses/' + loss_metric.name] = loss_metric.result()
+    eval_logs['evaluation/iou/' + self._eval_iou_metric.name] = (
+        self._eval_iou_metric.result())
+    if common.TASK_PANOPTIC_SEGMENTATION in self._supported_tasks:
+      pq_results = self._eval_pq_metric.result()
+      eval_logs['evaluation/pq/PQ'] = pq_results[0]
+      eval_logs['evaluation/pq/SQ'] = pq_results[1]
+      eval_logs['evaluation/pq/RQ'] = pq_results[2]
+      eval_logs['evaluation/pq/TP'] = pq_results[3]
+      eval_logs['evaluation/pq/FN'] = pq_results[4]
+      eval_logs['evaluation/pq/FP'] = pq_results[5]
+
+    if common.TASK_INSTANCE_SEGMENTATION in self._supported_tasks:
+      ap_results = self._eval_ap_metric.result()
+      eval_logs['evaluation/ap/AP_Mask'] = ap_results[0]
+      if self._config.evaluator_options.detailed_ap_metrics:
+        eval_logs['evaluation/ap/AP_Mask_@IoU=0.5'] = ap_results[1]
+        eval_logs['evaluation/ap/AP_Mask_@IoU=0.75'] = ap_results[2]
+        eval_logs['evaluation/ap/AP_Mask_small'] = ap_results[3]
+        eval_logs['evaluation/ap/AP_Mask_medium'] = ap_results[4]
+        eval_logs['evaluation/ap/AP_Mask_large'] = ap_results[5]
+        eval_logs['evaluation/ap/AR_Mask_maxdets=1'] = ap_results[6]
+        eval_logs['evaluation/ap/AR_Mask_maxdets=10'] = ap_results[7]
+        eval_logs['evaluation/ap/AR_Mask_maxdets=100'] = ap_results[8]
+        eval_logs['evaluation/ap/AR_Mask_small'] = ap_results[9]
+        eval_logs['evaluation/ap/AR_Mask_medium'] = ap_results[10]
+        eval_logs['evaluation/ap/AR_Mask_large'] = ap_results[11]
+
+    if common.TASK_VIDEO_PANOPTIC_SEGMENTATION in self._supported_tasks:
+      tracking_results = self._eval_tracking_metric.result()
+      eval_logs['evaluation/step/STQ'] = tracking_results['STQ']
+      eval_logs['evaluation/step/AQ'] = tracking_results['AQ']
+      eval_logs['evaluation/step/IoU'] = tracking_results['IoU']
+    if (common.TASK_DEPTH_AWARE_VIDEO_PANOPTIC_SEGMENTATION
+            in self._supported_tasks):
+      vpq_results = self._eval_vpq_metric.result()
+      eval_logs['evaluation/vpq_2frames/PQ'] = vpq_results[0]
+      eval_logs['evaluation/vpq_2frames/SQ'] = vpq_results[1]
+      eval_logs['evaluation/vpq_2frames/RQ'] = vpq_results[2]
+      eval_logs['evaluation/vpq_2frames/TP'] = vpq_results[3]
+      eval_logs['evaluation/vpq_2frames/FN'] = vpq_results[4]
+      eval_logs['evaluation/vpq_2frames/FP'] = vpq_results[5]
+    return eval_logs
+
+  def eval_reduce(self, state=None, step_outputs=None):
+    """A function to do the reduction on the evaluation outputs per step.
+
+    Args:
+      state: A maintained state throughout the evaluation.
+      step_outputs: Outputs from the current evaluation step.
+
+    Returns:
+      An output which is passed as `state` argument into `eval_reduce` function
+      for the next step. After evaluation is finished, the output from last step
+      will be passed into `eval_end` function.
+    """
+    if self._save_raw_predictions:
+      sequence = None
+      if self._dataset_info.is_video_dataset:
+        sequence = step_outputs[_LABELS_KEY][common.SEQUENCE_ID][0][0]
+      vis.store_raw_predictions(
+          step_outputs[_PREDICTIONS_KEY],
+          step_outputs[_LABELS_KEY][common.IMAGE_NAME][0][0],
+          self._dataset_info,
+          self._vis_dir,
+          sequence,
+          raw_panoptic_format=(
+              self._config.evaluator_options.raw_panoptic_format),
+          convert_to_eval=self._config.evaluator_options.convert_raw_to_eval_ids
+      )
+    if not self._decode_groundtruth_label:
+      # The followed operations will all require decoding groundtruth label, and
+      # thus we will simply return if decode_groundtruth_label is False.
+      return state
+
+    if (self._enable_visualization and
+        (self._sample_counter < self._num_vis_samples)):
+      predictions = step_outputs[_PREDICTIONS_KEY]
+      inputs = step_outputs[_LABELS_KEY]
+      if self._dataset_info.is_video_dataset:
+        inputs[common.IMAGE] = tf.expand_dims(inputs[common.IMAGE][0][..., :3],
+                                              axis=0)
+      vis.store_predictions(
+          predictions,
+          inputs,
+          self._sample_counter,
+          self._dataset_info,
+          self._vis_dir)
+      self._sample_counter += 1
+
+    # Accumulates PQ, AP_Mask and STQ.
+    if common.TASK_PANOPTIC_SEGMENTATION in self._supported_tasks:
+      for gt_panoptic, pred_panoptic in zip(
+          step_outputs[self._eval_pq_metric.name][0],
+          step_outputs[self._eval_pq_metric.name][1]):
+        batch_size = tf.shape(gt_panoptic)[0]
+        for i in range(batch_size):
+          self._eval_pq_metric.update_state(gt_panoptic[i], pred_panoptic[i])
+          # STQ.
+          if common.TASK_VIDEO_PANOPTIC_SEGMENTATION in self._supported_tasks:
+            self._eval_tracking_metric.update_state(
+                gt_panoptic[i], pred_panoptic[i],
+                step_outputs[common.SEQUENCE_ID][0][0].numpy())
+    if common.TASK_INSTANCE_SEGMENTATION in self._supported_tasks:
+      # AP_Mask.
+      for ap_result in zip(*tuple(step_outputs[self._eval_ap_metric.name])):
+        (gt_panoptic, pred_panoptic, pred_semantic_probs, pred_instance_scores,
+         gt_is_crowd) = ap_result
+        batch_size = tf.shape(gt_panoptic)[0]
+        for i in range(batch_size):
+          self._eval_ap_metric.update_state(gt_panoptic[i], pred_panoptic[i],
+                                            pred_semantic_probs[i],
+                                            pred_instance_scores[i],
+                                            gt_is_crowd[i])
+    if (common.TASK_DEPTH_AWARE_VIDEO_PANOPTIC_SEGMENTATION
+            in self._supported_tasks):
+      for vpq_result in zip(*tuple(step_outputs[self._eval_vpq_metric.name])):
+        (gt_panoptic, gt_next_panoptic, pred_panoptic,
+         pred_next_panoptic) = vpq_result
+        batch_size = tf.shape(gt_panoptic)[0]
+        for i in range(batch_size):
+          self._eval_vpq_metric.update_state(
+              [gt_panoptic[i], gt_next_panoptic[i]],
+              [pred_panoptic[i], pred_next_panoptic[i]])
+    # We simply return state as it is, since our current implementation does not
+    # keep track of state between steps.
+    return state
diff --git a/trainer/evaluator_test.py b/trainer/evaluator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0fd02456773521965e8370b585210759010ab55
--- /dev/null
+++ b/trainer/evaluator_test.py
@@ -0,0 +1,319 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for the evaluator."""
+
+import os
+import tempfile
+from unittest import mock
+
+from absl import flags
+import numpy as np
+import tensorflow as tf
+
+from google.protobuf import text_format
+from deeplab2 import common
+from deeplab2 import config_pb2
+from deeplab2 import trainer_pb2
+from deeplab2.data import data_utils
+from deeplab2.data import dataset
+from deeplab2.data import sample_generator
+from deeplab2.model import deeplab
+from deeplab2.model.loss import loss_builder
+from deeplab2.trainer import evaluator
+from deeplab2.trainer import runner_utils
+
+# resources dependency
+
+_CONFIG_PATH = 'deeplab2/configs/example'
+
+flags.DEFINE_string(
+    'panoptic_annotation_data',
+    'deeplab2/data/testdata/',
+    'Path to annotated test image.')
+
+FLAGS = flags.FLAGS
+
+_FILENAME_PREFIX = 'dummy_000000_000000'
+_IMAGE_FOLDER = 'leftImg8bit/'
+
+
+def _read_proto_file(filename, proto):
+  filename = filename  # OSS: removed internal filename loading.
+  with tf.io.gfile.GFile(filename, 'r') as proto_file:
+    return text_format.ParseLines(proto_file, proto)
+
+
+def _create_panoptic_deeplab_loss(dataset_info):
+  semantic_loss_options = trainer_pb2.LossOptions.SingleLossOptions(
+      name='softmax_cross_entropy')
+  center_loss_options = trainer_pb2.LossOptions.SingleLossOptions(name='mse')
+  regression_loss_options = trainer_pb2.LossOptions.SingleLossOptions(
+      name='l1')
+  loss_options = trainer_pb2.LossOptions(
+      semantic_loss=semantic_loss_options,
+      center_loss=center_loss_options,
+      regression_loss=regression_loss_options)
+
+  loss_layer = loss_builder.DeepLabFamilyLoss(
+      loss_options,
+      num_classes=dataset_info.num_classes,
+      ignore_label=dataset_info.ignore_label,
+      thing_class_ids=dataset_info.class_has_instances_list)
+  return loss_layer
+
+
+def _create_max_deeplab_loss(dataset_info):
+  semantic_loss_options = trainer_pb2.LossOptions.SingleLossOptions(
+      name='softmax_cross_entropy')
+  pq_style_loss_options = trainer_pb2.LossOptions.SingleLossOptions()
+  mask_id_cross_entropy_loss_options = (
+      trainer_pb2.LossOptions.SingleLossOptions())
+  instance_discrimination_loss_options = (
+      trainer_pb2.LossOptions.SingleLossOptions())
+  loss_options = trainer_pb2.LossOptions(
+      semantic_loss=semantic_loss_options,
+      pq_style_loss=pq_style_loss_options,
+      mask_id_cross_entropy_loss=mask_id_cross_entropy_loss_options,
+      instance_discrimination_loss=instance_discrimination_loss_options)
+  loss_layer = loss_builder.DeepLabFamilyLoss(
+      loss_options,
+      num_classes=dataset_info.num_classes,
+      ignore_label=dataset_info.ignore_label,
+      thing_class_ids=dataset_info.class_has_instances_list)
+  return loss_layer
+
+
+class RealDataEvaluatorTest(tf.test.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    self._test_img_data_dir = os.path.join(
+        FLAGS.test_srcdir,
+        FLAGS.panoptic_annotation_data,
+        _IMAGE_FOLDER)
+    self._test_gt_data_dir = os.path.join(
+        FLAGS.test_srcdir,
+        FLAGS.panoptic_annotation_data)
+    image_path = self._test_img_data_dir + _FILENAME_PREFIX + '_leftImg8bit.png'
+    with tf.io.gfile.GFile(image_path, 'rb') as image_file:
+      rgb_image = data_utils.read_image(image_file.read())
+    self._rgb_image = tf.convert_to_tensor(np.array(rgb_image))
+    label_path = self._test_gt_data_dir + 'dummy_gt_for_vps.png'
+    with tf.io.gfile.GFile(label_path, 'rb') as label_file:
+      label = data_utils.read_image(label_file.read())
+    self._label = tf.expand_dims(tf.convert_to_tensor(
+        np.dot(np.array(label), [1, 256, 256 * 256])), -1)
+
+  def test_evaluates_max_deeplab_model(self):
+    tf.random.set_seed(0)
+    np.random.seed(0)
+    small_instances = {'threshold': 4096, 'weight': 1.0}
+    generator = sample_generator.PanopticSampleGenerator(
+        dataset.CITYSCAPES_PANOPTIC_INFORMATION._asdict(),
+        focus_small_instances=small_instances,
+        is_training=False,
+        crop_size=[769, 769],
+        thing_id_mask_annotations=True)
+    input_sample = {
+        'image': self._rgb_image,
+        'image_name': 'test_image',
+        'label': self._label,
+        'height': 800,
+        'width': 800
+    }
+    sample = generator(input_sample)
+
+    experiment_options_textproto = """
+      experiment_name: "evaluation_test"
+      eval_dataset_options {
+        dataset: "cityscapes_panoptic"
+        file_pattern: "EMPTY"
+        batch_size: 1
+        crop_size: 769
+        crop_size: 769
+        thing_id_mask_annotations: true
+      }
+      evaluator_options {
+        continuous_eval_timeout: 43200
+        stuff_area_limit: 2048
+        center_score_threshold: 0.1
+        nms_kernel: 13
+        save_predictions: true
+        save_raw_predictions: false
+      }
+    """
+    config = text_format.Parse(experiment_options_textproto,
+                               config_pb2.ExperimentOptions())
+
+    model_proto_filename = os.path.join(
+        _CONFIG_PATH, 'example_coco_max_deeplab.textproto')
+    model_config = _read_proto_file(model_proto_filename,
+                                    config_pb2.ExperimentOptions())
+    config.model_options.CopyFrom(model_config.model_options)
+    config.model_options.max_deeplab.auxiliary_semantic_head.output_channels = (
+        19)
+    model = deeplab.DeepLab(config, dataset.CITYSCAPES_PANOPTIC_INFORMATION)
+    pool_size = (49, 49)
+    model.set_pool_size(pool_size)
+
+    loss_layer = _create_max_deeplab_loss(
+        dataset.CITYSCAPES_PANOPTIC_INFORMATION)
+    global_step = tf.Variable(initial_value=0, dtype=tf.int64)
+
+    batched_sample = {}
+    for key, value in sample.items():
+      batched_sample[key] = tf.expand_dims(value, axis=0)
+    real_data = [batched_sample]
+
+    with tempfile.TemporaryDirectory() as model_dir:
+      with mock.patch.object(runner_utils, 'create_dataset'):
+        ev = evaluator.Evaluator(
+            config, model, loss_layer, global_step, model_dir)
+
+        state = ev.eval_begin()
+        # Verify that output directories are created.
+        self.assertTrue(os.path.isdir(os.path.join(model_dir, 'vis')))
+
+        step_outputs = ev.eval_step(iter(real_data))
+
+        state = ev.eval_reduce(state, step_outputs)
+        result = ev.eval_end(state)
+
+    expected_metric_keys = {
+        'losses/eval_' + common.TOTAL_LOSS,
+        'losses/eval_' + common.SEMANTIC_LOSS,
+        'losses/eval_' + common.PQ_STYLE_LOSS_CLASS_TERM,
+        'losses/eval_' + common.PQ_STYLE_LOSS_MASK_DICE_TERM,
+        'losses/eval_' + common.MASK_ID_CROSS_ENTROPY_LOSS,
+        'losses/eval_' + common.INSTANCE_DISCRIMINATION_LOSS,
+        'evaluation/iou/IoU',
+        'evaluation/pq/PQ',
+        'evaluation/pq/SQ',
+        'evaluation/pq/RQ',
+        'evaluation/pq/TP',
+        'evaluation/pq/FN',
+        'evaluation/pq/FP',
+    }
+    self.assertCountEqual(result.keys(), expected_metric_keys)
+    self.assertSequenceEqual(result['losses/eval_total_loss'].shape, ())
+
+
+class EvaluatorTest(tf.test.TestCase):
+
+  def test_evaluates_panoptic_deeplab_model(self):
+    experiment_options_textproto = """
+      experiment_name: "evaluation_test"
+      eval_dataset_options {
+        dataset: "cityscapes_panoptic"
+        file_pattern: "EMPTY"
+        batch_size: 1
+        crop_size: 1025
+        crop_size: 2049
+        # Skip resizing.
+        min_resize_value: 0
+        max_resize_value: 0
+      }
+      evaluator_options {
+        continuous_eval_timeout: 43200
+        stuff_area_limit: 2048
+        center_score_threshold: 0.1
+        nms_kernel: 13
+        save_predictions: true
+        save_raw_predictions: false
+      }
+    """
+    config = text_format.Parse(experiment_options_textproto,
+                               config_pb2.ExperimentOptions())
+
+    model_proto_filename = os.path.join(
+        _CONFIG_PATH, 'example_cityscapes_panoptic_deeplab.textproto')
+    model_config = _read_proto_file(model_proto_filename,
+                                    config_pb2.ExperimentOptions())
+    config.model_options.CopyFrom(model_config.model_options)
+    model = deeplab.DeepLab(config, dataset.CITYSCAPES_PANOPTIC_INFORMATION)
+    pool_size = (33, 65)
+    model.set_pool_size(pool_size)
+
+    loss_layer = _create_panoptic_deeplab_loss(
+        dataset.CITYSCAPES_PANOPTIC_INFORMATION)
+    global_step = tf.Variable(initial_value=0, dtype=tf.int64)
+
+    fake_datum = {
+        common.IMAGE:
+            tf.zeros([1, 1025, 2049, 3]),
+        common.RESIZED_IMAGE:
+            tf.zeros([1, 1025, 2049, 3]),
+        common.GT_SIZE_RAW:
+            tf.constant([[1025, 2049]], dtype=tf.int32),
+        common.GT_SEMANTIC_KEY:
+            tf.zeros([1, 1025, 2049], dtype=tf.int32),
+        common.GT_SEMANTIC_RAW:
+            tf.zeros([1, 1025, 2049], dtype=tf.int32),
+        common.GT_PANOPTIC_RAW:
+            tf.zeros([1, 1025, 2049], dtype=tf.int32),
+        common.GT_IS_CROWD_RAW:
+            tf.zeros([1, 1025, 2049], dtype=tf.uint8),
+        common.GT_INSTANCE_CENTER_KEY:
+            tf.zeros([1, 1025, 2049], dtype=tf.float32),
+        common.GT_INSTANCE_REGRESSION_KEY:
+            tf.zeros([1, 1025, 2049, 2], dtype=tf.float32),
+        common.IMAGE_NAME:
+            'fake',
+        common.SEMANTIC_LOSS_WEIGHT_KEY:
+            tf.zeros([1, 1025, 2049], dtype=tf.float32),
+        common.CENTER_LOSS_WEIGHT_KEY:
+            tf.zeros([1, 1025, 2049], dtype=tf.float32),
+        common.REGRESSION_LOSS_WEIGHT_KEY:
+            tf.zeros([1, 1025, 2049], dtype=tf.float32),
+    }
+    fake_data = [fake_datum]
+
+    with tempfile.TemporaryDirectory() as model_dir:
+      with mock.patch.object(runner_utils, 'create_dataset'):
+        ev = evaluator.Evaluator(
+            config, model, loss_layer, global_step, model_dir)
+
+        state = ev.eval_begin()
+        # Verify that output directories are created.
+        self.assertTrue(os.path.isdir(os.path.join(model_dir, 'vis')))
+
+        step_outputs = ev.eval_step(iter(fake_data))
+
+        state = ev.eval_reduce(state, step_outputs)
+        result = ev.eval_end(state)
+
+    expected_metric_keys = {
+        'losses/eval_total_loss',
+        'losses/eval_semantic_loss',
+        'losses/eval_center_loss',
+        'losses/eval_regression_loss',
+        'evaluation/iou/IoU',
+        'evaluation/pq/PQ',
+        'evaluation/pq/SQ',
+        'evaluation/pq/RQ',
+        'evaluation/pq/TP',
+        'evaluation/pq/FN',
+        'evaluation/pq/FP',
+        'evaluation/ap/AP_Mask',
+    }
+    self.assertCountEqual(result.keys(), expected_metric_keys)
+
+    self.assertSequenceEqual(result['losses/eval_total_loss'].shape, ())
+    self.assertEqual(result['losses/eval_total_loss'].numpy(), 0.0)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/trainer/runner_utils.py b/trainer/runner_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..07d2fc597bc4d4b030c884312d82db2f47c4b451
--- /dev/null
+++ b/trainer/runner_utils.py
@@ -0,0 +1,186 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utility functions for the trainer and evaluator runner."""
+from typing import Any
+from typing import Mapping
+from typing import Union
+
+import tensorflow as tf
+
+from deeplab2 import config_pb2
+from deeplab2.data import data_utils
+from deeplab2.data import dataset
+from deeplab2.data import sample_generator
+from deeplab2.data.dataloader import input_reader
+from deeplab2.model.encoder import axial_resnet
+from deeplab2.model.layers import axial_block_groups
+
+
+def _load_tf_model_garden_vision_checkpoint(initial_checkpoint):
+  # Determine whether the initial_checkpoint is trained by TensorFlow Model
+  # Garden Vision trainer. This trainer applies a hardcoded prefix "backbone" to
+  # DeepLab model variables that start with "_encoder".
+  checkpoint_reader = tf.train.load_checkpoint(initial_checkpoint)
+  variable_to_shape_map = checkpoint_reader.get_variable_to_shape_map()
+  for variable in variable_to_shape_map:
+    if variable.startswith('backbone/_encoder/'):
+      return True
+  return False
+
+
+def maybe_load_checkpoint(initial_checkpoint: Union[str, None],
+                          load_dict: Mapping[Any, Any]) -> None:
+  """Maybe load a checkpoint.
+
+  Args:
+    initial_checkpoint: A string or None, specifying a path to a checkpoint.
+    load_dict: A dictionary that defines what to load from the checkpoint.
+
+  Raises:
+    ValueError: If load_dict does not contain the 'encoder'.
+  """
+  if not initial_checkpoint:
+    return
+
+  if 'encoder' not in load_dict:
+    raise ValueError('Load_dict should contain the encoder, but it is missing.')
+
+  if tf.io.gfile.isdir(initial_checkpoint):
+    initial_checkpoint = tf.train.latest_checkpoint(initial_checkpoint)
+
+  if _load_tf_model_garden_vision_checkpoint(initial_checkpoint):
+    checkpoint = tf.train.Checkpoint(
+        backbone=tf.train.Checkpoint(
+            _encoder=load_dict['encoder']))
+  else:
+    checkpoint = tf.train.Checkpoint(**load_dict)
+  status = checkpoint.read(initial_checkpoint)
+  # Motion-DeepLab models require nontrivial_match, as the input channels for
+  # the first convolution change.
+  status.expect_partial().assert_nontrivial_match()
+
+
+def create_dataset(dataset_config: config_pb2.DatasetOptions,
+                   is_training: bool,
+                   only_semantic_annotations: bool = False):
+  """Creates a tf.data.Dataset from the configuration.
+
+  Args:
+    dataset_config: A dataset_pb2.DatasetOptions configuration.
+    is_training: A flag specifying if the dataset is used for training.
+    only_semantic_annotations: A flag specifying if only semantic segmentation
+      ground-truth should be generated.
+
+  Returns:
+    A tf.data.Dataset.
+  """
+  dataset_info = dataset.MAP_NAME_TO_DATASET_INFO[dataset_config.dataset]
+  decoder = data_utils.SegmentationDecoder(
+      is_panoptic_dataset=True,
+      is_video_dataset=dataset_info.is_video_dataset,
+      use_two_frames=dataset_config.use_two_frames,
+      use_next_frame=dataset_config.use_next_frame,
+      decode_groundtruth_label=dataset_config.decode_groundtruth_label)
+
+  focus_small_instances = None
+  if dataset_config.increase_small_instance_weights:
+    focus_small_instances = {
+        'threshold': dataset_config.small_instance_threshold,
+        'weight': dataset_config.small_instance_weight,
+    }
+
+  augmentation_options = dataset_config.augmentations
+  generator = sample_generator.PanopticSampleGenerator(
+      dataset_info=dataset_info._asdict(),
+      is_training=is_training,
+      crop_size=dataset_config.crop_size,
+      min_resize_value=dataset_config.min_resize_value,
+      max_resize_value=dataset_config.max_resize_value,
+      resize_factor=dataset_config.resize_factor,
+      min_scale_factor=augmentation_options.min_scale_factor,
+      max_scale_factor=augmentation_options.max_scale_factor,
+      scale_factor_step_size=augmentation_options.scale_factor_step_size,
+      autoaugment_policy_name=augmentation_options.autoaugment_policy_name,
+      only_semantic_annotations=only_semantic_annotations,
+      thing_id_mask_annotations=dataset_config.thing_id_mask_annotations,
+      max_thing_id=dataset_config.max_thing_id,
+      sigma=dataset_config.sigma,
+      focus_small_instances=focus_small_instances)
+
+  reader = input_reader.InputReader(
+      file_pattern=dataset_config.file_pattern,
+      decoder_fn=decoder,
+      generator_fn=generator,
+      is_training=is_training)
+
+  return reader(dataset_config.batch_size)
+
+
+def create_loss_metric_dict(loss_names, prefix='train_'):
+  """Creates a loss metric dict.
+
+  This function creates a metric for each loss name.
+
+  Args:
+    loss_names: A string list of N loss names.
+    prefix: A string prefix, e.g., 'train_' or 'eval_'.
+
+  Returns:
+    loss_metric_dict: A dictionary of N tf.keras.metrics.Mean.
+  """
+  loss_metric_dict = {}
+  for loss_name in loss_names:
+    loss_metric = tf.keras.metrics.Mean(
+        prefix + loss_name, dtype=tf.float32)
+    loss_metric_dict[loss_name] = loss_metric
+  return loss_metric_dict
+
+
+def check_if_variable_in_backbone(
+    variable, encoder_name, encoder_variable_names):
+  """Determines whether a variable belongs to the pretrained backbone.
+
+  The use case of this function could be to find all variables in the backbone,
+  and then, we can use a smaller learning rate for them during training. For
+  example, in MaX-DeepLab, we use 0.1x learning rate for the backbone. This is
+  implemented by building a backbone optimizer (besides the base optimizer) for
+  all variables that have been pretrained on a classification task. For other
+  DeepLab variants, a smaller backbone learning rate is supported although it is
+  not used by default.
+
+  Args:
+    variable: A tf.Variable, the variable to check.
+    encoder_name: A string, the name of the DeepLab encoder.
+    encoder_variable_names: A list of strings, all variable names of the DeepLab
+      encoder.
+
+  Returns:
+    variable_in_backbone: A bool, whether the variable belongs to the backbone.
+  """
+  # Return false if the variable is not part of the encoder.
+  if variable.name not in encoder_variable_names:
+    return False
+  # The variable is part of the encoder.
+  # Return true if the encoder is not max_deeplab_s or max_deeplab_l.
+  if encoder_name not in ('max_deeplab_s', 'max_deeplab_l'):
+    return True
+  # The variable is part of a max_deeplab encoder.
+  # Return false for excluded keywords.
+  if any([axial_block_groups.TRANSFORMER in variable.name,
+          axial_resnet.EXTRA in variable.name,
+          axial_resnet.MEMORY_FEATURE in variable.name]):
+    return False
+  return True
diff --git a/trainer/runner_utils_test.py b/trainer/runner_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e94551e584890ad86e5788a98293113547093ba
--- /dev/null
+++ b/trainer/runner_utils_test.py
@@ -0,0 +1,78 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for runner_utils.py."""
+
+import os
+
+import numpy as np
+import tensorflow as tf
+
+from google.protobuf import text_format
+from deeplab2 import config_pb2
+from deeplab2.data import dataset
+from deeplab2.model import deeplab
+from deeplab2.trainer import runner_utils
+# resources dependency
+
+_CONFIG_PATH = 'deeplab2/configs/example'
+
+
+def _read_proto_file(filename, proto):
+  filename = filename  # OSS: removed internal filename loading.
+  with tf.io.gfile.GFile(filename, 'r') as proto_file:
+    return text_format.ParseLines(proto_file, proto)
+
+
+def _create_model_from_test_proto(file_name,
+                                  dataset_name='coco_panoptic'):
+  proto_filename = os.path.join(_CONFIG_PATH, file_name)
+  config = _read_proto_file(proto_filename, config_pb2.ExperimentOptions())
+  return deeplab.DeepLab(config,
+                         dataset.MAP_NAME_TO_DATASET_INFO[dataset_name]
+                         ), config
+
+
+class RunnerUtilsTest(tf.test.TestCase):
+
+  def test_check_if_variable_in_backbone_with_max_deeplab(self):
+    model, experiment_options = _create_model_from_test_proto(
+        'example_coco_max_deeplab.textproto', dataset_name='coco_panoptic')
+    train_crop_size = tuple(
+        experiment_options.train_dataset_options.crop_size)
+    input_tensor = tf.random.uniform(
+        shape=(2, train_crop_size[0], train_crop_size[1], 3))
+    _ = model(input_tensor, training=True)
+
+    encoder = model.checkpoint_items['encoder']
+    encoder_variable_names = [x.name for x in encoder.trainable_variables]
+    encoder_name = experiment_options.model_options.backbone.name
+
+    num_backbone_params = 0
+    backbone_optimizer_inputs = []
+    for variable in model.trainable_weights:
+      if runner_utils.check_if_variable_in_backbone(variable, encoder_name,
+                                                    encoder_variable_names):
+        backbone_optimizer_inputs.append(variable)
+        num_backbone_params += np.prod(variable.get_shape().as_list())
+    # The number of Tensors in the backbone. We use this number in addition to
+    # the number of parameters as a check of correctness.
+    self.assertLen(backbone_optimizer_inputs, 301)
+    # The same number of parameters as max_deeplab_s_backbone.
+    self.assertEqual(num_backbone_params, 41343424)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/trainer/train.py b/trainer/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..8162be286a2d8fd205d18a4b21a29d2095727a96
--- /dev/null
+++ b/trainer/train.py
@@ -0,0 +1,76 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This file contains code to run a model."""
+
+import os
+from absl import app
+from absl import flags
+from absl import logging
+import tensorflow as tf
+
+from google.protobuf import text_format
+from deeplab2 import config_pb2
+from deeplab2.trainer import train_lib
+
+flags.DEFINE_enum(
+    'mode',
+    default=None,
+    enum_values=['train', 'eval', 'train_and_eval', 'continuous_eval'],
+    help='Mode to run: `train`, `eval`, `train_and_eval`, `continuous_eval`.')
+
+flags.DEFINE_string(
+    'model_dir',
+    default=None,
+    help='The base directory where the model and training/evaluation summaries'
+    'are stored. The path will be combined with the `experiment_name` defined '
+    'in the config file to create a folder under which all files are stored.')
+
+flags.DEFINE_string(
+    'config_file',
+    default=None,
+    help='Proto file which specifies the experiment configuration. The proto '
+    'definition of ExperimentOptions is specified in config.proto.')
+
+flags.DEFINE_string(
+    'master',
+    default=None,
+    help='The Cloud TPU to use for training. This should be either the name '
+    'used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 '
+    'url.')
+
+flags.DEFINE_integer(
+    'num_gpus',
+    default=0,
+    help='The number of GPUs to use for. If `master` flag is not set, this'
+    'parameter specifies whether GPUs should be used and how many of them '
+    '(default: 0).')
+
+FLAGS = flags.FLAGS
+
+
+def main(_):
+  logging.info('Reading the config file.')
+  with tf.io.gfile.GFile(FLAGS.config_file, 'r') as proto_file:
+    config = text_format.ParseLines(proto_file, config_pb2.ExperimentOptions())
+
+  logging.info('Starting the experiment.')
+  combined_model_dir = os.path.join(FLAGS.model_dir, config.experiment_name)
+  train_lib.run_experiment(FLAGS.mode, config, combined_model_dir, FLAGS.master,
+                           FLAGS.num_gpus)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/trainer/train_lib.py b/trainer/train_lib.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab2e809a34c619249e8197ec965989282c15f6fa
--- /dev/null
+++ b/trainer/train_lib.py
@@ -0,0 +1,206 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This file contains code to create run an experiment."""
+import functools
+import os
+from typing import Text, Optional, Sequence
+
+from absl import logging
+import orbit
+import tensorflow as tf
+
+from deeplab2 import common
+from deeplab2 import config_pb2
+from deeplab2.data import dataset
+from deeplab2.model import deeplab
+from deeplab2.model.loss import loss_builder
+from deeplab2.trainer import distribution_utils
+from deeplab2.trainer import evaluator as evaluator_lib
+from deeplab2.trainer import runner_utils
+from deeplab2.trainer import trainer as trainer_lib
+from deeplab2.video import motion_deeplab
+from deeplab2.video import vip_deeplab
+
+_INSTANCE_LAYER_NAMES = (common.CKPT_MOTION_REGRESSION_HEAD_LAST_LAYER,
+                         common.CKPT_INSTANCE_REGRESSION_HEAD_LAST_LAYER,
+                         common.CKPT_INSTANCE_CENTER_HEAD_LAST_LAYER)
+# For Motion-Deeplab, 6 channels are used as input (2x RGB) during inference.
+# Its 7th input channel is obtained by the predicted center heatmap of
+# previous frame.
+_TWO_FRAME_MOTION_DEEPLAB_INPUT_CHANNELS = 6
+# All other networks use 3 channels as input (RGB).
+_SINGLE_FRAME_INPUT_CHANNELS = 3
+
+
+def create_deeplab_model(
+    config: config_pb2.ExperimentOptions,
+    dataset_descriptor: dataset.DatasetDescriptor) -> tf.keras.Model:
+  """Creates DeepLab model based on config."""
+  if config.model_options.WhichOneof('meta_architecture') == 'motion_deeplab':
+    return motion_deeplab.MotionDeepLab(config, dataset_descriptor)
+  elif config.model_options.WhichOneof('meta_architecture') == 'vip_deeplab':
+    return vip_deeplab.ViPDeepLab(config, dataset_descriptor)
+  else:
+    return deeplab.DeepLab(config, dataset_descriptor)
+
+
+def build_deeplab_model(deeplab_model: tf.keras.Model,
+                        crop_size: Sequence[int],
+                        batch_size: Optional[int] = None):
+  """Builds DeepLab model with input crop size."""
+  if isinstance(deeplab_model, motion_deeplab.MotionDeepLab) or isinstance(
+      deeplab_model, vip_deeplab.ViPDeepLab):
+    # Motion-DeepLab and ViP-DeepLab use the input differently despite that
+    # the input_shape is the same: Motion-DeepLab uses two frames as one input,
+    # while ViP-DeepLab splits the two frames first and passes them individually
+    # to the backbone encoder.
+    input_shape = list(crop_size) + [_TWO_FRAME_MOTION_DEEPLAB_INPUT_CHANNELS]
+    deeplab_model(
+        tf.keras.Input(input_shape, batch_size=batch_size), training=False)
+  else:
+    input_shape = list(crop_size) + [_SINGLE_FRAME_INPUT_CHANNELS]
+    deeplab_model(
+        tf.keras.Input(input_shape, batch_size=batch_size), training=False)
+  return input_shape
+
+
+def run_experiment(mode: Text, config: config_pb2.ExperimentOptions,
+                   model_dir: Text, tpu: Optional[Text], num_gpus: int):
+  """Runs an experiment.
+
+  Args:
+    mode: A string specifying the mode of the experiment. Supported are `train`,
+      `train_and_eval`, `eval` and `continuous_eval`.
+    config: A config_pb2.ExperimentOptions configuration.
+    model_dir: A path to store all checkpoints and other experimental artifacts.
+    tpu: The name or address of the tpu to connect to, if any.
+    num_gpus: An integer specifying the number of GPUs to use. If mode contains
+      `eval`, num_gpus must be less or equal to 1.
+
+  Raises:
+    ValueError: If mode is none of `train`, `train_and_eval`, `eval`, or
+      `continuous_eval`.
+    ValueError: If mode is `train_and_eval`, but different dataset_names are
+      specified for training and evaluation. This error could be relaxed for
+      applications like domain transferring learning (e.g., synthetic to real
+      datasets), which has not been fully tested yet.
+    ValueError: If mode includes `eval` and num_gpus > 1. Currently, evaluation
+      is not supported on more than a single GPU.
+  """
+  strategy = distribution_utils.create_strategy(tpu, num_gpus)
+  logging.info('Using strategy %s with %d replicas', type(strategy),
+               strategy.num_replicas_in_sync)
+
+  if 'eval' in mode:
+    dataset_name = config.eval_dataset_options.dataset
+    if (mode == 'train_and_eval' and
+        dataset_name != config.train_dataset_options.dataset):
+      raise ValueError('Using difference dataset_names in train_and_eval mode.')
+    if num_gpus > 1:
+      raise ValueError(
+          'Using more than one GPU for evaluation is not supported.')
+  else:
+    dataset_name = config.train_dataset_options.dataset
+
+  num_classes = dataset.MAP_NAME_TO_DATASET_INFO[dataset_name].num_classes
+  ignore_label = dataset.MAP_NAME_TO_DATASET_INFO[dataset_name].ignore_label
+  class_has_instances_list = (
+      dataset.MAP_NAME_TO_DATASET_INFO[dataset_name].class_has_instances_list)
+
+  trainer = None
+  evaluator = None
+  with strategy.scope():
+    deeplab_model = create_deeplab_model(
+        config, dataset.MAP_NAME_TO_DATASET_INFO[dataset_name])
+    losses = loss_builder.DeepLabFamilyLoss(config.trainer_options.loss_options,
+                                            num_classes, ignore_label,
+                                            class_has_instances_list)
+    global_step = orbit.utils.create_global_step()
+    if 'train' in mode:
+      trainer = trainer_lib.Trainer(config, deeplab_model, losses, global_step)
+    if 'eval' in mode:
+      evaluator = evaluator_lib.Evaluator(config, deeplab_model, losses,
+                                          global_step, model_dir)
+
+  checkpoint_dict = dict(global_step=global_step)
+  checkpoint_dict.update(deeplab_model.checkpoint_items)
+  if trainer is not None:
+    checkpoint_dict['optimizer'] = trainer.optimizer
+    if trainer.backbone_optimizer is not None:
+      checkpoint_dict['backbone_optimizer'] = trainer.backbone_optimizer
+  checkpoint = tf.train.Checkpoint(**checkpoint_dict)
+
+  # Define items to load from initial checkpoint.
+  init_dict = deeplab_model.checkpoint_items
+  if (not config.model_options
+      .restore_semantic_last_layer_from_initial_checkpoint):
+    del init_dict[common.CKPT_SEMANTIC_LAST_LAYER]
+  if (not config.model_options
+      .restore_instance_last_layer_from_initial_checkpoint):
+    for layer_name in _INSTANCE_LAYER_NAMES:
+      if layer_name in init_dict:
+        del init_dict[layer_name]
+  init_fn = functools.partial(runner_utils.maybe_load_checkpoint,
+                              config.model_options.initial_checkpoint,
+                              init_dict)
+  checkpoint_manager = tf.train.CheckpointManager(
+      checkpoint,
+      directory=model_dir,
+      max_to_keep=config.trainer_options.num_checkpoints_to_keep,
+      step_counter=global_step,
+      checkpoint_interval=config.trainer_options.save_checkpoints_steps,
+      init_fn=init_fn)
+
+  controller = orbit.Controller(
+      strategy=strategy,
+      trainer=trainer,
+      evaluator=evaluator,
+      global_step=global_step,
+      steps_per_loop=config.trainer_options.steps_per_loop,
+      checkpoint_manager=checkpoint_manager,
+      summary_interval=config.trainer_options.save_summaries_steps,
+      summary_dir=os.path.join(model_dir, 'train'),
+      eval_summary_dir=os.path.join(model_dir, 'eval'))
+
+  with strategy.scope():
+    # Save initial checkpoint.
+    if 'train' in mode:
+      crop_size = list(config.train_dataset_options.crop_size)
+      # Build model before saving.
+      build_deeplab_model(deeplab_model, crop_size)
+      controller.save_checkpoint()
+    if mode == 'train':
+      controller.train(
+          steps=config.trainer_options.solver_options.training_number_of_steps)
+    elif mode == 'train_and_eval':
+      # Interleave training and evaluation.
+      controller.train_and_evaluate(
+          train_steps=(
+              config.trainer_options.solver_options.training_number_of_steps),
+          eval_steps=config.evaluator_options.eval_steps,
+          eval_interval=config.evaluator_options.eval_interval)
+    elif mode == 'eval':
+      controller.evaluate(steps=config.evaluator_options.eval_steps)
+    elif mode == 'continuous_eval':
+      # Monitor the checkpoint directory for new checkpoints to evaluate.
+      timeout = config.evaluator_options.continuous_eval_timeout
+      if timeout == -1:
+        # Wait forever
+        timeout = None
+      controller.evaluate_continuously(
+          steps=config.evaluator_options.eval_steps, timeout=timeout)
+    else:
+      raise ValueError('Mode %s is not a valid mode.' % mode)
diff --git a/trainer/trainer.py b/trainer/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..2390a486fe061a433513cff2ee896b438e86d731
--- /dev/null
+++ b/trainer/trainer.py
@@ -0,0 +1,292 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This file contains code to create a Trainer for training and validation."""
+
+from typing import Dict, Any, Text
+import orbit
+import tensorflow as tf
+
+from deeplab2 import common
+from deeplab2 import config_pb2
+from deeplab2.model import utils
+from deeplab2.trainer import runner_utils
+
+
+class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Applies a warmup schedule on a given learning rate decay schedule."""
+
+  def __init__(self,
+               initial_learning_rate,
+               decay_schedule_fn,
+               warmup_steps,
+               name=None):
+    super(WarmUp, self).__init__()
+    self.initial_learning_rate = initial_learning_rate
+    self.warmup_steps = warmup_steps
+    self.decay_schedule_fn = decay_schedule_fn
+    self.name = name
+
+  def __call__(self, step):
+    with tf.name_scope(self.name or 'WarmUp') as name:
+      # Implements linear warmup. i.e., if global_step < warmup_steps, the
+      # learning rate will be `global_step/num_warmup_steps * init_lr`.
+      global_step_float = tf.cast(step, tf.float32)
+      warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
+      warmup_percent_done = global_step_float / warmup_steps_float
+      warmup_learning_rate = self.initial_learning_rate * warmup_percent_done
+      return tf.cond(
+          global_step_float < warmup_steps_float,
+          lambda: warmup_learning_rate,
+          lambda: self.decay_schedule_fn(step),
+          name=name)
+
+  def get_config(self):
+    return {
+        'initial_learning_rate': self.initial_learning_rate,
+        'decay_schedule_fn': self.decay_schedule_fn,
+        'warmup_steps': self.warmup_steps,
+        'name': self.name
+    }
+
+
+def _create_optimizer(
+    solver_config: config_pb2.SolverOptions,
+    learning_rate_multiplier: float = 1.0) -> tf.keras.optimizers.Optimizer:
+  """Creates an Optimizer based on the configuration.
+
+  Args:
+    solver_config: A trainer_pb2.SolverOptions configuration.
+    learning_rate_multiplier: A float, the learning rate multiplier applied on
+      top of the base learning rate. Default to 1.0.
+
+  Returns:
+    A tf.keras.optimizer.Optimizer.
+
+  Raises:
+    ValueError: An error occurs when the desired optimizer or learning rate
+      scheduler is not supported.
+  """
+  learning_rate = (solver_config.base_learning_rate * learning_rate_multiplier)
+  if solver_config.learning_policy == 'poly':
+    lr_scheduler = tf.keras.optimizers.schedules.PolynomialDecay(
+        initial_learning_rate=learning_rate,
+        decay_steps=solver_config.training_number_of_steps,
+        end_learning_rate=solver_config.poly_end_learning_rate,
+        power=solver_config.poly_learning_power,
+        cycle=False)
+  elif solver_config.learning_policy == 'cosine':
+    lr_scheduler = tf.keras.experimental.CosineDecay(
+        initial_learning_rate=learning_rate,
+        decay_steps=solver_config.training_number_of_steps,
+        alpha=0.0)
+  else:
+    raise ValueError('Learning rate policy %s is not supported.' %
+                     solver_config.learning_policy)
+
+  if solver_config.warmup_steps:
+    lr_scheduler = WarmUp(
+        initial_learning_rate=learning_rate,
+        decay_schedule_fn=lr_scheduler,
+        warmup_steps=solver_config.warmup_steps,
+        name='linear_warmup')
+
+  if solver_config.optimizer == 'adam':
+    return tf.keras.optimizers.Adam(learning_rate=lr_scheduler)
+  elif solver_config.optimizer == 'sgd':
+    # We use momentum = 0.9, the most frequently used case.
+    return tf.keras.optimizers.SGD(learning_rate=lr_scheduler,
+                                   momentum=0.9)
+
+  raise ValueError('Optimizer %s is not supported.' % solver_config.optimizer)
+
+
+class Trainer(orbit.StandardTrainer):
+  """Implements a Trainer for training DeepLab models."""
+
+  def __init__(self, config: config_pb2.ExperimentOptions,
+               model: tf.keras.Model, loss: tf.keras.losses.Loss,
+               global_step: tf.Variable):
+    """Initializes the trainer.
+
+    Args:
+      config: A config_pb2.ExperimentOptions configuration.
+      model: A tf.keras.Model.
+      loss: A tf.keras.losses.Loss.
+      global_step: A tf.Variable that records the global training step.
+    """
+    self._strategy = tf.distribute.get_strategy()
+
+    support_panoptic = (common.TASK_PANOPTIC_SEGMENTATION in
+                        utils.get_supported_tasks(config))
+    train_dataset = runner_utils.create_dataset(
+        config.train_dataset_options,
+        is_training=True,
+        only_semantic_annotations=not support_panoptic)
+    train_dataset = orbit.utils.make_distributed_dataset(
+        self.strategy, train_dataset)
+    super(Trainer, self).__init__(train_dataset)
+
+    self._config = config
+    self._model = model
+    self._loss = loss
+
+    solver_options = config.trainer_options.solver_options
+    self._optimizer = _create_optimizer(solver_options)
+    self._backbone_optimizer = None
+    if solver_options.HasField('backbone_learning_rate_multiplier'):
+      self._backbone_optimizer = _create_optimizer(
+          solver_options, learning_rate_multiplier=(
+              solver_options.backbone_learning_rate_multiplier))
+
+    self._global_step = global_step
+    self._use_gradient_clipping = solver_options.use_gradient_clipping
+    self._clip_gradient_norm = solver_options.clip_gradient_norm
+
+    self._train_loss_metric_dict = runner_utils.create_loss_metric_dict(
+        loss.get_loss_names(), prefix='train_')
+
+  def train_loop_begin(self):
+    """Called once at the beginning of the training loop.
+
+    This method is called before dataset iterators creation.
+    """
+    for metric in self._train_loss_metric_dict.values():
+      metric.reset_states()
+
+  def _apply_gradients_to_optimizers(self, gradients_and_variables):
+    """Applies gradients to their optimizers.
+
+    This function divides all trainable variables (and their gradients) into
+    two groups. One group contains backbone variables that have been pretrained,
+    e.g., on ImageNet classification. The other group contains all other
+    variables that are added specifically for the dense prediction task, e.g.,
+    panoptic segmentation. Then, we apply two optimizers, optionally with two
+    learning rates, to the variables and gradients.
+
+    Args:
+      gradients_and_variables: A list of tuple of (gradient, variable) tensors.
+    """
+    if self._backbone_optimizer is None:
+      self._optimizer.apply_gradients(gradients_and_variables)
+    else:
+      optimizer_inputs = []
+      backbone_optimizer_inputs = []
+
+      encoder = self._model.checkpoint_items['encoder']
+      encoder_variable_names = [x.name for x in encoder.trainable_variables]
+      encoder_name = self._config.model_options.backbone.name
+
+      for gradient, variable in gradients_and_variables:
+        if runner_utils.check_if_variable_in_backbone(variable, encoder_name,
+                                                      encoder_variable_names):
+          backbone_optimizer_inputs.append((gradient, variable))
+        else:
+          optimizer_inputs.append((gradient, variable))
+      self._optimizer.apply_gradients(optimizer_inputs)
+      self._backbone_optimizer.apply_gradients(backbone_optimizer_inputs)
+
+  def train_step(self, iterator):
+    """Implements one step of training.
+
+    Runs one step of evaluation with respect to the chosen strategy. In case of
+    a distributed strategy, the replica results are gathered and returned.
+
+    Note that all operations within `_train_step` are tf.function compatible, as
+    they will be traced with tf.function. Any other/numpy operations are put in
+    `train_loop_begin` or `train_loop_end` functions.
+
+    Args:
+      iterator: A tf.nest-compatible structure of tf.data Iterator or
+        DistributedIterator.
+    """
+
+    def step_fn(inputs):
+      self._train_step(inputs)
+      self._global_step.assign_add(1)
+
+    self._strategy.run(step_fn, args=(next(iterator),))
+
+  def _train_step(self, inputs: Dict[Text, Any]):
+    """Performs a forward and backward pass.
+
+    Args:
+      inputs: A dictionary to be consumed by the model.
+    """
+    with tf.GradientTape() as tape:
+      outputs = self._model(inputs[common.IMAGE], training=True)
+      # Get the average per-batch loss and scale it down by the number of
+      # replicas. This ensures that we don't end up multiplying our loss by the
+      # number of workers - gradients are summed, not averaged, across replicas
+      # during the apply_gradients call.
+      loss_dict = self._loss(inputs, outputs)
+      # Average over the batch.
+      average_loss_dict = {
+          key: tf.reduce_mean(value) for key, value in loss_dict.items()}
+      total_loss = average_loss_dict[common.TOTAL_LOSS]
+      scaled_loss = total_loss / self.strategy.num_replicas_in_sync
+
+    training_vars = self._model.trainable_variables
+    gradients = tape.gradient(scaled_loss, training_vars)
+
+    # Apply gradient clipping.
+    if self._clip_gradient_norm > 0.0 and self._use_gradient_clipping:
+      gradients, _ = tf.clip_by_global_norm(gradients, self._clip_gradient_norm)
+
+    self._apply_gradients_to_optimizers(list(zip(gradients, training_vars)))
+
+    for name, value in average_loss_dict.items():
+      self._train_loss_metric_dict[name].update_state(value)
+
+  def train_loop_end(self) -> Dict[Text, tf.Tensor]:
+    """Called at the end of the training loop.
+
+    The value returned from this function will be returned as-is from the
+    train() method.
+
+    Returns:
+      A dictionary of `Tensors`, which will be written to logs and as
+      TensorBoard summaries.
+    """
+    train_logs = {}
+    for loss_metric in self._train_loss_metric_dict.values():
+      train_logs['losses/' + loss_metric.name] = loss_metric.result()
+
+    if callable(self._optimizer.learning_rate):
+      train_logs['learning_rate'] = self._optimizer.learning_rate(
+          self._global_step)
+    else:
+      train_logs['learning_rate'] = self._optimizer.learning_rate
+    return train_logs
+
+  @property
+  def optimizer(self):
+    return self._optimizer
+
+  @property
+  def backbone_optimizer(self):
+    return self._backbone_optimizer
+
+  @property
+  def strategy(self):
+    return self._strategy
+
+  @property
+  def global_step(self):
+    return self._global_step
+
+  @property
+  def model(self):
+    return self._model
diff --git a/trainer/vis.py b/trainer/vis.py
new file mode 100644
index 0000000000000000000000000000000000000000..895c8c66afd89501ba7e1867d68e2b3712d6b955
--- /dev/null
+++ b/trainer/vis.py
@@ -0,0 +1,344 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Visualizes and stores results of a panoptic-deeplab model."""
+import os.path
+from typing import Any, Dict, List, Text
+
+import numpy as np
+import tensorflow as tf
+
+from deeplab2 import common
+from deeplab2.data import coco_constants
+from deeplab2.data import dataset
+from deeplab2.trainer import vis_utils
+
+# The format of the labels.
+_IMAGE_FORMAT = '%06d_image'
+_CENTER_LABEL_FORMAT = '%06d_center_label'
+_OFFSET_LABEL_FORMAT = '%06d_offset_label'
+_PANOPTIC_LABEL_FORMAT = '%06d_panoptic_label'
+_SEMANTIC_LABEL_FORMAT = '%06d_semantic_label'
+
+# The format of the predictions.
+_INSTANCE_PREDICTION_FORMAT = '%06d_instance_prediction'
+_CENTER_HEATMAP_PREDICTION_FORMAT = '%06d_center_prediction'
+_OFFSET_PREDICTION_RGB_FORMAT = '%06d_offset_prediction_rgb'
+_PANOPTIC_PREDICTION_FORMAT = '%06d_panoptic_prediction'
+_SEMANTIC_PREDICTION_FORMAT = '%06d_semantic_prediction'
+
+# The format of others.
+_ANALYSIS_FORMAT = '%06d_semantic_error'
+
+# Conversion from train id to eval id.
+_CITYSCAPES_TRAIN_ID_TO_EVAL_ID = (
+    7, 8, 11, 12, 13, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 31, 32, 33, 0
+)
+_COCO_TRAIN_ID_TO_EVAL_ID = coco_constants.get_id_mapping_inverse()
+
+
+def _convert_train_id_to_eval_id(
+    prediction: np.ndarray, dataset_name: str) -> np.ndarray:
+  """Converts the predicted label for evaluation.
+
+  There are cases where the training labels are not equal to the evaluation
+  labels. This function is used to perform the conversion so that we could
+  evaluate the results on the evaluation server.
+
+  Args:
+    prediction: Semantic segmentation prediction.
+    dataset_name: Dataset name.
+
+  Returns:
+    Semantic segmentation prediction whose labels have been changed.
+
+  Raises:
+    ValueError: If the dataset is not supported.
+  """
+  if 'cityscapes' in dataset_name:
+    train_id_to_eval_id = _CITYSCAPES_TRAIN_ID_TO_EVAL_ID
+  elif 'coco' in dataset_name:
+    train_id_to_eval_id = _COCO_TRAIN_ID_TO_EVAL_ID
+  else:
+    raise ValueError(
+        'Unsupported dataset %s for converting semantic class IDs.' %
+        dataset_name)
+  length = np.maximum(256, len(train_id_to_eval_id))
+  to_eval_id_map = np.zeros((length), dtype=prediction.dtype)
+  dataset_ids = np.asarray(
+      train_id_to_eval_id, dtype=prediction.dtype)
+  to_eval_id_map[:len(train_id_to_eval_id)] = dataset_ids
+  return to_eval_id_map[prediction]
+
+
+def _get_fg_mask(label_map: np.ndarray, thing_list: List[int]) -> np.ndarray:
+  fg_mask = np.zeros_like(label_map, np.bool)
+  for class_id in np.unique(label_map):
+    if class_id in thing_list:
+      fg_mask = np.logical_or(fg_mask, np.equal(label_map, class_id))
+  fg_mask = np.expand_dims(fg_mask, axis=2)
+  return fg_mask.astype(np.int)
+
+
+def store_raw_predictions(predictions: Dict[str, Any],
+                          image_filename: tf.Tensor,
+                          dataset_info: dataset.DatasetDescriptor,
+                          save_dir: Text,
+                          sequence: tf.Tensor,
+                          raw_panoptic_format='two_channel_png',
+                          convert_to_eval=True):
+  """Stores raw predictions to the specified path.
+
+  Raw predictions are saved in the specified path with the specified
+  `raw_panoptic_format`. For the `raw_panoptic_format`, we currently
+  support `two_channel_png`, `three_channel_png` and `two_channel_numpy_array`.
+  Note that `two_channel_png` and `three_channel_png` could not encode large
+  values of semantic label and instance ID due to limited PNG channel size. In
+  such a case, use `raw_panoptic_format` = `two_channel_numpy_array` to save
+  the raw predictions as two channel numpy array (i.e., first channel encodes
+  the semantic class and the second channel the instance ID).
+
+  Args:
+    predictions: A dctionary with string keys and any content. Tensors under
+      common.PRED_SEMANTIC_KEY and common.PRED_PANOPTIC_KEY will be stored.
+    image_filename: A tf.Tensor containing the image filename.
+    dataset_info: A dataset.DatasetDescriptor specifying the dataset.
+    save_dir: A path to the folder to write the output to.
+    sequence: A tf.Tensor describing the sequence that the image belongs to.
+    raw_panoptic_format: A string specifying what format the panoptic output
+      should be stored. Supports:
+      - 'two_channel_png': The popular format, also supported by the official
+        COCO panoptic API (https://github.com/cocodataset/panopticapi), where
+        the saved PNG image contains R-channel for semantic labels and
+        G-channel for instance IDs.
+      - 'three_channel_png': A simple extension of the 'two_channel_png' format,
+        and is adopted in some video panoptic segmentation datasets (for
+        example, KITTI-STEP and MOTChallenge-STEP), where the saved PNG image
+        contains R-channel for semantic labels, G-channel for the values of
+        (instance ID // 256), and B-channel for (instance ID % 256).
+      - 'two_channel_numpy_array': A more flexible format (unconstrained by the
+        PNG channel size), where the panoptic predictions are saved as a numpy
+        array in the two channel format (i.e., first channel encodes the
+        semantic class and the second channel the instance ID).
+    convert_to_eval: A flag specyfing whether semantic class IDs should be
+      converted to cityscapes eval IDs. This is usefulfor the official test
+      sever evaluation.
+
+  Raises:
+    ValueError: An error occurs when semantic label or instance ID is larger
+      than the values supported by the 'two_channel_png' or 'three_channel_png'
+      format. Or, if the raw_panoptic_format is not supported.
+  """
+  # Note: predictions[key] contains a tuple of length 1.
+  predictions = {key: predictions[key][0] for key in predictions}
+  predictions = vis_utils.squeeze_batch_dim_and_convert_to_numpy(predictions)
+  image_filename = image_filename.numpy().decode('utf-8')
+  image_filename = os.path.splitext(image_filename)[0]
+
+  # Store raw semantic prediction.
+  semantic_prediction = predictions[common.PRED_SEMANTIC_KEY]
+  if convert_to_eval:
+    semantic_prediction = _convert_train_id_to_eval_id(
+        semantic_prediction, dataset_info.dataset_name)
+  output_folder = os.path.join(save_dir, 'raw_semantic')
+  if dataset_info.is_video_dataset:
+    sequence = sequence.numpy().decode('utf-8')
+    output_folder = os.path.join(output_folder, sequence)
+    tf.io.gfile.makedirs(output_folder)
+  vis_utils.save_annotation(
+      semantic_prediction,
+      output_folder,
+      image_filename,
+      add_colormap=False)
+
+  if common.PRED_PANOPTIC_KEY in predictions:
+    # Save the predicted panoptic annotations in two-channel format, where the
+    # R-channel stores the semantic label while the G-channel stores the
+    # instance label.
+    panoptic_prediction = predictions[common.PRED_PANOPTIC_KEY]
+    panoptic_outputs = np.zeros(
+        (panoptic_prediction.shape[0], panoptic_prediction.shape[1], 3),
+        dtype=panoptic_prediction.dtype)
+    predicted_semantic_labels = (
+        panoptic_prediction // dataset_info.panoptic_label_divisor)
+    if convert_to_eval:
+      predicted_semantic_labels = _convert_train_id_to_eval_id(
+          predicted_semantic_labels, dataset_info.dataset_name)
+    predicted_instance_labels = predictions[
+        common.PRED_PANOPTIC_KEY] % dataset_info.panoptic_label_divisor
+
+    output_folder = os.path.join(save_dir, 'raw_panoptic')
+    if dataset_info.is_video_dataset:
+      output_folder = os.path.join(output_folder, sequence)
+      tf.io.gfile.makedirs(output_folder)
+    if raw_panoptic_format == 'two_channel_png':
+      if np.max(predicted_semantic_labels) > 255:
+        raise ValueError('Overflow: Semantic IDs greater 255 are not supported '
+                         'for images of 8-bit. Please save output as numpy '
+                         'arrays instead.')
+      if np.max(predicted_instance_labels) > 255:
+        raise ValueError(
+            'Overflow: Instance IDs greater 255 could not be encoded by '
+            'G channel. Please save output as numpy arrays instead.')
+      panoptic_outputs[:, :, 0] = predicted_semantic_labels
+      panoptic_outputs[:, :, 1] = predicted_instance_labels
+      vis_utils.save_annotation(panoptic_outputs,
+                                output_folder,
+                                image_filename,
+                                add_colormap=False)
+    elif raw_panoptic_format == 'three_channel_png':
+      if np.max(predicted_semantic_labels) > 255:
+        raise ValueError('Overflow: Semantic IDs greater 255 are not supported '
+                         'for images of 8-bit. Please save output as numpy '
+                         'arrays instead.')
+      if np.max(predicted_instance_labels) > 65535:
+        raise ValueError(
+            'Overflow: Instance IDs greater 65535 could not be encoded by '
+            'G and B channels. Please save output as numpy arrays instead.')
+      panoptic_outputs[:, :, 0] = predicted_semantic_labels
+      panoptic_outputs[:, :, 1] = predicted_instance_labels // 256
+      panoptic_outputs[:, :, 2] = predicted_instance_labels % 256
+      vis_utils.save_annotation(panoptic_outputs,
+                                output_folder,
+                                image_filename,
+                                add_colormap=False)
+    elif raw_panoptic_format == 'two_channel_numpy_array':
+      panoptic_outputs[:, :, 0] = predicted_semantic_labels
+      panoptic_outputs[:, :, 1] = predicted_instance_labels
+      with tf.io.gfile.GFile(
+          os.path.join(output_folder, image_filename + '.npy'), 'w') as f:
+        np.save(f, panoptic_outputs)
+    else:
+      raise ValueError(
+          'Unknown raw_panoptic_format %s.' % raw_panoptic_format)
+
+
+def store_predictions(predictions: Dict[str, Any], inputs: Dict[str, Any],
+                      image_id: int, dataset_info: dataset.DatasetDescriptor,
+                      save_dir: Text):
+  """Saves predictions and labels to the specified path."""
+  predictions = {key: predictions[key][0] for key in predictions}
+  predictions = vis_utils.squeeze_batch_dim_and_convert_to_numpy(predictions)
+  inputs = {key: inputs[key][0] for key in inputs}
+  del inputs[common.IMAGE_NAME]
+  inputs = vis_utils.squeeze_batch_dim_and_convert_to_numpy(inputs)
+
+  thing_list = dataset_info.class_has_instances_list
+  label_divisor = dataset_info.panoptic_label_divisor
+  colormap_name = dataset_info.colormap
+
+  # 1. Save image.
+  image = inputs[common.IMAGE]
+  vis_utils.save_annotation(
+      image,
+      save_dir,
+      _IMAGE_FORMAT % image_id,
+      add_colormap=False)
+
+  # 2. Save semantic predictions and semantic labels.
+  vis_utils.save_annotation(
+      predictions[common.PRED_SEMANTIC_KEY],
+      save_dir,
+      _SEMANTIC_PREDICTION_FORMAT % image_id,
+      add_colormap=True,
+      colormap_name=colormap_name)
+  vis_utils.save_annotation(
+      inputs[common.GT_SEMANTIC_RAW],
+      save_dir,
+      _SEMANTIC_LABEL_FORMAT % image_id,
+      add_colormap=True,
+      colormap_name=colormap_name)
+
+  if common.PRED_CENTER_HEATMAP_KEY in predictions:
+    # 3. Save center heatmap.
+    heatmap_pred = predictions[common.PRED_CENTER_HEATMAP_KEY]
+    heat_map_gt = inputs[common.GT_INSTANCE_CENTER_KEY]
+    vis_utils.save_annotation(
+        vis_utils.overlay_heatmap_on_image(
+            heatmap_pred,
+            image),
+        save_dir,
+        _CENTER_HEATMAP_PREDICTION_FORMAT % image_id,
+        add_colormap=False)
+    vis_utils.save_annotation(
+        vis_utils.overlay_heatmap_on_image(
+            heat_map_gt,
+            image),
+        save_dir,
+        _CENTER_LABEL_FORMAT % image_id,
+        add_colormap=False)
+
+  if common.PRED_OFFSET_MAP_KEY in predictions:
+    # 4. Save center offsets.
+    center_offset_prediction = predictions[common.PRED_OFFSET_MAP_KEY]
+    center_offset_prediction_rgb = vis_utils.flow_to_color(
+        center_offset_prediction)
+    semantic_prediction = predictions[common.PRED_SEMANTIC_KEY]
+    pred_fg_mask = _get_fg_mask(semantic_prediction, thing_list)
+    center_offset_prediction_rgb = (
+        center_offset_prediction_rgb * pred_fg_mask)
+    vis_utils.save_annotation(
+        center_offset_prediction_rgb,
+        save_dir,
+        _OFFSET_PREDICTION_RGB_FORMAT % image_id,
+        add_colormap=False)
+
+    center_offset_label = inputs[common.GT_INSTANCE_REGRESSION_KEY]
+    center_offset_label_rgb = vis_utils.flow_to_color(center_offset_label)
+    gt_fg_mask = _get_fg_mask(inputs[common.GT_SEMANTIC_RAW], thing_list)
+    center_offset_label_rgb = center_offset_label_rgb * gt_fg_mask
+
+    vis_utils.save_annotation(
+        center_offset_label_rgb,
+        save_dir,
+        _OFFSET_LABEL_FORMAT % image_id,
+        add_colormap=False)
+
+  if common.PRED_INSTANCE_KEY in predictions:
+    # 5. Save instance map.
+    vis_utils.save_annotation(
+        vis_utils.create_rgb_from_instance_map(
+            predictions[common.PRED_INSTANCE_KEY]),
+        save_dir,
+        _INSTANCE_PREDICTION_FORMAT % image_id,
+        add_colormap=False)
+
+  if common.PRED_PANOPTIC_KEY in predictions:
+    # 6. Save panoptic segmentation.
+    vis_utils.save_parsing_result(
+        predictions[common.PRED_PANOPTIC_KEY],
+        label_divisor=label_divisor,
+        thing_list=thing_list,
+        save_dir=save_dir,
+        filename=_PANOPTIC_PREDICTION_FORMAT % image_id,
+        colormap_name=colormap_name)
+    vis_utils.save_parsing_result(
+        parsing_result=inputs[common.GT_PANOPTIC_RAW],
+        label_divisor=label_divisor,
+        thing_list=thing_list,
+        save_dir=save_dir,
+        filename=_PANOPTIC_LABEL_FORMAT % image_id,
+        colormap_name=colormap_name)
+
+  # 7. Save error of semantic prediction.
+  label = inputs[common.GT_SEMANTIC_RAW].astype(np.uint8)
+  error_prediction = (
+      (predictions[common.PRED_SEMANTIC_KEY] != label) &
+      (label != dataset_info.ignore_label)).astype(np.uint8) * 255
+  vis_utils.save_annotation(
+      error_prediction,
+      save_dir,
+      _ANALYSIS_FORMAT % (image_id),
+      add_colormap=False)
diff --git a/trainer/vis_utils.py b/trainer/vis_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e4d96ffef18ecbc4943fe8aa3fccaaad26e5946
--- /dev/null
+++ b/trainer/vis_utils.py
@@ -0,0 +1,613 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utility functions for the visualizer."""
+from absl import logging
+import matplotlib.pyplot as plt
+import numpy as np
+import PIL
+import tensorflow as tf
+
+from deeplab2.data import coco_constants
+
+# Amount of color perturbation added to colormap.
+_COLOR_PERTURBATION = 60
+
+
+def bit_get(val, idx):
+  """Gets the bit value.
+
+  Args:
+    val: Input value, int or numpy int array.
+    idx: Which bit of the input val.
+
+  Returns:
+    The "idx"-th bit of input val.
+  """
+  return (val >> idx) & 1
+
+
+def create_pascal_label_colormap():
+  """Creates a label colormap used in PASCAL VOC segmentation benchmark.
+
+  Returns:
+    A colormap for visualizing segmentation results.
+  """
+  colormap = np.zeros((512, 3), dtype=int)
+  ind = np.arange(512, dtype=int)
+
+  for shift in reversed(list(range(8))):
+    for channel in range(3):
+      colormap[:, channel] |= bit_get(ind, channel) << shift
+    ind >>= 3
+
+  return colormap
+
+
+def create_rgb_from_instance_map(instance_map):
+  """Creates an RGB image from an instance map for visualization.
+
+  To assign a color to each instance, if the maximum value of the instance
+  labels is smaller than the maximum allowed value of Pascal's colormap, we use
+  Pascal's colormap. Otherwise, we use random and non-repeated colors.
+
+  Args:
+    instance_map: Numpy array of shape `[height, width]`, the instance map.
+
+  Returns:
+    instance_image: Numpy array of shape `[height, width, 3]`, the visualized
+      RGB instance image.
+  """
+  # pylint: disable=protected-access
+  if np.max(instance_map) < 512:
+    colormap = create_pascal_label_colormap()
+    instance_image = colormap[instance_map]
+  else:
+    np.random.seed(0)
+
+    used_colors = [(0, 0, 0)]
+    instanc_map_shape = instance_map.shape
+    instance_image = np.zeros([instanc_map_shape[0], instanc_map_shape[1], 3],
+                              np.uint8)
+    instance_ids = np.unique(instance_map)
+    for instance_id in instance_ids:
+      # We preserve the id "0" for stuff.
+      if instance_id == 0:
+        continue
+      r = np.random.randint(0, 256, dtype=np.uint8)
+      g = np.random.randint(0, 256, dtype=np.uint8)
+      b = np.random.randint(0, 256, dtype=np.uint8)
+      while (r, g, b) in used_colors:
+        r = np.random.randint(0, 256, dtype=np.uint8)
+        g = np.random.randint(0, 256, dtype=np.uint8)
+        b = np.random.randint(0, 256, dtype=np.uint8)
+      instance_image[instance_map == instance_id, :] = (r, g, b)
+      used_colors.append((r, g, b))
+    instance_image[instance_map == 0, :] = (0, 0, 0)
+
+  return instance_image
+
+
+def _generate_color(used_colors):
+  """"Generates a non-repeated color.
+
+  This function first uses the pascal colormap to generate the color. If more
+  colors are requested, it randomly generates a non-repeated color.
+
+  Args:
+    used_colors: A list, where each element is a tuple in the format of
+      (r, g, b).
+
+  Returns:
+    A tuple representing a color in the format of (r, g, b).
+    A list, which is the updated `used_colors` with the returned color tuple
+      appended to it.
+  """
+
+  pascal_colormap = create_pascal_label_colormap()
+
+  if len(used_colors) < len(pascal_colormap):
+    color = tuple(pascal_colormap[len(used_colors)])
+  else:
+    r = np.random.randint(0, 256, dtype=np.uint8)
+    g = np.random.randint(0, 256, dtype=np.uint8)
+    b = np.random.randint(0, 256, dtype=np.uint8)
+    while (r, g, b) in used_colors:
+      r = np.random.randint(0, 256, dtype=np.uint8)
+      g = np.random.randint(0, 256, dtype=np.uint8)
+      b = np.random.randint(0, 256, dtype=np.uint8)
+    color = (r, g, b)
+  used_colors.append(color)
+
+  return color, used_colors
+
+
+def overlay_heatmap_on_image(heatmap,
+                             input_image,
+                             dpi=80.0,
+                             add_color_bar=False):
+  """Overlays a heatmap on top of an image.
+
+  Args:
+    heatmap: A numpy array (float32) of shape `[height, width]`,
+      which is the heatmap of keypoints.
+    input_image: A numpy array (float32 or uint8) of shape
+      `[height, width, 3]`, which is an image and all the pixel values are in
+      the range of [0.0, 255.0].
+    dpi: Float, the dpi of the output image.
+    add_color_bar: Boolean, whether to add a colorbar to the output image.
+
+  Returns:
+    A numpy array (uint8) of the same shape as the `input_image`.
+  """
+
+  # Generate the cmap.
+  cmap = plt.cm.Reds
+  # pylint: disable=protected-access
+  cmap._init()
+  # pylint: disable=protected-access
+  cmap._lut[:, -1] = np.linspace(0, 1.0, 259)
+
+  # Plot.
+  image = input_image.astype(np.float32) / 255.0
+  image_height, image_width, _ = image.shape
+  fig, ax = plt.subplots(
+      1,
+      1,
+      facecolor='white',
+      figsize=(image_width / dpi, image_height / dpi),
+      dpi=dpi)
+  grid_y, grid_x = np.mgrid[0:image_height, 0:image_width]
+  cb = ax.contourf(grid_x, grid_y, heatmap, 10, cmap=cmap)
+  ax.imshow(image)
+  ax.grid(False)
+  plt.axis('off')
+  if add_color_bar:
+    plt.colorbar(cb)
+  fig.subplots_adjust(bottom=0)
+  fig.subplots_adjust(top=1)
+  fig.subplots_adjust(right=1)
+  fig.subplots_adjust(left=0)
+
+  # Get the output image.
+  fig.canvas.draw()
+  # pylint: disable=protected-access
+  output_image = np.array(fig.canvas.renderer._renderer)[:, :, :-1]
+  plt.close()
+
+  return output_image
+
+
+# pylint: disable=invalid-name
+def make_colorwheel():
+  """Generates a color wheel for optical flow visualization.
+
+  Reference implementation:
+  https://github.com/tomrunia/OpticalFlow_Visualization
+
+  Returns:
+    flow_image: A numpy array of output image.
+  """
+
+  RY = 15
+  YG = 6
+  GC = 4
+  CB = 11
+  BM = 13
+  MR = 6
+
+  ncols = RY + YG + GC + CB + BM + MR
+  colorwheel = np.zeros((ncols, 3))
+  col = 0
+
+  # RY
+  colorwheel[0:RY, 0] = 255
+  colorwheel[0:RY, 1] = np.floor(255 * np.arange(0, RY) / RY)
+  col = col + RY
+  # YG
+  colorwheel[col:col + YG, 0] = 255 - np.floor(255 * np.arange(0, YG) / YG)
+  colorwheel[col:col + YG, 1] = 255
+  col = col + YG
+  # GC
+  colorwheel[col:col + GC, 1] = 255
+  colorwheel[col:col + GC, 2] = np.floor(255 * np.arange(0, GC) / GC)
+  col = col + GC
+  # CB
+  colorwheel[col:col+CB, 1] = 255 - np.floor(255*np.arange(CB)/CB)
+  colorwheel[col:col+CB, 2] = 255
+  col = col+CB
+  # BM
+  colorwheel[col:col + BM, 2] = 255
+  colorwheel[col:col + BM, 0] = np.floor(255 * np.arange(0, BM) / BM)
+  col = col + BM
+  # MR
+  colorwheel[col:col+MR, 2] = 255 - np.floor(255*np.arange(MR)/MR)
+  colorwheel[col:col+MR, 0] = 255
+  return colorwheel
+# pylint: enable=invalid-name
+
+
+def flow_compute_color(u, v):
+  """Computes color for 2D flow field.
+
+  Reference implementation:
+  https://github.com/tomrunia/OpticalFlow_Visualization
+
+  Args:
+    u: A numpy array of horizontal flow.
+    v: A numpy array of vertical flow.
+
+  Returns:
+    flow_image: A numpy array of output image.
+  """
+
+  flow_image = np.zeros((u.shape[0], u.shape[1], 3), np.uint8)
+
+  colorwheel = make_colorwheel()  # shape [55x3]
+  ncols = colorwheel.shape[0]
+
+  rad = np.sqrt(np.square(u) + np.square(v))
+  a = np.arctan2(-v, -u) / np.pi
+
+  fk = (a + 1) / 2 * (ncols - 1)
+  k0 = np.floor(fk).astype(np.int32)
+  k1 = k0 + 1
+  k1[k1 == ncols] = 0
+  f = fk - k0
+
+  for i in range(colorwheel.shape[1]):
+    tmp = colorwheel[:, i]
+    color0 = tmp[k0] / 255.0
+    color1 = tmp[k1] / 255.0
+    color = (1 - f) * color0 + f * color1
+
+    idx = (rad <= 1)
+    color[idx] = 1 - rad[idx] * (1 - color[idx])
+    color[~idx] = color[~idx] * 0.75
+
+    # The order is RGB.
+    ch_idx = i
+    flow_image[:, :, ch_idx] = np.floor(255 * color)
+
+  return flow_image
+
+
+def flow_to_color(flow_uv, clip_flow=None):
+  """Applies color to 2D flow field.
+
+  Reference implementation:
+  https://github.com/tomrunia/OpticalFlow_Visualization
+
+  Args:
+    flow_uv: A numpy array of flow with shape [Height, Width, 2].
+    clip_flow: A float to clip the maximum value for the flow.
+
+  Returns:
+    flow_image: A numpy array of output image.
+
+  Raises:
+    ValueError: Input flow does not have dimension equals to 3.
+    ValueError: Input flow does not have shape [H, W, 2].
+  """
+
+  if flow_uv.ndim != 3:
+    raise ValueError('Input flow must have three dimensions.')
+  if flow_uv.shape[2] != 2:
+    raise ValueError('Input flow must have shape [H, W, 2].')
+
+  if clip_flow is not None:
+    flow_uv = np.clip(flow_uv, 0, clip_flow)
+
+  u = flow_uv[:, :, 0]
+  v = flow_uv[:, :, 1]
+
+  rad = np.sqrt(np.square(u) + np.square(v))
+  rad_max = np.max(rad)
+
+  epsilon = 1e-5
+  u = u / (rad_max + epsilon)
+  v = v / (rad_max + epsilon)
+
+  return flow_compute_color(u, v)
+
+
+def squeeze_batch_dim_and_convert_to_numpy(input_dict):
+  for key in input_dict:
+    input_dict[key] = tf.squeeze(input_dict[key], axis=0).numpy()
+  return input_dict
+
+
+def create_cityscapes_label_colormap():
+  """Creates a label colormap used in CITYSCAPES segmentation benchmark.
+
+  Returns:
+    A colormap for visualizing segmentation results.
+  """
+  colormap = np.zeros((256, 3), dtype=np.uint8)
+  colormap[0] = [128, 64, 128]
+  colormap[1] = [244, 35, 232]
+  colormap[2] = [70, 70, 70]
+  colormap[3] = [102, 102, 156]
+  colormap[4] = [190, 153, 153]
+  colormap[5] = [153, 153, 153]
+  colormap[6] = [250, 170, 30]
+  colormap[7] = [220, 220, 0]
+  colormap[8] = [107, 142, 35]
+  colormap[9] = [152, 251, 152]
+  colormap[10] = [70, 130, 180]
+  colormap[11] = [220, 20, 60]
+  colormap[12] = [255, 0, 0]
+  colormap[13] = [0, 0, 142]
+  colormap[14] = [0, 0, 70]
+  colormap[15] = [0, 60, 100]
+  colormap[16] = [0, 80, 100]
+  colormap[17] = [0, 0, 230]
+  colormap[18] = [119, 11, 32]
+  return colormap
+
+
+def create_motchallenge_label_colormap():
+  """Creates a label colormap used in MOTChallenge-STEP benchmark.
+
+  Returns:
+    A colormap for visualizing segmentation results.
+  """
+  colormap = np.zeros((256, 3), dtype=np.uint8)
+  colormap[0] = [244, 35, 232]
+  colormap[1] = [70, 70, 70]
+  colormap[2] = [107, 142, 35]
+  colormap[3] = [70, 130, 180]
+  colormap[4] = [220, 20, 60]
+  colormap[5] = [255, 0, 0]
+  colormap[6] = [119, 11, 32]
+  return colormap
+
+
+def create_coco_label_colormap():
+  """Creates a label colormap used in COCO dataset.
+
+  Returns:
+    A colormap for visualizing segmentation results.
+  """
+  # Obtain the dictionary mapping original category id to contiguous ones.
+  coco_categories = coco_constants.get_coco_reduced_meta()
+  colormap = np.zeros((256, 3), dtype=np.uint8)
+  for category in coco_categories:
+    colormap[category['id']] = category['color']
+  return colormap
+
+
+def label_to_color_image(label, colormap_name='cityscapes'):
+  """Adds color defined by the colormap derived from the dataset to the label.
+
+  Args:
+    label: A 2D array with integer type, storing the segmentation label.
+    colormap_name: A string specifying the name of the dataset. Used for
+      choosing the right colormap. Currently supported: 'cityscapes',
+      'motchallenge'. (Default: 'cityscapes')
+
+  Returns:
+    result: A 2D array with floating type. The element of the array
+      is the color indexed by the corresponding element in the input label
+      to the cityscapes colormap.
+
+  Raises:
+    ValueError: If label is not of rank 2 or its value is larger than color
+      map maximum entry.
+  """
+  if label.ndim != 2:
+    raise ValueError('Expect 2-D input label. Got {}'.format(label.shape))
+
+  if np.max(label) >= 256:
+    raise ValueError(
+        'label value too large: {} >= 256.'.format(np.max(label)))
+
+  if colormap_name == 'cityscapes':
+    colormap = create_cityscapes_label_colormap()
+  elif colormap_name == 'motchallenge':
+    colormap = create_motchallenge_label_colormap()
+  elif colormap_name == 'coco':
+    colormap = create_coco_label_colormap()
+  else:
+    raise ValueError('Could not find a colormap for dataset %s.' %
+                     colormap_name)
+  return colormap[label]
+
+
+def save_parsing_result(parsing_result,
+                        label_divisor,
+                        thing_list,
+                        save_dir,
+                        filename,
+                        id_to_colormap=None,
+                        colormap_name='cityscapes'):
+  """Saves the parsing results.
+
+  The parsing result encodes both semantic segmentation and instance
+  segmentation results. In order to visualize the parsing result with only
+  one png file, we adopt the following procedures, similar to the
+  `visualization.py` provided in the COCO panoptic segmentation evaluation
+  codes.
+
+  1. Pixels predicted as `stuff` will take the same semantic color defined
+    in the colormap.
+  2. Pixels of a predicted `thing` instance will take similar semantic color
+    defined in the colormap. For example, `car` class takes blue color in
+    the colormap. Predicted car instance 1 will then be colored with the
+    blue color perturbed with a small amount of RGB noise.
+
+  Args:
+    parsing_result: The numpy array to be saved. The data will be converted
+      to uint8 and saved as png image.
+    label_divisor: Integer, encoding the semantic segmentation and instance
+      segmentation results as value = semantic_label * label_divisor +
+      instance_label.
+    thing_list: A list containing the semantic indices of the thing classes.
+    save_dir: String, the directory to which the results will be saved.
+    filename: String, the image filename.
+    id_to_colormap: An optional mapping from track ID to color.
+    colormap_name: A string specifying the dataset to choose the corresponding
+      color map. Currently supported: 'cityscapes', 'motchallenge'. (Default:
+      'cityscapes').
+
+  Raises:
+    ValueError: If parsing_result is not of rank 2 or its value in semantic
+      segmentation result is larger than color map maximum entry.
+    ValueError: If provided colormap_name is not supported.
+
+  Returns:
+    If id_to_colormap is passed, the updated id_to_colormap will be returned.
+  """
+  if parsing_result.ndim != 2:
+    raise ValueError('Expect 2-D parsing result. Got {}'.format(
+        parsing_result.shape))
+  semantic_result = parsing_result // label_divisor
+  instance_result = parsing_result % label_divisor
+  colormap_max_value = 256
+  if np.max(semantic_result) >= colormap_max_value:
+    raise ValueError('Predicted semantic value too large: {} >= {}.'.format(
+        np.max(semantic_result), colormap_max_value))
+  height, width = parsing_result.shape
+  colored_output = np.zeros((height, width, 3), dtype=np.uint8)
+  if colormap_name == 'cityscapes':
+    colormap = create_cityscapes_label_colormap()
+  elif colormap_name == 'motchallenge':
+    colormap = create_motchallenge_label_colormap()
+  elif colormap_name == 'coco':
+    colormap = create_coco_label_colormap()
+  else:
+    raise ValueError('Could not find a colormap for dataset %s.' %
+                     colormap_name)
+  # Keep track of used colors.
+  used_colors = set()
+  if id_to_colormap is not None:
+    used_colors = set([tuple(val) for val in id_to_colormap.values()])
+    np_state = None
+  else:
+    # Use random seed 0 in order to reproduce the same visualization.
+    np_state = np.random.RandomState(0)
+
+  unique_semantic_values = np.unique(semantic_result)
+  for semantic_value in unique_semantic_values:
+    semantic_mask = semantic_result == semantic_value
+    if semantic_value in thing_list:
+      # For `thing` class, we will add a small amount of random noise to its
+      # correspondingly predefined semantic segmentation colormap.
+      unique_instance_values = np.unique(instance_result[semantic_mask])
+      for instance_value in unique_instance_values:
+        instance_mask = np.logical_and(semantic_mask,
+                                       instance_result == instance_value)
+        if id_to_colormap is not None:
+          if instance_value in id_to_colormap:
+            colored_output[instance_mask] = id_to_colormap[instance_value]
+            continue
+        random_color = perturb_color(
+            colormap[semantic_value],
+            _COLOR_PERTURBATION,
+            used_colors,
+            random_state=np_state)
+        colored_output[instance_mask] = random_color
+        if id_to_colormap is not None:
+          id_to_colormap[instance_value] = random_color
+    else:
+      # For `stuff` class, we use the defined semantic color.
+      colored_output[semantic_mask] = colormap[semantic_value]
+      used_colors.add(tuple(colormap[semantic_value]))
+
+  pil_image = PIL.Image.fromarray(colored_output.astype(dtype=np.uint8))
+  with tf.io.gfile.GFile('{}/{}.png'.format(save_dir, filename), mode='w') as f:
+    pil_image.save(f, 'PNG')
+  if id_to_colormap is not None:
+    return id_to_colormap
+
+
+def perturb_color(color,
+                  noise,
+                  used_colors=None,
+                  max_trials=50,
+                  random_state=None):
+  """Pertrubs the color with some noise.
+
+  If `used_colors` is not None, we will return the color that has
+  not appeared before in it.
+
+  Args:
+    color: A numpy array with three elements [R, G, B].
+    noise: Integer, specifying the amount of perturbing noise.
+    used_colors: A set, used to keep track of used colors.
+    max_trials: An integer, maximum trials to generate random color.
+    random_state: An optional np.random.RandomState. If passed, will be used to
+      generate random numbers.
+
+  Returns:
+    A perturbed color that has not appeared in used_colors.
+  """
+  for _ in range(max_trials):
+    if random_state is not None:
+      random_color = color + random_state.randint(
+          low=-noise, high=noise + 1, size=3)
+    else:
+      random_color = color + np.random.randint(low=-noise,
+                                               high=noise+1,
+                                               size=3)
+    random_color = np.maximum(0, np.minimum(255, random_color))
+    if used_colors is None:
+      return random_color
+    elif tuple(random_color) not in used_colors:
+      used_colors.add(tuple(random_color))
+      return random_color
+  logging.warning('Using duplicate random color.')
+  return random_color
+
+
+def save_annotation(label,
+                    save_dir,
+                    filename,
+                    add_colormap=True,
+                    normalize_to_unit_values=False,
+                    scale_values=False,
+                    colormap_name='cityscapes'):
+  """Saves the given label to image on disk.
+
+  Args:
+    label: The numpy array to be saved. The data will be converted
+      to uint8 and saved as png image.
+    save_dir: String, the directory to which the results will be saved.
+    filename: String, the image filename.
+    add_colormap: Boolean, add color map to the label or not.
+    normalize_to_unit_values: Boolean, normalize the input values to [0, 1].
+    scale_values: Boolean, scale the input values to [0, 255] for visualization.
+    colormap_name: A string specifying the dataset to choose the corresponding
+      color map. Currently supported: 'cityscapes', 'motchallenge'. (Default:
+      'cityscapes').
+  """
+  # Add colormap for visualizing the prediction.
+  if add_colormap:
+    colored_label = label_to_color_image(label, colormap_name)
+  else:
+    colored_label = label
+    if normalize_to_unit_values:
+      min_value = np.amin(colored_label)
+      max_value = np.amax(colored_label)
+      range_value = max_value - min_value
+      if range_value != 0:
+        colored_label = (colored_label - min_value) / range_value
+
+    if scale_values:
+      colored_label = 255. * colored_label
+
+  pil_image = PIL.Image.fromarray(colored_label.astype(dtype=np.uint8))
+  with tf.io.gfile.GFile('%s/%s.png' % (save_dir, filename), mode='w') as f:
+    pil_image.save(f, 'PNG')
diff --git a/utils/__init__.py b/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..35e4ce02ff422f3aa84ab644b88d65b13e0cbc03
--- /dev/null
+++ b/utils/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/utils/coco_tools.py b/utils/coco_tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f89dbc176f743888002fd2b11746318814a9109
--- /dev/null
+++ b/utils/coco_tools.py
@@ -0,0 +1,201 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Wrappers and conversions for third party pycocotools.
+
+This is derived from code in the Tensorflow Object Detection API:
+https://github.com/tensorflow/models/tree/master/research/object_detection
+
+Huang et. al. "Speed/accuracy trade-offs for modern convolutional object
+detectors" CVPR 2017.
+"""
+
+from typing import Any, Collection, Dict, List, Optional, Union
+
+import numpy as np
+from pycocotools import mask
+
+
+COCO_METRIC_NAMES_AND_INDEX = (
+    ('Precision/mAP', 0),
+    ('Precision/mAP@.50IOU', 1),
+    ('Precision/mAP@.75IOU', 2),
+    ('Precision/mAP (small)', 3),
+    ('Precision/mAP (medium)', 4),
+    ('Precision/mAP (large)', 5),
+    ('Recall/AR@1', 6),
+    ('Recall/AR@10', 7),
+    ('Recall/AR@100', 8),
+    ('Recall/AR@100 (small)', 9),
+    ('Recall/AR@100 (medium)', 10),
+    ('Recall/AR@100 (large)', 11)
+)
+
+
+def _ConvertBoxToCOCOFormat(box: np.ndarray) -> List[float]:
+  """Converts a box in [ymin, xmin, ymax, xmax] format to COCO format.
+
+  This is a utility function for converting from our internal
+  [ymin, xmin, ymax, xmax] convention to the convention used by the COCO API
+  i.e., [xmin, ymin, width, height].
+
+  Args:
+    box: a [ymin, xmin, ymax, xmax] numpy array
+
+  Returns:
+    a list of floats representing [xmin, ymin, width, height]
+  """
+  return [float(box[1]), float(box[0]), float(box[3] - box[1]),
+          float(box[2] - box[0])]
+
+
+def ExportSingleImageGroundtruthToCoco(
+    image_id: Union[int, str],
+    next_annotation_id: int,
+    category_id_set: Collection[int],
+    groundtruth_boxes: np.ndarray,
+    groundtruth_classes: np.ndarray,
+    groundtruth_masks: np.ndarray,
+    groundtruth_is_crowd: Optional[np.ndarray] = None) -> List[Dict[str, Any]]:
+  """Exports groundtruth of a single image to COCO format.
+
+  This function converts groundtruth detection annotations represented as numpy
+  arrays to dictionaries that can be ingested by the COCO evaluation API. Note
+  that the image_ids provided here must match the ones given to
+  ExportSingleImageDetectionsToCoco. We assume that boxes and classes are in
+  correspondence - that is: groundtruth_boxes[i, :], and
+  groundtruth_classes[i] are associated with the same groundtruth annotation.
+
+  In the exported result, "area" fields are always set to the foregorund area of
+  the mask.
+
+  Args:
+    image_id: a unique image identifier either of type integer or string.
+    next_annotation_id: integer specifying the first id to use for the
+      groundtruth annotations. All annotations are assigned a continuous integer
+      id starting from this value.
+    category_id_set: A set of valid class ids. Groundtruth with classes not in
+      category_id_set are dropped.
+    groundtruth_boxes: numpy array (float32) with shape [num_gt_boxes, 4]
+    groundtruth_classes: numpy array (int) with shape [num_gt_boxes]
+    groundtruth_masks: uint8 numpy array of shape [num_detections, image_height,
+      image_width] containing detection_masks.
+    groundtruth_is_crowd: optional numpy array (int) with shape [num_gt_boxes]
+      indicating whether groundtruth boxes are crowd.
+
+  Returns:
+    a list of groundtruth annotations for a single image in the COCO format.
+
+  Raises:
+    ValueError: if (1) groundtruth_boxes and groundtruth_classes do not have the
+      right lengths or (2) if each of the elements inside these lists do not
+      have the correct shapes or (3) if image_ids are not integers
+  """
+
+  if len(groundtruth_classes.shape) != 1:
+    raise ValueError('groundtruth_classes is '
+                     'expected to be of rank 1.')
+  if len(groundtruth_boxes.shape) != 2:
+    raise ValueError('groundtruth_boxes is expected to be of '
+                     'rank 2.')
+  if groundtruth_boxes.shape[1] != 4:
+    raise ValueError('groundtruth_boxes should have '
+                     'shape[1] == 4.')
+  num_boxes = groundtruth_classes.shape[0]
+  if num_boxes != groundtruth_boxes.shape[0]:
+    raise ValueError('Corresponding entries in groundtruth_classes, '
+                     'and groundtruth_boxes should have '
+                     'compatible shapes (i.e., agree on the 0th dimension).'
+                     'Classes shape: %d. Boxes shape: %d. Image ID: %s' % (
+                         groundtruth_classes.shape[0],
+                         groundtruth_boxes.shape[0], image_id))
+  has_is_crowd = groundtruth_is_crowd is not None
+  if has_is_crowd and len(groundtruth_is_crowd.shape) != 1:
+    raise ValueError('groundtruth_is_crowd is expected to be of rank 1.')
+  groundtruth_list = []
+  for i in range(num_boxes):
+    if groundtruth_classes[i] in category_id_set:
+      iscrowd = groundtruth_is_crowd[i] if has_is_crowd else 0
+      segment = mask.encode(np.asfortranarray(groundtruth_masks[i]))
+      area = mask.area(segment)
+      export_dict = {
+          'id': next_annotation_id + i,
+          'image_id': image_id,
+          'category_id': int(groundtruth_classes[i]),
+          'bbox': list(_ConvertBoxToCOCOFormat(groundtruth_boxes[i, :])),
+          'segmentation': segment,
+          'area': area,
+          'iscrowd': iscrowd
+      }
+
+      groundtruth_list.append(export_dict)
+  return groundtruth_list
+
+
+def ExportSingleImageDetectionMasksToCoco(
+    image_id: Union[int, str], category_id_set: Collection[int],
+    detection_masks: np.ndarray, detection_scores: np.ndarray,
+    detection_classes: np.ndarray) -> List[Dict[str, Any]]:
+  """Exports detection masks of a single image to COCO format.
+
+  This function converts detections represented as numpy arrays to dictionaries
+  that can be ingested by the COCO evaluation API. We assume that
+  detection_masks, detection_scores, and detection_classes are in correspondence
+  - that is: detection_masks[i, :], detection_classes[i] and detection_scores[i]
+    are associated with the same annotation.
+
+  Args:
+    image_id: unique image identifier either of type integer or string.
+    category_id_set: A set of valid class ids. Detections with classes not in
+      category_id_set are dropped.
+    detection_masks: uint8 numpy array of shape [num_detections, image_height,
+      image_width] containing detection_masks.
+    detection_scores: float numpy array of shape [num_detections] containing
+      scores for detection masks.
+    detection_classes: integer numpy array of shape [num_detections] containing
+      the classes for detection masks.
+
+  Returns:
+    a list of detection mask annotations for a single image in the COCO format.
+
+  Raises:
+    ValueError: if (1) detection_masks, detection_scores and detection_classes
+      do not have the right lengths or (2) if each of the elements inside these
+      lists do not have the correct shapes or (3) if image_ids are not integers.
+  """
+
+  if len(detection_classes.shape) != 1 or len(detection_scores.shape) != 1:
+    raise ValueError('All entries in detection_classes and detection_scores'
+                     'expected to be of rank 1.')
+  num_boxes = detection_classes.shape[0]
+  if not num_boxes == len(detection_masks) == detection_scores.shape[0]:
+    raise ValueError('Corresponding entries in detection_classes, '
+                     'detection_scores and detection_masks should have '
+                     'compatible lengths and shapes '
+                     'Classes length: %d.  Masks length: %d. '
+                     'Scores length: %d' % (
+                         detection_classes.shape[0], len(detection_masks),
+                         detection_scores.shape[0]
+                     ))
+  detections_list = []
+  for i in range(num_boxes):
+    if detection_classes[i] in category_id_set:
+      detections_list.append({
+          'image_id': image_id,
+          'category_id': int(detection_classes[i]),
+          'segmentation': mask.encode(np.asfortranarray(detection_masks[i])),
+          'score': float(detection_scores[i])
+      })
+  return detections_list
diff --git a/utils/coco_tools_test.py b/utils/coco_tools_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea28e7f69784366cd0c016a44c3393426b7917b6
--- /dev/null
+++ b/utils/coco_tools_test.py
@@ -0,0 +1,106 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for coco_tools."""
+
+from absl.testing import absltest
+import numpy as np
+from pycocotools import mask
+
+from deeplab2.utils import coco_tools
+
+
+class CocoToolsTest(absltest.TestCase):
+
+  def testSingleImageDetectionMaskExport(self):
+    masks = np.array(
+        [[[1, 1,], [1, 1]],
+         [[0, 0], [0, 1]],
+         [[0, 0], [0, 0]]], dtype=np.uint8)
+    classes = np.array([1, 2, 3], dtype=np.int32)
+    scores = np.array([0.8, 0.2, 0.7], dtype=np.float32)
+    coco_annotations = coco_tools.ExportSingleImageDetectionMasksToCoco(
+        image_id='first_image',
+        category_id_set=set([1, 2, 3]),
+        detection_classes=classes,
+        detection_scores=scores,
+        detection_masks=masks)
+    expected_counts = ['04', '31', '4']
+    for i, mask_annotation in enumerate(coco_annotations):
+      self.assertEqual(mask_annotation['segmentation']['counts'],
+                       expected_counts[i])
+      self.assertTrue(np.all(np.equal(mask.decode(
+          mask_annotation['segmentation']), masks[i])))
+      self.assertEqual(mask_annotation['image_id'], 'first_image')
+      self.assertEqual(mask_annotation['category_id'], classes[i])
+      self.assertAlmostEqual(mask_annotation['score'], scores[i])
+
+  def testSingleImageGroundtruthExport(self):
+    masks = np.array(
+        [[[1, 1,], [1, 1]],
+         [[0, 0], [0, 1]],
+         [[0, 0], [0, 0]]], dtype=np.uint8)
+    boxes = np.array([[0, 0, 1, 1],
+                      [0, 0, .5, .5],
+                      [.5, .5, 1, 1]], dtype=np.float32)
+    coco_boxes = np.array([[0, 0, 1, 1],
+                           [0, 0, .5, .5],
+                           [.5, .5, .5, .5]], dtype=np.float32)
+    classes = np.array([1, 2, 3], dtype=np.int32)
+    is_crowd = np.array([0, 1, 0], dtype=np.int32)
+    next_annotation_id = 1
+    expected_counts = ['04', '31', '4']
+
+    # Tests exporting without passing in is_crowd (for backward compatibility).
+    coco_annotations = coco_tools.ExportSingleImageGroundtruthToCoco(
+        image_id='first_image',
+        category_id_set=set([1, 2, 3]),
+        next_annotation_id=next_annotation_id,
+        groundtruth_boxes=boxes,
+        groundtruth_classes=classes,
+        groundtruth_masks=masks)
+    for i, annotation in enumerate(coco_annotations):
+      self.assertEqual(annotation['segmentation']['counts'],
+                       expected_counts[i])
+      self.assertTrue(np.all(np.equal(mask.decode(
+          annotation['segmentation']), masks[i])))
+      self.assertTrue(np.all(np.isclose(annotation['bbox'], coco_boxes[i])))
+      self.assertEqual(annotation['image_id'], 'first_image')
+      self.assertEqual(annotation['category_id'], classes[i])
+      self.assertEqual(annotation['id'], i + next_annotation_id)
+
+    # Tests exporting with is_crowd.
+    coco_annotations = coco_tools.ExportSingleImageGroundtruthToCoco(
+        image_id='first_image',
+        category_id_set=set([1, 2, 3]),
+        next_annotation_id=next_annotation_id,
+        groundtruth_boxes=boxes,
+        groundtruth_classes=classes,
+        groundtruth_masks=masks,
+        groundtruth_is_crowd=is_crowd)
+    for i, annotation in enumerate(coco_annotations):
+      self.assertEqual(annotation['segmentation']['counts'],
+                       expected_counts[i])
+      self.assertTrue(np.all(np.equal(mask.decode(
+          annotation['segmentation']), masks[i])))
+      self.assertTrue(np.all(np.isclose(annotation['bbox'], coco_boxes[i])))
+      self.assertEqual(annotation['image_id'], 'first_image')
+      self.assertEqual(annotation['category_id'], classes[i])
+      self.assertEqual(annotation['iscrowd'], is_crowd[i])
+      self.assertEqual(annotation['id'], i + next_annotation_id)
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/utils/create_images_json_for_cityscapes.py b/utils/create_images_json_for_cityscapes.py
new file mode 100644
index 0000000000000000000000000000000000000000..666d4c2abdc1b46c90f641cd1c709ccb8d14d61d
--- /dev/null
+++ b/utils/create_images_json_for_cityscapes.py
@@ -0,0 +1,117 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python2, python3
+# pylint: disable=line-too-long
+# pyformat: disable
+r"""Creates a JSON file with info for a split of Cityscapes images.
+
+This single-purpose version has special handling for the directory structure of
+CityScapes dataset and the expected output ids.
+
+Sample commands:
+
+python create_images_json_for_cityscapes.py \
+  --image_dir=${DATA_ROOT}/leftImg8bit/${IMAGES_SPLIT} \
+  --output_json_path=${PATH_TO_SAVE}/${IMAGES_SPLIT}_images.json \
+  --only_basename \
+  --include_image_type_suffix=false
+"""
+# pyformat: enable
+# pylint: enable=line-too-long
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import os
+import re
+
+from absl import app
+from absl import flags
+
+import tensorflow as tf
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string(
+    'image_dir', None,
+    'The top-level directory of image files to be included in the set.')
+
+flags.DEFINE_list(
+    'keep_cities', None,
+    'Comma-separated list of strings specifying cities to be processed.')
+
+flags.DEFINE_string('output_json_path', None,
+                    'Output path to which is written the image info JSON.')
+
+flags.DEFINE_boolean(
+    'only_basename', True,
+    'If set, the included "file_name" properties of the images in the JSON '
+    'file will only include the base name and not the city directory. Used for '
+    'tools that do not support nested directories.')
+
+flags.DEFINE_boolean(
+    'include_image_type_suffix', True,
+    'If set, will include the suffix of the image type (e.g. "_leftImg8bit") '
+    'in the "file_name" properties of the image.')
+
+
+def _create_images_json(image_dir, output_json_path, only_basename=False,
+                        include_image_type_suffix=True, keep_cities=None):
+  """Lists the images in image_dir and writes out the info JSON for them."""
+  images_info_array = []
+  for city_dir in tf.io.gfile.listdir(image_dir):
+    if keep_cities and city_dir not in keep_cities:
+      continue
+    image_id_re = r'%s_[0-9]+_[0-9]+' % city_dir
+    image_id_re = re.compile(image_id_re)
+    for image_basename in tf.io.gfile.listdir(
+        os.path.join(image_dir, city_dir)):
+      match = image_id_re.match(image_basename)
+      image_id = image_basename[match.start():match.end()]
+      if include_image_type_suffix:
+        file_name = image_basename
+      else:
+        file_name = image_id + os.path.splitext(image_basename)[1]
+      if not only_basename:
+        file_name = os.path.join(city_dir, file_name)
+      image_info_dict = {'id': image_id, 'file_name': file_name}
+      images_info_array.append(image_info_dict)
+
+  info_dict = {'images': images_info_array}
+
+  with tf.io.gfile.GFile(output_json_path, 'w+') as json_file:
+    json.dump(info_dict, json_file)
+
+
+def main(argv):
+  if len(argv) > 1:
+    raise app.UsageError('Too many command-line arguments.')
+  keep_cities = None
+  if FLAGS.keep_cities:
+    keep_cities = [str(x) for x in FLAGS.keep_cities]
+  _create_images_json(
+      FLAGS.image_dir,
+      FLAGS.output_json_path,
+      only_basename=FLAGS.only_basename,
+      include_image_type_suffix=FLAGS.include_image_type_suffix,
+      keep_cities=keep_cities)
+
+
+if __name__ == '__main__':
+  flags.mark_flags_as_required(['image_dir', 'output_json_path'])
+  app.run(main)
diff --git a/utils/net_surgery_convert_last_layer.py b/utils/net_surgery_convert_last_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d34c0ed742430badddeaca289e83a84156f9232
--- /dev/null
+++ b/utils/net_surgery_convert_last_layer.py
@@ -0,0 +1,221 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utility script to perform net surgery on a model.
+
+This script will perform net surgery on DeepLab models trained on a source
+dataset and create a new checkpoint for the target dataset.
+"""
+
+from typing import Any, Dict, Text, Tuple
+
+from absl import app
+from absl import flags
+from absl import logging
+
+import numpy as np
+import tensorflow as tf
+
+from google.protobuf import text_format
+from deeplab2 import common
+from deeplab2 import config_pb2
+from deeplab2.data import dataset
+from deeplab2.model import deeplab
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('source_dataset', 'cityscapes',
+                    'Dataset name on which the model has been pretrained. '
+                    'Supported datasets: `cityscapes`.')
+
+flags.DEFINE_string('target_dataset', 'motchallenge_step',
+                    'Dataset name for conversion. Supported datasets: '
+                    '`motchallenge_step`.')
+
+flags.DEFINE_string('input_config_path', None,
+                    'Path to a config file that defines the DeepLab model and '
+                    'the checkpoint path.')
+
+flags.DEFINE_string('output_checkpoint_path', None,
+                    'Output filename for the generated checkpoint file.')
+
+
+_SUPPORTED_SOURCE_DATASETS = {'cityscapes'}
+_SUPPORTED_TARGET_DATASETS = {'motchallenge_step'}
+
+_CITYSCAPES_TO_MOTCHALLENGE_STEP = (
+    1,  # sidewalk
+    2,  # building
+    8,  # vegetation
+    10,  # sky
+    11,  # pedestrian
+    12,  # rider
+    18,  # bicycle
+)
+
+_DATASET_TO_INFO = {
+    'cityscapes': dataset.CITYSCAPES_PANOPTIC_INFORMATION,
+    'motchallenge_step': dataset.MOTCHALLENGE_STEP_INFORMATION,
+}
+_INPUT_SIZE = (1025, 2049, 3)
+
+
+def _load_model(
+    config_path: Text,
+    source_dataset: Text) -> Tuple[deeplab.DeepLab,
+                                   config_pb2.ExperimentOptions]:
+  """Load DeepLab model based on config and dataset."""
+  options = config_pb2.ExperimentOptions()
+  with tf.io.gfile.GFile(config_path) as f:
+    text_format.Parse(f.read(), options)
+  options.model_options.panoptic_deeplab.semantic_head.output_channels = (
+      _DATASET_TO_INFO[source_dataset].num_classes)
+  model = deeplab.DeepLab(options,
+                          _DATASET_TO_INFO[source_dataset])
+  return model, options
+
+
+def _convert_bias(input_tensor: np.ndarray,
+                  label_list: Tuple[int, ...]) -> np.ndarray:
+  """Converts 1D tensor bias w.r.t. label list.
+
+  We select the subsets from the input_tensor based on the label_list.
+
+  We assume input_tensor has shape = [num_classes], where
+  input_tensor is the bias weights trained on source dataset, and num_classes
+  is the number of classes in source dataset.
+
+  Args:
+    input_tensor: A numpy array with ndim == 1.
+    label_list: A tuple of labels used for net surgery.
+
+  Returns:
+    A numpy array with values modified.
+
+  Raises:
+    ValueError: input_tensor's ndim != 1.
+  """
+  if input_tensor.ndim != 1:
+    raise ValueError('The bias tensor should have ndim == 1.')
+
+  num_elements = len(label_list)
+  output_tensor = np.zeros(num_elements, dtype=np.float32)
+  for i, label in enumerate(label_list):
+    output_tensor[i] = input_tensor[label]
+  return output_tensor
+
+
+def _convert_kernels(input_tensor: np.ndarray,
+                     label_list: Tuple[int, ...]) -> np.ndarray:
+  """Converts 4D tensor kernels w.r.t. label list.
+
+  We select the subsets from the input_tensor based on the label_list.
+
+  We assume input_tensor has shape = [h, w, input_dim, num_classes], where
+  input_tensor is the kernel weights trained on source dataset, and num_classes
+  is the number of classes in source dataset.
+
+  Args:
+    input_tensor: A numpy array with ndim == 4.
+    label_list: A tuple of labels used for net surgery.
+
+  Returns:
+    A numpy array with values modified.
+
+  Raises:
+    ValueError: input_tensor's ndim != 4.
+  """
+  if input_tensor.ndim != 4:
+    raise ValueError('The kernels tensor should have ndim == 4.')
+
+  num_elements = len(label_list)
+  kernel_height, kernel_width, input_dim, _ = input_tensor.shape
+  output_tensor = np.zeros(
+      (kernel_height, kernel_width, input_dim, num_elements), dtype=np.float32)
+  for i, label in enumerate(label_list):
+    output_tensor[:, :, :, i] = input_tensor[:, :, :, label]
+  return output_tensor
+
+
+def _restore_checkpoint(restore_dict: Dict[Any, Any],
+                        options: config_pb2.ExperimentOptions
+                        ) -> tf.train.Checkpoint:
+  """Reads the provided dict items from the checkpoint specified in options.
+
+  Args:
+    restore_dict: A mapping of checkpoint item to location.
+    options: A experiment configuration containing the checkpoint location.
+
+  Returns:
+    The loaded checkpoint.
+  """
+  ckpt = tf.train.Checkpoint(**restore_dict)
+  if tf.io.gfile.isdir(options.model_options.initial_checkpoint):
+    path = tf.train.latest_checkpoint(
+        options.model_options.initial_checkpoint)
+    status = ckpt.restore(path)
+  else:
+    status = ckpt.restore(options.model_options.initial_checkpoint)
+  status.expect_partial().assert_existing_objects_matched()
+  return ckpt
+
+
+def main(_) -> None:
+  if FLAGS.source_dataset not in _SUPPORTED_SOURCE_DATASETS:
+    raise ValueError('Source dataset is not supported. Use --help to get list '
+                     'of supported datasets.')
+  if FLAGS.target_dataset not in _SUPPORTED_TARGET_DATASETS:
+    raise ValueError('Target dataset is not supported. Use --help to get list '
+                     'of supported datasets.')
+
+  logging.info('Loading DeepLab model from config %s', FLAGS.input_config_path)
+  source_model, options = _load_model(FLAGS.input_config_path,
+                                      FLAGS.source_dataset)
+  logging.info('Load pretrained checkpoint.')
+  _restore_checkpoint(source_model.checkpoint_items, options)
+  source_model(tf.keras.Input(_INPUT_SIZE), training=False)
+
+  logging.info('Perform net surgery.')
+  semantic_weights = (
+      source_model._decoder._semantic_head.final_conv.get_weights())  # pylint: disable=protected-access
+
+  if (FLAGS.source_dataset == 'cityscapes' and
+      FLAGS.target_dataset == 'motchallenge_step'):
+    # Kernels.
+    semantic_weights[0] = _convert_kernels(semantic_weights[0],
+                                           _CITYSCAPES_TO_MOTCHALLENGE_STEP)
+    # Bias.
+    semantic_weights[1] = _convert_bias(semantic_weights[1],
+                                        _CITYSCAPES_TO_MOTCHALLENGE_STEP)
+
+  logging.info('Load target model without last semantic layer.')
+  target_model, _ = _load_model(FLAGS.input_config_path, FLAGS.target_dataset)
+  restore_dict = target_model.checkpoint_items
+  del restore_dict[common.CKPT_SEMANTIC_LAST_LAYER]
+
+  ckpt = _restore_checkpoint(restore_dict, options)
+  target_model(tf.keras.Input(_INPUT_SIZE), training=False)
+  target_model._decoder._semantic_head.final_conv.set_weights(semantic_weights)  # pylint: disable=protected-access
+
+  logging.info('Save checkpoint to output path: %s',
+               FLAGS.output_checkpoint_path)
+  ckpt = tf.train.Checkpoint(**target_model.checkpoint_items)
+  ckpt.save(FLAGS.output_checkpoint_path)
+
+
+if __name__ == '__main__':
+  flags.mark_flags_as_required(
+      ['input_config_path', 'output_checkpoint_path'])
+  app.run(main)
diff --git a/utils/panoptic_cityscapes_categories.json b/utils/panoptic_cityscapes_categories.json
new file mode 100644
index 0000000000000000000000000000000000000000..f42c86fd0497e99529e8533ebc71bb07de5d520c
--- /dev/null
+++ b/utils/panoptic_cityscapes_categories.json
@@ -0,0 +1,21 @@
+[
+  {"name": "road",          "id": 7,  "isthing": 0, "color": [128,  64, 128]},
+  {"name": "sidewalk",      "id": 8,  "isthing": 0, "color": [244,  35, 232]},
+  {"name": "building",      "id": 11, "isthing": 0, "color": [ 70,  70,  70]},
+  {"name": "wall",          "id": 12, "isthing": 0, "color": [102, 102, 156]},
+  {"name": "fence",         "id": 13, "isthing": 0, "color": [190, 153, 153]},
+  {"name": "pole",          "id": 17, "isthing": 0, "color": [153, 153, 153]},
+  {"name": "traffic light", "id": 19, "isthing": 0, "color": [250, 170,  30]},
+  {"name": "traffic sign",  "id": 20, "isthing": 0, "color": [220, 220,   0]},
+  {"name": "vegetation",    "id": 21, "isthing": 0, "color": [107, 142,  35]},
+  {"name": "terrain",       "id": 22, "isthing": 0, "color": [152, 251, 152]},
+  {"name": "sky",           "id": 23, "isthing": 0, "color": [ 70, 130, 180]},
+  {"name": "person",        "id": 24, "isthing": 1, "color": [220,  20,  60]},
+  {"name": "rider",         "id": 25, "isthing": 1, "color": [255,   0,   0]},
+  {"name": "car",           "id": 26, "isthing": 1, "color": [  0,   0, 142]},
+  {"name": "truck",         "id": 27, "isthing": 1, "color": [  0,   0,  70]},
+  {"name": "bus",           "id": 28, "isthing": 1, "color": [  0,  60, 100]},
+  {"name": "train",         "id": 31, "isthing": 1, "color": [  0,  80, 100]},
+  {"name": "motorcycle",    "id": 32, "isthing": 1, "color": [  0,   0, 230]},
+  {"name": "bicycle",       "id": 33, "isthing": 1, "color": [119,  11,  32]}
+]
diff --git a/utils/panoptic_instances.py b/utils/panoptic_instances.py
new file mode 100644
index 0000000000000000000000000000000000000000..800b11238714094badcb43c16092e048010416da
--- /dev/null
+++ b/utils/panoptic_instances.py
@@ -0,0 +1,303 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tensorflow code for working with object instances in segmentation."""
+
+from typing import Iterable
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import tensorflow as tf
+
+
+def instances_without_ignore_categories(panoptic_labels: tf.Tensor,
+                                        ignore_categories: Union[tf.Tensor,
+                                                                 Iterable[int]],
+                                        panoptic_divisor: Union[tf.Tensor,
+                                                                int] = 256):
+  """Determines which instances to keep after ignoring a set of categories.
+
+  Args:
+    panoptic_labels: An integer tensor of panoptic labels of shape `[height,
+      width]`. Each element will be `category * panoptic_divisor + instance` for
+      a pixel.
+    ignore_categories: An iterable or tensor of integer category labels.
+      Instances where the category portion of the label in `panoptic_labels` are
+      in the ignore set will not be included in the results.
+    panoptic_divisor: The divisor used to multiply the category label when
+      constructing panoptic labels, as in integer or integer scalar tensor.
+
+  Returns:
+    A boolean tensor masking which of the input `panoptic_labels` corresponds
+    to an instance that will be kept, or equivalently *not* ignored.
+  """
+  ignore_categories = tf.convert_to_tensor(
+      ignore_categories, dtype=panoptic_labels.dtype)
+  panoptic_divisor = tf.convert_to_tensor(
+      panoptic_divisor, dtype=panoptic_labels.dtype)
+
+  instance_category = tf.math.floordiv(panoptic_labels, panoptic_divisor)
+  instance_is_ignored = tf.math.reduce_any(
+      tf.equal(
+          tf.expand_dims(instance_category, 1),
+          tf.expand_dims(ignore_categories, 0)),
+      axis=1)
+  instance_is_kept = tf.math.logical_not(instance_is_ignored)
+
+  return instance_is_kept
+
+
+def _broadcast_over_instances(t, num_instances):
+  out_shape = tf.concat([tf.shape(t), [num_instances]], axis=0)
+  return tf.broadcast_to(tf.expand_dims(t, -1), out_shape)
+
+
+def instance_boxes_from_masks(
+    panoptic_labels: tf.Tensor,
+    ignore_categories: Optional[Union[tf.Tensor, Iterable[int]]] = None,
+    panoptic_divisor: Union[tf.Tensor, int] = 256):
+  """Finds the bounding boxes around instances, given a panoptic label map.
+
+  Args:
+    panoptic_labels: An integer tensor of panoptic labelsof shape `[height,
+      width]`. Each element will be `category * panoptic_divisor + instance` for
+      a pixel.
+    ignore_categories: An iterable or tensor of integer category labels.
+      Instances where the category portion of the label in `panoptic_labels` are
+      in the ignore set will not be included in the results.
+    panoptic_divisor: The divisor used to multiply the category label when
+      constructing panoptic labels, as in integer or integer scalar tensor.
+
+  Returns:
+    A tuple of arrays (unique_labels, box_coords).
+    unique_labels: An tensor of each possible non-ignored label value in
+      `panoptic_labels`, in the same order as the boxes.
+    box_coords: An tensor of shape `[num_labels, 4]`. Each row is one box as
+      `[ymin, xmin, ymax, xmax]`.
+  """
+  label_shape = tf.shape(panoptic_labels)
+  height = label_shape[0]
+  width = label_shape[1]
+  x_coord, y_coord = tf.meshgrid(
+      tf.range(width, dtype=tf.float32), tf.range(height, dtype=tf.float32))
+
+  unique_labels, flat_instance_index = tf.unique(
+      tf.reshape(panoptic_labels, [height * width]))
+  num_instances = tf.size(unique_labels)
+  instance_index = tf.reshape(flat_instance_index, [height, width])
+
+  y_coord_repeated = _broadcast_over_instances(y_coord, num_instances)
+  x_coord_repeated = _broadcast_over_instances(x_coord, num_instances)
+  instance_index_repeated = _broadcast_over_instances(instance_index,
+                                                      num_instances)
+
+  instance_index_matches = tf.math.equal(
+      instance_index_repeated,
+      tf.reshape(tf.range(num_instances), [1, 1, num_instances]))
+
+  # In these tensors, each slice in the 3rd dimension corresponds to an
+  # instance. We replace the pixels that do _not_ belong to that instance with
+  # a +/- infinity in order that it not be included in the reduce_min/max below.
+  inf3d = tf.broadcast_to([[[float('Inf')]]], tf.shape(x_coord_repeated))
+  y_or_inf = tf.where(instance_index_matches, y_coord_repeated, inf3d)
+  y_or_neg_inf = tf.where(instance_index_matches, y_coord_repeated, -inf3d)
+  x_or_inf = tf.where(instance_index_matches, x_coord_repeated, inf3d)
+  x_or_neg_inf = tf.where(instance_index_matches, x_coord_repeated, -inf3d)
+
+  y_min = tf.reduce_min(y_or_inf, axis=[0, 1])
+  x_min = tf.reduce_min(x_or_inf, axis=[0, 1])
+  y_max = tf.reduce_max(y_or_neg_inf, axis=[0, 1]) + 1
+  x_max = tf.reduce_max(x_or_neg_inf, axis=[0, 1]) + 1
+
+  box_coords = tf.stack([y_min, x_min, y_max, x_max], axis=1)
+
+  if ignore_categories is not None:
+    # Filter out the boxes that correspond to instances in the "ignore"
+    # categories.
+    instance_is_kept = instances_without_ignore_categories(
+        unique_labels, ignore_categories, panoptic_divisor)
+
+    unique_labels = tf.boolean_mask(unique_labels, instance_is_kept)
+    box_coords = tf.boolean_mask(box_coords, instance_is_kept)
+
+  return unique_labels, box_coords
+
+
+def per_instance_masks(panoptic_labels: tf.Tensor,
+                       instance_panoptic_labels: tf.Tensor,
+                       out_dtype: tf.dtypes.DType = tf.bool) -> tf.Tensor:
+  """3D tensor where each slice in 3rd dimensions is an instance's mask."""
+  num_instances = tf.size(instance_panoptic_labels)
+  matches = tf.equal(
+      tf.expand_dims(panoptic_labels, 0),
+      tf.reshape(instance_panoptic_labels, [num_instances, 1, 1]))
+  return tf.cast(matches, out_dtype)
+
+
+def _average_per_instance(map_tensor: tf.Tensor, panoptic_labels: tf.Tensor,
+                          instance_panoptic_labels: tf.Tensor,
+                          instance_area: tf.Tensor) -> tf.Tensor:
+  """Finds the average of the values in map_tensor over each instance."""
+
+  # For each instance (in the 3rd dim), generate a map that has, per-pixel:
+  # - The input value if that pixel belongs to the instance.
+  # - Zero otherwise.
+  pixel_in_instance = per_instance_masks(panoptic_labels,
+                                         instance_panoptic_labels)
+
+  map_dtype = map_tensor.dtype
+  num_instances = tf.size(instance_panoptic_labels)
+  map_or_zero = tf.where(pixel_in_instance, tf.expand_dims(map_tensor, 0),
+                         tf.zeros([num_instances, 1, 1], dtype=map_dtype))
+
+  # Average the semantic probabilities over each instance.
+  instance_total_prob = tf.math.reduce_sum(map_or_zero, axis=[1, 2])
+  instance_avg_prob = tf.divide(instance_total_prob,
+                                tf.cast(instance_area, map_dtype))
+
+  return instance_avg_prob
+
+
+# pyformat: disable
+def per_instance_semantic_probabilities(
+    panoptic_labels: tf.Tensor,
+    instance_panoptic_labels: tf.Tensor,
+    instance_area: tf.Tensor,
+    semantic_probability: tf.Tensor,
+    panoptic_divisor: Union[tf.Tensor, int],
+    ignore_label: Union[tf.Tensor, int]) -> tf.Tensor:
+  """Mean probability for the semantic label of each unique instance."""
+  # pyformat: enable
+  # Get the probability associated with the semantic label encoded in the
+  # panoptic_labels at each pixel.
+  panoptic_divisor = tf.convert_to_tensor(panoptic_divisor, dtype=tf.int32)
+  ignore_label = tf.convert_to_tensor(ignore_label, dtype=tf.int32)
+  semantic_label_map = tf.math.floordiv(panoptic_labels, panoptic_divisor)
+
+  map_shape = tf.shape(semantic_label_map)
+  height = map_shape[0]
+  width = map_shape[1]
+  num_pixels = height * width
+
+  semantic_index = tf.reshape(semantic_label_map, [num_pixels])
+  # Use 0 as the index for a pixel with the "ignore" label, since that semantic
+  # label may not be a valid index into the class axis of the
+  # semantic_probability tensor.
+  semantic_index = tf.where(semantic_index == ignore_label, 0, semantic_index)
+
+  x, y = tf.meshgrid(tf.range(width), tf.range(height))
+  probability_index = tf.stack([
+      tf.reshape(y, [num_pixels]),
+      tf.reshape(x, [num_pixels]),
+      semantic_index,
+  ],
+                               axis=1)
+
+  pixel_semantic_probability = tf.reshape(
+      tf.gather_nd(semantic_probability, probability_index), [height, width])
+  # Set the probability for the "ignore" pixels to 0.
+  pixel_semantic_probability = tf.where(semantic_label_map == ignore_label, 0.0,
+                                        pixel_semantic_probability)
+
+  instance_avg_prob = _average_per_instance(pixel_semantic_probability,
+                                            panoptic_labels,
+                                            instance_panoptic_labels,
+                                            instance_area)
+
+  return instance_avg_prob
+
+
+def combined_instance_scores(
+    panoptic_labels: tf.Tensor, semantic_probability: tf.Tensor,
+    instance_score_map: tf.Tensor, panoptic_divisor: Union[tf.Tensor, int],
+    ignore_label: Union[tf.Tensor, int]) -> Tuple[tf.Tensor, tf.Tensor]:
+  """Combines (with a product) predicted semantic and instance probabilities.
+
+  Args:
+    panoptic_labels: A 2D integer tensor of panoptic format labels (each pixel
+      entry is `semantic_label * panoptic_divisor + instance_label`).
+    semantic_probability: A 3D float tensor, where the 3rd dimension is over
+      semantic labels, and each spatial location will have the discrete
+      distribution of the probabilities of the semantic classes.
+    instance_score_map: A 2D float tensor, where the pixels for an instance will
+      have the probability of that being an instance.
+    panoptic_divisor: Integer scalar divisor/multiplier used to construct the
+      panoptic labels.
+    ignore_label: Integer scalar, for the "ignore" semantic label in the
+      panoptic labels.
+
+  Returns:
+    A tuple of instance labels and the combined scores for those instances, each
+    as a 1D tensor.
+  """
+  panoptic_divisor = tf.convert_to_tensor(panoptic_divisor, dtype=tf.int32)
+  ignore_label = tf.convert_to_tensor(ignore_label, dtype=tf.int32)
+
+  num_pixels = tf.size(panoptic_labels)
+  instance_panoptic_labels, _, instance_area = tf.unique_with_counts(
+      tf.reshape(panoptic_labels, [num_pixels]))
+
+  instance_semantic_labels = tf.math.floordiv(instance_panoptic_labels,
+                                              panoptic_divisor)
+  instance_panoptic_labels = tf.boolean_mask(
+      instance_panoptic_labels, instance_semantic_labels != ignore_label)
+  instance_area = tf.boolean_mask(instance_area,
+                                  instance_semantic_labels != ignore_label)
+
+  instance_semantic_probabilities = per_instance_semantic_probabilities(
+      panoptic_labels, instance_panoptic_labels, instance_area,
+      semantic_probability, panoptic_divisor, ignore_label)
+
+  instance_scores = _average_per_instance(instance_score_map, panoptic_labels,
+                                          instance_panoptic_labels,
+                                          instance_area)
+
+  combined_scores = instance_semantic_probabilities * instance_scores
+  return instance_panoptic_labels, combined_scores
+
+
+def per_instance_is_crowd(is_crowd_map: tf.Tensor, id_map: tf.Tensor,
+                          output_ids: tf.Tensor) -> tf.Tensor:
+  """Determines the per-instance is_crowd value from a boolian is_crowd map.
+
+  Args:
+    is_crowd_map: A 2D boolean tensor. Where it is True, the instance in that
+      region is a "crowd" instance. It is assumed that all pixels in an instance
+      will have the same value in this map.
+    id_map: A 2D integer tensor, with the instance id label at each pixel.
+    output_ids: A 1D integer vector tensor, the per-instance ids for which to
+      output the is_crowd values.
+
+  Returns:
+    A 1D boolean vector tensor, with the per-instance is_crowd value. The ith
+    element of the return value will be the is_crowd result for the segment
+    with the ith element of the output_ids argument.
+  """
+  flat_is_crowd_map = tf.reshape(is_crowd_map, [-1])
+  flat_id_map = tf.reshape(id_map, [-1])
+
+  # Get an is_crowd value from the map for each id.
+  # Only need an arbtitrary value due to assumption that the is_crowd map does
+  # not vary over an instance.
+  unique_ids, unique_index = tf.unique(flat_id_map)
+  unique_is_crowd = tf.scatter_nd(
+      tf.expand_dims(unique_index, 1), flat_is_crowd_map, tf.shape(unique_ids))
+
+  # Map from the order/set in unique_ids to that in output_ids
+  matching_ids = tf.math.equal(
+      tf.expand_dims(output_ids, 1), tf.expand_dims(unique_ids, 0))
+  matching_index = tf.where(matching_ids)[:, 1]
+  return tf.gather(unique_is_crowd, matching_index)
diff --git a/utils/panoptic_instances_test.py b/utils/panoptic_instances_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..28550652470ca59cc2cd54b056933d6d5f9e2533
--- /dev/null
+++ b/utils/panoptic_instances_test.py
@@ -0,0 +1,431 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for Tensorflow instance utils."""
+
+import numpy as np
+import tensorflow as tf
+
+from deeplab2.utils import panoptic_instances
+
+
+class TensorflowInstanceBoxesTest(tf.test.TestCase):
+
+  def testFilterPanopticLabelsByIgnoreCategories(self):
+    panoptic_divisor = 9
+    panoptic_labels = [
+        4 * panoptic_divisor + 3,
+        2 * panoptic_divisor + 7,
+        4 * panoptic_divisor,
+        1 * panoptic_divisor + 2,
+        4,
+        8,
+        4 * panoptic_divisor + 1,
+        1 * panoptic_divisor,
+    ]
+    panoptic_labels_tensor = tf.constant(panoptic_labels, dtype=tf.int32)
+
+    ignore_categories = [0, 4]
+    is_kept = panoptic_instances.instances_without_ignore_categories(
+        panoptic_labels_tensor, ignore_categories, panoptic_divisor)
+    with self.cached_session() as sess:
+      is_kept_result = sess.run(is_kept)
+
+    np.testing.assert_array_equal(
+        is_kept_result, [False, True, False, True, False, False, False, True])
+
+  def testFilterByIgnoreCategoriesWithEmptyIgnore(self):
+    panoptic_labels = [14, 19, 0, 2]
+    panoptic_labels_tensor = tf.constant(panoptic_labels, dtype=tf.int32)
+    panoptic_divisor = 7
+
+    is_kept_empty_ignore = panoptic_instances.instances_without_ignore_categories(
+        panoptic_labels_tensor, [], panoptic_divisor)
+    with self.cached_session() as sess:
+      is_kept_empty_ignore_result = sess.run(is_kept_empty_ignore)
+
+    np.testing.assert_array_equal(is_kept_empty_ignore_result,
+                                  [True, True, True, True])
+
+  def testFilterByIgnoreCategoriesWithEmptyPanopticLabels(self):
+    panoptic_labels = tf.zeros([0], dtype=tf.int32)
+    ignore_categories = [2, 3]
+    panoptic_divisor = 7
+
+    is_kept_empty_labels = panoptic_instances.instances_without_ignore_categories(
+        panoptic_labels, ignore_categories, panoptic_divisor)
+    with self.cached_session() as sess:
+      is_kept_empty_labels_result = sess.run(is_kept_empty_labels)
+    np.testing.assert_array_equal(is_kept_empty_labels_result, [])
+
+  def testComputesInstanceBoxes(self):
+    instance_labels = [
+        [0, 1, 1, 0],
+        [0, 1, 2, 2],
+        [5, 1, 2, 2],
+        [1, 1, 1, 1],
+        [0, 0, 0, 2],
+        [0, 0, 0, 0],
+    ]
+    instance_labels_tensor = tf.constant(instance_labels, dtype=tf.int64)
+
+    category_labels = [
+        [0, 1, 1, 0],
+        [0, 1, 1, 1],
+        [1, 1, 1, 1],
+        [2, 2, 2, 2],
+        [0, 0, 0, 1],
+        [3, 3, 3, 3],
+    ]
+    category_labels_tensor = tf.constant(category_labels, dtype=tf.int64)
+
+    panoptic_divisor = 13
+    panoptic_labels = (
+        instance_labels_tensor + panoptic_divisor * category_labels_tensor)
+
+    ignore_categories = tf.constant([0, 3], dtype=tf.int64)
+
+    unique_labels, box_coords = panoptic_instances.instance_boxes_from_masks(
+        panoptic_labels, ignore_categories, panoptic_divisor)
+    with self.cached_session() as sess:
+      unique_labels_result, box_coords_result = sess.run(
+          [unique_labels, box_coords])
+
+    np.testing.assert_array_equal(unique_labels_result, [14, 15, 18, 27])
+    np.testing.assert_array_equal(
+        box_coords_result,
+        [
+            [0, 1, 3, 3],  # Category 1, Instance 1
+            [1, 2, 5, 4],  # Category 1, Instance 2
+            [2, 0, 3, 1],  # Category 1, Instance 5
+            [3, 0, 4, 4],  # Category 2, Instance 1
+        ])
+
+  def testIgnoresNothing(self):
+    instance_labels = [
+        [0, 1, 1, 0],
+        [0, 1, 2, 2],
+        [5, 1, 2, 2],
+    ]
+    instance_labels_tensor = tf.constant(instance_labels, dtype=tf.int64)
+
+    category_labels = [
+        [0, 1, 1, 0],
+        [0, 1, 1, 1],
+        [1, 1, 1, 1],
+    ]
+    category_labels_tensor = tf.constant(category_labels, dtype=tf.int64)
+
+    panoptic_divisor = 7
+    panoptic_labels = (
+        instance_labels_tensor + panoptic_divisor * category_labels_tensor)
+
+    unique_labels, box_coords = panoptic_instances.instance_boxes_from_masks(
+        panoptic_labels, panoptic_divisor=panoptic_divisor)
+    with self.cached_session() as sess:
+      unique_labels_result, box_coords_result = sess.run(
+          [unique_labels, box_coords])
+
+    np.testing.assert_array_equal(unique_labels_result, [0, 8, 9, 12])
+    np.testing.assert_array_equal(
+        box_coords_result,
+        [
+            [0, 0, 2, 4],  # Category 0, Instance 0
+            [0, 1, 3, 3],  # Category 1, Instance 1
+            [1, 2, 3, 4],  # Category 1, Instance 2
+            [2, 0, 3, 1],  # Category 1, Instance 5
+        ])
+
+  def testIgnoresEverything(self):
+    instance_labels = [
+        [0, 1, 1, 0],
+        [0, 1, 2, 2],
+        [5, 1, 2, 2],
+    ]
+    instance_labels_tensor = tf.constant(instance_labels, dtype=tf.int64)
+
+    category_labels = [
+        [0, 1, 1, 0],
+        [0, 1, 2, 2],
+        [1, 1, 2, 2],
+    ]
+    category_labels_tensor = tf.constant(category_labels, dtype=tf.int64)
+
+    panoptic_divisor = 11
+    panoptic_labels = (
+        instance_labels_tensor + panoptic_divisor * category_labels_tensor)
+
+    ignore_categories = [0, 1, 2]
+
+    unique_labels, box_coords = panoptic_instances.instance_boxes_from_masks(
+        panoptic_labels, ignore_categories, panoptic_divisor)
+    with self.cached_session() as sess:
+      unique_labels_result, box_coords_result = sess.run(
+          [unique_labels, box_coords])
+
+    self.assertSequenceEqual(unique_labels_result.shape, (0,))
+    self.assertSequenceEqual(box_coords_result.shape, (0, 4))
+
+  def testSingleInstance(self):
+    instance_labels = [
+        [0, 0, 0],
+        [0, 0, 0],
+    ]
+    instance_labels_tensor = tf.constant(instance_labels, dtype=tf.int64)
+
+    category_labels = [
+        [3, 3, 3],
+        [3, 3, 3],
+    ]
+    category_labels_tensor = tf.constant(category_labels, dtype=tf.int64)
+
+    panoptic_divisor = 9
+    panoptic_labels = (
+        instance_labels_tensor + panoptic_divisor * category_labels_tensor)
+
+    unique_labels, box_coords = panoptic_instances.instance_boxes_from_masks(
+        panoptic_labels, panoptic_divisor=panoptic_divisor)
+    with self.cached_session() as sess:
+      unique_labels_result, box_coords_result = sess.run(
+          [unique_labels, box_coords])
+
+    np.testing.assert_array_equal(unique_labels_result, [27])
+    np.testing.assert_array_equal(box_coords_result, [[0, 0, 2, 3]])
+
+
+class InstanceScoringTest(tf.test.TestCase):
+
+  def testGetsSemanticProbabilities(self):
+    ignore_label = 3
+    semantic_labels = [
+        [0, 1, 1, 0, 0],
+        [0, 1, 1, 1, 0],
+        [1, 1, 1, 1, 1],
+        [2, 2, 2, 2, 2],
+    ]
+    semantic_tensor = tf.constant(semantic_labels, dtype=tf.int32)
+
+    instances = [
+        [1, 2, 2, 1, 1],
+        [1, 2, 3, 3, 1],
+        [6, 2, 3, 3, 3],
+        [2, 2, 2, 2, 2],
+    ]
+    instances_tensor = tf.constant(instances, dtype=tf.int32)
+
+    panoptic_divisor = 13
+    panoptic_labels = semantic_tensor * panoptic_divisor + instances_tensor
+
+    category_0_probability = [
+        [1.0, 0.0, 0.0, 0.8, 0.9],
+        [0.8, 0.0, 0.0, 0.2, 1.0],
+        [0.1, 0.0, 0.2, 0.1, 0.2],
+        [0.1, 0.1, 0.2, 0.0, 0.0],
+    ]
+    category_1_probability = [
+        [0.0, 1.0, 0.9, 0.0, 0.1],
+        [0.0, 1.0, 0.9, 0.9, 0.0],
+        [0.8, 0.7, 0.7, 0.8, 0.9],
+        [0.0, 0.2, 0.2, 0.0, 0.4],
+    ]
+    category_2_probability = (
+        np.ones((4, 5), dtype=np.float32) -
+        (np.array(category_0_probability) + np.array(category_1_probability)))
+    semantic_probability = np.stack([
+        category_0_probability, category_1_probability, category_2_probability
+    ],
+                                    axis=2)
+    semantic_probability_tensor = tf.constant(semantic_probability, tf.float32)
+
+    instance_panoptic_labels, _, instance_area = tf.unique_with_counts(
+        tf.reshape(panoptic_labels, [20]))
+    probs = panoptic_instances.per_instance_semantic_probabilities(
+        panoptic_labels, instance_panoptic_labels, instance_area,
+        semantic_probability_tensor, panoptic_divisor, ignore_label)
+
+    probs_result = probs.numpy()
+
+    np.testing.assert_array_almost_equal(probs_result,
+                                         [0.9, 0.9, 0.84, 0.8, 0.76])
+
+  def testCombineInstanceScores(self):
+    # This test does not have any pixels equal to ignore_label, so a dummy value
+    # is used as it's not being tested.
+    ignore_label = -1
+    semantic_labels = [
+        [0, 1, 1, 0],
+        [0, 1, 1, 1],
+        [1, 1, 1, 1],
+    ]
+    semantic_tensor = tf.constant(semantic_labels, dtype=tf.int32)
+
+    instances = [
+        [1, 2, 2, 1],
+        [1, 2, 3, 3],
+        [6, 2, 3, 3],
+    ]
+    instances_tensor = tf.constant(instances, dtype=tf.int32)
+
+    panoptic_divisor = 8
+    panoptic_labels = semantic_tensor * panoptic_divisor + instances_tensor
+
+    category_0_probability = [
+        [1.0, 0.0, 0.0, 0.8],
+        [0.8, 0.0, 0.0, 0.2],
+        [0.1, 0.0, 0.2, 0.1],
+    ]
+    category_1_probability = (
+        np.ones((3, 4), dtype=np.float32) - np.array(category_0_probability))
+    semantic_probability = np.stack([
+        category_0_probability,
+        category_1_probability,
+    ],
+                                    axis=2)
+    semantic_probability_tensor = tf.constant(semantic_probability, tf.float32)
+
+    instance_scores = [
+        [0.0, 0.5, 0.5, 0.0],
+        [0.0, 0.5, 0.7, 0.7],
+        [0.8, 0.5, 0.7, 0.7],
+    ]
+    instance_scores_tensor = tf.constant(instance_scores, tf.float32)
+
+    labels, combined_scores = panoptic_instances.combined_instance_scores(
+        panoptic_labels, semantic_probability_tensor, instance_scores_tensor,
+        panoptic_divisor, ignore_label)
+
+    labels_result = labels.numpy()
+    combined_scores_result = combined_scores.numpy()
+
+    np.testing.assert_array_equal(labels_result, [1, 10, 11, 14])
+    np.testing.assert_array_almost_equal(combined_scores_result,
+                                         [0, 0.5, 0.875 * 0.7, 0.9 * 0.8])
+
+  def testIgnoresLabel(self):
+    # This ignore label will be outside the valid range of an index into the
+    # last axis of the semantic probability tensor.
+    ignore_label = 255
+    semantic_labels = [
+        [0, 1],
+        [0, 255],
+    ]
+    semantic_tensor = tf.constant(semantic_labels, dtype=tf.int32)
+
+    instances = [
+        [1, 2],
+        [1, 3],
+    ]
+    instances_tensor = tf.constant(instances, dtype=tf.int32)
+
+    panoptic_divisor = 256
+    panoptic_labels = semantic_tensor * panoptic_divisor + instances_tensor
+
+    category_0_probability = [
+        [1.0, 0.0],
+        [0.8, 0.0],
+    ]
+    category_1_probability = (
+        np.ones((2, 2), dtype=np.float32) - np.array(category_0_probability))
+    semantic_probability = np.stack([
+        category_0_probability,
+        category_1_probability,
+    ],
+                                    axis=2)
+    semantic_probability_tensor = tf.constant(semantic_probability, tf.float32)
+
+    instance_scores = [
+        [1.0, 0.5],
+        [1.0, 0.5],
+    ]
+    instance_scores_tensor = tf.constant(instance_scores, tf.float32)
+
+    labels, combined_scores = panoptic_instances.combined_instance_scores(
+        panoptic_labels, semantic_probability_tensor, instance_scores_tensor,
+        panoptic_divisor, ignore_label)
+
+    labels_result = labels.numpy()
+    combined_scores_result = combined_scores.numpy()
+
+    np.testing.assert_array_equal(labels_result, [1, 258])
+    np.testing.assert_array_almost_equal(combined_scores_result, [0.9, 0.5])
+
+
+class InstanceIsCrowdTest(tf.test.TestCase):
+
+  def testGetsIsCrowdValues(self):
+    is_crowd_map = tf.constant([
+        [1, 0, 0],
+        [1, 0, 1],
+        [0, 1, 1],
+    ], tf.uint8)
+    is_crowd_map = tf.cast(is_crowd_map, tf.bool)
+    id_map = tf.constant([
+        [0, 1, 1],
+        [0, 2, 3],
+        [4, 3, 3],
+    ], tf.int32)
+    output_ids = tf.range(5)
+
+    instance_is_crowd = panoptic_instances.per_instance_is_crowd(
+        is_crowd_map, id_map, output_ids)
+
+    is_crowd_result = instance_is_crowd.numpy()
+    np.testing.assert_array_equal(is_crowd_result,
+                                  [True, False, False, True, False])
+
+  def testGetsSubsetOfIsCrowdValues(self):
+    is_crowd_map = tf.constant([
+        [1, 0, 0],
+        [1, 0, 1],
+        [0, 1, 1],
+    ], tf.uint8)
+    is_crowd_map = tf.cast(is_crowd_map, tf.bool)
+    id_map = tf.constant([
+        [0, 1, 1],
+        [0, 2, 3],
+        [4, 3, 3],
+    ], tf.int32)
+    output_ids = [1, 3]
+
+    instance_is_crowd = panoptic_instances.per_instance_is_crowd(
+        is_crowd_map, id_map, output_ids)
+
+    is_crowd_result = instance_is_crowd.numpy()
+    np.testing.assert_array_equal(is_crowd_result, [False, True])
+
+  def testGetsIsCrowdValuesWithIdsInArbitraryOrder(self):
+    is_crowd_map = tf.constant([
+        [1, 0, 0],
+        [1, 0, 1],
+        [1, 1, 1],
+    ], tf.uint8)
+    is_crowd_map = tf.cast(is_crowd_map, tf.bool)
+    id_map = tf.constant([
+        [0, 1, 1],
+        [0, 2, 3],
+        [4, 3, 3],
+    ], tf.int32)
+    output_ids = [1, 3, 0, 2, 4]
+
+    instance_is_crowd = panoptic_instances.per_instance_is_crowd(
+        is_crowd_map, id_map, output_ids)
+
+    is_crowd_result = instance_is_crowd.numpy()
+    np.testing.assert_array_equal(is_crowd_result,
+                                  [False, True, True, False, True])
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/utils/test_utils.py b/utils/test_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..18269ec6d5d25fb02c59b5e1807c6bd425e99c55
--- /dev/null
+++ b/utils/test_utils.py
@@ -0,0 +1,64 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Provide utility functions to write simple tests."""
+import functools
+
+import numpy as np
+import tensorflow as tf
+
+
+NORMALIZATION_LAYERS = (
+    tf.keras.layers.experimental.SyncBatchNormalization,
+    tf.keras.layers.BatchNormalization
+)
+
+
+def create_strategy():
+  """Returns a strategy based on available devices.
+
+  Does NOT work with local_multiworker_tpu_test tests!
+  """
+  tpus = tf.config.list_logical_devices(device_type='TPU')
+  gpus = tf.config.list_logical_devices(device_type='GPU')
+  if tpus:
+    resolver = tf.distribute.cluster_resolver.TPUClusterResolver('')
+    tf.config.experimental_connect_to_cluster(resolver)
+    tf.tpu.experimental.initialize_tpu_system(resolver)
+    return tf.distribute.TPUStrategy(resolver)
+  elif gpus:
+    return tf.distribute.OneDeviceStrategy('/gpu:0')
+  else:
+    return tf.distribute.OneDeviceStrategy('/cpu:0')
+
+
+def test_all_strategies(func):
+  """Decorator to test CPU, GPU and TPU strategies."""
+  @functools.wraps(func)
+  def decorator(self):
+    strategy = create_strategy()
+    return func(self, strategy)
+  return decorator
+
+
+def create_test_input(batch, height, width, channels):
+  """Creates test input tensor."""
+  return tf.convert_to_tensor(
+      np.tile(
+          np.reshape(
+              np.reshape(np.arange(height), [height, 1]) +
+              np.reshape(np.arange(width), [1, width]),
+              [1, height, width, 1]),
+          [batch, 1, 1, channels]), dtype=tf.float32)
diff --git a/video/__init__.py b/video/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..35e4ce02ff422f3aa84ab644b88d65b13e0cbc03
--- /dev/null
+++ b/video/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/video/motion_deeplab.py b/video/motion_deeplab.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee35fb70b0650730d50ecd839b2a49915a753e2d
--- /dev/null
+++ b/video/motion_deeplab.py
@@ -0,0 +1,212 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This file contains the Motion-DeepLab architecture."""
+
+import functools
+from typing import Any, Dict, Text, Tuple
+
+from absl import logging
+import tensorflow as tf
+
+from deeplab2 import common
+from deeplab2 import config_pb2
+from deeplab2.data import dataset
+from deeplab2.model import builder
+from deeplab2.model import utils
+from deeplab2.model.post_processor import motion_deeplab
+from deeplab2.model.post_processor import post_processor_builder
+
+
+class MotionDeepLab(tf.keras.Model):
+  """This class represents the Motion-DeepLab meta architecture.
+
+  This class is the basis of the Motion-DeepLab architecture. This Model can be
+  used for Video Panoptic Segmentation or Segmenting and Tracking Every Pixel
+  (STEP).
+  """
+
+  def __init__(self,
+               config: config_pb2.ExperimentOptions,
+               dataset_descriptor: dataset.DatasetDescriptor):
+    """Initializes a Motion-DeepLab architecture.
+
+    Args:
+      config: A config_pb2.ExperimentOptions configuration.
+      dataset_descriptor: A dataset.DatasetDescriptor.
+    """
+    super(MotionDeepLab, self).__init__(name='MotionDeepLab')
+
+    if config.trainer_options.solver_options.use_sync_batchnorm:
+      logging.info('Synchronized Batchnorm is used.')
+      bn_layer = functools.partial(
+          tf.keras.layers.experimental.SyncBatchNormalization,
+          momentum=config.trainer_options.solver_options.batchnorm_momentum,
+          epsilon=config.trainer_options.solver_options.batchnorm_epsilon)
+    else:
+      logging.info('Standard (unsynchronized) Batchnorm is used.')
+      bn_layer = functools.partial(
+          tf.keras.layers.BatchNormalization,
+          momentum=config.trainer_options.solver_options.batchnorm_momentum,
+          epsilon=config.trainer_options.solver_options.batchnorm_epsilon)
+
+    self._encoder = builder.create_encoder(
+        config.model_options.backbone, bn_layer,
+        conv_kernel_weight_decay=(
+            config.trainer_options.solver_options.weight_decay))
+
+    self._decoder = builder.create_decoder(config.model_options, bn_layer,
+                                           dataset_descriptor.ignore_label)
+
+    self._prev_center_prediction = tf.Variable(
+        0.0,
+        trainable=False,
+        validate_shape=False,
+        shape=tf.TensorShape(None),
+        dtype=tf.float32,
+        name='prev_prediction_buffer')
+    self._prev_center_list = tf.Variable(
+        tf.zeros((0, 5), dtype=tf.int32),
+        trainable=False,
+        validate_shape=False,
+        shape=tf.TensorShape(None),
+        name='prev_prediction_list')
+    self._next_tracking_id = tf.Variable(
+        1,
+        trainable=False,
+        validate_shape=False,
+        dtype=tf.int32,
+        name='next+_tracking_id')
+
+    self._post_processor = post_processor_builder.get_post_processor(
+        config, dataset_descriptor)
+    self._render_fn = functools.partial(
+        motion_deeplab.render_panoptic_map_as_heatmap,
+        sigma=8,
+        label_divisor=dataset_descriptor.panoptic_label_divisor,
+        void_label=dataset_descriptor.ignore_label)
+    self._track_fn = functools.partial(
+        motion_deeplab.assign_instances_to_previous_tracks,
+        label_divisor=dataset_descriptor.panoptic_label_divisor)
+    # The ASPP pooling size is always set to train crop size, which is found to
+    # be experimentally better.
+    pool_size = config.train_dataset_options.crop_size
+    output_stride = float(config.model_options.backbone.output_stride)
+    pool_size = tuple(
+        utils.scale_mutable_sequence(pool_size, 1.0 / output_stride))
+    logging.info('Setting pooling size to %s', pool_size)
+    self.set_pool_size(pool_size)
+
+  def call(self, input_tensor: tf.Tensor, training=False) -> Dict[Text, Any]:
+    """Performs a forward pass.
+
+    Args:
+      input_tensor: An input tensor of type tf.Tensor with shape [batch, height,
+        width, channels]. The input tensor should contain batches of RGB images.
+      training: A boolean flag indicating whether training behavior should be
+        used (default: False).
+
+    Returns:
+      A dictionary containing the results of the specified DeepLab architecture.
+      The results are bilinearly upsampled to input size before returning.
+    """
+    if not training:
+      # During evaluation, we add the previous predicted heatmap as 7th input
+      # channel (cf. during training, we use groundtruth heatmap).
+      input_tensor = self._add_previous_heatmap_to_input(input_tensor)
+    # Normalize the input in the same way as Inception. We normalize it outside
+    # the encoder so that we can extend encoders to different backbones without
+    # copying the normalization to each encoder. We normalize it after data
+    # preprocessing because it is faster on TPUs than on host CPUs. The
+    # normalization should not increase TPU memory consumption because it does
+    # not require gradient.
+    input_tensor = input_tensor / 127.5 - 1.0
+    # Get the static spatial shape of the input tensor.
+    _, input_h, input_w, _ = input_tensor.get_shape().as_list()
+
+    pred = self._decoder(
+        self._encoder(input_tensor, training=training), training=training)
+    result_dict = dict()
+    for key, value in pred.items():
+      if (key == common.PRED_OFFSET_MAP_KEY or
+          key == common.PRED_FRAME_OFFSET_MAP_KEY):
+        result_dict[key] = utils.resize_and_rescale_offsets(
+            value, [input_h, input_w])
+      else:
+        result_dict[key] = utils.resize_bilinear(
+            value, [input_h, input_w])
+
+    # Change the semantic logits to probabilities with softmax.
+    result_dict[common.PRED_SEMANTIC_PROBS_KEY] = tf.nn.softmax(
+        result_dict[common.PRED_SEMANTIC_LOGITS_KEY])
+    if not training:
+      result_dict.update(self._post_processor(result_dict))
+
+      next_heatmap, next_centers = self._render_fn(
+          result_dict[common.PRED_PANOPTIC_KEY])
+      panoptic_map, next_centers, next_id = self._track_fn(
+          self._prev_center_list.value(),
+          next_centers,
+          next_heatmap,
+          result_dict[common.PRED_FRAME_OFFSET_MAP_KEY],
+          result_dict[common.PRED_PANOPTIC_KEY],
+          self._next_tracking_id.value()
+      )
+
+      result_dict[common.PRED_PANOPTIC_KEY] = panoptic_map
+      self._next_tracking_id.assign(next_id)
+      self._prev_center_prediction.assign(
+          tf.expand_dims(next_heatmap, axis=3, name='expand_prev_centermap'))
+      self._prev_center_list.assign(next_centers)
+
+    if common.PRED_CENTER_HEATMAP_KEY in result_dict:
+      result_dict[common.PRED_CENTER_HEATMAP_KEY] = tf.squeeze(
+          result_dict[common.PRED_CENTER_HEATMAP_KEY], axis=3)
+    return result_dict
+
+  def _add_previous_heatmap_to_input(self, input_tensor: tf.Tensor
+                                     ) -> tf.Tensor:
+    frame1, frame2 = tf.split(input_tensor, [3, 3], axis=3)
+    # We use a simple way to detect if the first frame of a sequence is being
+    # processed. For the first frame, frame1 and frame2 are identical.
+    if tf.reduce_all(tf.equal(frame1, frame2)):
+      h = tf.shape(input_tensor)[1]
+      w = tf.shape(input_tensor)[2]
+      prev_center = tf.zeros((1, h, w, 1), dtype=tf.float32)
+      self._prev_center_list.assign(tf.zeros((0, 5), dtype=tf.int32))
+      self._next_tracking_id.assign(1)
+    else:
+      prev_center = self._prev_center_prediction
+    output_tensor = tf.concat([frame1, frame2, prev_center], axis=3)
+    output_tensor.set_shape([None, None, None, 7])
+    return output_tensor
+
+  def reset_pooling_layer(self):
+    """Resets the ASPP pooling layer to global average pooling."""
+    self._decoder.reset_pooling_layer()
+
+  def set_pool_size(self, pool_size: Tuple[int, int]):
+    """Sets the pooling size of the ASPP pooling layer.
+
+    Args:
+      pool_size: A tuple specifying the pooling size of the ASPP pooling layer.
+    """
+    self._decoder.set_pool_size(pool_size)
+
+  @property
+  def checkpoint_items(self) -> Dict[Text, Any]:
+    items = dict(encoder=self._encoder)
+    items.update(self._decoder.checkpoint_items)
+    return items
diff --git a/video/vip_deeplab.py b/video/vip_deeplab.py
new file mode 100644
index 0000000000000000000000000000000000000000..e931ec6bfebfc6e13d1a7a8b37ee7dbb2a74252f
--- /dev/null
+++ b/video/vip_deeplab.py
@@ -0,0 +1,321 @@
+# coding=utf-8
+# Copyright 2021 The Deeplab2 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This file contains the ViP-DeepLab meta architecture."""
+import collections
+import functools
+from typing import Any, Dict, Text, Tuple
+
+from absl import logging
+import tensorflow as tf
+
+from deeplab2 import common
+from deeplab2 import config_pb2
+from deeplab2.data import dataset
+from deeplab2.model import builder
+from deeplab2.model import utils
+from deeplab2.model.post_processor import post_processor_builder
+from deeplab2.model.post_processor import vip_deeplab
+
+_OFFSET_OUTPUT = 'offset'
+
+
+class ViPDeepLab(tf.keras.Model):
+  """This class represents the ViP-DeepLab meta architecture.
+
+  This class supports the architecture of ViP-DeepLab.
+  """
+
+  def __init__(self, config: config_pb2.ExperimentOptions,
+               dataset_descriptor: dataset.DatasetDescriptor):
+    """Initializes a ViP-DeepLab architecture.
+
+    Args:
+      config: A config_pb2.ExperimentOptions configuration.
+      dataset_descriptor: A dataset.DatasetDescriptor.
+    """
+    super(ViPDeepLab, self).__init__(name='ViPDeepLab')
+
+    if config.trainer_options.solver_options.use_sync_batchnorm:
+      logging.info('Synchronized Batchnorm is used.')
+      bn_layer = functools.partial(
+          tf.keras.layers.experimental.SyncBatchNormalization,
+          momentum=config.trainer_options.solver_options.batchnorm_momentum,
+          epsilon=config.trainer_options.solver_options.batchnorm_epsilon)
+    else:
+      logging.info('Standard (unsynchronized) Batchnorm is used.')
+      bn_layer = functools.partial(
+          tf.keras.layers.BatchNormalization,
+          momentum=config.trainer_options.solver_options.batchnorm_momentum,
+          epsilon=config.trainer_options.solver_options.batchnorm_epsilon)
+
+    self._encoder = builder.create_encoder(
+        config.model_options.backbone,
+        bn_layer,
+        conv_kernel_weight_decay=(
+            config.trainer_options.solver_options.weight_decay / 2))
+
+    self._decoder = builder.create_decoder(config.model_options, bn_layer,
+                                           dataset_descriptor.ignore_label)
+
+    self._post_processor = post_processor_builder.get_post_processor(
+        config, dataset_descriptor)
+
+    pool_size = config.train_dataset_options.crop_size
+    output_stride = float(config.model_options.backbone.output_stride)
+    pool_size = tuple(
+        utils.scale_mutable_sequence(pool_size, 1.0 / output_stride))
+    logging.info('Setting pooling size to %s', pool_size)
+    self.set_pool_size(pool_size)
+
+    # Variables for multi-scale inference.
+    self._add_flipped_images = config.evaluator_options.add_flipped_images
+    if not config.evaluator_options.eval_scales:
+      self._eval_scales = [1.0]
+    else:
+      self._eval_scales = config.evaluator_options.eval_scales
+
+    self._label_divisor = dataset_descriptor.panoptic_label_divisor
+
+  def _inference(self, input_tensor: tf.Tensor, next_input_tensor: tf.Tensor,
+                 training: bool) -> Dict[Text, Any]:
+    """Performs an inference pass and returns raw predictions."""
+    _, input_h, input_w, _ = input_tensor.get_shape().as_list()
+    result_dict = collections.defaultdict(list)
+    # Evaluation mode where one could perform multi-scale inference.
+    scale_1_pool_size = self.get_pool_size()
+    logging.info('Eval with scales %s', self._eval_scales)
+    for eval_scale in self._eval_scales:
+      # Get the scaled images/pool_size for each scale.
+      scaled_images, scaled_pool_size = (
+          self._scale_images_and_pool_size(input_tensor,
+                                           list(scale_1_pool_size), eval_scale))
+      next_scaled_images, _ = (
+          self._scale_images_and_pool_size(next_input_tensor,
+                                           list(scale_1_pool_size), eval_scale))
+      # Update the ASPP pool size for different eval scales.
+      self.set_pool_size(tuple(scaled_pool_size))
+      logging.info('Eval scale %s; setting pooling size to %s', eval_scale,
+                   scaled_pool_size)
+      pred_dict = self._decoder(
+          self._encoder(scaled_images, training=training),
+          self._encoder(next_scaled_images, training=training),
+          training=training)
+      pred_dict = self._resize_predictions(
+          pred_dict, target_h=input_h, target_w=input_w)
+      # Change the semantic logits to probabilities with softmax. Note
+      # one should remove semantic logits for faster inference. We still
+      # keep them since they will be used to compute evaluation loss.
+      pred_dict[common.PRED_SEMANTIC_PROBS_KEY] = tf.nn.softmax(
+          pred_dict[common.PRED_SEMANTIC_LOGITS_KEY])
+      # Store the predictions from each scale.
+      for output_type, output_value in pred_dict.items():
+        result_dict[output_type].append(output_value)
+      if self._add_flipped_images:
+        pred_dict_reverse = self._decoder(
+            self._encoder(tf.reverse(scaled_images, [2]), training=training),
+            self._encoder(
+                tf.reverse(next_scaled_images, [2]), training=training),
+            training=training)
+        pred_dict_reverse = self._resize_predictions(
+            pred_dict_reverse, target_h=input_h, target_w=input_w, reverse=True)
+        # Change the semantic logits to probabilities with softmax.
+        pred_dict_reverse[common.PRED_SEMANTIC_PROBS_KEY] = tf.nn.softmax(
+            pred_dict_reverse[common.PRED_SEMANTIC_LOGITS_KEY])
+        # Store the predictions from each scale.
+        for output_type, output_value in pred_dict_reverse.items():
+          result_dict[output_type].append(output_value)
+    # Set back the pool_size for scale 1.0, the original setting.
+    self.set_pool_size(tuple(scale_1_pool_size))
+    # Average results across scales.
+    for output_type, output_value in result_dict.items():
+      result_dict[output_type] = tf.reduce_mean(
+          tf.stack(output_value, axis=0), axis=0)
+    return result_dict
+
+  def call(self,
+           input_tensor: tf.Tensor,
+           training: bool = False) -> Dict[Text, Any]:
+    """Performs a forward pass.
+
+    Args:
+      input_tensor: An input tensor of type tf.Tensor with shape [batch, height,
+        width, channels]. The input tensor should contain batches of RGB images
+        pairs. The channel dimension is expected to encode two RGB pixels.
+      training: A boolean flag indicating whether training behavior should be
+        used (default: False).
+
+    Returns:
+      A dictionary containing the results of the specified DeepLab architecture.
+      The results are bilinearly upsampled to input size before returning.
+    """
+    # Normalize the input in the same way as Inception. We normalize it outside
+    # the encoder so that we can extend encoders to different backbones without
+    # copying the normalization to each encoder. We normalize it after data
+    # preprocessing because it is faster on TPUs than on host CPUs. The
+    # normalization should not increase TPU memory consumption because it does
+    # not require gradient.
+    input_tensor = input_tensor / 127.5 - 1.0
+    # Get the static spatial shape of the input tensor.
+    _, input_h, input_w, _ = input_tensor.get_shape().as_list()
+    # Splits the input_tensor into the current and the next frames.
+    input_tensor, next_input_tensor = tf.split(input_tensor, 2, axis=3)
+    if training:
+      encoder_features = self._encoder(input_tensor, training=training)
+      next_encoder_features = self._encoder(
+          next_input_tensor, training=training)
+      result_dict = self._decoder(
+          encoder_features, next_encoder_features, training=training)
+      result_dict = self._resize_predictions(
+          result_dict, target_h=input_h, target_w=input_w)
+    else:
+      result_dict = self._inference(input_tensor, next_input_tensor, training)
+      # To get panoptic prediction of the next frame, we reverse the
+      # input_tensor and next_input_tensor and use them as the input.
+      # The second input can be anything. In sequence evaluation, we can wait
+      # for the results of the next pair. Here, we need to compute the panoptic
+      # predictions of the next frame to do pair evaluation.
+      # pylint: disable=arguments-out-of-order
+      next_result_dict = self._inference(
+          next_input_tensor, input_tensor, training)
+      # Here, we horizontally concat the raw predictions of the current frame
+      # and the next frame to perform two-frame panoptic post-processing.
+      concat_result_dict = collections.defaultdict(list)
+      concat_result_dict[common.PRED_SEMANTIC_PROBS_KEY] = tf.concat([
+          result_dict[common.PRED_SEMANTIC_PROBS_KEY],
+          next_result_dict[common.PRED_SEMANTIC_PROBS_KEY]
+      ],
+                                                                     axis=2)
+      concat_result_dict[common.PRED_CENTER_HEATMAP_KEY] = tf.concat([
+          result_dict[common.PRED_CENTER_HEATMAP_KEY],
+          tf.zeros_like(next_result_dict[common.PRED_CENTER_HEATMAP_KEY])
+      ],
+                                                                     axis=2)
+      next_regression_y, next_regression_x = tf.split(
+          result_dict[common.PRED_NEXT_OFFSET_MAP_KEY],
+          num_or_size_splits=2,
+          axis=3)
+      # The predicted horizontal offsets of the next frame need to subtract the
+      # image width to point to the object centers in the current frame because
+      # the two frames are horizontally concatenated.
+      next_regression_x -= tf.constant(input_w, dtype=tf.float32)
+      next_regression = tf.concat([next_regression_y, next_regression_x],
+                                  axis=3)
+      concat_result_dict[common.PRED_OFFSET_MAP_KEY] = tf.concat(
+          [result_dict[common.PRED_OFFSET_MAP_KEY], next_regression], axis=2)
+      concat_result_dict.update(self._post_processor(concat_result_dict))
+      next_result_dict.update(self._post_processor(next_result_dict))
+      result_dict[common.PRED_NEXT_PANOPTIC_KEY] = next_result_dict[
+          common.PRED_PANOPTIC_KEY]
+      for result_key in [
+          common.PRED_PANOPTIC_KEY, common.PRED_SEMANTIC_KEY,
+          common.PRED_INSTANCE_KEY, common.PRED_INSTANCE_CENTER_KEY,
+          common.PRED_INSTANCE_SCORES_KEY
+      ]:
+        result_dict[result_key], next_result_dict[result_key] = tf.split(
+            concat_result_dict[result_key], num_or_size_splits=2, axis=2)
+      result_dict[common.PRED_CONCAT_NEXT_PANOPTIC_KEY] = next_result_dict[
+          common.PRED_PANOPTIC_KEY]
+      result_dict[common.PRED_NEXT_PANOPTIC_KEY] = tf.numpy_function(
+          func=vip_deeplab.stitch_video_panoptic_prediction,
+          inp=[
+              result_dict[common.PRED_CONCAT_NEXT_PANOPTIC_KEY],
+              result_dict[common.PRED_NEXT_PANOPTIC_KEY], self._label_divisor
+          ],
+          Tout=tf.int32)
+      result_dict[common.PRED_NEXT_PANOPTIC_KEY].set_shape(
+          result_dict[common.PRED_CONCAT_NEXT_PANOPTIC_KEY].get_shape())
+    if common.PRED_CENTER_HEATMAP_KEY in result_dict:
+      result_dict[common.PRED_CENTER_HEATMAP_KEY] = tf.squeeze(
+          result_dict[common.PRED_CENTER_HEATMAP_KEY], axis=3)
+    return result_dict
+
+  def reset_pooling_layer(self):
+    """Resets the ASPP pooling layer to global average pooling."""
+    self._decoder.reset_pooling_layer()
+
+  def set_pool_size(self, pool_size: Tuple[int, int]):
+    """Sets the pooling size of the ASPP pooling layer.
+
+    Args:
+      pool_size: A tuple specifying the pooling size of the ASPP pooling layer.
+    """
+    self._decoder.set_pool_size(pool_size)
+
+  def get_pool_size(self):
+    return self._decoder.get_pool_size()
+
+  @property
+  def checkpoint_items(self) -> Dict[Text, Any]:
+    items = dict(encoder=self._encoder)
+    items.update(self._decoder.checkpoint_items)
+    return items
+
+  def _resize_predictions(self, result_dict, target_h, target_w, reverse=False):
+    """Resizes predictions to the target height and width.
+
+    This function resizes the items in the result_dict to the target height and
+    width. The items are optionally reversed w.r.t width if `reverse` is True.
+
+    Args:
+      result_dict: A dictionary storing prediction results to be resized.
+      target_h: An integer, the target height.
+      target_w: An integer, the target width.
+      reverse: A boolean, reversing the prediction result w.r.t. width.
+
+    Returns:
+      Resized (or optionally reversed) result_dict.
+    """
+    for key, value in result_dict.items():
+      if reverse:
+        value = tf.reverse(value, [2])
+        # Special care to offsets: need to flip x-offsets.
+        if _OFFSET_OUTPUT in key:
+          offset_y, offset_x = tf.split(
+              value=value, num_or_size_splits=2, axis=3)
+          offset_x *= -1
+          value = tf.concat([offset_y, offset_x], 3)
+      if _OFFSET_OUTPUT in key:
+        result_dict[key] = utils.resize_and_rescale_offsets(
+            value, [target_h, target_w])
+      else:
+        result_dict[key] = utils.resize_bilinear(value, [target_h, target_w])
+    return result_dict
+
+  def _scale_images_and_pool_size(self, images, pool_size, scale):
+    """Scales images and pool_size w.r.t.
+
+    scale.
+
+    Args:
+      images: An input tensor with shape [batch, height, width, 3].
+      pool_size: A list with two elements, specifying the pooling size of ASPP
+        pooling layer.
+      scale: A float, used to scale the input images and pool_size.
+
+    Returns:
+      Scaled images, and pool_size.
+    """
+    if scale == 1.0:
+      scaled_images = images
+      scaled_pool_size = pool_size
+    else:
+      image_size = images.get_shape().as_list()[1:3]
+      scaled_image_size = utils.scale_mutable_sequence(image_size, scale)
+      scaled_images = utils.resize_bilinear(images, scaled_image_size)
+      scaled_pool_size = [None, None]
+      if pool_size != [None, None]:
+        scaled_pool_size = utils.scale_mutable_sequence(pool_size, scale)
+    return scaled_images, scaled_pool_size