Spaces:
Running
on
Zero
Running
on
Zero
#python3.10 | |
"""Hierachical configuration for different pipelines, using `yacs` | |
(refered to https://github.com/rbgirshick/yacs) | |
This projects contain the configuration for three aspects: | |
the regular config for experiment setting | |
NOTE: Each experiment will be assigned a seperate working space, and the | |
intermediate results will be saved in the working space. The experimentes | |
folder structure is as follows: | |
{ | |
/${ROOT_WORK_DIR}/ | |
βββ ${PIPELINES_NAME}/ | |
βββ ${EXP_NAME}/ | |
βββ ${CHECKPOINT_DIR}/ | |
βββ ${RESULT_DIR}/ | |
βββ meta.json/ | |
βββ ${LOG_DIR} | |
} | |
""" | |
import os, sys | |
from .yacs import CfgNode as CN | |
import argparse | |
import numpy as np | |
# the parser for boolean | |
def bool_parser(arg): | |
"""Parses an argument to boolean.""" | |
if isinstance(arg, bool): | |
return arg | |
if arg is None: | |
return False | |
if arg.lower() in ['1', 'true', 't', 'yes', 'y']: | |
return True | |
if arg.lower() in ['0', 'false', 'f', 'no', 'n']: | |
return False | |
raise ValueError(f'`{arg}` cannot be converted to boolean!') | |
# ----------------------------------------------------------------------------- | |
# base cfg | |
# ----------------------------------------------------------------------------- | |
cfg = CN() | |
# configuration for basic experiments | |
cfg.save_dir = "./checkpoints" | |
cfg.restore_ckpt = "" | |
cfg.model_name = "cotracker" | |
cfg.exp_name = "" | |
# NOTE: configuration for datasets and augmentation | |
cfg.dataset_root = "" | |
cfg.eval_datasets = [""] | |
cfg.dont_use_augs = False | |
cfg.crop_size = [384, 512] | |
cfg.traj_per_sample = 384 | |
cfg.sample_vis_1st_frame = False | |
cfg.depth_near = 0.01 # meter | |
cfg.depth_far = 65.0 # meter | |
cfg.sequence_len = 24 | |
# NOTE: configuration for network arch | |
cfg.sliding_window_len = 8 | |
cfg.remove_space_attn = False | |
cfg.updateformer_hidden_size = 384 | |
cfg.updateformer_num_heads = 8 | |
cfg.updateformer_space_depth = 6 | |
cfg.updateformer_time_depth = 6 | |
cfg.model_stride = 4 | |
cfg.train_iters = 4 | |
cfg.if_ARAP = False | |
cfg.Embed3D = False | |
cfg.Loss_W_feat = 5e-1 | |
cfg.Loss_W_cls = 1e-4 | |
cfg.depth_color = False | |
cfg.flash_attn = False | |
cfg.corr_dp = True | |
cfg.support_grid = 0 | |
cfg.backbone = "CNN" | |
cfg.enc_only = False | |
cfg.init_match = False | |
cfg.Nblock = 4 | |
# NOTE: configuration for training and saving | |
cfg.nodes_num = 1 | |
cfg.batch_size = 1 | |
cfg.num_workers = 6 | |
cfg.mixed_precision = False | |
cfg.lr = 0.0005 | |
cfg.wdecay = 0.00001 | |
cfg.num_steps = 200000 | |
cfg.evaluate_every_n_epoch = 1 | |
cfg.save_every_n_epoch = 1 | |
cfg.validate_at_start = False | |
cfg.save_freq = 100 | |
cfg.eval_max_seq_len = 1000 | |
cfg.debug = False | |
cfg.fine_tune = False | |
cfg.aug_wind_sample = False | |
cfg.use_video_flip = False | |
cfg.fix_backbone = False | |
cfg.tune_backbone = False | |
cfg.tune_arap = False | |
cfg.tune_per_scene = False | |
cfg.use_hier_encoder = False | |
cfg.scales = [4, 2] | |
# NOTE: configuration for monocular depth estimator | |
cfg.mde_name = "zoedepth_nk" | |
# ----------------------------------------------------------------------------- | |
# configurations for the command line | |
parser = argparse.ArgumentParser() | |
# config for the basic experiment | |
parser.add_argument("--save_dir", default="./checkpoints", type=str ,help="path to save checkpoints") | |
parser.add_argument("--restore_ckpt", default="", help="path to restore a checkpoint") | |
parser.add_argument("--model_name", default="cotracker", help="model name") | |
parser.add_argument("--exp_name", type=str, default="base", | |
help="the name for experiment", | |
) | |
# config for dataset and augmentation | |
parser.add_argument( | |
"--dataset_root", type=str, help="path lo all the datasets (train and eval)" | |
) | |
parser.add_argument( | |
"--eval_datasets", nargs="+", default=["things", "badja"], | |
help="what datasets to use for evaluation", | |
) | |
parser.add_argument( | |
"--dont_use_augs", action="store_true", default=False, | |
help="don't apply augmentations during training", | |
) | |
parser.add_argument( | |
"--crop_size", type=int, nargs="+", default=[384, 512], | |
help="crop videos to this resolution during training", | |
) | |
parser.add_argument( | |
"--traj_per_sample", type=int, default=768, | |
help="the number of trajectories to sample for training", | |
) | |
parser.add_argument( | |
"--depth_near", type=float, default=0.01, help="near plane depth" | |
) | |
parser.add_argument( | |
"--depth_far", type=float, default=65.0, help="far plane depth" | |
) | |
parser.add_argument( | |
"--sample_vis_1st_frame", | |
action="store_true", | |
default=False, | |
help="only sample trajectories with points visible on the first frame", | |
) | |
parser.add_argument( | |
"--sequence_len", type=int, default=24, help="train sequence length" | |
) | |
# configuration for network arch | |
parser.add_argument( | |
"--sliding_window_len", | |
type=int, | |
default=8, | |
help="length of the CoTracker sliding window", | |
) | |
parser.add_argument( | |
"--remove_space_attn", | |
action="store_true", | |
default=False, | |
help="remove space attention from CoTracker", | |
) | |
parser.add_argument( | |
"--updateformer_hidden_size", | |
type=int, | |
default=384, | |
help="hidden dimension of the CoTracker transformer model", | |
) | |
parser.add_argument( | |
"--updateformer_num_heads", | |
type=int, | |
default=8, | |
help="number of heads of the CoTracker transformer model", | |
) | |
parser.add_argument( | |
"--updateformer_space_depth", | |
type=int, | |
default=6, | |
help="number of group attention layers in the CoTracker transformer model", | |
) | |
parser.add_argument( | |
"--updateformer_time_depth", | |
type=int, | |
default=6, | |
help="number of time attention layers in the CoTracker transformer model", | |
) | |
parser.add_argument( | |
"--model_stride", | |
type=int, | |
default=4, | |
help="stride of the CoTracker feature network", | |
) | |
parser.add_argument( | |
"--train_iters", | |
type=int, | |
default=4, | |
help="number of updates to the disparity field in each forward pass.", | |
) | |
parser.add_argument( | |
"--if_ARAP", | |
action="store_true", | |
default=False, | |
help="if using ARAP loss in the optimization", | |
) | |
parser.add_argument( | |
"--Embed3D", | |
action="store_true", | |
default=False, | |
help="if using the 3D embedding for image", | |
) | |
parser.add_argument( | |
"--Loss_W_feat", | |
type=float, | |
default=5e-1, | |
help="weight for the feature loss", | |
) | |
parser.add_argument( | |
"--Loss_W_cls", | |
type=float, | |
default=1e-4, | |
help="weight for the classification loss", | |
) | |
parser.add_argument( | |
"--depth_color", | |
action="store_true", | |
default=False, | |
help="if using the color for depth", | |
) | |
parser.add_argument( | |
"--flash_attn", | |
action="store_true", | |
default=False, | |
help="if using the flash attention", | |
) | |
parser.add_argument( | |
"--corr_dp", | |
action="store_true", | |
default=False, | |
help="if using the correlation of depth", | |
) | |
parser.add_argument( | |
"--support_grid", | |
type=int, | |
default=0, | |
help="if using the support grid", | |
) | |
parser.add_argument( | |
"--backbone", | |
type=str, | |
default="CNN", | |
help="backbone for the CoTracker feature network", | |
) | |
parser.add_argument( | |
"--enc_only", | |
action="store_true", | |
default=False, | |
help="if using the encoder only", | |
) | |
parser.add_argument( | |
"--init_match", | |
action="store_true", | |
default=False, | |
help="if using the initial matching", | |
) | |
parser.add_argument( | |
"--Nblock", | |
type=int, | |
default=4, | |
help="number of blocks in the CoTracker feature network", | |
) | |
# configuration for training and saving | |
parser.add_argument( | |
"--nodes_num", type=int, default=1, help="number of nodes used for training." | |
) | |
parser.add_argument( | |
"--batch_size", type=int, default=1, help="batch size used during training." | |
) | |
parser.add_argument( | |
"--num_workers", type=int, default=6, help="number of dataloader workers" | |
) | |
parser.add_argument( | |
"--mixed_precision", | |
action="store_true", default=False, | |
help="use mixed precision" | |
) | |
parser.add_argument("--lr", type=float, default=0.0005, help="max learning rate.") | |
parser.add_argument( | |
"--wdecay", type=float, default=0.00001, help="Weight decay in optimizer." | |
) | |
parser.add_argument( | |
"--num_steps", type=int, default=200000, help="length of training schedule." | |
) | |
parser.add_argument( | |
"--evaluate_every_n_epoch", | |
type=int, | |
default=1, | |
help="evaluate during training after every n epochs, after every epoch by default", | |
) | |
parser.add_argument( | |
"--save_every_n_epoch", | |
type=int, | |
default=1, | |
help="save checkpoints during training after every n epochs, after every epoch by default", | |
) | |
parser.add_argument( | |
"--validate_at_start", | |
action="store_true", | |
default=False, | |
help="whether to run evaluation before training starts", | |
) | |
parser.add_argument( | |
"--save_freq", | |
type=int, | |
default=100, | |
help="frequency of trajectory visualization during training", | |
) | |
parser.add_argument( | |
"--eval_max_seq_len", | |
type=int, | |
default=1000, | |
help="maximum length of evaluation videos", | |
) | |
parser.add_argument( | |
"--debug", | |
action="store_true", | |
default=False, | |
help="if using the visibility mask", | |
) | |
parser.add_argument( | |
"--fine_tune", | |
action="store_true", | |
default=False, | |
help="if fine tune the model", | |
) | |
parser.add_argument( | |
"--aug_wind_sample", | |
action="store_true", | |
default=False, | |
help="if using the window sampling", | |
) | |
parser.add_argument( | |
"--use_video_flip", | |
action="store_true", | |
default=False, | |
help="if using the video flip", | |
) | |
parser.add_argument( | |
"--fix_backbone", | |
action="store_true", | |
default=False, | |
help="if fix the backbone", | |
) | |
parser.add_argument( | |
"--tune_backbone", | |
action="store_true", | |
default=False, | |
help="if tune the backbone", | |
) | |
parser.add_argument( | |
"--tune_arap", | |
action="store_true", | |
default=False, | |
help="if fix the backbone", | |
) | |
parser.add_argument( | |
"--tune_per_scene", | |
action="store_true", | |
default=False, | |
help="if tune one scene", | |
) | |
parser.add_argument( | |
"--use_hier_encoder", | |
action="store_true", | |
default=False, | |
help="if using the hierarchical encoder", | |
) | |
parser.add_argument( | |
"--scales", | |
type=int, | |
nargs="+", | |
default=[4, 2], | |
help="scales for the CoTracker feature network", | |
) | |
# config for monocular depth estimator | |
parser.add_argument( | |
"--mde_name", type=str, default="zoedepth_nk", help="name of the MDE model" | |
) | |
args = parser.parse_args() | |
args_dict = vars(args) | |
# ----------------------------------------------------------------------------- | |
# merge the `args` to the `cfg` | |
cfg.merge_from_dict(args_dict) | |
cfg.ckpt_path=os.path.join(args.save_dir, args.model_name ,args.exp_name) | |