_base_ = ['models'] # model settings model = dict( type='R2Tuning', arch='ViT-B/32', init=False, dims=256, strides=(1, 2, 4, 8), buffer_size=1024, max_num_moment=50, adapter_cfg=dict( type='R2Block', k=4, dropout=0.5, use_tef=True, pos_cfg=dict(type='PositionalEncoding', normalize=True, max_len=1024), tem_cfg=dict( type='TransformerDecoderLayer', heads=8, ratio=4, att_dropout=0.0, ffn_dropout=0.0, att_out_dropout=0.0, ffn_out_dropout=0.0, droppath=0.1, pre_norm=False, bias=True, norm_cfg=dict(type='LN'), act_cfg=dict(type='ReLU', inplace=True), order=('cross_att', 'self_att', 'ffn'), att_init_cfg=dict(type='xavier', distribution='uniform'), ffn_init_cfg=dict(type='kaiming'))), pyramid_cfg=dict(type='ConvPyramid'), pooling_cfg=dict(type='AdaPooling'), class_head_cfg=dict(type='ConvHead', kernal_size=3), coord_head_cfg=dict(type='ConvHead', kernal_size=3), loss_cfg=dict( type='BundleLoss', sample_radius=1.5, loss_cls=dict(type='FocalLoss', loss_weight=1.0), loss_reg=dict(type='L1Loss', loss_weight=0.2), loss_sal=dict(type='SampledNCELoss', loss_weight=0.1), loss_video_cal=dict(type='InfoNCELoss', loss_weight=0.1), loss_layer_cal=dict(type='InfoNCELoss', loss_weight=0.1)))