Spaces:
Sleeping
Sleeping
_base_ = ['models'] | |
# model settings | |
model = dict( | |
type='R2Tuning', | |
arch='ViT-B/32', | |
init=False, | |
dims=256, | |
strides=(1, 2, 4, 8), | |
buffer_size=1024, | |
max_num_moment=50, | |
adapter_cfg=dict( | |
type='R2Block', | |
k=4, | |
dropout=0.5, | |
use_tef=True, | |
pos_cfg=dict(type='PositionalEncoding', normalize=True, max_len=1024), | |
tem_cfg=dict( | |
type='TransformerDecoderLayer', | |
heads=8, | |
ratio=4, | |
att_dropout=0.0, | |
ffn_dropout=0.0, | |
att_out_dropout=0.0, | |
ffn_out_dropout=0.0, | |
droppath=0.1, | |
pre_norm=False, | |
bias=True, | |
norm_cfg=dict(type='LN'), | |
act_cfg=dict(type='ReLU', inplace=True), | |
order=('cross_att', 'self_att', 'ffn'), | |
att_init_cfg=dict(type='xavier', distribution='uniform'), | |
ffn_init_cfg=dict(type='kaiming'))), | |
pyramid_cfg=dict(type='ConvPyramid'), | |
pooling_cfg=dict(type='AdaPooling'), | |
class_head_cfg=dict(type='ConvHead', kernal_size=3), | |
coord_head_cfg=dict(type='ConvHead', kernal_size=3), | |
loss_cfg=dict( | |
type='BundleLoss', | |
sample_radius=1.5, | |
loss_cls=dict(type='FocalLoss', loss_weight=1.0), | |
loss_reg=dict(type='L1Loss', loss_weight=0.2), | |
loss_sal=dict(type='SampledNCELoss', loss_weight=0.1), | |
loss_video_cal=dict(type='InfoNCELoss', loss_weight=0.1), | |
loss_layer_cal=dict(type='InfoNCELoss', loss_weight=0.1))) | |