|
from sacred import Experiment |
|
|
|
ex = Experiment("VLMo") |
|
|
|
|
|
def _loss_names(d): |
|
ret = { |
|
"itm": 0, |
|
"itc": 0, |
|
"caption": 0, |
|
"mvlm": 0, |
|
"textmlm": 0, |
|
"imagemlm": 0, |
|
"vqa": 0, |
|
"nlvr2": 0, |
|
"irtr": 0, |
|
} |
|
ret.update(d) |
|
return ret |
|
|
|
|
|
@ex.config |
|
def config(): |
|
exp_name = "vlmo" |
|
seed = 1 |
|
datasets = ["coco", "vg", "sbu", "gcc"] |
|
loss_names = _loss_names({"itm": 0, "itc": 0, "mvlm": 0}) |
|
batch_size = 1024 |
|
|
|
|
|
encoder_layers = 12 |
|
encoder_embed_dim = 768 |
|
out_embed_dim = 768 |
|
beit_version = "base" |
|
beit3_vl_layers = 3 |
|
deepnorm_init = True |
|
share_layer = False |
|
share_attn = False |
|
one_attn = False |
|
|
|
|
|
train_transform_keys = ["square_transform_randaug"] |
|
val_transform_keys = ["square_transform"] |
|
image_size = 224 |
|
reclip_image_size = None |
|
patch_size = 16 |
|
draw_false_image = 0 |
|
image_only = False |
|
text_only = False |
|
|
|
|
|
video_num_frm = None |
|
|
|
|
|
tokenizer_model = "beit2_visual_tokenizer" |
|
codebook_size = 8192 |
|
codebook_dim = 32 |
|
visual_mask_size = 14 |
|
visual_mask_num = 80 |
|
|
|
|
|
lang = 'cn' |
|
vqav2_label_size = 3129 |
|
max_text_len = 40 |
|
max_text_len_of_initckpt = 196 |
|
tokenizer_type = "BertTokenizer" |
|
vocab_size = 21128 |
|
tokenizer = "./vocab.txt" |
|
whole_word_masking = True |
|
mlm_prob = 0.15 |
|
draw_false_text = 0 |
|
mvlm_prob = 0.50 |
|
mask_ratio = 0 |
|
|
|
|
|
cap_onlytext = False |
|
|
|
|
|
split_data_for_imagemlm = False |
|
|
|
|
|
itc_mask = False |
|
aggregate_nodes = -1 |
|
|
|
|
|
model_arch = "vlmo_base_patch16" |
|
drop_path_rate = 0.1 |
|
|
|
|
|
get_recall_metric = False |
|
get_recall_rerank_metric = False |
|
get_zeroshot_metric = False |
|
get_muge_feat = False |
|
get_f30k_feat = False |
|
k_test = 32 |
|
|
|
|
|
resume_from = None |
|
fast_dev_run = False |
|
val_check_interval = 1.0 |
|
test_only = False |
|
use_sharded_training = False |
|
resume_during_training = False |
|
save_top_k = 10 |
|
every_n_train_steps = 2000 |
|
log_metric_steps = 100 |
|
|
|
|
|
use_pcache = False |
|
pcache_root = "" |
|
|
|
|
|
gpu_env = "main_site" |
|
data_root = "" |
|
|
|
|
|
log_dir = "result" |
|
per_gpu_batchsize = 4 |
|
num_gpus = 1 |
|
num_nodes = 1 |
|
load_path = "" |
|
num_workers = 8 |
|
precision = 16 |
|
local_run = True |
|
flash_attn = False |
|
deepspeed_config = None |
|
coalesce_backbone = False |
|
mask_data = "v+l" |
|
communication_benchmark = False |
|
checkpoint_activations = False |
|
|
|
|
|
single_cap = True |
|
random_one = False |
|
|
|
|
|
itc_feats_name = "cls_vlffn_feats" |
|
itc_distill = "" |
|
itc_distill_dim = 1024 |
|
itc_teacher_weights = "" |
|
|
|
|
|
mup = False |
|
base_encoder_embed_dim = 1 |
|
delta_encoder_embed_dim = 2 |
|
mup_encoder_attention_heads = 1 |
|
base_encoder_ffn_embed_dim = 1 |
|
delta_encoder_ffn_embed_dim = 2 |
|
|
|
|
|
atorch_config = None |
|
compile_op = False |
|
optimizer_state_shard_save = False |
|
model_state_shard_save = False |
|
|
|
|
|
local_loss = False |
|
use_dual_softmax = False |
|
|
|
num_frames = 1 |
|
|
|
|
|
|
|
deepnorm = False |
|
|
|
|