from sacred import Experiment ex = Experiment("VLMo") def _loss_names(d): ret = { "itm": 0, # image-text matching loss "itc": 0, # image-text contrastive loss "caption": 0, # image captioning loss "mvlm": 0, # masked language modeling loss "textmlm": 0, # text-only masked language modeling "imagemlm": 0, # image-only masked language modeling "vqa": 0, "nlvr2": 0, "irtr": 0, # retrieval task ft } ret.update(d) return ret @ex.config def config(): exp_name = "vlmo" seed = 1 datasets = ["coco", "vg", "sbu", "gcc"] # dataset name, the definition can refer to: vlmo/datamodules/__init__.py # noqa loss_names = _loss_names({"itm": 0, "itc": 0, "mvlm": 0}) # training loss batch_size = 1024 # this is a desired batch size; pl trainer will accumulate gradients. # BEiT-v3 setting encoder_layers = 12 # the layer number of backbone encoder_embed_dim = 768 # the hidden size of tokenizer out_embed_dim = 768 # the hidden size of output embedding beit_version = "base" # model size: base(0.4B)|large(1B)|huge(10B) beit3_vl_layers = 3 # the layer number of vl_backbone deepnorm_init = True # init method share_layer = False # if share the weight between layer within backbone share_attn = False # if share the attention weight of different layer one_attn = False # if share the attention weight of vision and language # Image setting train_transform_keys = ["square_transform_randaug"] # train transform: refer to vlmo/transforms/__init__.py val_transform_keys = ["square_transform"] # test transform: refer to refer to vlmo/transforms/__init__.py image_size = 224 # image size reclip_image_size = None # reclip image size patch_size = 16 # patch size draw_false_image = 0 # if get negative image image_only = False # only input image text_only = False # # only input text # Video setting, video_num_frm is not None means video input video_num_frm = None # Visual tokenizer setting based on beit2 tokenizer_model = "beit2_visual_tokenizer" codebook_size = 8192 codebook_dim = 32 visual_mask_size = 14 visual_mask_num = 80 # Text Setting lang = 'cn' # language for zero-shot imagenet testing: cn|en vqav2_label_size = 3129 max_text_len = 40 # the number of characters max_text_len_of_initckpt = 196 tokenizer_type = "BertTokenizer" # Chinese text vocab_size = 21128 tokenizer = "./vocab.txt" whole_word_masking = True mlm_prob = 0.15 # language mask ratio draw_false_text = 0 mvlm_prob = 0.50 # vision-langurage mlm task mask_ratio = 0 # flip: mask ratio for image # cap setting cap_onlytext = False # default caption image to text # imagemlm setting split_data_for_imagemlm = False # if True, split a batch data to two parts, and the first part for imagemlm. # itc setting itc_mask = False # itc use masked token aggregate_nodes = -1 # aggregate nodes num for compute_itc, default -1 is for all nodes # Transformer Setting model_arch = "vlmo_base_patch16" drop_path_rate = 0.1 # Downstream Setting get_recall_metric = False get_recall_rerank_metric = False get_zeroshot_metric = False get_muge_feat = False get_f30k_feat = False k_test = 32 # PL Trainer Setting resume_from = None fast_dev_run = False val_check_interval = 1.0 test_only = False use_sharded_training = False resume_during_training = False save_top_k = 10 every_n_train_steps = 2000 # the step to save checkpoint log_metric_steps = 100 # the step to log metric # below params varies with the environment use_pcache = False # data storage method: pcache or nas pcache_root = "" # main_site: pcache://multimodalproxyi-pool.cz50c.alipay.com:39999/mnt/ # public_cloud: pcache://pcache_public_cloud.pcache.local:39999/mnt/abc7c88079a60b45ddfce7afa40720b7/ gpu_env = "main_site" # public_cloud or main_site data_root = "" # data root for data list log_dir = "result" per_gpu_batchsize = 4 # you should define this manually with per_gpu_batch_size=# num_gpus = 1 num_nodes = 1 load_path = "" num_workers = 8 precision = 16 local_run = True flash_attn = False deepspeed_config = None # "ds_config.json" coalesce_backbone = False mask_data = "v+l" # 'v+l':choose input of imagemlm+textmlm task, 'vl': choose input of mvlm task. communication_benchmark = False checkpoint_activations = False # dataset setting single_cap = True # if have only one caption random_one = False # if choose one caption from caption list # ITC setting itc_feats_name = "cls_vlffn_feats" # feat for itc loss itc_distill = "" itc_distill_dim = 1024 itc_teacher_weights = "" # mup training setting mup = False base_encoder_embed_dim = 1 delta_encoder_embed_dim = 2 mup_encoder_attention_heads = 1 base_encoder_ffn_embed_dim = 1 delta_encoder_ffn_embed_dim = 2 # atorch atorch_config = None compile_op = False optimizer_state_shard_save = False model_state_shard_save = False # itc loss local_loss = False use_dual_softmax = False num_frames = 1 # ----------------------- LMM pretraining config ----------------------- # norm setting deepnorm = False