M2_Encoder_Large / vlmo /config.py
acai66's picture
Upload folder using huggingface_hub
3440f83 verified
from sacred import Experiment
ex = Experiment("VLMo")
def _loss_names(d):
ret = {
"itm": 0, # image-text matching loss
"itc": 0, # image-text contrastive loss
"caption": 0, # image captioning loss
"mvlm": 0, # masked language modeling loss
"textmlm": 0, # text-only masked language modeling
"imagemlm": 0, # image-only masked language modeling
"vqa": 0,
"nlvr2": 0,
"irtr": 0, # retrieval task ft
}
ret.update(d)
return ret
@ex.config
def config():
exp_name = "vlmo"
seed = 1
datasets = ["coco", "vg", "sbu", "gcc"] # dataset name, the definition can refer to: vlmo/datamodules/__init__.py # noqa
loss_names = _loss_names({"itm": 0, "itc": 0, "mvlm": 0}) # training loss
batch_size = 1024 # this is a desired batch size; pl trainer will accumulate gradients.
# BEiT-v3 setting
encoder_layers = 12 # the layer number of backbone
encoder_embed_dim = 768 # the hidden size of tokenizer
out_embed_dim = 768 # the hidden size of output embedding
beit_version = "base" # model size: base(0.4B)|large(1B)|huge(10B)
beit3_vl_layers = 3 # the layer number of vl_backbone
deepnorm_init = True # init method
share_layer = False # if share the weight between layer within backbone
share_attn = False # if share the attention weight of different layer
one_attn = False # if share the attention weight of vision and language
# Image setting
train_transform_keys = ["square_transform_randaug"] # train transform: refer to vlmo/transforms/__init__.py
val_transform_keys = ["square_transform"] # test transform: refer to refer to vlmo/transforms/__init__.py
image_size = 224 # image size
reclip_image_size = None # reclip image size
patch_size = 16 # patch size
draw_false_image = 0 # if get negative image
image_only = False # only input image
text_only = False # # only input text
# Video setting, video_num_frm is not None means video input
video_num_frm = None
# Visual tokenizer setting based on beit2
tokenizer_model = "beit2_visual_tokenizer"
codebook_size = 8192
codebook_dim = 32
visual_mask_size = 14
visual_mask_num = 80
# Text Setting
lang = 'cn' # language for zero-shot imagenet testing: cn|en
vqav2_label_size = 3129
max_text_len = 40 # the number of characters
max_text_len_of_initckpt = 196
tokenizer_type = "BertTokenizer" # Chinese text
vocab_size = 21128
tokenizer = "./vocab.txt"
whole_word_masking = True
mlm_prob = 0.15 # language mask ratio
draw_false_text = 0
mvlm_prob = 0.50 # vision-langurage mlm task
mask_ratio = 0 # flip: mask ratio for image
# cap setting
cap_onlytext = False # default caption image to text
# imagemlm setting
split_data_for_imagemlm = False # if True, split a batch data to two parts, and the first part for imagemlm.
# itc setting
itc_mask = False # itc use masked token
aggregate_nodes = -1 # aggregate nodes num for compute_itc, default -1 is for all nodes
# Transformer Setting
model_arch = "vlmo_base_patch16"
drop_path_rate = 0.1
# Downstream Setting
get_recall_metric = False
get_recall_rerank_metric = False
get_zeroshot_metric = False
get_muge_feat = False
get_f30k_feat = False
k_test = 32
# PL Trainer Setting
resume_from = None
fast_dev_run = False
val_check_interval = 1.0
test_only = False
use_sharded_training = False
resume_during_training = False
save_top_k = 10
every_n_train_steps = 2000 # the step to save checkpoint
log_metric_steps = 100 # the step to log metric
# below params varies with the environment
use_pcache = False # data storage method: pcache or nas
pcache_root = ""
# main_site: pcache://multimodalproxyi-pool.cz50c.alipay.com:39999/mnt/
# public_cloud: pcache://pcache_public_cloud.pcache.local:39999/mnt/abc7c88079a60b45ddfce7afa40720b7/
gpu_env = "main_site" # public_cloud or main_site
data_root = "" # data root for data list
log_dir = "result"
per_gpu_batchsize = 4 # you should define this manually with per_gpu_batch_size=#
num_gpus = 1
num_nodes = 1
load_path = ""
num_workers = 8
precision = 16
local_run = True
flash_attn = False
deepspeed_config = None # "ds_config.json"
coalesce_backbone = False
mask_data = "v+l" # 'v+l':choose input of imagemlm+textmlm task, 'vl': choose input of mvlm task.
communication_benchmark = False
checkpoint_activations = False
# dataset setting
single_cap = True # if have only one caption
random_one = False # if choose one caption from caption list
# ITC setting
itc_feats_name = "cls_vlffn_feats" # feat for itc loss
itc_distill = ""
itc_distill_dim = 1024
itc_teacher_weights = ""
# mup training setting
mup = False
base_encoder_embed_dim = 1
delta_encoder_embed_dim = 2
mup_encoder_attention_heads = 1
base_encoder_ffn_embed_dim = 1
delta_encoder_ffn_embed_dim = 2
# atorch
atorch_config = None
compile_op = False
optimizer_state_shard_save = False
model_state_shard_save = False
# itc loss
local_loss = False
use_dual_softmax = False
num_frames = 1
# ----------------------- LMM pretraining config -----------------------
# norm setting
deepnorm = False