Spaces:
Sleeping
Sleeping
train_common () { | |
fairseq-train "$DATA" \ | |
--combine-val \ | |
--train-subset train \ | |
--num-workers 2 \ | |
--validate-interval-updates 1000 \ | |
--save-interval-updates 1000 \ | |
--no-epoch-checkpoints \ | |
--ddp-backend fully_sharded \ | |
--memory-efficient-fp16 \ | |
--fp16-init-scale 4 \ | |
--checkpoint-activations \ | |
--arch transformer_lm_gpt \ | |
--activation-fn gelu \ | |
--share-decoder-input-output-embed \ | |
--task language_modeling \ | |
--sample-break-mode none \ | |
--tokens-per-sample 2048 \ | |
--optimizer adam --adam-betas "(0.9, 0.98)" \ | |
--adam-eps 1e-08 \ | |
--clip-norm 0.0 \ | |
--lr-scheduler polynomial_decay \ | |
--warmup-updates 750 \ | |
--dropout 0.1 \ | |
--attention-dropout 0.1 \ | |
--weight-decay 0.01 \ | |
--batch-size 16 \ | |
--update-freq 2 \ | |
--required-batch-size-multiple 1 \ | |
--total-num-update 572204 \ | |
--max-update 572204 \ | |
--seed 1 \ | |
--log-format json --log-interval 1 \ | |
--distributed-world-size 8 --distributed-port 13177 \ | |
"$@" | |
} | |
train_125M () { | |
train_common --decoder-layers 12 \ | |
--decoder-embed-dim 768 \ | |
--decoder-ffn-embed-dim 3072 \ | |
--decoder-attention-heads 12 "$@" | |
} | |
train_355M () { | |
train_common --decoder-layers 24 \ | |
--decoder-embed-dim 1024\ | |
--decoder-ffn-embed-dim 4096 \ | |
--decoder-attention-heads 16 \ | |
--dropout 0.0 \ | |
--attention-dropout 0.0 \ | |
"$@" | |
} | |
train_1.3B () { | |
train_common --decoder-layers 24 \ | |
--decoder-embed-dim 2048 \ | |
--decoder-ffn-embed-dim 8192 \ | |
--decoder-attention-heads 32 \ | |
--batch-size 4 \ | |
--update-freq 16 \ | |
--total-num-update 286102 \ | |
--max-update 286102 \ | |
"$@" | |
} | |
train_2.7B () { | |
train_common --decoder-layers 32 \ | |
--decoder-embed-dim 2560 \ | |
--decoder-ffn-embed-dim 10240 \ | |
--decoder-attention-heads 32 \ | |
--batch-size 4 \ | |
--update-freq 16 \ | |
--total-num-update 286102 \ | |
--max-update 286102 \ | |
"$@" | |
} | |