File size: 7,118 Bytes
7718235 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
#!/bin/bash
# $1 is the name of the scripts folder
# $2 is the name of your task
# $3 is the training data file of your task
# $4 is the testing data file of your task
# $5 is the type of your task, either GLOF or DMS
# $6 is the number of dimensions of mode of action
# pretrain.seed.0.yaml: main file, the pretrain model
# first select the best model for TL based on validation dataset in pretrain
if [ ! -f $1/pretrain.seed.0.summary ] || [ ! -s $1/pretrain.seed.0.summary ]; then
Rscript visualize.train.process/plot.test.AUC.by.step.R $1/pretrain.seed.0.yaml > $1/pretrain.seed.0.summary
fi
number=$(cat $1/pretrain.seed.0.summary | grep 'val' | grep -oE '\([0-9]+\)' | sed 's/[(|)]//g')
logdir=$(cat $1/pretrain.seed.0.yaml | grep log_dir | sed 's/.*: //')
if [ -z $number ]; then
best_model="null"
else
best_model=$logdir"model.step."$number".pt"
fi
echo "Best model is: "$best_model
# origin hyper paramters
lr_warmup_steps=$(cat $1/pretrain.seed.0.yaml | grep lr_warmup_steps | sed 's/.*: //' | sed 's/ #.*//g')
num_save_batches=$(cat $1/pretrain.seed.0.yaml | grep num_save_batches | sed 's/.*: //' | sed 's/ #.*//g')
target_num_save_batches=400
num_epochs=$(cat $1/pretrain.seed.0.yaml | grep num_epochs | sed 's/.*: //' | sed 's/ #.*//g')
batch_size=$(cat $1/pretrain.seed.0.yaml | grep batch_size | sed 's/.*: //' | sed 's/ #.*//g')
lr=$(cat $1/pretrain.seed.0.yaml | grep lr: | sed 's/.*: //' | sed 's/ #.*//g')
half_lr=$(printf "%.1e" "$(echo "scale=10; $(printf "%f" "$lr")" | bc)")
five_lr=$(printf "%.1e" "$(echo "scale=10; $(printf "%f" "$lr") * 5" | bc)")
lr_min=$(cat $1/pretrain.seed.0.yaml | grep lr_min: | sed 's/.*: //' | sed 's/ #.*//g')
half_lr_min=$(echo "$lr_min" | awk '{ printf "%.1e", $1/10 }')
data_split=$(cat $1/pretrain.seed.0.yaml | grep data_split_fn | sed 's/.*: //' | sed 's/ #.*//g')
loss_fn=$(cat $1/pretrain.seed.0.yaml | grep ^loss_fn | sed 's/.*: //' | sed 's/ #.*//g')
drop_out=$(cat $1/pretrain.seed.0.yaml | grep drop_out | sed 's/.*: //' | sed 's/ #.*//g')
num_steps_update=$(cat $1/pretrain.seed.0.yaml | grep num_steps_update | sed 's/.*: //' | sed 's/ #.*//g')
ngpus=$(cat $1/pretrain.seed.0.yaml | grep ngpus | sed 's/.*: //' | sed 's/ #.*//g')
nworkers=$(cat $1/pretrain.seed.0.yaml | grep num_workers | sed 's/.*: //' | sed 's/ #.*//g')
target_nworkers=0
batch_size=$(cat $1/pretrain.seed.0.yaml | grep batch_size | sed 's/.*: //' | sed 's/ #.*//g')
data_file_train=$(cat $1/pretrain.seed.0.yaml | grep data_file_train: | sed 's/.*: //' | sed 's/ #.*//g')
data_file_test=$(cat $1/pretrain.seed.0.yaml | grep data_file_test: | sed 's/.*: //' | sed 's/ #.*//g')
echo "loss_fn was: "$loss_fn
changed_data=false
if grep -q "_by_anno" $1/pretrain.seed.0.yaml; then
echo "modify data-file-train in original yaml"
if [ ! -f $1/pretrain.seed.0.yaml.bak ]; then
cp $1/pretrain.seed.0.yaml $1/pretrain.seed.0.yaml.bak
fi
sed -i 's|_by_anno|""|g' $1/pretrain.seed.0.yaml
changed_data=true
fi
# prepare new yaml files for all tasks
for gene in $2
do
# use original yaml as template
cp $1/pretrain.seed.0.yaml $1/$gene.yaml
# ngpu should be 1
sed -i "s|ngpus: "$ngpus"|ngpus: 1\nuse_lora: |g" $1/$gene.yaml
# learning rate should be half
sed -i "s|lr: "$lr"|lr: "$half_lr"|g" $1/$gene.yaml
sed -i "s|lr_min: "$lr_min"|lr_min: "$half_lr_min"|g" $1/$gene.yaml
# change data type
sed -i "s|data_type: ClinVar|data_type: "$5"|g" $1/$gene.yaml
# change loss fn, if DMS, use mse_loss, if GLOF, use weighted_loss
if [[ "DMS" == *"$5"* ]]; then
sed -i "s|loss_fn: "$loss_fn"|loss_fn: mse_loss|g" $1/$gene.yaml
else
if [[ "GLOF" == *"$5"* ]]; then
sed -i "s|loss_fn: "$loss_fn"|loss_fn: weighted_loss|g" $1/$gene.yaml
fi
fi
# change logdir
sed -i "s|log_dir: "$logdir"|log_dir: "$logdir"TL."$gene".seed.0/|g" $1/$gene.yaml
# change drop out rate
sed -i "s|drop_out: "$drop_out"|drop_out: 0.1|g" $1/$gene.yaml
# change num workers in dataloader
sed -i "s|num_workers: "$nworkers"|num_workers: "$target_nworkers"|g" $1/$gene.yaml
# change loaded msa
if grep -q "loaded_msa" $1/pretrain.seed.0.yaml; then
sed -i "s|loaded_msa: false|loaded_msa: true|g" $1/$gene.yaml
else
echo "loaded_msa: true" >> $1/$gene.yaml
fi
if grep -q "loaded_esm" $1/pretrain.seed.0.yaml; then
sed -i "s|loaded_esm: false|loaded_esm: true|g" $1/$gene.yaml
else
echo "loaded_esm: true" >> $1/$gene.yaml
fi
# change load model
orig_load_model=$(cat $1/pretrain.seed.0.yaml | grep ^load_model | sed 's/.*: //' | sed 's/ #.*//g')
sed -i "s|load_model: "$orig_load_model"|load_model: "$best_model"|g" $1/$gene.yaml
sed -i "s|partial_load_model: true|partial_load_model: false|g" $1/$gene.yaml
# change num epochs to 2 times larger if DMS
if [[ "DMS" == *"$5"* ]]; then
sed -i "s|num_epochs: "$num_epochs"|num_epochs: "$(($num_epochs*2))"|g" $1/$gene.yaml
else
if [[ "GLOF" == *"$5"* ]]; then
sed -i "s|num_epochs: "$num_epochs"|num_epochs: "$(($num_epochs))"|g" $1/$gene.yaml
fi
fi
# warm up steps should be 20 times lower
sed -i "s|lr_warmup_steps: "$lr_warmup_steps"|lr_warmup_steps: "$(($lr_warmup_steps/20))"|g" $1/$gene.yaml
# num saved batches should be 20 times lower
if [[ "DMS" == *"$5"* ]]; then
sed -i "s|num_save_batches: "$num_save_batches"|num_save_batches: "$(($target_num_save_batches))"|g" $1/$gene.yaml
else
if [[ "GLOF" == *"$5"* ]]; then
sed -i "s|num_save_batches: "$num_save_batches"|num_save_batches: "$(($target_num_save_batches/80))"|g" $1/$gene.yaml
fi
fi
sed -i "s|num_steps_update: "$num_steps_update"|num_steps_update: 1|g" $1/$gene.yaml
# change the output dimension
sed -i "s|output_dim: 1|output_dim: "$6"|g" $1/$gene.yaml
# if is GLOF task, train/val split should be 0.75/0.25
if [[ "GLOF" == *"$5"* ]]; then
sed -i "s|train_size: 0.95|train_size: 0.75|g" $1/$gene.yaml
sed -i "s|val_size: 0.05|val_size: 0.25|g" $1/$gene.yaml
fi
# change the data file train
sed -i "s|data_file_train: "$data_file_train"|data_file_train: "$3"|g" $1/$gene.yaml
# change the data file test
sed -i "s|data_file_test: "$data_file_test"|data_file_test: "$4"|g" $1/$gene.yaml
done
# make 5 seeds
for gene in $2
do
mkdir -p $1/$gene/
mv $1/$gene.yaml $1/$gene/$gene.seed.0.yaml
for seed in {1..4}
do
cp $1/$gene/$gene.seed.0.yaml $1/$gene/$gene.seed.$seed.yaml
sed -i "s|log_dir: "$logdir"TL."$gene".seed.0/|log_dir: "$logdir"TL."$gene".seed."$seed"/|g" $1/$gene/$gene.seed.$seed.yaml
sed -i "s|seed: 0|seed: "$seed"|g" $1/$gene/$gene.seed.$seed.yaml
done
done
# make large window version for 5 seeds, if GLOF
if [[ "GLOF" == *"$5"* ]]; then
for gene in $2
do
for seed in {0..4}
do
cp $1/$gene/$gene.seed.$seed.yaml $1/$gene/$gene.seed.$seed.large.window.yaml
sed -i "s|max_len: 251|max_len: 1251|g" $1/$gene/$gene.seed.$seed.large.window.yaml
# change logdir
sed -i "s|log_dir: "$logdir"TL."$gene".seed."$seed"/|log_dir: "$logdir"TL."$gene".seed."$seed".large.window/|g" $1/$gene/$gene.seed.$seed.large.window.yaml
done
done
fi |