# #################################### # SpeechLM-P Base model # # #################################### [ $# -lt 2 ] && echo "Usage: $0 [mount=${PWD}] [world_size=32] [update_freq=1]" && exit 1 [ ${PWD##*/} != SpeechLM ] && echo "Error: dir not match! Switch to SpeechLM/ and run it again!" && exit 1 DATA_DIR=$1 TEXT_DATA_DIR=$2 mount=$3 world_size=$4 update_freq=$5 [ -z $mount ] && mount=${PWD} [ -z $world_size ] && world_size=32 [ -z $update_freq ] && update_freq=1 CODE_ROOT=${PWD} MODEL_DIR="${mount}/exp/pretrain/base_speechlmp_${world_size}gpu_${update_freq}accum" [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \ --config-dir $CODE_ROOT/speechlm/config/pretrain \ --config-name speechlm_base_librispeech \ common.user_dir=$CODE_ROOT/speechlm \ \ task.labels='["phn"]' \ model.label_rate=100 \ task.data=$DATA_DIR \ task.label_dir=$DATA_DIR \ task.text_cfg.text_data=$TEXT_DATA_DIR \ \ dataset.train_subset=\"train_960+train_text.phn-ltr\" \ dataset.valid_subset=\"dev_clean+dev_clean.phn-ltr\" \ dataset.num_workers=0 \ dataset.max_tokens=1400000 \ distributed_training.distributed_world_size=${world_size} \ optimization.update_freq=[${update_freq}] \ \ common.tensorboard_logdir=$MODEL_DIR \ checkpoint.save_dir=$MODEL_DIR \ hydra.run.dir=$MODEL_DIR \ hydra.job.name=pretrain # data_dir="/stdblob/users/v-ziqzhang/dataset/LibriLM/phn2char_sanych/tri4b_mono_label" # text_data_dir="/stdblob/users/v-ziqzhang/dataset/LibriLM/phn2char_sanych/filt2k_sil025_m5std25_sil14_spn32/bin-idx"