|
set -x |
|
METHOD=$1 |
|
PRETRAIN_PATH=$2 |
|
DATA_PATH=$3 |
|
REWARD_API=$4 |
|
|
|
|
|
|
|
PRETRAIN_PATH=PERTRAIN_PATH |
|
DATA_PATH=$DATA_PATH |
|
|
|
|
|
working_dir=$(pwd) |
|
LOG_PATH=${working_dir}/${EXP_NAME}/train.log |
|
SAVE_PATH=${working_dir}/${EXP_NAME}/checkpoint |
|
mkdir -p ${SAVE_PATH} |
|
|
|
|
|
|
|
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 |
|
ray start --head --node-ip-address 0.0.0.0 --num-gpus 8 |
|
|
|
ray job submit --address="http://127.0.0.1:8265" |
|
-- python3 -m openrlhf.cli.train_ppo_ray \ |
|
--ref_num_nodes 1 \ |
|
--ref_num_gpus_per_node 8 \ |
|
--actor_num_nodes 1 \ |
|
--actor_num_gpus_per_node 8 \ |
|
--vllm_num_engines 8 \ |
|
--vllm_tensor_parallel_size 1 \ |
|
--colocate_all_models \ |
|
--vllm_gpu_memory_utilization 0.5 \ |
|
--vllm_enable_sleep \ |
|
--deepspeed_enable_sleep \ |
|
--enforce_eager \ |
|
--pretrain ${PRETRAIN_PATH} \ |
|
--remote_rm_url ${REWARD_API} \ |
|
--save_path ${SAVE_PATH} \ |
|
--micro_train_batch_size 8 \ |
|
--train_batch_size 128 \ |
|
--micro_rollout_batch_size 16 \ |
|
--rollout_batch_size 128 \ |
|
--n_samples_per_prompt 4 \ |
|
--max_samples 30000 \ |
|
--max_epochs 1 \ |
|
--prompt_max_len 1024 \ |
|
--generate_max_len 1024 \ |
|
--zero_stage 3 \ |
|
--bf16 \ |
|
--actor_learning_rate 5e-7 \ |
|
--init_kl_coef 0.01 \ |
|
--use_kl_loss \ |
|
--advantage_estimator ${METHOD} \ |
|
--prompt_data ${DATA_PATH} \ |
|
--input_key query \ |
|
--apply_chat_template \ |
|
--packing_samples \ |
|
--normalize_reward \ |
|
--adam_offload \ |
|
--flash_attn \ |
|
--gradient_checkpointing \ |
|
2>&1 | tee ${LOG_PATH} |
|
|