Safetensors
qwen2
virtuoussy's picture
Upload 3 files
1680393 verified
set -x
METHOD=$1 # reinforce_baseline, reinforce , rloo
PRETRAIN_PATH=$2 #Qwen/Qwen2.5-7B
DATA_PATH=$3 #virtuoussy/Math-RLVR
REWARD_API=$4 #http://127.0.0.1:8000/get_reward
PRETRAIN_PATH=PERTRAIN_PATH
DATA_PATH=$DATA_PATH
working_dir=$(pwd)
LOG_PATH=${working_dir}/${EXP_NAME}/train.log
SAVE_PATH=${working_dir}/${EXP_NAME}/checkpoint
mkdir -p ${SAVE_PATH}
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
ray start --head --node-ip-address 0.0.0.0 --num-gpus 8
ray job submit --address="http://127.0.0.1:8265"
-- python3 -m openrlhf.cli.train_ppo_ray \
--ref_num_nodes 1 \
--ref_num_gpus_per_node 8 \
--actor_num_nodes 1 \
--actor_num_gpus_per_node 8 \
--vllm_num_engines 8 \
--vllm_tensor_parallel_size 1 \
--colocate_all_models \
--vllm_gpu_memory_utilization 0.5 \
--vllm_enable_sleep \
--deepspeed_enable_sleep \
--enforce_eager \
--pretrain ${PRETRAIN_PATH} \
--remote_rm_url ${REWARD_API} \
--save_path ${SAVE_PATH} \
--micro_train_batch_size 8 \
--train_batch_size 128 \
--micro_rollout_batch_size 16 \
--rollout_batch_size 128 \
--n_samples_per_prompt 4 \
--max_samples 30000 \
--max_epochs 1 \
--prompt_max_len 1024 \
--generate_max_len 1024 \
--zero_stage 3 \
--bf16 \
--actor_learning_rate 5e-7 \
--init_kl_coef 0.01 \
--use_kl_loss \
--advantage_estimator ${METHOD} \
--prompt_data ${DATA_PATH} \
--input_key query \
--apply_chat_template \
--packing_samples \
--normalize_reward \
--adam_offload \
--flash_attn \
--gradient_checkpointing \
2>&1 | tee ${LOG_PATH}