apollo_rank: 256 apollo_scale: 1 apollo_target: all apollo_update_interval: 200 bf16: true cutoff_len: 4096 dataset: codes_330k_ns dataset_dir: data ddp_timeout: 180000000 do_train: true enable_liger_kernel: true finetuning_type: freeze flash_attn: auto freeze_trainable_layers: 2 freeze_trainable_modules: all gradient_accumulation_steps: 8 include_num_input_tokens_seen: true learning_rate: 5.0e-05 logging_steps: 1 lr_scheduler_type: cosine max_grad_norm: 1.0 max_samples: 50000000 model_name_or_path: Qwen/Qwen2.5-Coder-7B-Instruct neat_packing: true num_train_epochs: 1.0 output_dir: saves/Qwen2.5-Coder-7B-Instruct/freeze/qwen_ns packing: true per_device_train_batch_size: 16 plot_loss: true preprocessing_num_workers: 16 report_to: none rope_scaling: llama3 save_steps: 500 stage: sft template: qwen trust_remote_code: true use_apollo: true use_llama_pro: true warmup_steps: 0