checkpointing: checkpoints_dir: checkpoints evaluation: eval_results_dir: eval_results fabric_checkpoint_dir: fabric_state fabric_checkpoint_filename: checkpoint.pt hf_checkpoint: collection_slug: null repo_id: pico-lm/pico-decoder-tiny learning_dynamics: batch_size: 256 eval_data: pico-lm/pretokenized-paloma-tinsy layer_suffixes: - attention.v_proj - attention.o_proj - swiglu.w_2 sequence_idx: -1 learning_dynamics_dir: learning_dynamics logs_dir: logs run_name: pico-decoder-tiny-1 runs_dir: runs save_every_n_steps: 1000 save_to_hf: true training: auto_resume: true data: dataloader: batch_size: 1024 dataset: name: pico-lm/pretokenized-dolma tokenizer: name: allenai/OLMo-7B-0724-hf vocab_size: 50304 evaluation: metrics: - paloma paloma: batch_size: 32 dataset_name: pico-lm/pretokenized-paloma-tinsy dataset_split: val max_length: 2048 model: activation_hidden_dim: 384 attention_n_heads: 12 attention_n_kv_heads: 4 batch_size: 1024 d_model: 96 max_seq_len: 2048 model_type: pico_decoder n_layers: 12 norm_eps: 1.0e-06 position_emb_theta: 10000.0 vocab_size: 50304 monitoring: logging: log_every_n_steps: 100 log_level: INFO save_to_wandb: true wandb: entity: pico-lm project: pico-decoder training: fabric: accelerator: cuda num_devices: 4 num_nodes: 4 precision: bf16-mixed max_steps: 200000 optimization: gradient_accumulation_steps: 4 lr: 0.0003 lr_scheduler: linear_with_warmup lr_warmup_steps: 2500 optimizer: adamw