name: comma_v0p1_yolooooo dump_dir: /fsx/craffel/lingua_logs/comma_v0p1/ seed: 777 grad_acc_steps: 4 gc_collect_freq: 1000 probe_freq: null steps: 500000 data: root_dir: /scratch/craffel/lingua/data/ sources: peS2o: 0.274065475510351 stackexchange: 0.134617935796937 stackv2_edu: 0.127770669195666 cccc: 0.0871992270000557 wikimedia: 0.0861800315862719 github_archive: 0.0606452345122248 uspto: 0.0413469377516883 pubmed: 0.0367902799837971 arxiv_papers: 0.0292395449667613 caselaw_access_project: 0.0193875362722656 wikiteam: 0.0137485410839637 doab: 0.0180439781895451 uk_hansard: 0.0144498535570883 pre_1929_books: 0.0115755547988338 ubuntu_irc: 0.00794254267719456 regulations: 0.00762583706405442 data_provenance_initiative: 0.00512264496834867 project_gutenberg: 0.00502100654070129 youtube: 0.00465917165839394 arxiv_abstracts: 0.00359635066160403 stackv2_html: 0.00225924255952781 usgpo: 0.00226024581728848 library_of_congress: 0.00222469340783564 biodiversity_heritage_library: 0.00221737524370278 pressbooks: 0.000865101033213598 libretexts: 0.00054149556727006 news: 0.000372716196818104 foodista: 0.000125363443065615 oercommons: 7.78696843693821e-05 python_enhancement_proposals: 1.69983991984805e-05 public_domain_review: 1.05448719635173e-05 batch_size: 2 seq_len: 4096 n_views: 2 seed: 42 add_bos: true add_eos: true load_async: true prefetch_size: 4096 tokenizer: name: tiktoken path: /fsx/craffel/lingua/tokenizers/common-pile-tokenizer.tiktoken optim: lr: 0.001 weight_decay: 0.2 epsilon: 1.0e-08 beta1: 0.9 beta2: 0.95 clip: 1.0 scheduler: cosine warmup: 2000 lr_min_ratio: 1.0e-06 cycle_length: 1.0 cosine_theta: 1.0 annealing_step: 1000 decay_fraction: 0.1 exp_factor: 0.5 model: dim: 4096 n_layers: 32 head_dim: null n_heads: 32 n_kv_heads: null ffn_dim_multiplier: 1.0 multiple_of: 256 norm_eps: 1.0e-05 rope_theta: 100000.0 init_base_std: null init_std_factor: disabled max_seqlen: 4096 seed: 42 vocab_size: 64256 weight_tying: false sliding_window: null distributed: dp_shard: 1 dp_replicate: 64 tp_size: 1 selective_activation_checkpointing: false compile: true fsdp_type: full_shard model_dtype: bf16 float8_recipe: null float8_filter: layers\.[0-9]+\. matmul_allow_tf32: false detect_anomaly: false compile_cache_size_limit: 8 spawn_method: forkserver env: MKL_SERVICE_FORCE_INTEL: GNU OMP_NUM_THREADS: '1' MKL_NUM_THREADS: '1' ENABLE_INTRA_NODE_COMM: '1' TORCH_NCCL_AVOID_RECORD_STREAMS: '1' NCCL_IB_TIMEOUT: '22' NCCL_DEBUG: INFO TORCH_NCCL_ASYNC_ERROR_HANDLING: '1' checkpoint: dump: every: 10000 keep: -1 eval: every: 2000 keep: 3 path: /fsx/craffel/lingua_logs/comma_v0p1/checkpoints init_ckpt_path: null continue_training_from_init: false profiling: run: true trace_folder: profiling mem_warmup: 0 mem_steps: 4 profile_warmup: 100 profile_steps: 4 logging: freq: 1 acc_freq: null wandb: null async_eval_gpus: 8 eval: harness: tasks: - hellaswag - task: boolq dataset_kwargs: trust_remote_code: true - piqa - task: social_iqa dataset_kwargs: trust_remote_code: true - winogrande - openbookqa - arc_easy - arc_challenge - race - commonsense_qa - task: copa dataset_kwargs: trust_remote_code: true - mmlu - mmlu_pro generator: max_tokens: 8192 dtype: bf16