craffel HF Staff commited on
Commit
2705b16
·
verified ·
1 Parent(s): 630b43a

Upload config.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. config.yaml +154 -0
config.yaml ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: comma_v0p1_yolooooo
2
+ dump_dir: /fsx/craffel/lingua_logs/comma_v0p1/
3
+ seed: 777
4
+ grad_acc_steps: 4
5
+ gc_collect_freq: 1000
6
+ probe_freq: null
7
+ steps: 500000
8
+ data:
9
+ root_dir: /scratch/craffel/lingua/data/
10
+ sources:
11
+ peS2o: 0.274065475510351
12
+ stackexchange: 0.134617935796937
13
+ stackv2_edu: 0.127770669195666
14
+ cccc: 0.0871992270000557
15
+ wikimedia: 0.0861800315862719
16
+ github_archive: 0.0606452345122248
17
+ uspto: 0.0413469377516883
18
+ pubmed: 0.0367902799837971
19
+ arxiv_papers: 0.0292395449667613
20
+ caselaw_access_project: 0.0193875362722656
21
+ wikiteam: 0.0137485410839637
22
+ doab: 0.0180439781895451
23
+ uk_hansard: 0.0144498535570883
24
+ pre_1929_books: 0.0115755547988338
25
+ ubuntu_irc: 0.00794254267719456
26
+ regulations: 0.00762583706405442
27
+ data_provenance_initiative: 0.00512264496834867
28
+ project_gutenberg: 0.00502100654070129
29
+ youtube: 0.00465917165839394
30
+ arxiv_abstracts: 0.00359635066160403
31
+ stackv2_html: 0.00225924255952781
32
+ usgpo: 0.00226024581728848
33
+ library_of_congress: 0.00222469340783564
34
+ biodiversity_heritage_library: 0.00221737524370278
35
+ pressbooks: 0.000865101033213598
36
+ libretexts: 0.00054149556727006
37
+ news: 0.000372716196818104
38
+ foodista: 0.000125363443065615
39
+ oercommons: 7.78696843693821e-05
40
+ python_enhancement_proposals: 1.69983991984805e-05
41
+ public_domain_review: 1.05448719635173e-05
42
+ batch_size: 2
43
+ seq_len: 4096
44
+ n_views: 2
45
+ seed: 42
46
+ add_bos: true
47
+ add_eos: true
48
+ load_async: true
49
+ prefetch_size: 4096
50
+ tokenizer:
51
+ name: tiktoken
52
+ path: /fsx/craffel/lingua/tokenizers/common-pile-tokenizer.tiktoken
53
+ optim:
54
+ lr: 0.001
55
+ weight_decay: 0.2
56
+ epsilon: 1.0e-08
57
+ beta1: 0.9
58
+ beta2: 0.95
59
+ clip: 1.0
60
+ scheduler: cosine
61
+ warmup: 2000
62
+ lr_min_ratio: 1.0e-06
63
+ cycle_length: 1.0
64
+ cosine_theta: 1.0
65
+ annealing_step: 1000
66
+ decay_fraction: 0.1
67
+ exp_factor: 0.5
68
+ model:
69
+ dim: 4096
70
+ n_layers: 32
71
+ head_dim: null
72
+ n_heads: 32
73
+ n_kv_heads: null
74
+ ffn_dim_multiplier: 1.0
75
+ multiple_of: 256
76
+ norm_eps: 1.0e-05
77
+ rope_theta: 100000.0
78
+ init_base_std: null
79
+ init_std_factor: disabled
80
+ max_seqlen: 4096
81
+ seed: 42
82
+ vocab_size: 64256
83
+ weight_tying: false
84
+ sliding_window: null
85
+ distributed:
86
+ dp_shard: 1
87
+ dp_replicate: 64
88
+ tp_size: 1
89
+ selective_activation_checkpointing: false
90
+ compile: true
91
+ fsdp_type: full_shard
92
+ model_dtype: bf16
93
+ float8_recipe: null
94
+ float8_filter: layers\.[0-9]+\.
95
+ matmul_allow_tf32: false
96
+ detect_anomaly: false
97
+ compile_cache_size_limit: 8
98
+ spawn_method: forkserver
99
+ env:
100
+ MKL_SERVICE_FORCE_INTEL: GNU
101
+ OMP_NUM_THREADS: '1'
102
+ MKL_NUM_THREADS: '1'
103
+ ENABLE_INTRA_NODE_COMM: '1'
104
+ TORCH_NCCL_AVOID_RECORD_STREAMS: '1'
105
+ NCCL_IB_TIMEOUT: '22'
106
+ NCCL_DEBUG: INFO
107
+ TORCH_NCCL_ASYNC_ERROR_HANDLING: '1'
108
+ checkpoint:
109
+ dump:
110
+ every: 10000
111
+ keep: -1
112
+ eval:
113
+ every: 2000
114
+ keep: 3
115
+ path: /fsx/craffel/lingua_logs/comma_v0p1/checkpoints
116
+ init_ckpt_path: null
117
+ continue_training_from_init: false
118
+ profiling:
119
+ run: true
120
+ trace_folder: profiling
121
+ mem_warmup: 0
122
+ mem_steps: 4
123
+ profile_warmup: 100
124
+ profile_steps: 4
125
+ logging:
126
+ freq: 1
127
+ acc_freq: null
128
+ wandb: null
129
+ async_eval_gpus: 8
130
+ eval:
131
+ harness:
132
+ tasks:
133
+ - hellaswag
134
+ - task: boolq
135
+ dataset_kwargs:
136
+ trust_remote_code: true
137
+ - piqa
138
+ - task: social_iqa
139
+ dataset_kwargs:
140
+ trust_remote_code: true
141
+ - winogrande
142
+ - openbookqa
143
+ - arc_easy
144
+ - arc_challenge
145
+ - race
146
+ - commonsense_qa
147
+ - task: copa
148
+ dataset_kwargs:
149
+ trust_remote_code: true
150
+ - mmlu
151
+ - mmlu_pro
152
+ generator:
153
+ max_tokens: 8192
154
+ dtype: bf16