Spaces:
Running
on
Zero
Running
on
Zero
# Template config, need to change dump_dir, data.root_dir and tokenizer.path | |
# Evals can be activated by uncommenting its config | |
# python -m launchers.stool config=apps/main/configs/debug.yaml nodes=8 account=fair_amaia_cw_codegen qos=lowest | |
dump_dir: /tmp/ | |
name: "debug" | |
steps: 100_000 | |
probe_freq: null | |
seed: 777 | |
optim: | |
lr: 4e-04 | |
warmup: 500 | |
lr_min_ratio: 0.1 | |
clip: 10.0 | |
distributed: | |
fsdp_type: full_shard | |
model_dtype: bf16 | |
matmul_allow_tf32: false | |
selective_activation_checkpointing: false | |
tp_size: 1 | |
model: | |
n_heads: 8 | |
dim: 512 | |
vocab_size: 260 | |
dim_token: 256 | |
patch_size: 6 | |
patching_mode: "space" | |
tie_local_encoder_decoder_logits: false | |
patch_in_forward: false | |
max_encoder_seq_length: 12288 | |
pad_to_max_length: true | |
patching_threshold: 3.1439168453216553 | |
encoder_hash_byte_group_size: [4] | |
encoder_hash_byte_group_vocab: 50002 | |
encoder_hash_byte_group_nb_functions: 3 | |
encoder_enable_byte_ngrams: false | |
cross_attn_encoder: true # assuming cross_attention is true | |
cross_attn_decoder: true # assuming cross_attention is true | |
cross_attn_window_encoder: 512 | |
cross_attn_window_decoder: 512 | |
dim_local_encoder: 256 | |
dim_local_decoder: 256 | |
cross_attn_k: 8 | |
cross_attn_nheads: 4 | |
cross_attn_all_layers_decoder: true | |
cross_attn_all_layers_encoder: true | |
cross_attn_use_flex_attention: true | |
cross_attn_init_by_pooling: true | |
log_patch_lengths: true | |
non_linearity: "swiglu" | |
use_rope: true | |
recompute_fc1_out: false | |
recompute_fc3_out: false | |
recompute_attn: false | |
custom_bwd: false | |
layer_ckpt: "none" | |
use_local_encoder_transformer: true | |
init_use_gaussian: true | |
init_use_depth: "current" | |
attn_impl: "xformers" | |
attn_bias_type: "block_causal" | |
alpha_depth: "disabled" | |
max_length: 256 | |
local_attention_window_len: 512 | |
max_seqlen: 12288 | |
downsampling_by_pooling: "max" | |
data: | |
root_dir: ??? | |
sources: | |
dclm_baseline_1.0: 1.0 | |
batch_size: 2 | |
prefetch_size: 64 | |
seq_len: 4096 | |
load_async: true | |
preprocess_dir: ??? | |
tokenizer_args: | |
name: blt | |
init_kwargs: | |
bpe_tokenizer_path: ??? | |
profiling: | |
run: false | |
checkpoint: | |
dump: | |
every: 500 | |
keep: 3 | |
eval: | |
every: 1000 | |
keep: -1 | |
logging: | |
freq: 10 | |
eval_on_gpus: 8 | |
eval: null | |