File size: 2,007 Bytes
5bfd071
 
 
611c848
 
 
4395ceb
611c848
4395ceb
611c848
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5bfd071
518aafe
 
 
 
 
 
 
4395ceb
518aafe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
defaults:
  - _self_

# Model configuration
model:
  name: "unsloth/SmolLM2-135M-Instruct-bnb-4bit"
  # name: "HuggingFaceTB/SmolLM2-135M-Instruct"
  max_seq_length: 2048  # Auto supports RoPE Scaling internally
  provider: "openai"
  dtype: null  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
  load_in_4bit: true  # Use 4bit quantization to reduce memory usage

# PEFT configuration
peft:
  r: 64
  lora_alpha: 128
  lora_dropout: 0.05
  bias: "none"
  use_gradient_checkpointing: "unsloth"
  random_state: 3407
  use_rslora: true
  loftq_config: null
  target_modules:
    - "q_proj"
    - "k_proj"
    - "v_proj"
    - "o_proj"
    - "gate_proj"
    - "up_proj"
    - "down_proj"

# Dataset configuration
dataset:
  validation_split: 0.1  # 10% of data for validation
  seed: 3407  # Random seed for dataset splitting

# Training configuration
training:
  args:
    per_device_train_batch_size: 2
    per_device_eval_batch_size: 2
    gradient_accumulation_steps: 16
    warmup_steps: 100
    max_steps: 120
    learning_rate: 5e-5
    logging_steps: 1
    save_strategy: "steps"
    save_steps: 30
    eval_strategy: "steps"
    eval_steps: 30
    save_total_limit: 2
    optim: "adamw_8bit"
    weight_decay: 0.01
    lr_scheduler_type: "cosine_with_restarts"
    seed: 3407
    output_dir: "outputs"
    gradient_checkpointing: true
    load_best_model_at_end: true
    metric_for_best_model: "eval_loss"
    greater_is_better: false

  sft:
    dataset_num_proc: 2
    packing: false
    data_collator:
      mlm: false
      pad_to_multiple_of: 8

# Output configuration
output:
  dir: "final_model"

# Training control
train: false

# Testing configuration
test: true  # Whether to run testing after training
test_dataset:
  name: "gaia-benchmark/GAIA"
  config: "2023_level1"  # Use level 1 questions for testing
  split: "test"  # Use test split for testing
  max_samples: 3  # Number of samples to test on
  max_length: 2048  # Maximum sequence length for testing