mjschock's picture
Enhance serve.py to handle additional content types by converting dictionary text and joining list items. Update train.py to replace FastLanguageModel with FastModel and LiteLLMModel, streamline model loading, and adjust dataset preparation logic. Modify config.yaml to change max_samples for testing and add provider information for model configuration.
4395ceb unverified
defaults:
- _self_
# Model configuration
model:
name: "unsloth/SmolLM2-135M-Instruct-bnb-4bit"
# name: "HuggingFaceTB/SmolLM2-135M-Instruct"
max_seq_length: 2048 # Auto supports RoPE Scaling internally
provider: "openai"
dtype: null # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit: true # Use 4bit quantization to reduce memory usage
# PEFT configuration
peft:
r: 64
lora_alpha: 128
lora_dropout: 0.05
bias: "none"
use_gradient_checkpointing: "unsloth"
random_state: 3407
use_rslora: true
loftq_config: null
target_modules:
- "q_proj"
- "k_proj"
- "v_proj"
- "o_proj"
- "gate_proj"
- "up_proj"
- "down_proj"
# Dataset configuration
dataset:
validation_split: 0.1 # 10% of data for validation
seed: 3407 # Random seed for dataset splitting
# Training configuration
training:
args:
per_device_train_batch_size: 2
per_device_eval_batch_size: 2
gradient_accumulation_steps: 16
warmup_steps: 100
max_steps: 120
learning_rate: 5e-5
logging_steps: 1
save_strategy: "steps"
save_steps: 30
eval_strategy: "steps"
eval_steps: 30
save_total_limit: 2
optim: "adamw_8bit"
weight_decay: 0.01
lr_scheduler_type: "cosine_with_restarts"
seed: 3407
output_dir: "outputs"
gradient_checkpointing: true
load_best_model_at_end: true
metric_for_best_model: "eval_loss"
greater_is_better: false
sft:
dataset_num_proc: 2
packing: false
data_collator:
mlm: false
pad_to_multiple_of: 8
# Output configuration
output:
dir: "final_model"
# Training control
train: false
# Testing configuration
test: true # Whether to run testing after training
test_dataset:
name: "gaia-benchmark/GAIA"
config: "2023_level1" # Use level 1 questions for testing
split: "test" # Use test split for testing
max_samples: 3 # Number of samples to test on
max_length: 2048 # Maximum sequence length for testing