darrenphodgson76's picture
Update train.py
f2467aa verified
import unsloth
from unsloth import FastLanguageModel
import os
import zipfile
import pandas as pd
from datasets import Dataset
import torch
from transformers import TrainingArguments
from trl import SFTTrainer
def main():
# 1) Load 4-bit model + tokenizer (SmolLM already chat-formatted)
model_name = "HuggingFaceTB/SmolLM2-1.7B"
max_seq_length = 768
dtype = torch.float16
load_in_4bit = True
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=model_name,
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=load_in_4bit,
)
# 2) NO manual special-tokens injection or resizing β€” base model tokenizer already includes chat markers
# 3) Load chat-formatted dataset
df = pd.read_json("data.jsonl", lines=True)
# Confirm each sample ends with the end-of-turn token
assert df['text'].str.endswith("<|im_end|>").all(), "Some samples missing end-of-turn token"
# 4) Create Hugging Face Dataset and split
full_dataset = Dataset.from_pandas(df[["text"]])
split = full_dataset.train_test_split(test_size=0.15, seed=42)
train_dataset = split["train"]
eval_dataset = split["test"]
print(f"βœ… Training samples: {len(train_dataset)} | Eval samples: {len(eval_dataset)}")
# 5) Apply LoRA adapters
model = FastLanguageModel.get_peft_model(
model,
r=8,
lora_alpha=32,
lora_dropout=0.05,
bias="none",
target_modules=[
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
],
use_gradient_checkpointing=True,
)
# 6) Tokenization function
def tokenize_fn(examples):
tokens = tokenizer(
examples["text"],
truncation=True,
padding="max_length",
max_length=max_seq_length,
)
tokens["labels"] = tokens["input_ids"].copy()
return tokens
tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
tokenized_eval = eval_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
# 7) Training arguments
training_args = TrainingArguments(
output_dir="./output_model",
per_device_train_batch_size=8,
gradient_accumulation_steps=1,
fp16=True,
num_train_epochs=3,
learning_rate=2e-4,
logging_strategy="steps",
logging_steps=25,
save_strategy="epoch",
save_total_limit=2,
eval_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
dataloader_num_workers=2,
)
# 8) Train with SFTTrainer
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_eval,
)
print("πŸš€ Starting training...")
trainer.train()
print("βœ… Training complete.")
# 9) Save and zip model
final_dir = "./output_model_final"
os.makedirs(final_dir, exist_ok=True)
model.save_pretrained(final_dir)
tokenizer.save_pretrained(final_dir)
zip_path = "model.zip"
print(f"πŸ—œ Zipping model β†’ {zip_path}")
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
for root, _, files in os.walk(final_dir):
for fname in files:
full = os.path.join(root, fname)
rel = os.path.relpath(full, final_dir)
zf.write(full, rel)
print(f"βœ… Model zipped β†’ {zip_path}")
if __name__ == "__main__":
main()