File size: 3,654 Bytes
8e6859e d58f607 6d6b976 7693569 f0fba32 7693569 8e6859e 785db18 2efa3f3 7693569 f2467aa 7693569 f2f7c22 7693569 785db18 7693569 b6a51b0 f2467aa 4d2faa4 2efa3f3 3c0a089 2efa3f3 287b4a7 2efa3f3 8e6859e 7693569 287b4a7 12a4ede 2efa3f3 7693569 8ba34e8 785db18 2efa3f3 7693569 287b4a7 7693569 785db18 7693569 785db18 2efa3f3 7693569 f0fba32 7693569 785db18 2efa3f3 7693569 4d2faa4 287b4a7 7693569 287b4a7 4d2faa4 2efa3f3 7693569 785db18 7693569 2efa3f3 7693569 f0fba32 2051c3e 7693569 2efa3f3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
import unsloth
from unsloth import FastLanguageModel
import os
import zipfile
import pandas as pd
from datasets import Dataset
import torch
from transformers import TrainingArguments
from trl import SFTTrainer
def main():
# 1) Load 4-bit model + tokenizer (SmolLM already chat-formatted)
model_name = "HuggingFaceTB/SmolLM2-1.7B"
max_seq_length = 768
dtype = torch.float16
load_in_4bit = True
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=model_name,
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=load_in_4bit,
)
# 2) NO manual special-tokens injection or resizing β base model tokenizer already includes chat markers
# 3) Load chat-formatted dataset
df = pd.read_json("data.jsonl", lines=True)
# Confirm each sample ends with the end-of-turn token
assert df['text'].str.endswith("<|im_end|>").all(), "Some samples missing end-of-turn token"
# 4) Create Hugging Face Dataset and split
full_dataset = Dataset.from_pandas(df[["text"]])
split = full_dataset.train_test_split(test_size=0.15, seed=42)
train_dataset = split["train"]
eval_dataset = split["test"]
print(f"β
Training samples: {len(train_dataset)} | Eval samples: {len(eval_dataset)}")
# 5) Apply LoRA adapters
model = FastLanguageModel.get_peft_model(
model,
r=8,
lora_alpha=32,
lora_dropout=0.05,
bias="none",
target_modules=[
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
],
use_gradient_checkpointing=True,
)
# 6) Tokenization function
def tokenize_fn(examples):
tokens = tokenizer(
examples["text"],
truncation=True,
padding="max_length",
max_length=max_seq_length,
)
tokens["labels"] = tokens["input_ids"].copy()
return tokens
tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
tokenized_eval = eval_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
# 7) Training arguments
training_args = TrainingArguments(
output_dir="./output_model",
per_device_train_batch_size=8,
gradient_accumulation_steps=1,
fp16=True,
num_train_epochs=3,
learning_rate=2e-4,
logging_strategy="steps",
logging_steps=25,
save_strategy="epoch",
save_total_limit=2,
eval_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
dataloader_num_workers=2,
)
# 8) Train with SFTTrainer
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_eval,
)
print("π Starting training...")
trainer.train()
print("β
Training complete.")
# 9) Save and zip model
final_dir = "./output_model_final"
os.makedirs(final_dir, exist_ok=True)
model.save_pretrained(final_dir)
tokenizer.save_pretrained(final_dir)
zip_path = "model.zip"
print(f"π Zipping model β {zip_path}")
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
for root, _, files in os.walk(final_dir):
for fname in files:
full = os.path.join(root, fname)
rel = os.path.relpath(full, final_dir)
zf.write(full, rel)
print(f"β
Model zipped β {zip_path}")
if __name__ == "__main__":
main() |