File size: 3,654 Bytes
8e6859e
d58f607
 
6d6b976
 
7693569
f0fba32
7693569
 
8e6859e
785db18
2efa3f3
7693569
f2467aa
7693569
f2f7c22
7693569
 
785db18
7693569
 
 
 
 
 
b6a51b0
f2467aa
4d2faa4
2efa3f3
3c0a089
2efa3f3
 
287b4a7
2efa3f3
8e6859e
7693569
 
 
287b4a7
12a4ede
2efa3f3
7693569
 
 
 
 
 
 
 
 
 
 
8ba34e8
785db18
2efa3f3
7693569
 
 
 
 
287b4a7
7693569
 
 
785db18
7693569
 
785db18
2efa3f3
7693569
 
 
 
 
 
 
 
 
 
 
f0fba32
7693569
 
 
 
 
785db18
2efa3f3
7693569
 
 
 
 
 
 
4d2faa4
287b4a7
7693569
287b4a7
4d2faa4
2efa3f3
7693569
 
 
 
785db18
7693569
2efa3f3
7693569
 
 
 
 
 
f0fba32
2051c3e
7693569
2efa3f3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import unsloth
from unsloth import FastLanguageModel

import os
import zipfile
import pandas as pd
from datasets import Dataset
import torch
from transformers import TrainingArguments
from trl import SFTTrainer


def main():
    # 1) Load 4-bit model + tokenizer (SmolLM already chat-formatted)
    model_name = "HuggingFaceTB/SmolLM2-1.7B"
    max_seq_length = 768
    dtype = torch.float16
    load_in_4bit = True

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
    )

    # 2) NO manual special-tokens injection or resizing β€” base model tokenizer already includes chat markers

    # 3) Load chat-formatted dataset
    df = pd.read_json("data.jsonl", lines=True)
    # Confirm each sample ends with the end-of-turn token
    assert df['text'].str.endswith("<|im_end|>").all(), "Some samples missing end-of-turn token"

    # 4) Create Hugging Face Dataset and split
    full_dataset = Dataset.from_pandas(df[["text"]])
    split = full_dataset.train_test_split(test_size=0.15, seed=42)
    train_dataset = split["train"]
    eval_dataset = split["test"]
    print(f"βœ… Training samples: {len(train_dataset)} | Eval samples: {len(eval_dataset)}")

    # 5) Apply LoRA adapters
    model = FastLanguageModel.get_peft_model(
        model,
        r=8,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        target_modules=[
            "q_proj", "k_proj", "v_proj", "o_proj",
            "gate_proj", "up_proj", "down_proj",
        ],
        use_gradient_checkpointing=True,
    )

    # 6) Tokenization function
    def tokenize_fn(examples):
        tokens = tokenizer(
            examples["text"],
            truncation=True,
            padding="max_length",
            max_length=max_seq_length,
        )
        tokens["labels"] = tokens["input_ids"].copy()
        return tokens

    tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
    tokenized_eval = eval_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])

    # 7) Training arguments
    training_args = TrainingArguments(
        output_dir="./output_model",
        per_device_train_batch_size=8,
        gradient_accumulation_steps=1,
        fp16=True,
        num_train_epochs=3,
        learning_rate=2e-4,
        logging_strategy="steps",
        logging_steps=25,
        save_strategy="epoch",
        save_total_limit=2,
        eval_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        dataloader_num_workers=2,
    )

    # 8) Train with SFTTrainer
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_eval,
    )

    print("πŸš€ Starting training...")
    trainer.train()
    print("βœ… Training complete.")

    # 9) Save and zip model
    final_dir = "./output_model_final"
    os.makedirs(final_dir, exist_ok=True)
    model.save_pretrained(final_dir)
    tokenizer.save_pretrained(final_dir)

    zip_path = "model.zip"
    print(f"πŸ—œ Zipping model β†’ {zip_path}")
    with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
        for root, _, files in os.walk(final_dir):
            for fname in files:
                full = os.path.join(root, fname)
                rel = os.path.relpath(full, final_dir)
                zf.write(full, rel)
    print(f"βœ… Model zipped β†’ {zip_path}")

if __name__ == "__main__":
    main()