Update train.py
Browse files
train.py
CHANGED
@@ -1,55 +1,73 @@
|
|
|
|
1 |
import torch
|
2 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
|
3 |
from datasets import load_dataset
|
4 |
-
from peft import LoraConfig, get_peft_model
|
|
|
5 |
|
6 |
-
#
|
7 |
-
|
8 |
-
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
9 |
-
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16)
|
10 |
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
peft_config = LoraConfig(
|
13 |
-
r=8,
|
14 |
-
lora_alpha=16,
|
15 |
-
lora_dropout=0.05,
|
16 |
-
bias="none",
|
17 |
-
task_type="CAUSAL_LM"
|
18 |
)
|
|
|
19 |
model = get_peft_model(model, peft_config)
|
20 |
|
21 |
-
# Load Dataset (OASST1)
|
22 |
-
dataset = load_dataset("OpenAssistant/oasst1", split="train[:10%]")
|
23 |
|
24 |
-
# Tokenization Function
|
25 |
def tokenize_function(examples):
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
-
#
|
29 |
-
|
30 |
|
31 |
-
# Training Arguments
|
32 |
training_args = TrainingArguments(
|
33 |
output_dir="./tinyllama-finetuned",
|
34 |
evaluation_strategy="epoch",
|
35 |
save_strategy="epoch",
|
36 |
-
per_device_train_batch_size=2, # Adjust
|
37 |
per_device_eval_batch_size=2,
|
38 |
num_train_epochs=3,
|
39 |
logging_dir="./logs",
|
40 |
-
|
|
|
|
|
|
|
|
|
41 |
)
|
42 |
|
43 |
-
# Trainer
|
44 |
trainer = Trainer(
|
45 |
model=model,
|
46 |
args=training_args,
|
47 |
train_dataset=tokenized_datasets,
|
|
|
48 |
)
|
49 |
|
50 |
-
# Start Training
|
51 |
trainer.train()
|
52 |
|
53 |
-
# Save Fine-Tuned Model
|
54 |
model.save_pretrained("./tinyllama-finetuned")
|
55 |
tokenizer.save_pretrained("./tinyllama-finetuned")
|
|
|
1 |
+
import os
|
2 |
import torch
|
3 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
|
4 |
from datasets import load_dataset
|
5 |
+
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
|
6 |
+
from huggingface_hub import login
|
7 |
|
8 |
+
# ๐น Get Hugging Face Token Securely
|
9 |
+
hf_token = os.getenv("HUGGING_FACE_HUB_TOKEN")
|
|
|
|
|
10 |
|
11 |
+
if not hf_token:
|
12 |
+
raise ValueError("โ Error: Missing Hugging Face API token. Set 'HUGGING_FACE_HUB_TOKEN' as an environment variable.")
|
13 |
+
|
14 |
+
# ๐น Authenticate with Hugging Face
|
15 |
+
login(token=hf_token)
|
16 |
+
|
17 |
+
# ๐น Load Pretrained Model & Tokenizer
|
18 |
+
MODEL_NAME = "vv876803/tinyllama-victor" # Change if using your own model
|
19 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=hf_token)
|
20 |
+
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.bfloat16, use_auth_token=hf_token)
|
21 |
+
|
22 |
+
# ๐น Apply LoRA for Efficient Fine-Tuning
|
23 |
peft_config = LoraConfig(
|
24 |
+
r=8, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
|
|
|
|
|
|
|
|
|
25 |
)
|
26 |
+
model = prepare_model_for_kbit_training(model) # Ensures LoRA compatibility
|
27 |
model = get_peft_model(model, peft_config)
|
28 |
|
29 |
+
# ๐น Load Dataset (OpenAssistant OASST1 - 10% Sample)
|
30 |
+
dataset = load_dataset("OpenAssistant/oasst1", split="train[:10%]")
|
31 |
|
32 |
+
# ๐น Tokenization Function
|
33 |
def tokenize_function(examples):
|
34 |
+
if "text" not in examples: # Ensure dataset has expected fields
|
35 |
+
return {}
|
36 |
+
return tokenizer(examples["text"], truncation=True, max_length=512)
|
37 |
+
|
38 |
+
# ๐น Tokenize Dataset
|
39 |
+
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)
|
40 |
|
41 |
+
# ๐น Data Collator (for better training)
|
42 |
+
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
43 |
|
44 |
+
# ๐น Training Arguments
|
45 |
training_args = TrainingArguments(
|
46 |
output_dir="./tinyllama-finetuned",
|
47 |
evaluation_strategy="epoch",
|
48 |
save_strategy="epoch",
|
49 |
+
per_device_train_batch_size=2, # Adjust based on RAM
|
50 |
per_device_eval_batch_size=2,
|
51 |
num_train_epochs=3,
|
52 |
logging_dir="./logs",
|
53 |
+
logging_steps=10,
|
54 |
+
report_to="none",
|
55 |
+
save_total_limit=2, # Prevents excessive checkpoint saving
|
56 |
+
fp16=False, # Use bfloat16 for CPU (float16 needs a GPU)
|
57 |
+
group_by_length=True # Packs similar-length sequences for better efficiency
|
58 |
)
|
59 |
|
60 |
+
# ๐น Trainer
|
61 |
trainer = Trainer(
|
62 |
model=model,
|
63 |
args=training_args,
|
64 |
train_dataset=tokenized_datasets,
|
65 |
+
data_collator=data_collator,
|
66 |
)
|
67 |
|
68 |
+
# ๐น Start Training
|
69 |
trainer.train()
|
70 |
|
71 |
+
# ๐น Save Fine-Tuned Model & Tokenizer
|
72 |
model.save_pretrained("./tinyllama-finetuned")
|
73 |
tokenizer.save_pretrained("./tinyllama-finetuned")
|