vv876803 commited on
Commit
10c0e07
ยท
verified ยท
1 Parent(s): d4f2755

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +42 -24
train.py CHANGED
@@ -1,55 +1,73 @@
 
1
  import torch
2
- from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
3
  from datasets import load_dataset
4
- from peft import LoraConfig, get_peft_model
 
5
 
6
- # Model & Tokenizer
7
- MODEL_NAME = "vv876803/tinyllama-victo" # Adjust if using your own model
8
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
9
- model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16)
10
 
11
- # Apply LoRA for Efficient Fine-Tuning
 
 
 
 
 
 
 
 
 
 
 
12
  peft_config = LoraConfig(
13
- r=8, # Low-rank adaptation size
14
- lora_alpha=16,
15
- lora_dropout=0.05,
16
- bias="none",
17
- task_type="CAUSAL_LM"
18
  )
 
19
  model = get_peft_model(model, peft_config)
20
 
21
- # Load Dataset (OASST1)
22
- dataset = load_dataset("OpenAssistant/oasst1", split="train[:10%]") # Use 10% of dataset
23
 
24
- # Tokenization Function
25
  def tokenize_function(examples):
26
- return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
 
 
 
 
 
27
 
28
- # Tokenize Dataset
29
- tokenized_datasets = dataset.map(tokenize_function, batched=True)
30
 
31
- # Training Arguments
32
  training_args = TrainingArguments(
33
  output_dir="./tinyllama-finetuned",
34
  evaluation_strategy="epoch",
35
  save_strategy="epoch",
36
- per_device_train_batch_size=2, # Adjust for CPU
37
  per_device_eval_batch_size=2,
38
  num_train_epochs=3,
39
  logging_dir="./logs",
40
- report_to="none"
 
 
 
 
41
  )
42
 
43
- # Trainer
44
  trainer = Trainer(
45
  model=model,
46
  args=training_args,
47
  train_dataset=tokenized_datasets,
 
48
  )
49
 
50
- # Start Training
51
  trainer.train()
52
 
53
- # Save Fine-Tuned Model
54
  model.save_pretrained("./tinyllama-finetuned")
55
  tokenizer.save_pretrained("./tinyllama-finetuned")
 
1
+ import os
2
  import torch
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
4
  from datasets import load_dataset
5
+ from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
6
+ from huggingface_hub import login
7
 
8
+ # ๐Ÿ”น Get Hugging Face Token Securely
9
+ hf_token = os.getenv("HUGGING_FACE_HUB_TOKEN")
 
 
10
 
11
+ if not hf_token:
12
+ raise ValueError("โŒ Error: Missing Hugging Face API token. Set 'HUGGING_FACE_HUB_TOKEN' as an environment variable.")
13
+
14
+ # ๐Ÿ”น Authenticate with Hugging Face
15
+ login(token=hf_token)
16
+
17
+ # ๐Ÿ”น Load Pretrained Model & Tokenizer
18
+ MODEL_NAME = "vv876803/tinyllama-victor" # Change if using your own model
19
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=hf_token)
20
+ model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.bfloat16, use_auth_token=hf_token)
21
+
22
+ # ๐Ÿ”น Apply LoRA for Efficient Fine-Tuning
23
  peft_config = LoraConfig(
24
+ r=8, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
 
 
 
 
25
  )
26
+ model = prepare_model_for_kbit_training(model) # Ensures LoRA compatibility
27
  model = get_peft_model(model, peft_config)
28
 
29
+ # ๐Ÿ”น Load Dataset (OpenAssistant OASST1 - 10% Sample)
30
+ dataset = load_dataset("OpenAssistant/oasst1", split="train[:10%]")
31
 
32
+ # ๐Ÿ”น Tokenization Function
33
  def tokenize_function(examples):
34
+ if "text" not in examples: # Ensure dataset has expected fields
35
+ return {}
36
+ return tokenizer(examples["text"], truncation=True, max_length=512)
37
+
38
+ # ๐Ÿ”น Tokenize Dataset
39
+ tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)
40
 
41
+ # ๐Ÿ”น Data Collator (for better training)
42
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
43
 
44
+ # ๐Ÿ”น Training Arguments
45
  training_args = TrainingArguments(
46
  output_dir="./tinyllama-finetuned",
47
  evaluation_strategy="epoch",
48
  save_strategy="epoch",
49
+ per_device_train_batch_size=2, # Adjust based on RAM
50
  per_device_eval_batch_size=2,
51
  num_train_epochs=3,
52
  logging_dir="./logs",
53
+ logging_steps=10,
54
+ report_to="none",
55
+ save_total_limit=2, # Prevents excessive checkpoint saving
56
+ fp16=False, # Use bfloat16 for CPU (float16 needs a GPU)
57
+ group_by_length=True # Packs similar-length sequences for better efficiency
58
  )
59
 
60
+ # ๐Ÿ”น Trainer
61
  trainer = Trainer(
62
  model=model,
63
  args=training_args,
64
  train_dataset=tokenized_datasets,
65
+ data_collator=data_collator,
66
  )
67
 
68
+ # ๐Ÿ”น Start Training
69
  trainer.train()
70
 
71
+ # ๐Ÿ”น Save Fine-Tuned Model & Tokenizer
72
  model.save_pretrained("./tinyllama-finetuned")
73
  tokenizer.save_pretrained("./tinyllama-finetuned")