SagaMKM commited on
Commit
23fcc89
·
verified ·
1 Parent(s): 11cc572

Create training.py

Browse files
Files changed (1) hide show
  1. training.py +123 -0
training.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #This file was our attempt at training the model, which ultimately failed.
2
+
3
+ !pip install datasets peft transformers
4
+ from google.colab import userdata
5
+ my_secret_key = userdata.get('Cli2')
6
+ from huggingface_hub import login
7
+ login(my_secret_key)
8
+
9
+ # Name for finetuned model and folder.
10
+ model_output = "./BudgetAdvisor"
11
+
12
+ # Dataset loading and manipulation.
13
+ from datasets import load_dataset
14
+ dataset = load_dataset("gbharti/finance-alpaca") # features: ['text', 'instruction', 'input', 'output']
15
+ # Remove empty columns from dataset.
16
+ dataset = dataset.remove_columns(["text", "input"])
17
+ # Splits dataset to test and train sets, 90 % for train and 10 % for test.
18
+ dataset = dataset["train"].train_test_split(test_size=0.1)
19
+ train_dataset = dataset["train"]
20
+ eval_dataset = dataset["test"]
21
+
22
+ #Tokenizer and model settings.
23
+ from transformers import AutoTokenizer
24
+ from transformers import Trainer, TrainingArguments, AutoModelForCausalLM
25
+ tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B", use_fast=True)
26
+ model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")
27
+
28
+ # Make arrays of token the same size.
29
+ if tokenizer.pad_token is None:
30
+ tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
31
+ model.resize_token_embeddings(len(tokenizer))
32
+
33
+ # For memory efficiency.
34
+ model.gradient_checkpointing_enable()
35
+
36
+ # Parameter-Efficient Fine-Tuning
37
+ from peft import LoraConfig, get_peft_model
38
+ # Define a PEFT configuration for LoRA
39
+ lora_config = LoraConfig(
40
+ r=8, # Reduced rank for faster training
41
+ lora_alpha=16,
42
+ target_modules=["q_proj", "v_proj"],
43
+ lora_dropout=0.05,
44
+ bias="none",
45
+ task_type="CAUSAL_LM"
46
+ )
47
+
48
+ # Check if cuda is available.
49
+ import torch
50
+ model = get_peft_model(model, lora_config)
51
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
52
+ model.to(device)
53
+
54
+ # Preprocessing function
55
+ def preprocess_data(examples):
56
+ # Combine instruction and input as the prompt
57
+ inputs = [f"Instruction: {instr}\nInput: {inp}\n" for instr, inp in zip(examples['instruction'], examples['output'])]
58
+ targets = [output for output in examples['output']]
59
+ return {'input_text': inputs, 'target_text': targets}
60
+
61
+ train_dataset = train_dataset.map(preprocess_data, batched=True)
62
+ eval_dataset = eval_dataset.map(preprocess_data, batched=True)
63
+
64
+ # Tokenization function
65
+ def tokenize_data(examples):
66
+ model_inputs = tokenizer(
67
+ examples['input_text'],
68
+ max_length=128, # Reduced max_length for faster processing
69
+ truncation=True,
70
+ padding="max_length"
71
+ )
72
+ labels = tokenizer(
73
+ examples['target_text'],
74
+ max_length=128,
75
+ truncation=True,
76
+ padding="max_length"
77
+ )["input_ids"]
78
+ model_inputs["labels"] = labels
79
+ return model_inputs
80
+
81
+ # Tokenize the datasets
82
+ train_dataset = train_dataset.map(tokenize_data, batched=True, remove_columns=train_dataset.column_names)
83
+ eval_dataset = eval_dataset.map(tokenize_data, batched=True, remove_columns=eval_dataset.column_names)
84
+
85
+ # Set the format for PyTorch tensors
86
+ train_dataset.set_format(type="torch")
87
+ eval_dataset.set_format(type="torch")
88
+
89
+ # Training arguments and trainer.
90
+ training_args = TrainingArguments(
91
+ output_dir=model_output, # "./BudgetAdvisor"
92
+ per_device_train_batch_size=8, # Increase if GPU memory allows
93
+ per_device_eval_batch_size=8,
94
+ evaluation_strategy="steps",
95
+ eval_steps=500,
96
+ save_steps=500,
97
+ num_train_epochs=3, # Increased epochs for better training
98
+ learning_rate=5e-5,
99
+ fp16=True, # Enable mixed precision for faster training
100
+ logging_steps=100,
101
+ save_total_limit=2,
102
+ load_best_model_at_end=True,
103
+ report_to="none", # Disable reporting to third-party services
104
+ )
105
+
106
+ trainer = Trainer(
107
+ model=model,
108
+ args=training_args,
109
+ train_dataset=train_dataset,
110
+ eval_dataset=eval_dataset,
111
+ tokenizer=tokenizer
112
+ )
113
+
114
+ # Message for testing.
115
+ print("Trainer is set up!")
116
+
117
+ # Trains the model and saves it.
118
+ trainer.train()
119
+ print("Model trained!")
120
+ trainer.save_model(model_output) # "./BudgetAdvisor"
121
+ tokenizer.save_pretrained(model_output)# "./BudgetAdvisor"
122
+
123
+ !zip -r BudgetAdvisor.zip ./BudgetAdvisor