nicoleathy commited on
Commit
391d684
·
verified ·
1 Parent(s): f754508

Delete competition/gemma-2-9b.py

Browse files
Files changed (1) hide show
  1. competition/gemma-2-9b.py +0 -128
competition/gemma-2-9b.py DELETED
@@ -1,128 +0,0 @@
1
- from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
2
- from datasets import Dataset
3
- import pandas as pd
4
- from sklearn.model_selection import train_test_split
5
- from peft import get_peft_model, LoraConfig, TaskType
6
- import evaluate
7
- import numpy as np
8
-
9
- # Load the dataset
10
- file_path = 'train_en.csv'
11
- dataset = pd.read_csv(file_path)
12
-
13
- # Map labels to expected responses
14
- label_mapping = {
15
- "Yes": 0,
16
- "No": 1,
17
- "It doesn't matter": 2,
18
- "Unimportant": 2, # Assuming "unimportant" is synonymous with "It doesn't matter"
19
- "Incorrect questioning": 3,
20
- "Correct answers": 4
21
- }
22
-
23
- # Apply label mapping
24
- dataset['label'] = dataset['label'].map(label_mapping)
25
-
26
- # Handle NaN values: Drop rows where label is NaN
27
- dataset = dataset.dropna(subset=['label'])
28
-
29
- # Ensure labels are integers
30
- dataset['label'] = dataset['label'].astype(int)
31
-
32
- # Combine "text" and "puzzle" columns
33
- dataset['combined_text'] = dataset['text'] + " " + dataset['puzzle']
34
-
35
- # Split the dataset into training and validation sets
36
- train_df, val_df = train_test_split(dataset, test_size=0.2, random_state=42)
37
-
38
- # Convert the dataframes to datasets
39
- train_dataset = Dataset.from_pandas(train_df)
40
- val_dataset = Dataset.from_pandas(val_df)
41
-
42
- # Load the tokenizer and model
43
- model_name = "google/gemma-2-9b"
44
- tokenizer = AutoTokenizer.from_pretrained(model_name)
45
- model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)
46
-
47
- # Tokenize the data
48
- def tokenize_function(examples):
49
- return tokenizer(examples['combined_text'], truncation=True, padding='max_length', max_length=128)
50
-
51
- train_dataset = train_dataset.map(tokenize_function, batched=True)
52
- val_dataset = val_dataset.map(tokenize_function, batched=True)
53
-
54
- # Set the format for PyTorch
55
- train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
56
- val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
57
-
58
- # Define LoRA configuration
59
- lora_config = LoraConfig(
60
- task_type=TaskType.SEQ_CLS,
61
- r=16,
62
- lora_alpha=16,
63
- target_modules=["q_proj", "v_proj"],
64
- lora_dropout=0.05,
65
- bias="none"
66
- )
67
-
68
- # Apply LoRA to the model
69
- model = get_peft_model(model, lora_config)
70
- model.print_trainable_parameters()
71
-
72
- # Training arguments
73
- training_args = TrainingArguments(
74
- output_dir='./results',
75
- learning_rate=1e-4,
76
- lr_scheduler_type="linear",
77
- warmup_ratio=0.1,
78
- max_grad_norm=0.3,
79
- per_device_train_batch_size=4,
80
- per_device_eval_batch_size=4,
81
- num_train_epochs=3,
82
- weight_decay=0.001,
83
- evaluation_strategy="epoch",
84
- save_strategy="epoch",
85
- load_best_model_at_end=True,
86
- report_to="wandb",
87
- fp16=True,
88
- gradient_checkpointing=True,
89
- gradient_accumulation_steps=4,
90
- dataloader_num_workers=4,
91
- logging_steps=100,
92
- save_total_limit=2,
93
- )
94
-
95
- def compute_metrics(eval_pred):
96
- precision_metric = evaluate.load("precision")
97
- recall_metric = evaluate.load("recall")
98
- f1_metric = evaluate.load("f1")
99
- accuracy_metric = evaluate.load("accuracy")
100
-
101
- logits, labels = eval_pred
102
- predictions = np.argmax(logits, axis=-1)
103
-
104
- precision = precision_metric.compute(predictions=predictions, references=labels, average='macro')["precision"]
105
- recall = recall_metric.compute(predictions=predictions, references=labels, average='macro')["recall"]
106
- f1 = f1_metric.compute(predictions=predictions, references=labels, average='macro')["f1"]
107
- accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
108
-
109
- return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy}
110
-
111
- # Initialize the Trainer
112
- trainer = Trainer(
113
- model=model,
114
- args=training_args,
115
- train_dataset=train_dataset,
116
- eval_dataset=val_dataset,
117
- compute_metrics=compute_metrics
118
- )
119
-
120
- # Train the model
121
- trainer.train()
122
-
123
- # Save the model
124
- model.save_pretrained('trained_gemma_model')
125
- tokenizer.save_pretrained('trained_gemma_model')
126
-
127
- # Evaluate the model
128
- trainer.evaluate()