nicoleathy commited on
Commit
f11a0ff
·
verified ·
1 Parent(s): 391d684

Delete competition/llama.py

Browse files
Files changed (1) hide show
  1. competition/llama.py +0 -139
competition/llama.py DELETED
@@ -1,139 +0,0 @@
1
- from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
2
- from datasets import Dataset
3
- import pandas as pd
4
- from sklearn.model_selection import train_test_split
5
- from peft import get_peft_model, LoraConfig, TaskType
6
- import evaluate
7
- import numpy as np
8
- from tqdm import tqdm
9
-
10
- # Load the dataset
11
- file_path = 'train_en.csv'
12
- dataset = pd.read_csv(file_path)
13
-
14
- # Map labels to expected responses
15
- label_mapping = {
16
- "Yes": 0,
17
- "No": 1,
18
- "It doesn't matter": 2,
19
- "Unimportant": 2,
20
- "Incorrect questioning": 3,
21
- "Correct answers": 4
22
- }
23
-
24
- # Apply label mapping
25
- dataset['label'] = dataset['label'].map(label_mapping)
26
-
27
- # Handle NaN values: Drop rows where label is NaN
28
- dataset = dataset.dropna(subset=['label'])
29
-
30
- # Ensure labels are integers
31
- dataset['label'] = dataset['label'].astype(int)
32
-
33
- # Combine "text" and "puzzle" columns
34
- dataset['combined_text'] = dataset['text'] + " " + dataset['puzzle']
35
-
36
- # Split the dataset into training and validation sets
37
- train_df, val_df = train_test_split(dataset, test_size=0.2, random_state=42)
38
-
39
- # Convert the dataframes to datasets
40
- train_dataset = Dataset.from_pandas(train_df)
41
- val_dataset = Dataset.from_pandas(val_df)
42
-
43
- # Load the tokenizer and model
44
- model_name = "meta-llama/Meta-Llama-3-8B" # Replace with the actual model name
45
- tokenizer = AutoTokenizer.from_pretrained(model_name)
46
- model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)
47
-
48
- # Add a padding token if it's not already present
49
- if tokenizer.pad_token is None:
50
- tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
51
- model.resize_token_embeddings(len(tokenizer))
52
- tokenizer.pad_token = tokenizer.eos_token # Set the padding token explicitly
53
-
54
- # Ensure the padding token is set correctly in the model configuration
55
- model.config.pad_token_id = tokenizer.pad_token_id
56
-
57
- # Tokenize the data
58
- def tokenize_function(examples):
59
- return tokenizer(examples['combined_text'], truncation=True, padding='max_length', max_length=128)
60
-
61
- train_dataset = train_dataset.map(tokenize_function, batched=True, num_proc=4) # Use multiprocessing
62
- val_dataset = val_dataset.map(tokenize_function, batched=True, num_proc=4)
63
-
64
- # Set the format for PyTorch
65
- train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
66
- val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
67
-
68
- # Define LoRA configuration
69
- lora_config = LoraConfig(
70
- task_type=TaskType.SEQ_CLS,
71
- r=16,
72
- lora_alpha=16,
73
- target_modules=["q_proj", "v_proj"],
74
- lora_dropout=0.05,
75
- bias="none"
76
- )
77
-
78
- # Apply LoRA to the model
79
- model = get_peft_model(model, lora_config)
80
- model.print_trainable_parameters()
81
-
82
- # Training arguments
83
- training_args = TrainingArguments(
84
- output_dir='./results',
85
- learning_rate=1e-4,
86
- lr_scheduler_type="linear",
87
- warmup_ratio=0.1,
88
- max_grad_norm=0.3,
89
- per_device_train_batch_size=8, # Increase batch size if memory allows
90
- per_device_eval_batch_size=8,
91
- num_train_epochs=3,
92
- weight_decay=0.001,
93
- evaluation_strategy="epoch",
94
- save_strategy="epoch",
95
- load_best_model_at_end=True,
96
- report_to="wandb",
97
- fp16=True,
98
- gradient_checkpointing=True,
99
- gradient_accumulation_steps=2, # Adjust based on memory constraints
100
- dataloader_num_workers=4,
101
- logging_steps=100,
102
- save_total_limit=2,
103
- )
104
-
105
- def compute_metrics(eval_pred):
106
- precision_metric = evaluate.load("precision")
107
- recall_metric = evaluate.load("recall")
108
- f1_metric = evaluate.load("f1")
109
- accuracy_metric = evaluate.load("accuracy")
110
-
111
- logits, labels = eval_pred
112
- predictions = np.argmax(logits, axis=-1)
113
-
114
- precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")["precision"]
115
- recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")["recall"]
116
- f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")["f1"]
117
- accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
118
-
119
- return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy}
120
-
121
- # Initialize the Trainer
122
- trainer = Trainer(
123
- model=model,
124
- args=training_args,
125
- train_dataset=train_dataset,
126
- eval_dataset=val_dataset,
127
- compute_metrics=compute_metrics
128
- )
129
-
130
- # Train the model with progress bar
131
- trainer.train()
132
-
133
- # Save the model
134
- model.save_pretrained('trained_llama_model')
135
- tokenizer.save_pretrained('trained_llama_model')
136
-
137
- # Evaluate the model with progress bar
138
- eval_results = trainer.evaluate()
139
- print(eval_results)