Berbex commited on
Commit
624a2c3
Β·
1 Parent(s): 4ec7dd4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -8
app.py CHANGED
@@ -33,21 +33,69 @@ def preprocess_data(examples):
33
 
34
  encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)
35
 
36
- example = encoded_dataset['train'][9500]
37
- console.log(example['labels'])
38
-
39
  encoded_dataset.set_format("torch")
40
 
 
 
 
 
 
 
 
41
 
 
 
42
 
 
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
- def sentiment_score(review):
46
- tokens = tokenizer.encode(review, return_tensors='pt')
47
- result = model(tokens)
48
- return int(torch.argmax(result.logits))
49
 
50
- dataset['sentiment'] = dataset['train']['text'].apply(lambda x: sentiment_score(x[:512]))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  """
53
 
 
33
 
34
  encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)
35
 
 
 
 
36
  encoded_dataset.set_format("torch")
37
 
38
+ id2label = {idx:label for idx, label in enumerate(labels)}
39
+ label2id = {label:idx for idx, label in enumerate(labels)}
40
+
41
+ model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
42
+ num_labels=len(labels),
43
+ id2label=id2label,
44
+ label2id=label2id)
45
 
46
+ batch_size = 8
47
+ metric_name = "f1"
48
 
49
+ from transformers import TrainingArguments, Trainer
50
 
51
+ args = TrainingArguments(
52
+ f"bert-finetuned-sem_eval-english",
53
+ evaluation_strategy = "epoch",
54
+ save_strategy = "epoch",
55
+ learning_rate=2e-5,
56
+ per_device_train_batch_size=batch_size,
57
+ per_device_eval_batch_size=batch_size,
58
+ num_train_epochs=5,
59
+ weight_decay=0.01,
60
+ load_best_model_at_end=True,
61
+ metric_for_best_model=metric_name,
62
+ #push_to_hub=True,
63
+ )
64
 
 
 
 
 
65
 
66
+ from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
67
+ from transformers import EvalPrediction
68
+ import torch
69
+
70
+ # source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
71
+ def multi_label_metrics(predictions, labels, threshold=0.5):
72
+ # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
73
+ sigmoid = torch.nn.Sigmoid()
74
+ probs = sigmoid(torch.Tensor(predictions))
75
+ # next, use threshold to turn them into integer predictions
76
+ y_pred = np.zeros(probs.shape)
77
+ y_pred[np.where(probs >= threshold)] = 1
78
+ # finally, compute metrics
79
+ y_true = labels
80
+ f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
81
+ roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
82
+ accuracy = accuracy_score(y_true, y_pred)
83
+ # return as dictionary
84
+ metrics = {'f1': f1_micro_average,
85
+ 'roc_auc': roc_auc,
86
+ 'accuracy': accuracy}
87
+ return metrics
88
+
89
+ def compute_metrics(p: EvalPrediction):
90
+ preds = p.predictions[0] if isinstance(p.predictions,
91
+ tuple) else p.predictions
92
+ result = multi_label_metrics(
93
+ predictions=preds,
94
+ labels=p.label_ids)
95
+ return result
96
+
97
+
98
+ console.log(encoded_dataset['train']['input_ids'][0])
99
 
100
  """
101