File size: 4,660 Bytes
55b44b5
 
18ae97f
 
efb2125
 
c3b69c6
 
 
 
 
 
efb2125
 
 
 
7d33fb7
709816c
8a75de9
3a65c6e
71d11a0
 
 
 
 
8a75de9
709816c
7d33fb7
9c89421
7d33fb7
 
 
 
71d11a0
 
 
9429d1c
 
 
 
 
 
 
71d11a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9429d1c
 
 
 
 
 
 
 
 
 
6fd5709
9429d1c
44aeeb4
9429d1c
624a2c3
 
 
71d11a0
 
624a2c3
 
 
9429d1c
624a2c3
 
9429d1c
624a2c3
 
 
 
 
 
 
 
 
 
 
 
 
9429d1c
624a2c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b13b28
 
 
 
 
 
 
 
 
 
 
efb2125
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!pip install -q transformers datasets torch gradio console_logging numpy

import gradio as gr

title = 'Text market sentiment'
text_ = "Bitcoin to the moon"
model = torch.load("huggingface/Berbex/Model")

inp = [gr.Textbox(label='API Key', placeholder="What is your API Key?"), gr.Textbox(label='Audio File URL', placeholder="Audio file URL?")]
out = gr.Textbox(label='Output')

interface = gr.Interface.load(input=inp,output=out,
title = title,
theme = "peach",
examples = [[text_]]).launch()

import torch
from datasets import load_dataset
from console_logging.console import Console
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
console = Console()

dataset = load_dataset("zeroshot/twitter-financial-news-sentiment", )


model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

#labels = [label for label in dataset['train'].features.keys() if label not in ['text']]

labels = ["Bearish", "Bullish", "Neutral"]

def preprocess_data(examples):
  # take a batch of texts
  text = examples["text"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  #labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  labels_batch = {'Bearish': [], 'Bullish': [], 'Neutral': []}
  for i in range (len(examples['label'])):
    labels_batch["Bearish"].append(False)
    labels_batch["Bullish"].append(False)
    labels_batch["Neutral"].append(False)
    
    if examples['label'][i] == 0:
      labels_batch["Bearish"][i] = True

    elif examples['label'][i] == 1:
      labels_batch["Bullish"][i] = True

    else:
      labels_batch["Neutral"][i] = True

  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)

encoded_dataset.set_format("torch")

id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

batch_size = 8
metric_name = "f1"

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    #push_to_hub=True,
)

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result


trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

trainer.evaluate()