Spaces:
Runtime error
Runtime error
File size: 4,777 Bytes
55b44b5 33649cd 18ae97f ce2b636 877a87b ce2b636 efb2125 ce2b636 c3b69c6 33649cd c3b69c6 efb2125 877a87b 33649cd 8a75de9 709816c 7d33fb7 9c89421 7d33fb7 71d11a0 9429d1c 71d11a0 9429d1c 6fd5709 9429d1c 44aeeb4 9429d1c 624a2c3 71d11a0 624a2c3 9429d1c 624a2c3 9429d1c 624a2c3 9429d1c 624a2c3 1b13b28 efb2125 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
#!pip install -q transformers datasets torch gradio console_logging numpy
import torch
from datasets import load_dataset
from console_logging.console import Console
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
import gradio as gr
# REMOVE THIS IN COLAB
title = 'Text market sentiment'
text_ = "Bitcoin to the moon"
model = torch.load("./model.pt")
inp = [gr.Textbox(label='API Key', placeholder="What is your API Key?"), gr.Textbox(label='Audio File URL', placeholder="Audio file URL?")]
out = gr.Textbox(label='Output')
text_button = gr.Button("Flip")
text_button.click(audio_to_text, inputs=inp, outputs=out)
interface = gr.Interface.load(input=inp,output=out,
title = title,
theme = "peach",
examples = [[text_]]).launch()
###############
console = Console()
dataset = load_dataset("zeroshot/twitter-financial-news-sentiment", )
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
#labels = [label for label in dataset['train'].features.keys() if label not in ['text']]
labels = ["Bearish", "Bullish", "Neutral"]
def preprocess_data(examples):
# take a batch of texts
text = examples["text"]
# encode them
encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
# add labels
#labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
labels_batch = {'Bearish': [], 'Bullish': [], 'Neutral': []}
for i in range (len(examples['label'])):
labels_batch["Bearish"].append(False)
labels_batch["Bullish"].append(False)
labels_batch["Neutral"].append(False)
if examples['label'][i] == 0:
labels_batch["Bearish"][i] = True
elif examples['label'][i] == 1:
labels_batch["Bullish"][i] = True
else:
labels_batch["Neutral"][i] = True
# create numpy array of shape (batch_size, num_labels)
labels_matrix = np.zeros((len(text), len(labels)))
# fill numpy array
for idx, label in enumerate(labels):
labels_matrix[:, idx] = labels_batch[label]
encoding["labels"] = labels_matrix.tolist()
return encoding
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)
encoded_dataset.set_format("torch")
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
problem_type="multi_label_classification",
num_labels=len(labels),
id2label=id2label,
label2id=label2id)
batch_size = 8
metric_name = "f1"
args = TrainingArguments(
f"bert-finetuned-sem_eval-english",
evaluation_strategy = "epoch",
save_strategy = "epoch",
learning_rate=2e-5,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
num_train_epochs=5,
weight_decay=0.01,
load_best_model_at_end=True,
metric_for_best_model=metric_name,
#push_to_hub=True,
)
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
# first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(torch.Tensor(predictions))
# next, use threshold to turn them into integer predictions
y_pred = np.zeros(probs.shape)
y_pred[np.where(probs >= threshold)] = 1
# finally, compute metrics
y_true = labels
f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
accuracy = accuracy_score(y_true, y_pred)
# return as dictionary
metrics = {'f1': f1_micro_average,
'roc_auc': roc_auc,
'accuracy': accuracy}
return metrics
def compute_metrics(p: EvalPrediction):
preds = p.predictions[0] if isinstance(p.predictions,
tuple) else p.predictions
result = multi_label_metrics(
predictions=preds,
labels=p.label_ids)
return result
trainer = Trainer(
model,
args,
train_dataset=encoded_dataset["train"],
eval_dataset=encoded_dataset["validation"],
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
trainer.train()
trainer.evaluate()
|