Spaces:
Runtime error
Runtime error
File size: 4,660 Bytes
55b44b5 18ae97f efb2125 c3b69c6 efb2125 7d33fb7 709816c 8a75de9 3a65c6e 71d11a0 8a75de9 709816c 7d33fb7 9c89421 7d33fb7 71d11a0 9429d1c 71d11a0 9429d1c 6fd5709 9429d1c 44aeeb4 9429d1c 624a2c3 71d11a0 624a2c3 9429d1c 624a2c3 9429d1c 624a2c3 9429d1c 624a2c3 1b13b28 efb2125 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
#!pip install -q transformers datasets torch gradio console_logging numpy
import gradio as gr
title = 'Text market sentiment'
text_ = "Bitcoin to the moon"
model = torch.load("huggingface/Berbex/Model")
inp = [gr.Textbox(label='API Key', placeholder="What is your API Key?"), gr.Textbox(label='Audio File URL', placeholder="Audio file URL?")]
out = gr.Textbox(label='Output')
interface = gr.Interface.load(input=inp,output=out,
title = title,
theme = "peach",
examples = [[text_]]).launch()
import torch
from datasets import load_dataset
from console_logging.console import Console
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
console = Console()
dataset = load_dataset("zeroshot/twitter-financial-news-sentiment", )
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
#labels = [label for label in dataset['train'].features.keys() if label not in ['text']]
labels = ["Bearish", "Bullish", "Neutral"]
def preprocess_data(examples):
# take a batch of texts
text = examples["text"]
# encode them
encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
# add labels
#labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
labels_batch = {'Bearish': [], 'Bullish': [], 'Neutral': []}
for i in range (len(examples['label'])):
labels_batch["Bearish"].append(False)
labels_batch["Bullish"].append(False)
labels_batch["Neutral"].append(False)
if examples['label'][i] == 0:
labels_batch["Bearish"][i] = True
elif examples['label'][i] == 1:
labels_batch["Bullish"][i] = True
else:
labels_batch["Neutral"][i] = True
# create numpy array of shape (batch_size, num_labels)
labels_matrix = np.zeros((len(text), len(labels)))
# fill numpy array
for idx, label in enumerate(labels):
labels_matrix[:, idx] = labels_batch[label]
encoding["labels"] = labels_matrix.tolist()
return encoding
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)
encoded_dataset.set_format("torch")
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
problem_type="multi_label_classification",
num_labels=len(labels),
id2label=id2label,
label2id=label2id)
batch_size = 8
metric_name = "f1"
args = TrainingArguments(
f"bert-finetuned-sem_eval-english",
evaluation_strategy = "epoch",
save_strategy = "epoch",
learning_rate=2e-5,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
num_train_epochs=5,
weight_decay=0.01,
load_best_model_at_end=True,
metric_for_best_model=metric_name,
#push_to_hub=True,
)
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
# first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(torch.Tensor(predictions))
# next, use threshold to turn them into integer predictions
y_pred = np.zeros(probs.shape)
y_pred[np.where(probs >= threshold)] = 1
# finally, compute metrics
y_true = labels
f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
accuracy = accuracy_score(y_true, y_pred)
# return as dictionary
metrics = {'f1': f1_micro_average,
'roc_auc': roc_auc,
'accuracy': accuracy}
return metrics
def compute_metrics(p: EvalPrediction):
preds = p.predictions[0] if isinstance(p.predictions,
tuple) else p.predictions
result = multi_label_metrics(
predictions=preds,
labels=p.label_ids)
return result
trainer = Trainer(
model,
args,
train_dataset=encoded_dataset["train"],
eval_dataset=encoded_dataset["validation"],
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
trainer.train()
trainer.evaluate()
|