File size: 5,506 Bytes
058f1d9 eedbf0c 1e9eb0b 058f1d9 1e9eb0b 058f1d9 1e9eb0b 058f1d9 eedbf0c 058f1d9 1e9eb0b 058f1d9 1e9eb0b 058f1d9 1e9eb0b 058f1d9 1e9eb0b 058f1d9 1e9eb0b 058f1d9 1e9eb0b 058f1d9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
from uvicorn.config import logger
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
import mlflow
import os
import time
from scipy.special import softmax
# HuggingFace Model to be used for inferencing
MODEL = "gpicciuca/sentiment_trainer"
# MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
class InferenceTask:
"""
This class encapsulates the entire inferencing logic by using HuggingFace's Transformers library.
It offers a convenient "predict()" method that returns a list of dictionaries, where each
dictionary contains the sentiment analysis for each message that has been evaluated.
"""
def __init__(self):
self.clear()
self.load_model()
def load_model(self):
"""
Loads the classification model, its configuration and the tokenizer required for pre-processing
any text that needs to be inferenced later on.
Returns:
bool: True if loading succeeded, false otherwise
"""
try:
self.__tokenizer = AutoTokenizer.from_pretrained(MODEL)
self.__config = AutoConfig.from_pretrained(MODEL)
self.__model = AutoModelForSequenceClassification.from_pretrained(MODEL)
self.__is_loaded = True
except Exception as ex:
logger.error(f"Failed to load inference model: {ex}")
self.clear()
return False
return True
def clear(self):
"""
Resets the state of this instance
"""
self.__is_loaded = False
self.__tokenizer = None
self.__config = None
self.__model = None
def is_loaded(self):
"""
Checks if the class is ready and can be used, depending on whether
a model has been loaded.
Returns:
bool: True if model was loaded, false otherwise
"""
return self.__is_loaded
def predict(self, messages: list[str]):
"""
Method taking a list of messages to perform the sentiment classification on.
Each inference run is logged in MLFlow under the experiment 'Sentiment Analysis'.
For efficiency, only the average of the whole bulk request is logged.
Args:
messages (list[str]): List of messages to classify
Returns:
list[dict]: A list of dictionaries where each element contains the probabilities
for 'positive', 'neutral' and 'negative' sentiment.
"""
if len(messages) == 0:
return None
if not self.is_loaded() and not self.load_model():
return None
mlflow.set_tracking_uri(os.environ["MLFLOW_ENDPOINT"])
mlflow.set_experiment("Sentiment Analysis")
with mlflow.start_run() as run:
preprocessed_messages = self.__preprocess(messages)
labelized_scores = []
for message in preprocessed_messages:
encoded_input = self.__tokenizer(message, return_tensors='pt', padding="longest")
output = self.__model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores = self.__labelize(scores)
labelized_scores.append(scores)
mean_sentiment = self.__calculate_mean_sentiment(labelized_scores)
mean_sentiment["samples"] = len(labelized_scores)
logger.info(mean_sentiment)
mlflow.log_metrics(mean_sentiment, step=int(time.time()))
return labelized_scores
def __calculate_mean_sentiment(self, labelized_scores: list):
"""
Calculates the average sentiment over a list of classified messages.
Args:
labelized_scores (list): List of labelled scores resulting from the prediction step.
Returns:
dict: Dictionary with average values for for 'positive', 'neutral' and 'negative'.
"""
total_samples = float(len(labelized_scores))
mean_sentiment = {
"positive": 0.0,
"neutral": 0.0,
"negative": 0.0,
}
for score in labelized_scores:
mean_sentiment["positive"] += score["positive"]
mean_sentiment["neutral"] += score["neutral"]
mean_sentiment["negative"] += score["negative"]
mean_sentiment["positive"] /= total_samples
mean_sentiment["neutral"] /= total_samples
mean_sentiment["negative"] /= total_samples
return mean_sentiment
def __preprocess(self, messages: list[str]):
"""
Preprocesses the messages to remove certain patterns that are not
required for inferencing. User tags and http links are stripped out.
Args:
messages (list[str]): List of messages to preprocess
Returns:
list[str]: List of processed messages without user tags and links
"""
msg_list = []
for message in messages:
new_message = []
for t in message.split(" "):
t = '@user' if t.startswith('@') and len(t) > 1 else t
t = 'http' if t.startswith('http') else t
new_message.append(t)
msg_list.append(" ".join(new_message))
return msg_list
def __labelize(self, scores):
"""
Helper method to transform numpy labels, coming as a result of the classification,
back into their equivalent textual version so that they are human-readable by using
the model's configuration.
Args:
scores: Result from prediction for each individual message
Returns:
dict: Dictionary containing the sentiment prediction with human-readable labels
"""
output = {}
ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
l = self.__config.id2label[ranking[i]]
s = float(scores[ranking[i]])
output[l] = s
return output
# Preload a global instance so that inference can be
# executed immediately when requested
infer_task = InferenceTask()
|