File size: 5,506 Bytes
058f1d9
 
 
 
 
 
 
 
 
 
eedbf0c
1e9eb0b
058f1d9
 
 
1e9eb0b
 
 
 
 
 
058f1d9
 
 
 
 
1e9eb0b
 
 
 
 
 
 
058f1d9
 
 
 
 
 
eedbf0c
058f1d9
 
 
 
 
 
1e9eb0b
 
 
058f1d9
 
 
 
 
 
1e9eb0b
 
 
 
 
 
 
058f1d9
 
 
1e9eb0b
 
 
 
 
 
 
 
 
 
 
 
058f1d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e9eb0b
 
 
 
 
 
 
 
 
058f1d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e9eb0b
 
 
 
 
 
 
 
 
 
058f1d9
 
 
 
 
 
 
 
 
 
 
1e9eb0b
 
 
 
 
 
 
 
 
 
 
058f1d9
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
from uvicorn.config import logger
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
import mlflow
import os
import time
from scipy.special import softmax

# HuggingFace Model to be used for inferencing
MODEL = "gpicciuca/sentiment_trainer"
# MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"

class InferenceTask:

	"""

	This class encapsulates the entire inferencing logic by using HuggingFace's Transformers library.

	It offers a convenient "predict()" method that returns a list of dictionaries, where each

	dictionary contains the sentiment analysis for each message that has been evaluated.

	"""

	def __init__(self):
		self.clear()
		self.load_model()

	def load_model(self):
		"""

		Loads the classification model, its configuration and the tokenizer required for pre-processing

		any text that needs to be inferenced later on.



		Returns:

			bool: True if loading succeeded, false otherwise

		"""
		try:
			self.__tokenizer = AutoTokenizer.from_pretrained(MODEL)
			self.__config = AutoConfig.from_pretrained(MODEL)
			self.__model = AutoModelForSequenceClassification.from_pretrained(MODEL)
			self.__is_loaded = True
		except Exception as ex:
			logger.error(f"Failed to load inference model: {ex}")
			self.clear()
			return False

		return True

	def clear(self):
		"""

		Resets the state of this instance

		"""
		self.__is_loaded = False
		self.__tokenizer = None
		self.__config = None
		self.__model = None

	def is_loaded(self):
		"""

		Checks if the class is ready and can be used, depending on whether 

  		a model has been loaded.



		Returns:

			bool: True if model was loaded, false otherwise

		"""
		return self.__is_loaded

	def predict(self, messages: list[str]):
		"""

		Method taking a list of messages to perform the sentiment classification on.

		Each inference run is logged in MLFlow under the experiment 'Sentiment Analysis'.

		For efficiency, only the average of the whole bulk request is logged.



		Args:

			messages (list[str]): List of messages to classify



		Returns:

			list[dict]: A list of dictionaries where each element contains the probabilities

						for 'positive', 'neutral' and 'negative' sentiment.

		"""
		if len(messages) == 0:
			return None

		if not self.is_loaded() and not self.load_model():
			return None

		mlflow.set_tracking_uri(os.environ["MLFLOW_ENDPOINT"])
		mlflow.set_experiment("Sentiment Analysis")

		with mlflow.start_run() as run:
			preprocessed_messages = self.__preprocess(messages)
			labelized_scores = []

			for message in preprocessed_messages:
				encoded_input = self.__tokenizer(message, return_tensors='pt', padding="longest")
				output = self.__model(**encoded_input)
				scores = output[0][0].detach().numpy()
				scores = softmax(scores)
				scores = self.__labelize(scores)
				labelized_scores.append(scores)

			mean_sentiment = self.__calculate_mean_sentiment(labelized_scores)
			mean_sentiment["samples"] = len(labelized_scores)
			logger.info(mean_sentiment)

			mlflow.log_metrics(mean_sentiment, step=int(time.time()))

			return labelized_scores

	def __calculate_mean_sentiment(self, labelized_scores: list):
		"""

		Calculates the average sentiment over a list of classified messages.



		Args:

			labelized_scores (list): List of labelled scores resulting from the prediction step.



		Returns:

			dict: Dictionary with average values for for 'positive', 'neutral' and 'negative'.

		"""
		total_samples = float(len(labelized_scores))

		mean_sentiment = {
			"positive": 0.0,
			"neutral": 0.0,
			"negative": 0.0,
		}

		for score in labelized_scores:
			mean_sentiment["positive"] += score["positive"]
			mean_sentiment["neutral"] += score["neutral"]
			mean_sentiment["negative"] += score["negative"]

		mean_sentiment["positive"] /= total_samples
		mean_sentiment["neutral"] /= total_samples
		mean_sentiment["negative"] /= total_samples

		return mean_sentiment

	def __preprocess(self, messages: list[str]):
		"""

		Preprocesses the messages to remove certain patterns that are not

		required for inferencing. User tags and http links are stripped out.



		Args:

			messages (list[str]): List of messages to preprocess



		Returns:

			list[str]: List of processed messages without user tags and links

		"""
		msg_list = []
		for message in messages:
			new_message = []
			for t in message.split(" "):
				t = '@user' if t.startswith('@') and len(t) > 1 else t
				t = 'http' if t.startswith('http') else t
				new_message.append(t)
			msg_list.append(" ".join(new_message))
		return msg_list

	def __labelize(self, scores):
		"""

		Helper method to transform numpy labels, coming as a result of the classification,

		back into their equivalent textual version so that they are human-readable by using

		the model's configuration.



		Args:

			scores: Result from prediction for each individual message



		Returns:

			dict: Dictionary containing the sentiment prediction with human-readable labels

		"""
		output = {}
		ranking = np.argsort(scores)
		ranking = ranking[::-1]
		for i in range(scores.shape[0]):
			l = self.__config.id2label[ranking[i]]
			s = float(scores[ranking[i]])
			output[l] = s
		return output

# Preload a global instance so that inference can be
# executed immediately when requested
infer_task = InferenceTask()