svenwey commited on
Commit
47362f6
·
1 Parent(s): a50fad5

refactor code into smaller, modular function components

Browse files
Files changed (1) hide show
  1. logmetric.py +119 -195
logmetric.py CHANGED
@@ -18,10 +18,8 @@ import datasets
18
  import re
19
  import dateutil.parser
20
  import numpy as np
21
- from difflib import SequenceMatcher
22
- import sacrebleu
23
 
24
- import time
25
 
26
 
27
  # TODO: Add BibTeX citation
@@ -68,19 +66,6 @@ BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
68
  class LogMetric(evaluate.Metric):
69
  """TODO: Short description of my evaluation module."""
70
 
71
- # Constant regex to get timestrings
72
- timestamp_regex = r'^\s*\[?\s*(\d{4}[-/.]\d{2}[-/.]\d{2}(?:[ T]\d{2}[:]\d{2}(?:[:]\d{2}(?:[.,]\d+)?)?(?:Z|[+-]\d{2}[:]\d{2})?)?)\s*\]?\s*'
73
- timestamp_pattern = re.compile(timestamp_regex, re.MULTILINE)
74
-
75
- int_regex = r'(-?\d+)'
76
- int_pattern = re.compile(int_regex)
77
-
78
- float_regex = r'(-?\d+\.\d+)'
79
- float_pattern = re.compile(float_regex)
80
-
81
- sacrebleu_metric = evaluate.load("evaluate-metric/sacrebleu")
82
-
83
-
84
  def _info(self):
85
  # TODO: Specifies the evaluate.EvaluationModuleInfo object
86
  return evaluate.MetricInfo(
@@ -107,25 +92,59 @@ class LogMetric(evaluate.Metric):
107
  # TODO: Download external resources if needed
108
  pass
109
 
110
- # Jaccard Similarity to measure closeness of two log-messages
111
- def get_jaccard_similarity(self, set1, set2):
112
- intersection = set1.intersection(set2)
113
- union = set1.union(set2)
114
- if (len(union) == 0):
115
- return 1.0
 
 
 
 
 
 
 
 
 
116
 
117
- return len(intersection) / len(union)
118
 
119
- # A score depending on the difference in length of two sentences
120
- def get_length_score(self, preds_split, refs_split):
121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  pred_content_lengths = np.vectorize(len)(preds_split)
123
  ref_content_lengths = np.vectorize(len)(refs_split)
124
 
125
  return self.smapeScore(pred_content_lengths, ref_content_lengths)
126
 
127
  # helper function that computes the smape_score either between two numbers or two lists of numbers (must be the same length)
128
- def smapeScore(self, P, R):
129
  P_isnumber = isinstance(P, (int, float))
130
  R_isnumber = isinstance(R, (int, float))
131
 
@@ -136,10 +155,12 @@ class LogMetric(evaluate.Metric):
136
  assert(len(P) == len(R))
137
 
138
  if P_isnumber and R_isnumber:
139
- if P == 0 and R == 0: return 1.0 # since this leads to (|R| + |P|) = 0
 
140
  return 1 - (np.sum(np.abs(R - P) / (np.abs(R) + np.abs(P)))) # (n = 1)
141
  else:
142
- if len(P) == 0 and len(R) == 0: return 1.0 # since this leads to n = 0
 
143
  n = len(P)
144
  P = np.array(P)
145
  R = np.array(R)
@@ -150,157 +171,27 @@ class LogMetric(evaluate.Metric):
150
 
151
  return 1 - (1.0/n * np.sum(np.abs(R - P) / denominator))
152
 
153
- # splits both strings at \n and then computes the smape_score of their lengths
154
- def getLineCountScore(self, pred, ref):
155
- pred_lines_amt = len(pred.splitlines())
156
- ref_lines_amt = len(ref.splitlines())
157
-
158
- # print("#pred_lines:", pred_lines_amt)
159
- # print("#ref_lines:", ref_lines_amt)
160
-
161
- return self.smapeScore(pred_lines_amt, ref_lines_amt)
162
-
163
- def replaceNumbers(self, text:str):
164
  text = self.int_pattern.sub(r'<|INT|>', text)
165
  text = self.float_pattern.sub(r'<|FLOAT|>', text)
166
  return text
167
 
168
- # Get differenct scores regarding the content of a log-message
169
- def getLineContentScore(self, pred_logMessages, ref_logMessages):
170
- if pred_logMessages == [] and ref_logMessages == []:
171
- pred_logMessages = [""]
172
- ref_logMessages = [""]
173
- sacrebleu_score = self.sacrebleu_metric.compute(predictions=pred_logMessages, references=ref_logMessages)["score"] / 100.0
174
-
175
- smape_length_score = self.get_length_score(pred_logMessages, ref_logMessages)
176
-
177
- vectorized_replaceNumbers = np.vectorize(self.replaceNumbers)
178
-
179
- cleaned_pred_logMessages = vectorized_replaceNumbers(pred_logMessages)
180
- cleaned_ref_logMessages = vectorized_replaceNumbers(ref_logMessages)
181
-
182
- sacrebleu_withoutExplicitNumbers_score = self.sacrebleu_metric.compute(predictions=cleaned_pred_logMessages, references=cleaned_ref_logMessages)["score"] / 100.0
183
-
184
-
185
- return sacrebleu_score, sacrebleu_withoutExplicitNumbers_score, smape_length_score
186
-
187
- # get different scores regarding the timestamp
188
- def getTimestampsScore(self, pred_timestamps, ref_timestamps):
189
- timestamp_amt_score = self.smapeScore(len(pred_timestamps), len(ref_timestamps))
190
-
191
- if (len(pred_timestamps) == 0) and (len(ref_timestamps) == 0):
192
- return timestamp_amt_score, 1.0, 1.0, 1.0
193
-
194
- # if there are no predicted timestamps, return early. It is still consistent and monotonic.
195
- if (len(pred_timestamps) == 0) and (len(ref_timestamps) != 0):
196
- return timestamp_amt_score, 1.0, 1.0, 0.0
197
-
198
- # replace all digits in the reference timestamp (first timestamp) with '/d' to get
199
- # a regex that describes the format
200
- pred_timestring_pattern = re.sub(r'\d', r'\\d', re.escape(pred_timestamps[0]))
201
-
202
- matchesPatternScore = 1.0
203
- monotonicallyIncreasingScore = 1.0
204
- pred_timedeltas = []
205
-
206
- # A variable to save the previous timestamp (as datetime obj) to check monotonicity
207
- prev_datetime = None
208
- # Convert matches to datetime objects
209
-
210
- for i in range(len(pred_timestamps)):
211
- ts = pred_timestamps[i]
212
- try:
213
- # Check if the format matches with the format of the first timestamp
214
- # TODO!! Check this later, maybe it is too restricting for training a llm
215
- matchesPattern = re.fullmatch(pred_timestring_pattern, ts) is not None
216
- # Check if the timestamps are monotonically increasing
217
- cur_datetime = dateutil.parser.parse(ts)
218
- if prev_datetime == None:
219
- monotonicallyIncreasing = True
220
- else:
221
- monotonicallyIncreasing = prev_datetime <= cur_datetime
222
- pred_timedeltas.append((cur_datetime - prev_datetime).total_seconds())
223
-
224
- prev_datetime = cur_datetime
225
-
226
- # If one entry doesn't fulfill the matching pattern property or the monotinicity property, set to 0 for whole log
227
- matchesPatternScore = 0.0 if (not matchesPattern) else matchesPatternScore
228
- monotonicallyIncreasingScore = 0.0 if (not monotonicallyIncreasing) else monotonicallyIncreasingScore
229
-
230
-
231
- except Exception as e:
232
- # e.g. date format not parsable by dateutil.parser
233
- matchesPatternScore = 0.0
234
- monotonicallyIncreasingScore = 0.0
235
- pred_timedeltas.append(-1)
236
-
237
-
238
- if (len(pred_timestamps) != 0) and (len(ref_timestamps) == 0):
239
- return timestamp_amt_score, matchesPatternScore, monotonicallyIncreasingScore, 0.0
240
-
241
-
242
- ref_timedeltas = []
243
- prev_datetime = None
244
- for i in range(len(ref_timestamps)):
245
- ts = ref_timestamps[i]
246
- try:
247
- cur_datetime = dateutil.parser.parse(ts)
248
- if prev_datetime == None:
249
- pass
250
- else:
251
- ref_timedeltas.append((cur_datetime - prev_datetime).total_seconds())
252
-
253
- prev_datetime = cur_datetime
254
-
255
- except Exception as e:
256
- ref_timedeltas.append(-1)
257
-
258
- minlength = min(len(pred_timedeltas), len(ref_timedeltas))
259
-
260
- pred_timedeltas = pred_timedeltas[:minlength]
261
- ref_timedeltas = ref_timedeltas[:minlength]
262
-
263
- print("pred_timedeltas:", pred_timedeltas)
264
- print("ref_timedeltas:", ref_timedeltas)
265
-
266
-
267
- timestampDeltaScore = self.smapeScore(pred_timedeltas, ref_timedeltas)
268
-
269
- print("timestampDeltaScore:", timestampDeltaScore)
270
- # matchesPatternScore and monotonicallyIncreasingScore are in {0,1}
271
- return timestamp_amt_score, matchesPatternScore, monotonicallyIncreasingScore, timestampDeltaScore
272
-
273
-
274
-
275
- def getLogMetric(self, pred : str, ref : str):
276
- ref = ref.strip(' \t\n\r')
277
- pred = pred.strip(' \t\n\r')
278
-
279
- linecount_difference_SMAPE = self.getLineCountScore(pred, ref)
280
-
281
-
282
- # Split log on timestamps
283
  pred_split_log = self.timestamp_pattern.split(pred)
284
  ref_split_log = self.timestamp_pattern.split(ref)
285
 
286
  # One logentry always consists of timestamp + log-message
287
- # pred_logentries = []
288
- # ref_logentries = []
289
-
290
- pred_timestamps = []
291
- pred_logMessages = []
292
 
293
- ref_timestamps = []
294
- ref_logMessages = []
295
  # reorganize log into logentry-tuples, consisting of timestamp + log-message
296
  for i in range(1, len(pred_split_log), 2):
297
- # pred_logentries.append((pred_split_log[i],pred_split_log[i+1]))
298
  pred_timestamps.append(pred_split_log[i])
299
  pred_logMessages.append(pred_split_log[i+1])
300
 
301
-
302
  for i in range(1, len(ref_split_log), 2):
303
- # ref_logentries.append((ref_split_log[i],ref_split_log[i+1]))
304
  ref_timestamps.append(ref_split_log[i])
305
  ref_logMessages.append(ref_split_log[i+1])
306
 
@@ -310,44 +201,77 @@ class LogMetric(evaluate.Metric):
310
  pred_logMessages += (max_logentries - len(pred_logMessages)) * [" "]
311
  ref_logMessages += (max_logentries- len(ref_logMessages)) * [" "]
312
 
313
- linecontent_sacrebleu, linecontent_sacrebleu_withoutExplicitNumbers, linecontentlength_difference_SMAPE = self.getLineContentScore(pred_logMessages, ref_logMessages)
314
-
315
- timestamps_difference_SMAPE, timestamps_formatConsistency_absolute, timestamps_monotinicity_absolute, timestamps_delta_SMAPE = self.getTimestampsScore(pred_timestamps, ref_timestamps)
316
 
 
317
 
318
- # return weighted overall score of all the different scores
319
- return {"linecount_difference_SMAPE_score": linecount_difference_SMAPE,
320
- "linecontentlength_difference_SMAPE_score": linecontentlength_difference_SMAPE,
321
- "linecontent_sacrebleu_score": linecontent_sacrebleu,
322
- "linecontent_sacrebleu_withoutExplicitNumbers_score": linecontent_sacrebleu_withoutExplicitNumbers,
323
- "timestamps_SMAPE_difference_score": timestamps_difference_SMAPE,
324
- "timestamps_formatConsistency_score": timestamps_formatConsistency_absolute,
325
- "timestamps_monotinicity_score": timestamps_monotinicity_absolute,
326
- "timestamps_delta_SMAPE_score" : timestamps_delta_SMAPE
327
- }
 
 
 
328
 
329
- def _compute(self, predictions, references):
330
- """Returns the scores"""
 
 
 
 
331
 
332
- # TODO: get separate log entries (split before timestamps), replace timestamps with token and compare the log entry with BLEU
 
 
 
 
333
 
334
- t_before_logmetric = time.perf_counter()
335
- metric_dicts = [self.getLogMetric(p,r) for p,r in zip(predictions,references)]
336
- # Extract keys (assuming all dictionaries have the same keys)
337
- keys = metric_dicts[0].keys()
338
-
339
- # Convert list of dictionaries into a 2D numpy array
340
- values = np.array([list(d.values()) for d in metric_dicts])
 
 
 
 
 
341
 
342
- # Calculate the mean along the vertical axis (axis=0)
343
- mean_values = np.mean(values, axis=0)
344
 
345
- # a dictionary, matching the keys with their corresponding mean values
346
- metric_result = dict(zip(keys, mean_values))
347
 
348
- t_after_logmetric = time.perf_counter()
349
- logmetric_duration = f"{t_after_logmetric - t_before_logmetric:0.10f}"
 
 
 
 
350
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
 
352
- return metric_result
353
-
 
18
  import re
19
  import dateutil.parser
20
  import numpy as np
21
+ from typing import List, Dict, Any
 
22
 
 
23
 
24
 
25
  # TODO: Add BibTeX citation
 
66
  class LogMetric(evaluate.Metric):
67
  """TODO: Short description of my evaluation module."""
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  def _info(self):
70
  # TODO: Specifies the evaluate.EvaluationModuleInfo object
71
  return evaluate.MetricInfo(
 
92
  # TODO: Download external resources if needed
93
  pass
94
 
95
+
96
+ def _compute(self, predictions, references):
97
+ # TODO: get separate log entries (split before timestamps), replace timestamps with token and compare the log entry with BLEU
98
+ metric_dicts = [PredRefScore(p,r).run() for p,r in zip(predictions,references)]
99
+ # Extract keys (assuming all dictionaries have the same keys)
100
+ keys = metric_dicts[0].keys()
101
+
102
+ # Convert list of dictionaries into a 2D numpy array
103
+ values = np.array([list(d.values()) for d in metric_dicts])
104
+
105
+ # Calculate the mean along the vertical axis (axis=0)
106
+ mean_values = np.mean(values, axis=0)
107
+
108
+ # a dictionary, matching the keys with their corresponding mean values
109
+ metric_result = dict(zip(keys, mean_values))
110
 
111
+ return metric_result
112
 
 
 
113
 
114
+ class PredRefScore:
115
+ # Constant regex to get timestrings
116
+ timestamp_regex = r'^\s*\[?\s*(\d{4}[-/.]\d{2}[-/.]\d{2}(?:[ T]\d{2}[:]\d{2}(?:[:]\d{2}(?:[.,]\d+)?)?(?:Z|[+-]\d{2}[:]\d{2})?)?)\s*\]?\s*'
117
+ timestamp_pattern = re.compile(timestamp_regex, re.MULTILINE)
118
+
119
+ int_pattern = re.compile(r'(-?\d+)')
120
+ float_pattern = re.compile(r'(-?\d+\.\d+)')
121
+
122
+ scores : Dict[str, float]= {}
123
+
124
+ sacrebleu_metric = evaluate.load("evaluate-metric/sacrebleu")
125
+
126
+ def __init__(self, prediction : str, reference: str) -> Dict[str, float]:
127
+ self.reference = reference.strip(' \t\n\r')
128
+ self.prediction = prediction.strip(' \t\n\r')
129
+
130
+ def run(self):
131
+ self.getLogMetric()
132
+ return self.scores
133
+
134
+
135
+ ##### Convenience Methods #####
136
+
137
+ # TODO: also set pred_ts, ref_ts, pred_msgs and ref_msgs as fields
138
+
139
+ # A score depending on the difference in length of two sentences
140
+ def get_length_score(self, preds_split : List[Any], refs_split : List[Any]) -> float:
141
  pred_content_lengths = np.vectorize(len)(preds_split)
142
  ref_content_lengths = np.vectorize(len)(refs_split)
143
 
144
  return self.smapeScore(pred_content_lengths, ref_content_lengths)
145
 
146
  # helper function that computes the smape_score either between two numbers or two lists of numbers (must be the same length)
147
+ def smapeScore(self, P, R) -> float:
148
  P_isnumber = isinstance(P, (int, float))
149
  R_isnumber = isinstance(R, (int, float))
150
 
 
155
  assert(len(P) == len(R))
156
 
157
  if P_isnumber and R_isnumber:
158
+ if P == 0 and R == 0:
159
+ return 1.0 # since this leads to (|R| + |P|) = 0
160
  return 1 - (np.sum(np.abs(R - P) / (np.abs(R) + np.abs(P)))) # (n = 1)
161
  else:
162
+ if len(P) == 0 and len(R) == 0:
163
+ return 1.0 # since this leads to n = 0
164
  n = len(P)
165
  P = np.array(P)
166
  R = np.array(R)
 
171
 
172
  return 1 - (1.0/n * np.sum(np.abs(R - P) / denominator))
173
 
174
+ # Replaces numbers in a string with a placeholder
175
+ def replaceNumbers(self, text : str) -> str:
 
 
 
 
 
 
 
 
 
176
  text = self.int_pattern.sub(r'<|INT|>', text)
177
  text = self.float_pattern.sub(r'<|FLOAT|>', text)
178
  return text
179
 
180
+ # Split all log-entries in timestamps and log-messages
181
+ def split_log_entry(self, pred : str, ref: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  pred_split_log = self.timestamp_pattern.split(pred)
183
  ref_split_log = self.timestamp_pattern.split(ref)
184
 
185
  # One logentry always consists of timestamp + log-message
186
+ pred_timestamps, pred_logMessages = [], []
187
+ ref_timestamps, ref_logMessages = [], []
 
 
 
188
 
 
 
189
  # reorganize log into logentry-tuples, consisting of timestamp + log-message
190
  for i in range(1, len(pred_split_log), 2):
 
191
  pred_timestamps.append(pred_split_log[i])
192
  pred_logMessages.append(pred_split_log[i+1])
193
 
 
194
  for i in range(1, len(ref_split_log), 2):
 
195
  ref_timestamps.append(ref_split_log[i])
196
  ref_logMessages.append(ref_split_log[i+1])
197
 
 
201
  pred_logMessages += (max_logentries - len(pred_logMessages)) * [" "]
202
  ref_logMessages += (max_logentries- len(ref_logMessages)) * [" "]
203
 
204
+ return pred_timestamps, pred_logMessages, ref_timestamps, ref_logMessages
 
 
205
 
206
+ ##### Individual Setter Methods for Scores #####
207
 
208
+ # splits both strings at \n and then computes the smape_score of their lengths
209
+ def set_linecount_score(self, pred : str, ref : str) -> None:
210
+ pred_lines_amt = len(pred.splitlines())
211
+ ref_lines_amt = len(ref.splitlines())
212
+ self.scores["linecount_difference_SMAPE_score"] = self.smapeScore(pred_lines_amt, ref_lines_amt)
213
+
214
+ def set_sacrebleu_score(self, pred_log_messages : List[str], ref_log_messages : List[str]) -> None:
215
+ sacrebleu_score = self.sacrebleu_metric.compute(predictions=pred_log_messages, references=ref_log_messages)["score"] / 100.0
216
+ self.scores["linecontent_sacrebleu_score"] = sacrebleu_score
217
+
218
+ def set_smape_length_score(self, pred_log_messages : List[str], ref_log_messages : List[str]) -> None:
219
+ smape_length_score = self.get_length_score(pred_log_messages, ref_log_messages)
220
+ self.scores["linecontentlength_difference_SMAPE_score"] = smape_length_score
221
 
222
+ def set_sacrebleu_withoutexplnumbers_score(self, pred_log_messages : List[str], ref_log_messages : List[str]):
223
+ vectorized_replaceNumbers = np.vectorize(self.replaceNumbers)
224
+ cleaned_pred_logMessages = vectorized_replaceNumbers(pred_log_messages)
225
+ cleaned_ref_logMessages = vectorized_replaceNumbers(ref_log_messages)
226
+ sacrebleu_withoutExplicitNumbers_score = self.sacrebleu_metric.compute(predictions=cleaned_pred_logMessages, references=cleaned_ref_logMessages)["score"] / 100.0
227
+ self.scores["linecontent_sacrebleu_withoutExplicitNumbers_score"] = sacrebleu_withoutExplicitNumbers_score
228
 
229
+ # Get differenct scores regarding the content of a log-message
230
+ def all_linecontent_scores(self, pred_logMessages : List[str], ref_logMessages: List[str]) -> None:
231
+ if pred_logMessages == [] and ref_logMessages == []:
232
+ pred_logMessages = [""]
233
+ ref_logMessages = [""]
234
 
235
+ self.set_sacrebleu_score(pred_logMessages, ref_logMessages)
236
+ self.set_smape_length_score(pred_logMessages, ref_logMessages)
237
+ self.set_sacrebleu_withoutexplnumbers_score(pred_logMessages, ref_logMessages)
238
+
239
+ def set_timestamp_amt_score(self, pred_timestamps : List[str], ref_timestamps : List[str]):
240
+ timestamp_amt_score = self.smapeScore(len(pred_timestamps), len(ref_timestamps))
241
+ self.scores["timestamps_SMAPE_difference_score"] = timestamp_amt_score
242
+
243
+ def set_timestamp_format_consistency_score(self, pred_timestamps, ref_timestamps):
244
+ if (len(pred_timestamps) == 0):
245
+ self.scores["timestamps_formatConsistency_score"] = 1.0
246
+ return
247
 
248
+ pred_timestring_pattern = re.sub(r'\d', r'\\d', re.escape(pred_timestamps[0])).strip()
249
+ all_consistent = all(re.fullmatch(pred_timestring_pattern, ts.strip()) is not None for ts in ref_timestamps)
250
 
251
+ self.scores["timestamps_formatConsistency_score"] = 1.0 if all_consistent else 0.0
 
252
 
253
+ def set_timestamp_monotonicity_score(self, pred_timestamps) -> None:
254
+ try:
255
+ parsed_times = [dateutil.parser.parse(ts) for ts in pred_timestamps] # Parse all timestamps
256
+ except dateutil.parser.ParserError:
257
+ self.scores["timestamps_monotinicity_score"] = 0.0
258
+ return
259
 
260
+ # Check if the timestamps are monotonically increasing
261
+ all_monotone = all(t1 <= t2 for t1, t2 in zip(parsed_times, parsed_times[1:]))
262
+ self.scores["timestamps_monotinicity_score"] = 1.0 if all_monotone else 0.0
263
+
264
+ # get different scores regarding the timestamp
265
+ def all_timestamp_scores(self, pred_timestamps, ref_timestamps) -> None:
266
+ self.set_timestamp_amt_score(pred_timestamps, ref_timestamps)
267
+ self.set_timestamp_format_consistency_score(pred_timestamps, ref_timestamps)
268
+ self.set_timestamp_monotonicity_score(pred_timestamps)
269
+
270
+ # driver method for different score computations
271
+ def getLogMetric(self):
272
+ self.set_linecount_score(self.prediction, self.reference)
273
+ # Split log on timestamps
274
+ pred_timestamps, pred_logMessages, ref_timestamps, ref_logMessages = self.split_log_entry(self.prediction, self.reference)
275
+ self.all_linecontent_scores(pred_logMessages, ref_logMessages)
276
+ self.all_timestamp_scores(pred_timestamps, ref_timestamps)
277