svenwey commited on
Commit
dd8881c
·
1 Parent(s): 47362f6

make regexe patterns constants

Browse files
Files changed (1) hide show
  1. logmetric.py +12 -15
logmetric.py CHANGED
@@ -20,7 +20,13 @@ import dateutil.parser
20
  import numpy as np
21
  from typing import List, Dict, Any
22
 
 
 
 
23
 
 
 
 
24
 
25
  # TODO: Add BibTeX citation
26
  _CITATION = """\
@@ -112,17 +118,8 @@ class LogMetric(evaluate.Metric):
112
 
113
 
114
  class PredRefScore:
115
- # Constant regex to get timestrings
116
- timestamp_regex = r'^\s*\[?\s*(\d{4}[-/.]\d{2}[-/.]\d{2}(?:[ T]\d{2}[:]\d{2}(?:[:]\d{2}(?:[.,]\d+)?)?(?:Z|[+-]\d{2}[:]\d{2})?)?)\s*\]?\s*'
117
- timestamp_pattern = re.compile(timestamp_regex, re.MULTILINE)
118
-
119
- int_pattern = re.compile(r'(-?\d+)')
120
- float_pattern = re.compile(r'(-?\d+\.\d+)')
121
-
122
  scores : Dict[str, float]= {}
123
 
124
- sacrebleu_metric = evaluate.load("evaluate-metric/sacrebleu")
125
-
126
  def __init__(self, prediction : str, reference: str) -> Dict[str, float]:
127
  self.reference = reference.strip(' \t\n\r')
128
  self.prediction = prediction.strip(' \t\n\r')
@@ -173,14 +170,14 @@ class PredRefScore:
173
 
174
  # Replaces numbers in a string with a placeholder
175
  def replaceNumbers(self, text : str) -> str:
176
- text = self.int_pattern.sub(r'<|INT|>', text)
177
- text = self.float_pattern.sub(r'<|FLOAT|>', text)
178
  return text
179
 
180
  # Split all log-entries in timestamps and log-messages
181
  def split_log_entry(self, pred : str, ref: str):
182
- pred_split_log = self.timestamp_pattern.split(pred)
183
- ref_split_log = self.timestamp_pattern.split(ref)
184
 
185
  # One logentry always consists of timestamp + log-message
186
  pred_timestamps, pred_logMessages = [], []
@@ -212,7 +209,7 @@ class PredRefScore:
212
  self.scores["linecount_difference_SMAPE_score"] = self.smapeScore(pred_lines_amt, ref_lines_amt)
213
 
214
  def set_sacrebleu_score(self, pred_log_messages : List[str], ref_log_messages : List[str]) -> None:
215
- sacrebleu_score = self.sacrebleu_metric.compute(predictions=pred_log_messages, references=ref_log_messages)["score"] / 100.0
216
  self.scores["linecontent_sacrebleu_score"] = sacrebleu_score
217
 
218
  def set_smape_length_score(self, pred_log_messages : List[str], ref_log_messages : List[str]) -> None:
@@ -223,7 +220,7 @@ class PredRefScore:
223
  vectorized_replaceNumbers = np.vectorize(self.replaceNumbers)
224
  cleaned_pred_logMessages = vectorized_replaceNumbers(pred_log_messages)
225
  cleaned_ref_logMessages = vectorized_replaceNumbers(ref_log_messages)
226
- sacrebleu_withoutExplicitNumbers_score = self.sacrebleu_metric.compute(predictions=cleaned_pred_logMessages, references=cleaned_ref_logMessages)["score"] / 100.0
227
  self.scores["linecontent_sacrebleu_withoutExplicitNumbers_score"] = sacrebleu_withoutExplicitNumbers_score
228
 
229
  # Get differenct scores regarding the content of a log-message
 
20
  import numpy as np
21
  from typing import List, Dict, Any
22
 
23
+ # Constant regex to get timestrings
24
+ timestamp_regex = r'^\s*\[?\s*(\d{4}[-/.]\d{2}[-/.]\d{2}(?:[ T]\d{2}[:]\d{2}(?:[:]\d{2}(?:[.,]\d+)?)?(?:Z|[+-]\d{2}[:]\d{2})?)?)\s*\]?\s*'
25
+ TIMESTAMP_PATTERN = re.compile(timestamp_regex, re.MULTILINE)
26
 
27
+ INT_PATTERN = re.compile(r'(-?\d+)')
28
+ FLOAT_PATTERN = re.compile(r'(-?\d+\.\d+)')
29
+ SACREBLEU_METRIC = evaluate.load("evaluate-metric/sacrebleu")
30
 
31
  # TODO: Add BibTeX citation
32
  _CITATION = """\
 
118
 
119
 
120
  class PredRefScore:
 
 
 
 
 
 
 
121
  scores : Dict[str, float]= {}
122
 
 
 
123
  def __init__(self, prediction : str, reference: str) -> Dict[str, float]:
124
  self.reference = reference.strip(' \t\n\r')
125
  self.prediction = prediction.strip(' \t\n\r')
 
170
 
171
  # Replaces numbers in a string with a placeholder
172
  def replaceNumbers(self, text : str) -> str:
173
+ text = INT_PATTERN.sub(r'<|INT|>', text)
174
+ text = FLOAT_PATTERN.sub(r'<|FLOAT|>', text)
175
  return text
176
 
177
  # Split all log-entries in timestamps and log-messages
178
  def split_log_entry(self, pred : str, ref: str):
179
+ pred_split_log = TIMESTAMP_PATTERN.split(pred)
180
+ ref_split_log = TIMESTAMP_PATTERN.split(ref)
181
 
182
  # One logentry always consists of timestamp + log-message
183
  pred_timestamps, pred_logMessages = [], []
 
209
  self.scores["linecount_difference_SMAPE_score"] = self.smapeScore(pred_lines_amt, ref_lines_amt)
210
 
211
  def set_sacrebleu_score(self, pred_log_messages : List[str], ref_log_messages : List[str]) -> None:
212
+ sacrebleu_score = SACREBLEU_METRIC.compute(predictions=pred_log_messages, references=ref_log_messages)["score"] / 100.0
213
  self.scores["linecontent_sacrebleu_score"] = sacrebleu_score
214
 
215
  def set_smape_length_score(self, pred_log_messages : List[str], ref_log_messages : List[str]) -> None:
 
220
  vectorized_replaceNumbers = np.vectorize(self.replaceNumbers)
221
  cleaned_pred_logMessages = vectorized_replaceNumbers(pred_log_messages)
222
  cleaned_ref_logMessages = vectorized_replaceNumbers(ref_log_messages)
223
+ sacrebleu_withoutExplicitNumbers_score = SACREBLEU_METRIC.compute(predictions=cleaned_pred_logMessages, references=cleaned_ref_logMessages)["score"] / 100.0
224
  self.scores["linecontent_sacrebleu_withoutExplicitNumbers_score"] = sacrebleu_withoutExplicitNumbers_score
225
 
226
  # Get differenct scores regarding the content of a log-message