make regexe patterns constants
Browse files- logmetric.py +12 -15
logmetric.py
CHANGED
@@ -20,7 +20,13 @@ import dateutil.parser
|
|
20 |
import numpy as np
|
21 |
from typing import List, Dict, Any
|
22 |
|
|
|
|
|
|
|
23 |
|
|
|
|
|
|
|
24 |
|
25 |
# TODO: Add BibTeX citation
|
26 |
_CITATION = """\
|
@@ -112,17 +118,8 @@ class LogMetric(evaluate.Metric):
|
|
112 |
|
113 |
|
114 |
class PredRefScore:
|
115 |
-
# Constant regex to get timestrings
|
116 |
-
timestamp_regex = r'^\s*\[?\s*(\d{4}[-/.]\d{2}[-/.]\d{2}(?:[ T]\d{2}[:]\d{2}(?:[:]\d{2}(?:[.,]\d+)?)?(?:Z|[+-]\d{2}[:]\d{2})?)?)\s*\]?\s*'
|
117 |
-
timestamp_pattern = re.compile(timestamp_regex, re.MULTILINE)
|
118 |
-
|
119 |
-
int_pattern = re.compile(r'(-?\d+)')
|
120 |
-
float_pattern = re.compile(r'(-?\d+\.\d+)')
|
121 |
-
|
122 |
scores : Dict[str, float]= {}
|
123 |
|
124 |
-
sacrebleu_metric = evaluate.load("evaluate-metric/sacrebleu")
|
125 |
-
|
126 |
def __init__(self, prediction : str, reference: str) -> Dict[str, float]:
|
127 |
self.reference = reference.strip(' \t\n\r')
|
128 |
self.prediction = prediction.strip(' \t\n\r')
|
@@ -173,14 +170,14 @@ class PredRefScore:
|
|
173 |
|
174 |
# Replaces numbers in a string with a placeholder
|
175 |
def replaceNumbers(self, text : str) -> str:
|
176 |
-
text =
|
177 |
-
text =
|
178 |
return text
|
179 |
|
180 |
# Split all log-entries in timestamps and log-messages
|
181 |
def split_log_entry(self, pred : str, ref: str):
|
182 |
-
pred_split_log =
|
183 |
-
ref_split_log =
|
184 |
|
185 |
# One logentry always consists of timestamp + log-message
|
186 |
pred_timestamps, pred_logMessages = [], []
|
@@ -212,7 +209,7 @@ class PredRefScore:
|
|
212 |
self.scores["linecount_difference_SMAPE_score"] = self.smapeScore(pred_lines_amt, ref_lines_amt)
|
213 |
|
214 |
def set_sacrebleu_score(self, pred_log_messages : List[str], ref_log_messages : List[str]) -> None:
|
215 |
-
sacrebleu_score =
|
216 |
self.scores["linecontent_sacrebleu_score"] = sacrebleu_score
|
217 |
|
218 |
def set_smape_length_score(self, pred_log_messages : List[str], ref_log_messages : List[str]) -> None:
|
@@ -223,7 +220,7 @@ class PredRefScore:
|
|
223 |
vectorized_replaceNumbers = np.vectorize(self.replaceNumbers)
|
224 |
cleaned_pred_logMessages = vectorized_replaceNumbers(pred_log_messages)
|
225 |
cleaned_ref_logMessages = vectorized_replaceNumbers(ref_log_messages)
|
226 |
-
sacrebleu_withoutExplicitNumbers_score =
|
227 |
self.scores["linecontent_sacrebleu_withoutExplicitNumbers_score"] = sacrebleu_withoutExplicitNumbers_score
|
228 |
|
229 |
# Get differenct scores regarding the content of a log-message
|
|
|
20 |
import numpy as np
|
21 |
from typing import List, Dict, Any
|
22 |
|
23 |
+
# Constant regex to get timestrings
|
24 |
+
timestamp_regex = r'^\s*\[?\s*(\d{4}[-/.]\d{2}[-/.]\d{2}(?:[ T]\d{2}[:]\d{2}(?:[:]\d{2}(?:[.,]\d+)?)?(?:Z|[+-]\d{2}[:]\d{2})?)?)\s*\]?\s*'
|
25 |
+
TIMESTAMP_PATTERN = re.compile(timestamp_regex, re.MULTILINE)
|
26 |
|
27 |
+
INT_PATTERN = re.compile(r'(-?\d+)')
|
28 |
+
FLOAT_PATTERN = re.compile(r'(-?\d+\.\d+)')
|
29 |
+
SACREBLEU_METRIC = evaluate.load("evaluate-metric/sacrebleu")
|
30 |
|
31 |
# TODO: Add BibTeX citation
|
32 |
_CITATION = """\
|
|
|
118 |
|
119 |
|
120 |
class PredRefScore:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
scores : Dict[str, float]= {}
|
122 |
|
|
|
|
|
123 |
def __init__(self, prediction : str, reference: str) -> Dict[str, float]:
|
124 |
self.reference = reference.strip(' \t\n\r')
|
125 |
self.prediction = prediction.strip(' \t\n\r')
|
|
|
170 |
|
171 |
# Replaces numbers in a string with a placeholder
|
172 |
def replaceNumbers(self, text : str) -> str:
|
173 |
+
text = INT_PATTERN.sub(r'<|INT|>', text)
|
174 |
+
text = FLOAT_PATTERN.sub(r'<|FLOAT|>', text)
|
175 |
return text
|
176 |
|
177 |
# Split all log-entries in timestamps and log-messages
|
178 |
def split_log_entry(self, pred : str, ref: str):
|
179 |
+
pred_split_log = TIMESTAMP_PATTERN.split(pred)
|
180 |
+
ref_split_log = TIMESTAMP_PATTERN.split(ref)
|
181 |
|
182 |
# One logentry always consists of timestamp + log-message
|
183 |
pred_timestamps, pred_logMessages = [], []
|
|
|
209 |
self.scores["linecount_difference_SMAPE_score"] = self.smapeScore(pred_lines_amt, ref_lines_amt)
|
210 |
|
211 |
def set_sacrebleu_score(self, pred_log_messages : List[str], ref_log_messages : List[str]) -> None:
|
212 |
+
sacrebleu_score = SACREBLEU_METRIC.compute(predictions=pred_log_messages, references=ref_log_messages)["score"] / 100.0
|
213 |
self.scores["linecontent_sacrebleu_score"] = sacrebleu_score
|
214 |
|
215 |
def set_smape_length_score(self, pred_log_messages : List[str], ref_log_messages : List[str]) -> None:
|
|
|
220 |
vectorized_replaceNumbers = np.vectorize(self.replaceNumbers)
|
221 |
cleaned_pred_logMessages = vectorized_replaceNumbers(pred_log_messages)
|
222 |
cleaned_ref_logMessages = vectorized_replaceNumbers(ref_log_messages)
|
223 |
+
sacrebleu_withoutExplicitNumbers_score = SACREBLEU_METRIC.compute(predictions=cleaned_pred_logMessages, references=cleaned_ref_logMessages)["score"] / 100.0
|
224 |
self.scores["linecontent_sacrebleu_withoutExplicitNumbers_score"] = sacrebleu_withoutExplicitNumbers_score
|
225 |
|
226 |
# Get differenct scores regarding the content of a log-message
|