Spaces:

svenwey
/

logmetric

Running

App Files Files Community

svenwey commited on Mar 18

Commit

47362f6

1 Parent(s): a50fad5

refactor code into smaller, modular function components

Browse files

Files changed (1) hide show

logmetric.py +119 -195

logmetric.py CHANGED Viewed

@@ -18,10 +18,8 @@ import datasets
 import re
 import dateutil.parser
 import numpy as np
-from difflib import SequenceMatcher
-import sacrebleu
-import time
 # TODO: Add BibTeX citation
@@ -68,19 +66,6 @@ BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
 class LogMetric(evaluate.Metric):
     """TODO: Short description of my evaluation module."""
-    # Constant regex to get timestrings
-    timestamp_regex = r'^\s*\[?\s*(\d{4}[-/.]\d{2}[-/.]\d{2}(?:[ T]\d{2}[:]\d{2}(?:[:]\d{2}(?:[.,]\d+)?)?(?:Z|[+-]\d{2}[:]\d{2})?)?)\s*\]?\s*'
-    timestamp_pattern = re.compile(timestamp_regex, re.MULTILINE)
-    int_regex = r'(-?\d+)'
-    int_pattern = re.compile(int_regex)
-    float_regex = r'(-?\d+\.\d+)'
-    float_pattern = re.compile(float_regex)
-    sacrebleu_metric = evaluate.load("evaluate-metric/sacrebleu")
     def _info(self):
         # TODO: Specifies the evaluate.EvaluationModuleInfo object
         return evaluate.MetricInfo(
@@ -107,25 +92,59 @@ class LogMetric(evaluate.Metric):
         # TODO: Download external resources if needed
         pass
-    # Jaccard Similarity to measure closeness of two log-messages
-    def get_jaccard_similarity(self, set1, set2):
-        intersection = set1.intersection(set2)
-        union = set1.union(set2)
-        if (len(union) == 0):
-            return 1.0
-        return len(intersection) / len(union)
-    # A score depending on the difference in length of two sentences
-    def get_length_score(self, preds_split, refs_split):
         pred_content_lengths = np.vectorize(len)(preds_split)
         ref_content_lengths = np.vectorize(len)(refs_split)
         return self.smapeScore(pred_content_lengths, ref_content_lengths)
     # helper function that computes the smape_score either between two numbers or two lists of numbers (must be the same length)
-    def smapeScore(self, P, R):
         P_isnumber = isinstance(P, (int, float))
         R_isnumber = isinstance(R, (int, float))
@@ -136,10 +155,12 @@ class LogMetric(evaluate.Metric):
             assert(len(P) == len(R))
         if P_isnumber and R_isnumber:
-            if P == 0 and R == 0: return 1.0      # since this leads to (|R| + |P|) = 0
             return 1 - (np.sum(np.abs(R - P) / (np.abs(R) + np.abs(P))))    # (n = 1)
         else:
-            if len(P) == 0 and len(R) == 0: return 1.0     # since this leads to n = 0
             n = len(P)
             P = np.array(P)
             R = np.array(R)
@@ -150,157 +171,27 @@ class LogMetric(evaluate.Metric):
             return 1 - (1.0/n * np.sum(np.abs(R - P) / denominator))
-    # splits both strings at \n and then computes the smape_score of their lengths
-    def getLineCountScore(self, pred, ref):
-        pred_lines_amt = len(pred.splitlines())
-        ref_lines_amt = len(ref.splitlines())
-        # print("#pred_lines:", pred_lines_amt)
-        # print("#ref_lines:", ref_lines_amt)
-        return self.smapeScore(pred_lines_amt, ref_lines_amt)
-    def replaceNumbers(self, text:str):
         text = self.int_pattern.sub(r'<|INT|>', text)
         text = self.float_pattern.sub(r'<|FLOAT|>', text)
         return text
-    # Get differenct scores regarding the content of a log-message
-    def getLineContentScore(self, pred_logMessages, ref_logMessages):
-        if pred_logMessages == [] and ref_logMessages == []:
-            pred_logMessages = [""]
-            ref_logMessages = [""]
-        sacrebleu_score = self.sacrebleu_metric.compute(predictions=pred_logMessages, references=ref_logMessages)["score"] / 100.0
-        smape_length_score = self.get_length_score(pred_logMessages, ref_logMessages)
-        vectorized_replaceNumbers = np.vectorize(self.replaceNumbers)
-        cleaned_pred_logMessages = vectorized_replaceNumbers(pred_logMessages)
-        cleaned_ref_logMessages = vectorized_replaceNumbers(ref_logMessages)
-        sacrebleu_withoutExplicitNumbers_score = self.sacrebleu_metric.compute(predictions=cleaned_pred_logMessages, references=cleaned_ref_logMessages)["score"] / 100.0
-        return sacrebleu_score, sacrebleu_withoutExplicitNumbers_score, smape_length_score
-    # get different scores regarding the timestamp
-    def getTimestampsScore(self, pred_timestamps, ref_timestamps):
-        timestamp_amt_score = self.smapeScore(len(pred_timestamps), len(ref_timestamps))
-        if (len(pred_timestamps) == 0) and (len(ref_timestamps) == 0):
-            return timestamp_amt_score, 1.0, 1.0, 1.0
-        # if there are no predicted timestamps, return early. It is still consistent and monotonic.
-        if (len(pred_timestamps) == 0) and (len(ref_timestamps) != 0):
-            return timestamp_amt_score, 1.0, 1.0, 0.0
-        # replace all digits in the reference timestamp (first timestamp) with '/d' to get
-        # a regex that describes the format
-        pred_timestring_pattern = re.sub(r'\d', r'\\d', re.escape(pred_timestamps[0]))
-        matchesPatternScore = 1.0
-        monotonicallyIncreasingScore = 1.0
-        pred_timedeltas = []
-        # A variable to save the previous timestamp (as datetime obj) to check monotonicity
-        prev_datetime = None
-        # Convert matches to datetime objects
-        for i in range(len(pred_timestamps)):
-            ts = pred_timestamps[i]
-            try:
-                # Check if the format matches with the format of the first timestamp
-                # TODO!! Check this later, maybe it is too restricting for training a llm
-                matchesPattern = re.fullmatch(pred_timestring_pattern, ts) is not None
-                # Check if the timestamps are monotonically increasing
-                cur_datetime = dateutil.parser.parse(ts)
-                if prev_datetime == None:
-                    monotonicallyIncreasing = True
-                else:
-                    monotonicallyIncreasing = prev_datetime <= cur_datetime
-                    pred_timedeltas.append((cur_datetime - prev_datetime).total_seconds())
-                prev_datetime = cur_datetime
-                # If one entry doesn't fulfill the matching pattern property or the monotinicity property, set to 0 for whole log
-                matchesPatternScore = 0.0 if (not matchesPattern) else matchesPatternScore
-                monotonicallyIncreasingScore = 0.0 if (not monotonicallyIncreasing) else monotonicallyIncreasingScore
-            except Exception as e:
-                # e.g. date format not parsable by dateutil.parser
-                matchesPatternScore = 0.0
-                monotonicallyIncreasingScore = 0.0
-                pred_timedeltas.append(-1)
-            if (len(pred_timestamps) != 0) and (len(ref_timestamps) == 0):
-                return timestamp_amt_score, matchesPatternScore, monotonicallyIncreasingScore, 0.0
-        ref_timedeltas = []
-        prev_datetime = None
-        for i in range(len(ref_timestamps)):
-            ts = ref_timestamps[i]
-            try:
-                cur_datetime = dateutil.parser.parse(ts)
-                if prev_datetime == None:
-                    pass
-                else:
-                    ref_timedeltas.append((cur_datetime - prev_datetime).total_seconds())
-                prev_datetime = cur_datetime
-            except Exception as e:
-               ref_timedeltas.append(-1)
-        minlength = min(len(pred_timedeltas), len(ref_timedeltas))
-        pred_timedeltas = pred_timedeltas[:minlength]
-        ref_timedeltas = ref_timedeltas[:minlength]
-        print("pred_timedeltas:", pred_timedeltas)
-        print("ref_timedeltas:", ref_timedeltas)
-        timestampDeltaScore = self.smapeScore(pred_timedeltas, ref_timedeltas)
-        print("timestampDeltaScore:", timestampDeltaScore)
-        # matchesPatternScore and monotonicallyIncreasingScore are in {0,1}
-        return timestamp_amt_score, matchesPatternScore, monotonicallyIncreasingScore, timestampDeltaScore
-    def getLogMetric(self, pred : str, ref : str):
-        ref = ref.strip(' \t\n\r')
-        pred = pred.strip(' \t\n\r')
-        linecount_difference_SMAPE = self.getLineCountScore(pred, ref)
-        # Split log on timestamps
         pred_split_log = self.timestamp_pattern.split(pred)
         ref_split_log = self.timestamp_pattern.split(ref)
         # One logentry always consists of timestamp + log-message
-        # pred_logentries = []
-        # ref_logentries = []
-        pred_timestamps = []
-        pred_logMessages = []
-        ref_timestamps = []
-        ref_logMessages = []
         # reorganize log into logentry-tuples, consisting of timestamp + log-message
         for i in range(1, len(pred_split_log), 2):
-            # pred_logentries.append((pred_split_log[i],pred_split_log[i+1]))
             pred_timestamps.append(pred_split_log[i])
             pred_logMessages.append(pred_split_log[i+1])
         for i in range(1, len(ref_split_log), 2):
-            # ref_logentries.append((ref_split_log[i],ref_split_log[i+1]))
             ref_timestamps.append(ref_split_log[i])
             ref_logMessages.append(ref_split_log[i+1])
@@ -310,44 +201,77 @@ class LogMetric(evaluate.Metric):
         pred_logMessages += (max_logentries - len(pred_logMessages)) * [" "]
         ref_logMessages += (max_logentries- len(ref_logMessages)) * [" "]
-        linecontent_sacrebleu, linecontent_sacrebleu_withoutExplicitNumbers, linecontentlength_difference_SMAPE = self.getLineContentScore(pred_logMessages, ref_logMessages)
-        timestamps_difference_SMAPE, timestamps_formatConsistency_absolute, timestamps_monotinicity_absolute, timestamps_delta_SMAPE = self.getTimestampsScore(pred_timestamps, ref_timestamps)
-        # return weighted overall score of all the different scores
-        return {"linecount_difference_SMAPE_score": linecount_difference_SMAPE,
-                "linecontentlength_difference_SMAPE_score": linecontentlength_difference_SMAPE,
-                "linecontent_sacrebleu_score": linecontent_sacrebleu,
-                "linecontent_sacrebleu_withoutExplicitNumbers_score": linecontent_sacrebleu_withoutExplicitNumbers,
-                "timestamps_SMAPE_difference_score": timestamps_difference_SMAPE,
-                "timestamps_formatConsistency_score": timestamps_formatConsistency_absolute,
-                "timestamps_monotinicity_score": timestamps_monotinicity_absolute,
-                "timestamps_delta_SMAPE_score" : timestamps_delta_SMAPE
-                }
-    def _compute(self, predictions, references):
-        """Returns the scores"""
-        # TODO: get separate log entries (split before timestamps), replace timestamps with token and compare the log entry with BLEU
-        t_before_logmetric = time.perf_counter()
-        metric_dicts = [self.getLogMetric(p,r) for p,r in zip(predictions,references)]
-        # Extract keys (assuming all dictionaries have the same keys)
-        keys = metric_dicts[0].keys()
-        # Convert list of dictionaries into a 2D numpy array
-        values = np.array([list(d.values()) for d in metric_dicts])
-        # Calculate the mean along the vertical axis (axis=0)
-        mean_values = np.mean(values, axis=0)
-        # a dictionary, matching the keys with their corresponding mean values
-        metric_result = dict(zip(keys, mean_values))
-        t_after_logmetric = time.perf_counter()
-        logmetric_duration = f"{t_after_logmetric - t_before_logmetric:0.10f}"
-        return metric_result

 import re
 import dateutil.parser
 import numpy as np
+from typing import List, Dict, Any
 # TODO: Add BibTeX citation
 class LogMetric(evaluate.Metric):
     """TODO: Short description of my evaluation module."""
     def _info(self):
         # TODO: Specifies the evaluate.EvaluationModuleInfo object
         return evaluate.MetricInfo(
         # TODO: Download external resources if needed
         pass
+    def _compute(self, predictions, references):
+        # TODO: get separate log entries (split before timestamps), replace timestamps with token and compare the log entry with BLEU
+        metric_dicts = [PredRefScore(p,r).run() for p,r in zip(predictions,references)]
+        # Extract keys (assuming all dictionaries have the same keys)
+        keys = metric_dicts[0].keys()
+        # Convert list of dictionaries into a 2D numpy array
+        values = np.array([list(d.values()) for d in metric_dicts])
+        # Calculate the mean along the vertical axis (axis=0)
+        mean_values = np.mean(values, axis=0)
+        # a dictionary, matching the keys with their corresponding mean values
+        metric_result = dict(zip(keys, mean_values))
+        return metric_result
+class PredRefScore:
+    # Constant regex to get timestrings
+    timestamp_regex = r'^\s*\[?\s*(\d{4}[-/.]\d{2}[-/.]\d{2}(?:[ T]\d{2}[:]\d{2}(?:[:]\d{2}(?:[.,]\d+)?)?(?:Z|[+-]\d{2}[:]\d{2})?)?)\s*\]?\s*'
+    timestamp_pattern = re.compile(timestamp_regex, re.MULTILINE)
+    int_pattern = re.compile(r'(-?\d+)')
+    float_pattern = re.compile(r'(-?\d+\.\d+)')
+    scores : Dict[str, float]= {}
+    sacrebleu_metric = evaluate.load("evaluate-metric/sacrebleu")
+    def __init__(self, prediction : str, reference: str) -> Dict[str, float]:
+        self.reference = reference.strip(' \t\n\r')
+        self.prediction = prediction.strip(' \t\n\r')
+    def run(self):
+        self.getLogMetric()
+        return self.scores
+    ##### Convenience Methods #####
+    # TODO: also set pred_ts, ref_ts, pred_msgs and ref_msgs as fields
+    # A score depending on the difference in length of two sentences
+    def get_length_score(self, preds_split : List[Any], refs_split : List[Any]) -> float:
         pred_content_lengths = np.vectorize(len)(preds_split)
         ref_content_lengths = np.vectorize(len)(refs_split)
         return self.smapeScore(pred_content_lengths, ref_content_lengths)
     # helper function that computes the smape_score either between two numbers or two lists of numbers (must be the same length)
+    def smapeScore(self, P, R) -> float:
         P_isnumber = isinstance(P, (int, float))
         R_isnumber = isinstance(R, (int, float))
             assert(len(P) == len(R))
         if P_isnumber and R_isnumber:
+            if P == 0 and R == 0:
+                return 1.0      # since this leads to (|R| + |P|) = 0
             return 1 - (np.sum(np.abs(R - P) / (np.abs(R) + np.abs(P))))    # (n = 1)
         else:
+            if len(P) == 0 and len(R) == 0:
+                return 1.0     # since this leads to n = 0
             n = len(P)
             P = np.array(P)
             R = np.array(R)
             return 1 - (1.0/n * np.sum(np.abs(R - P) / denominator))
+    # Replaces numbers in a string with a placeholder
+    def replaceNumbers(self, text : str) -> str:
         text = self.int_pattern.sub(r'<|INT|>', text)
         text = self.float_pattern.sub(r'<|FLOAT|>', text)
         return text
+    # Split all log-entries in timestamps and log-messages
+    def split_log_entry(self, pred : str, ref: str):
         pred_split_log = self.timestamp_pattern.split(pred)
         ref_split_log = self.timestamp_pattern.split(ref)
         # One logentry always consists of timestamp + log-message
+        pred_timestamps, pred_logMessages = [], []
+        ref_timestamps, ref_logMessages = [], []
         # reorganize log into logentry-tuples, consisting of timestamp + log-message
         for i in range(1, len(pred_split_log), 2):
             pred_timestamps.append(pred_split_log[i])
             pred_logMessages.append(pred_split_log[i+1])
         for i in range(1, len(ref_split_log), 2):
             ref_timestamps.append(ref_split_log[i])
             ref_logMessages.append(ref_split_log[i+1])
         pred_logMessages += (max_logentries - len(pred_logMessages)) * [" "]
         ref_logMessages += (max_logentries- len(ref_logMessages)) * [" "]
+        return pred_timestamps, pred_logMessages, ref_timestamps, ref_logMessages
+    ##### Individual Setter Methods for Scores #####
+    # splits both strings at \n and then computes the smape_score of their lengths
+    def set_linecount_score(self, pred : str, ref : str) -> None:
+        pred_lines_amt = len(pred.splitlines())
+        ref_lines_amt = len(ref.splitlines())
+        self.scores["linecount_difference_SMAPE_score"] = self.smapeScore(pred_lines_amt, ref_lines_amt)
+    def set_sacrebleu_score(self, pred_log_messages : List[str], ref_log_messages : List[str]) -> None:
+        sacrebleu_score = self.sacrebleu_metric.compute(predictions=pred_log_messages, references=ref_log_messages)["score"] / 100.0
+        self.scores["linecontent_sacrebleu_score"] = sacrebleu_score
+    def set_smape_length_score(self, pred_log_messages : List[str], ref_log_messages : List[str]) -> None:
+        smape_length_score = self.get_length_score(pred_log_messages, ref_log_messages)
+        self.scores["linecontentlength_difference_SMAPE_score"] = smape_length_score
+    def set_sacrebleu_withoutexplnumbers_score(self, pred_log_messages : List[str], ref_log_messages : List[str]):
+        vectorized_replaceNumbers = np.vectorize(self.replaceNumbers)
+        cleaned_pred_logMessages = vectorized_replaceNumbers(pred_log_messages)
+        cleaned_ref_logMessages = vectorized_replaceNumbers(ref_log_messages)
+        sacrebleu_withoutExplicitNumbers_score = self.sacrebleu_metric.compute(predictions=cleaned_pred_logMessages, references=cleaned_ref_logMessages)["score"] / 100.0
+        self.scores["linecontent_sacrebleu_withoutExplicitNumbers_score"] = sacrebleu_withoutExplicitNumbers_score
+    # Get differenct scores regarding the content of a log-message
+    def all_linecontent_scores(self, pred_logMessages : List[str], ref_logMessages: List[str]) -> None:
+        if pred_logMessages == [] and ref_logMessages == []:
+            pred_logMessages = [""]
+            ref_logMessages = [""]
+        self.set_sacrebleu_score(pred_logMessages, ref_logMessages)
+        self.set_smape_length_score(pred_logMessages, ref_logMessages)
+        self.set_sacrebleu_withoutexplnumbers_score(pred_logMessages, ref_logMessages)
+    def set_timestamp_amt_score(self, pred_timestamps : List[str], ref_timestamps : List[str]):
+        timestamp_amt_score = self.smapeScore(len(pred_timestamps), len(ref_timestamps))
+        self.scores["timestamps_SMAPE_difference_score"] = timestamp_amt_score
+    def set_timestamp_format_consistency_score(self, pred_timestamps, ref_timestamps):
+        if (len(pred_timestamps) == 0):
+            self.scores["timestamps_formatConsistency_score"] = 1.0
+            return
+        pred_timestring_pattern = re.sub(r'\d', r'\\d', re.escape(pred_timestamps[0])).strip()
+        all_consistent = all(re.fullmatch(pred_timestring_pattern, ts.strip()) is not None for ts in ref_timestamps)
+        self.scores["timestamps_formatConsistency_score"] = 1.0 if all_consistent else 0.0
+    def set_timestamp_monotonicity_score(self, pred_timestamps) -> None:
+        try:
+            parsed_times = [dateutil.parser.parse(ts) for ts in pred_timestamps]  # Parse all timestamps
+        except dateutil.parser.ParserError:
+            self.scores["timestamps_monotinicity_score"] = 0.0
+            return
+        # Check if the timestamps are monotonically increasing
+        all_monotone =  all(t1 <= t2 for t1, t2 in zip(parsed_times, parsed_times[1:]))
+        self.scores["timestamps_monotinicity_score"] = 1.0 if all_monotone else 0.0
+    # get different scores regarding the timestamp
+    def all_timestamp_scores(self, pred_timestamps, ref_timestamps) -> None:
+        self.set_timestamp_amt_score(pred_timestamps, ref_timestamps)
+        self.set_timestamp_format_consistency_score(pred_timestamps, ref_timestamps)
+        self.set_timestamp_monotonicity_score(pred_timestamps)
+    # driver method for different score computations
+    def getLogMetric(self):
+        self.set_linecount_score(self.prediction, self.reference)
+        # Split log on timestamps
+        pred_timestamps, pred_logMessages, ref_timestamps, ref_logMessages = self.split_log_entry(self.prediction, self.reference)
+        self.all_linecontent_scores(pred_logMessages, ref_logMessages)
+        self.all_timestamp_scores(pred_timestamps, ref_timestamps)