# Copyright (c) Microsoft Corporation. # Licensed under the MIT license. """ROUGe metric implementation. This is a modified and slightly extended verison of https://github.com/miso-belica/sumy/blob/dev/sumy/evaluation/rouge.py. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals import itertools import numpy as np # pylint: disable=C0103 def _get_ngrams(n, text): """Calcualtes n-grams. Args: n: which n-grams to calculate text: An array of tokens Returns: A set of n-grams """ ngram_set = {} text_length = len(text) max_index_ngram_start = text_length - n for i in range(max_index_ngram_start + 1): k = " ".join(text[i : i + n]) if k not in ngram_set: ngram_set[k] = 0 ngram_set[k] += 1 return ngram_set def _get_su(dist, text): """Calcualtes skip-grams and unigram Args: n: which n-grams to calculate text: An array of tokens Returns: A set of n-grams """ su_set = {} text_length = len(text) for i in range(text_length): k = text[i] if k not in su_set: su_set[k] = 0 su_set[k] += 1 for j in range(i + 1, text_length): if j - i - 1 > dist: break k = text[i] + " " + text[j] if k not in su_set: su_set[k] = 0 su_set[k] += 1 return su_set def _split_into_words(sentences): """Splits multiple sentences into words and flattens the result""" return list(itertools.chain(*[_.split(" ") for _ in sentences])) def _get_word_ngrams(n, sentences): """Calculates word n-grams for multiple sentences.""" assert len(sentences) > 0 assert n > 0 words = _split_into_words(sentences) return _get_ngrams(n, words) def _get_word_su(dist, sentences): """Calculates word skip-dist-grams for multiple sentences.""" assert len(sentences) > 0 assert dist > 0 words = _split_into_words(sentences) return _get_su(dist, words) def _len_lcs(x, y): """ Returns the length of the Longest Common Subsequence between sequences x and y. Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence Args: x: sequence of words y: sequence of words Returns integer: Length of LCS between x and y """ table = _lcs(x, y) n, m = len(x), len(y) return table[n, m] def _lcs(x, y): """ Computes the length of the longest common subsequence (lcs) between two strings. The implementation below uses a DP programming algorithm and runs in O(nm) time where n = len(x) and m = len(y). Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence Args: x: collection of words y: collection of words Returns: Table of dictionary of coord and len lcs """ n, m = len(x), len(y) table = dict() for i in range(n + 1): for j in range(m + 1): if i == 0 or j == 0: table[i, j] = 0 elif x[i - 1] == y[j - 1]: table[i, j] = table[i - 1, j - 1] + 1 else: table[i, j] = max(table[i - 1, j], table[i, j - 1]) return table def _recon_lcs(x, y): """ Returns the Longest Subsequence between x and y. Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence Args: x: sequence of words y: sequence of words Returns: sequence: LCS of x and y """ i, j = len(x), len(y) table = _lcs(x, y) def _recon(i, j): """private recon calculation""" if i == 0 or j == 0: return [] elif x[i - 1] == y[j - 1]: return _recon(i - 1, j - 1) + [(x[i - 1], i)] elif table[i - 1, j] > table[i, j - 1]: return _recon(i - 1, j) else: return _recon(i, j - 1) recon_tuple = tuple(map(lambda x: x[0], _recon(i, j))) return recon_tuple def rouge_su(evaluated_sentences, reference_sentences, dist=4): """ Computes ROUGE-SU_dist of two text collections of sentences. Sourece: http://research.microsoft.com/en-us/um/people/cyl/download/ papers/rouge-working-note-v1.3.1.pdf Args: evaluated_sentences: The sentences that have been picked by the summarizer reference_sentences: The sentences from the referene set n: maximum distance between two tokens. Defaults to 4. Returns: A tuple (f1, precision, recall) for ROUGE-SU4 Raises: ValueError: raises exception if a param has len <= 0 """ return rouge_n(evaluated_sentences, reference_sentences, dist=dist, su=True) def rouge_n(evaluated_sentences, reference_sentences, n=2, dist=4, su=False): """ Computes ROUGE-N of two text collections of sentences. Sourece: http://research.microsoft.com/en-us/um/people/cyl/download/ papers/rouge-working-note-v1.3.1.pdf Args: evaluated_sentences: The sentences that have been picked by the summarizer reference_sentences: The sentences from the referene set n: Size of ngram. Defaults to 2. su: if true, we are computing rouge_su Returns: A tuple (f1, precision, recall) for ROUGE-N Raises: ValueError: raises exception if a param has len <= 0 """ if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: raise ValueError("Collections must contain at least 1 sentence.") if su == True: evaluated_ngrams = _get_word_su(dist, evaluated_sentences) reference_ngrams = _get_word_su(dist, reference_sentences) else: evaluated_ngrams = _get_word_ngrams(n, evaluated_sentences) reference_ngrams = _get_word_ngrams(n, reference_sentences) reference_count = sum([v for k, v in reference_ngrams.items()]) evaluated_count = sum([v for k, v in evaluated_ngrams.items()]) # Gets the overlapping ngrams between evaluated and reference overlapping_count = 0 for k, v in reference_ngrams.items(): if k in evaluated_ngrams: if evaluated_ngrams[k] < v: overlapping_count += evaluated_ngrams[k] else: overlapping_count += v # Handle edge case. This isn't mathematically correct, but it's good enough if evaluated_count == 0: precision = 0.0 else: precision = overlapping_count / evaluated_count if reference_count == 0: recall = 0.0 else: recall = overlapping_count / reference_count f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8)) # return overlapping_count / reference_count return f1_score, precision, recall def _f_p_r_lcs(llcs, m, n): """ Computes the LCS-based F-measure score Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/ rouge-working-note-v1.3.1.pdf Args: llcs: Length of LCS m: number of words in reference summary n: number of words in candidate summary Returns: Float. LCS-based F-measure score """ r_lcs = llcs / m p_lcs = llcs / n beta = p_lcs / (r_lcs + 1e-12) num = (1 + (beta ** 2)) * r_lcs * p_lcs denom = r_lcs + ((beta ** 2) * p_lcs) f_lcs = num / (denom + 1e-12) return f_lcs, p_lcs, r_lcs def rouge_l_sentence_level(evaluated_sentences, reference_sentences): """ Computes ROUGE-L (sentence level) of two text collections of sentences. http://research.microsoft.com/en-us/um/people/cyl/download/papers/ rouge-working-note-v1.3.1.pdf Calculated according to: R_lcs = LCS(X,Y)/m P_lcs = LCS(X,Y)/n F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs) where: X = reference summary Y = Candidate summary m = length of reference summary n = length of candidate summary Args: evaluated_sentences: The sentences that have been picked by the summarizer reference_sentences: The sentences from the referene set Returns: A float: F_lcs Raises: ValueError: raises exception if a param has len <= 0 """ if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: raise ValueError("Collections must contain at least 1 sentence.") reference_words = _split_into_words(reference_sentences) evaluated_words = _split_into_words(evaluated_sentences) m = len(reference_words) n = len(evaluated_words) lcs = _len_lcs(evaluated_words, reference_words) return _f_p_r_lcs(lcs, m, n) def _union_lcs(evaluated_sentences, reference_sentence): """ Returns LCS_u(r_i, C) which is the LCS score of the union longest common subsequence between reference sentence ri and candidate summary C. For example if r_i= w1 w2 w3 w4 w5, and C contains two sentences: c1 = w1 w2 w6 w7 w8 and c2 = w1 w3 w8 w9 w5, then the longest common subsequence of r_i and c1 is “w1 w2” and the longest common subsequence of r_i and c2 is “w1 w3 w5”. The union longest common subsequence of r_i, c1, and c2 is “w1 w2 w3 w5” and LCS_u(r_i, C) = 4/5. Args: evaluated_sentences: The sentences that have been picked by the summarizer reference_sentence: One of the sentences in the reference summaries Returns: float: LCS_u(r_i, C) ValueError: Raises exception if a param has len <= 0 """ if len(evaluated_sentences) <= 0: raise ValueError("Collections must contain at least 1 sentence.") lcs_union = set() reference_words = _split_into_words([reference_sentence]) combined_lcs_length = 0 for eval_s in evaluated_sentences: evaluated_words = _split_into_words([eval_s]) lcs = set(_recon_lcs(reference_words, evaluated_words)) combined_lcs_length += len(lcs) lcs_union = lcs_union.union(lcs) union_lcs_count = len(lcs_union) union_lcs_value = union_lcs_count / combined_lcs_length return union_lcs_value def rouge_l_summary_level(evaluated_sentences, reference_sentences): """ Computes ROUGE-L (summary level) of two text collections of sentences. http://research.microsoft.com/en-us/um/people/cyl/download/papers/ rouge-working-note-v1.3.1.pdf Calculated according to: R_lcs = SUM(1, u)[LCS(r_i,C)]/m P_lcs = SUM(1, u)[LCS(r_i,C)]/n F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs) where: SUM(i,u) = SUM from i through u u = number of sentences in reference summary C = Candidate summary made up of v sentences m = number of words in reference summary n = number of words in candidate summary Args: evaluated_sentences: The sentences that have been picked by the summarizer reference_sentence: One of the sentences in the reference summaries Returns: A float: F_lcs Raises: ValueError: raises exception if a param has len <= 0 """ if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: raise ValueError("Collections must contain at least 1 sentence.") # total number of words in reference sentences m = len(_split_into_words(reference_sentences)) # total number of words in evaluated sentences n = len(_split_into_words(evaluated_sentences)) union_lcs_sum_across_all_references = 0 for ref_s in reference_sentences: union_lcs_sum_across_all_references += _union_lcs(evaluated_sentences, ref_s) return _f_p_r_lcs(union_lcs_sum_across_all_references, m, n) def rouge(hypotheses, references): """Calculates average rouge scores for a list of hypotheses and references""" # Filter out hyps that are of 0 length # hyps_and_refs = zip(hypotheses, references) # hyps_and_refs = [_ for _ in hyps_and_refs if len(_[0]) > 0] # hypotheses, references = zip(*hyps_and_refs) # Calculate ROUGE-1 F1, precision, recall scores rouge_1 = [rouge_n([hyp], [ref], 1) for hyp, ref in zip(hypotheses, references)] rouge_1_f, rouge_1_p, rouge_1_r = map(np.mean, zip(*rouge_1)) # Calculate ROUGE-2 F1, precision, recall scores rouge_2 = [rouge_n([hyp], [ref], 2) for hyp, ref in zip(hypotheses, references)] rouge_2_f, rouge_2_p, rouge_2_r = map(np.mean, zip(*rouge_2)) # Calculate ROUGE-SU4 F1, precision, recall scores rouge_su4 = [rouge_su([hyp], [ref], 4) for hyp, ref in zip(hypotheses, references)] rouge_su4_f, rouge_su4_p, rouge_su4_r = map(np.mean, zip(*rouge_su4)) # Calculate ROUGE-L F1, precision, recall scores rouge_l = [ rouge_l_sentence_level([hyp], [ref]) for hyp, ref in zip(hypotheses, references) ] rouge_l_f, rouge_l_p, rouge_l_r = map(np.mean, zip(*rouge_l)) return { "rouge_1_f_score": rouge_1_f, "rouge_2_f_score": rouge_2_f, "rouge_su4_f_score": rouge_su4_f, "rouge_l_f_score": rouge_l_f, } class OldROUGEEval: def __init__(self): pass def make_html_safe(self, s): s.replace("<", "<") s.replace(">", ">") return s def eval(self, predictions, groundtruths): predictions = [self.make_html_safe(w) for w in predictions] groundtruths = [self.make_html_safe(w) for w in groundtruths] results = rouge(predictions, groundtruths) return results