from pycocoevalcap.bleu.bleu import Bleu from pycocoevalcap.meteor import Meteor from pycocoevalcap.rouge import Rouge def compute_scores(gts, res): """ Performs the MS COCO evaluation using the Python 3 implementation (https://github.com/salaniz/pycocoevalcap) :param gts: Dictionary with the image ids and their gold captions, :param res: Dictionary with the image ids ant their generated captions :print: Evaluation score (the mean of the scores of all the instances) for each measure """ # Set up scorers scorers = [ (Bleu(4), ["BLEU_1", "BLEU_2", "BLEU_3", "BLEU_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L") ] eval_res = {} # Compute score for each metric for scorer, method in scorers: try: score, scores = scorer.compute_score(gts, res, verbose=0) except TypeError: score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, m in zip(score, method): eval_res[m] = sc else: eval_res[method] = score return eval_res