mshukor
init
87d7283
raw
history blame contribute delete
4.32 kB
from refTools.evaluation.tokenizer.ptbtokenizer import PTBTokenizer
from refTools.evaluation.bleu.bleu import Bleu
from refTools.evaluation.meteor.meteor import Meteor
from refTools.evaluation.rouge.rouge import Rouge
from refTools.evaluation.cider.cider import Cider
"""
Input: refer and Res = [{ref_id, sent}]
Things of interest
evalRefs - list of ['ref_id', 'CIDEr', 'Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4', 'ROUGE_L', 'METEOR']
eval - dict of {metric: score}
refToEval - dict of {ref_id: ['ref_id', 'CIDEr', 'Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4', 'ROUGE_L', 'METEOR']}
"""
class RefEvaluation:
def __init__ (self, refer, Res):
"""
:param refer: refer class of current dataset
:param Res: [{'ref_id', 'sent'}]
"""
self.evalRefs = []
self.eval = {}
self.refToEval = {}
self.refer = refer
self.Res = Res
def evaluate(self):
evalRefIds = [ann['ref_id'] for ann in self.Res]
refToGts = {}
for ref_id in evalRefIds:
ref = self.refer.Refs[ref_id]
gt_sents = [sent['sent'].encode('ascii', 'ignore').decode('ascii') for sent in ref['sentences']] # up to 3 expressions
refToGts[ref_id] = gt_sents
refToRes = {ann['ref_id']: [ann['sent']] for ann in self.Res}
print('tokenization...')
tokenizer = PTBTokenizer()
self.refToRes = tokenizer.tokenize(refToRes)
self.refToGts = tokenizer.tokenize(refToGts)
# =================================================
# Set up scorers
# =================================================
print('setting up scorers...')
scorers = [
(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
(Meteor(),"METEOR"),
(Rouge(), "ROUGE_L"),
(Cider(), "CIDEr")
]
# =================================================
# Compute scores
# =================================================
for scorer, method in scorers:
print('computing %s score...'%(scorer.method()))
score, scores = scorer.compute_score(self.refToGts, self.refToRes)
if type(method) == list:
for sc, scs, m in zip(score, scores, method):
self.setEval(sc, m)
self.setRefToEvalRefs(scs, self.refToGts.keys(), m)
print("%s: %0.3f"%(m, sc))
else:
self.setEval(score, method)
self.setRefToEvalRefs(scores, self.refToGts.keys(), method)
print("%s: %0.3f"%(method, score))
self.setEvalRefs()
def setEval(self, score, method):
self.eval[method] = score
def setRefToEvalRefs(self, scores, refIds, method):
for refId, score in zip(refIds, scores):
if not refId in self.refToEval:
self.refToEval[refId] = {}
self.refToEval[refId]["ref_id"] = refId
self.refToEval[refId][method] = score
def setEvalRefs(self):
self.evalRefs = [eval for refId, eval in self.refToEval.items()]
if __name__ == '__main__':
import os.path as osp
import sys
ROOT_DIR = osp.abspath(osp.join(osp.dirname(__file__), '..', '..'))
sys.path.insert(0, osp.join(ROOT_DIR, 'lib', 'datasets'))
from refer import REFER
# load refer of dataset
dataset = 'refcoco'
refer = REFER(dataset, splitBy = 'google')
# mimic some Res
val_refIds = refer.getRefIds(split='test')
ref_id = 49767
print("GD: %s" % refer.Refs[ref_id]['sentences'])
Res = [{'ref_id': ref_id, 'sent': 'left bottle'}]
# evaluate some refer expressions
refEval = RefEvaluation(refer, Res)
refEval.evaluate()
# print output evaluation scores
for metric, score in refEval.eval.items():
print('%s: %.3f'%(metric, score))
# demo how to use evalImgs to retrieve low score result
# evals = [eva for eva in refEval.evalRefs if eva['CIDEr']<30]
# print 'ground truth sents'
# refId = evals[0]['ref_id']
# print 'refId: %s' % refId
# print [sent['sent'] for sent in refer.Refs[refId]['sentences']]
#
# print 'generated sent (CIDEr score %0.1f)' % (evals[0]['CIDEr'])
# print refEval.refToEval[8]