|
from rouge_chinese import Rouge |
|
import jieba |
|
from nltk.translate.gleu_score import corpus_gleu |
|
|
|
def compute_f1_two_sets(pred_set, gt_set): |
|
precision = len(pred_set.intersection(gt_set)) / len(pred_set) if len(pred_set) > 0 else 0 |
|
recall = len(pred_set.intersection(gt_set)) / len(gt_set) if len(gt_set) > 0 else 0 |
|
f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0 |
|
return f1 |
|
|
|
def multi_choice_judge(prediction, option_list, answer_token): |
|
|
|
count_dict, abstention, accuracy = {}, 0, 0 |
|
for option in option_list: |
|
option_count = prediction.count(option) |
|
count_dict[option] = 1 if option_count > 0 else 0 |
|
|
|
if sum(count_dict.values()) == 0: |
|
abstention = 1 |
|
|
|
elif count_dict[answer_token] == 1 and sum(count_dict.values()) == 1: |
|
accuracy = 1 |
|
return {"score": accuracy, "abstention": abstention} |
|
|
|
""" |
|
compute the rouge score. |
|
hyps and refs are lists of hyposisis and reference strings |
|
empty predictions are replaces with 无内容 |
|
""" |
|
|
|
|
|
def compute_rouge(hyps, refs): |
|
assert(len(hyps) == len(refs)) |
|
hyps = [' '.join(jieba.cut(h)) for h in hyps] |
|
hyps = [h if h.strip() != "" else "无内容" for h in hyps] |
|
refs = [' '.join(jieba.cut(r)) for r in refs] |
|
return Rouge().get_scores(hyps, refs) |
|
|
|
""" |
|
compute the gleu score. |
|
hyps and refs are lists of hyposisis and reference strings |
|
empty predictions are replaces with 无内容 |
|
""" |
|
def compute_gleu(hyps, refs): |
|
assert(len(hyps) == len(refs)) |
|
hyps = [' '.join(jieba.cut(h)) for h in hyps] |
|
hyps = [h if h.strip() != "" else "无内容" for h in hyps] |
|
refs = [[' '.join(jieba.cut(r))] for r in refs] |
|
return corpus_gleu(refs, hyps) |
|
|