File size: 2,600 Bytes
256a159
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# flake8: noqa: E501
import json
import os.path as osp
import re
from typing import Optional

from datasets import Dataset, DatasetDict

from opencompass.registry import LOAD_DATASET

from .subjective_cmp import SubjectiveCmpDataset

eng_base_prefix = """
You are a judger. Please impartially judge whether an AI model's response to a question is correct based on the reference answers. You need to provide a conclusion of "correct" or "wrong," followed by the corresponding reasoning.

Note that since the reference answer is a candidate list, the AI model's response only needs to align with one item in the list to be deemed "correct."

Your judgment must strictly adhere to the following format:
Conclusion: [[Correct]]
Reasoning: xxx.

Conclusion: [[Wrong]]
Reasoning: xxx.

[Question Start]
{question}
[Question End]

[Reference Answers Start]
{ref}
[Reference Answers End]

[Model Response Start]
"""

chn_base_prefix = """
你是一个评判者,请你基于参考答案,公正地评判一个AI模型对于问题的回答是否正确。你需要给出“对或错”的结论,然后再给出相应的理由。
请注意,由于参考答案是一个候选列表,因此AI模型的回答只要符合列表中的某一项即可判断为“对”。
你的评判必须严格遵守以下格式:
结论:[[对]]
理由:xxx。

结论:[[错]]
理由:xxx。

[问题开始]
{question}
[问题结束]

[参考答案开始]
{ref}
[参考答案结束]

[模型回答开始]
"""


def prompt_construct(sample):
    lan = sample['others']['lan']
    question = sample['question']
    if lan == 'zh':
        prefix = chn_base_prefix.format(question=sample['question'],
                                        ref=str(sample['others']['answers']))
        suffix = '\n[模型回答结束]\n'
    elif lan == 'en':
        prefix = eng_base_prefix.format(question=sample['question'],
                                        ref=str(sample['others']['answers']))
        suffix = '\n[Model Response End]\n'
    return prefix, suffix


@LOAD_DATASET.register_module()
class IRDataset(SubjectiveCmpDataset):

    def load(
        self,
        path: str,
        name: str,
    ):
        dataset = list(super().load(path, name))
        subject_dataset = []
        for data in dataset:
            data['gpt4_prefix'], data['gpt4_suffix'] = prompt_construct(data)
            data['judge']['others'] = data['others']
            data['ref'] = str(data['others']['answers'])
            subject_dataset.append(data)
        dataset = Dataset.from_list(subject_dataset)
        return dataset