api-demo
/
opencompass-my-api
/build
/lib
/opencompass
/datasets
/subjective
/information_retrival.py
# flake8: noqa: E501 | |
import json | |
import os.path as osp | |
import re | |
from typing import Optional | |
from datasets import Dataset, DatasetDict | |
from opencompass.registry import LOAD_DATASET | |
from .subjective_cmp import SubjectiveCmpDataset | |
eng_base_prefix = """ | |
You are a judger. Please impartially judge whether an AI model's response to a question is correct based on the reference answers. You need to provide a conclusion of "correct" or "wrong," followed by the corresponding reasoning. | |
Note that since the reference answer is a candidate list, the AI model's response only needs to align with one item in the list to be deemed "correct." | |
Your judgment must strictly adhere to the following format: | |
Conclusion: [[Correct]] | |
Reasoning: xxx. | |
Conclusion: [[Wrong]] | |
Reasoning: xxx. | |
[Question Start] | |
{question} | |
[Question End] | |
[Reference Answers Start] | |
{ref} | |
[Reference Answers End] | |
[Model Response Start] | |
""" | |
chn_base_prefix = """ | |
你是一个评判者,请你基于参考答案,公正地评判一个AI模型对于问题的回答是否正确。你需要给出“对或错”的结论,然后再给出相应的理由。 | |
请注意,由于参考答案是一个候选列表,因此AI模型的回答只要符合列表中的某一项即可判断为“对”。 | |
你的评判必须严格遵守以下格式: | |
结论:[[对]] | |
理由:xxx。 | |
结论:[[错]] | |
理由:xxx。 | |
[问题开始] | |
{question} | |
[问题结束] | |
[参考答案开始] | |
{ref} | |
[参考答案结束] | |
[模型回答开始] | |
""" | |
def prompt_construct(sample): | |
lan = sample['others']['lan'] | |
question = sample['question'] | |
if lan == 'zh': | |
prefix = chn_base_prefix.format(question=sample['question'], | |
ref=str(sample['others']['answers'])) | |
suffix = '\n[模型回答结束]\n' | |
elif lan == 'en': | |
prefix = eng_base_prefix.format(question=sample['question'], | |
ref=str(sample['others']['answers'])) | |
suffix = '\n[Model Response End]\n' | |
return prefix, suffix | |
class IRDataset(SubjectiveCmpDataset): | |
def load( | |
self, | |
path: str, | |
name: str, | |
): | |
dataset = list(super().load(path, name)) | |
subject_dataset = [] | |
for data in dataset: | |
data['gpt4_prefix'], data['gpt4_suffix'] = prompt_construct(data) | |
data['judge']['others'] = data['others'] | |
data['ref'] = str(data['others']['answers']) | |
subject_dataset.append(data) | |
dataset = Dataset.from_list(subject_dataset) | |
return dataset | |