UltraRonin commited on
Commit
3a8cf08
·
1 Parent(s): 84010af
Files changed (6) hide show
  1. app.py +33 -41
  2. src/about.py +18 -24
  3. src/envs.py +3 -2
  4. src/evaluation.py +423 -0
  5. src/leaderboard/read_evals.py +1 -1
  6. src/populate.py +2 -2
app.py CHANGED
@@ -4,6 +4,7 @@ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
4
  import pandas as pd
5
  from apscheduler.schedulers.background import BackgroundScheduler
6
  from huggingface_hub import snapshot_download
 
7
 
8
  from src.about import (
9
  CITATION_BUTTON_LABEL,
@@ -27,9 +28,10 @@ from src.display.utils import (
27
  WeightType,
28
  Precision
29
  )
30
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
31
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
32
  from src.submission.submit import add_new_eval
 
33
 
34
  import pdb
35
 
@@ -52,16 +54,17 @@ def restart_space():
52
  # except Exception:
53
  # restart_space()
54
 
55
- task = ['Overall', 'Acrostic', 'Crossword', 'Cryptogram', 'Logic_Puzzle', 'Sudoku', 'Drop_Quote']
 
 
 
 
 
 
56
  leaderboard_dict = {}
57
  for t in task:
58
- leaderboard_dict[t] = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS, task=t)
59
 
60
- (
61
- finished_eval_queue_df,
62
- running_eval_queue_df,
63
- pending_eval_queue_df,
64
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
65
 
66
  def init_leaderboard(dataframe):
67
  if dataframe is None or dataframe.empty:
@@ -82,43 +85,31 @@ def init_leaderboard(dataframe):
82
  column_widths=[180, 60, 80, 80, 80, 80, 60],
83
  )
84
 
85
- # return Leaderboard(
86
- # value=dataframe,
87
- # datatype=[c.type for c in fields(AutoEvalColumn)],
88
- # select_columns=SelectColumns(
89
- # default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
90
- # cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
91
- # label="Select Columns to Display:",
92
- # ),
93
- # # search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
94
- # # hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
95
- # # filter_columns=[
96
- # # ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
97
- # # ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
98
- # # ColumnFilter(
99
- # # AutoEvalColumn.params.name,
100
- # # type="slider",
101
- # # min=0.01,
102
- # # max=150,
103
- # # label="Select the number of parameters (B)",
104
- # # ),
105
- # # ColumnFilter(
106
- # # AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
107
- # # ),
108
- # # ],
109
- # # bool_checkboxgroup_label="Hide models",
110
- # interactive=False,
111
- # )
112
-
113
- def process_json(file):
114
- """ 读取用户上传的 JSON 文件并返回解析后的数据 """
115
  try:
116
  with open(file.name, 'r', encoding='utf-8') as f:
117
  data = json.load(f)
118
- return json.dumps(data, indent=4, ensure_ascii=False) # 格式化 JSON 以便显示
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  except Exception as e:
120
  return str(e)
121
 
 
 
122
  demo = gr.Blocks(css=custom_css)
123
  with demo:
124
  gr.HTML(TITLE)
@@ -143,12 +134,13 @@ with demo:
143
  gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
144
 
145
  gr.Markdown("## Submission Template", elem_classes="markdown-text")
 
146
  gr.Markdown(SUBMIT_TEMPLATE, elem_classes="markdown-text", height=250)
147
 
148
  file_input = gr.File(label="Upload JSON File", file_types=[".json"], height=150)
149
- json_output = gr.JSON(label="Parsed JSON Data") # 输出 JSON 数据
150
  submit_button = gr.Button("Submit")
151
- submit_button.click(fn=process_json, inputs=file_input, outputs=json_output)
152
 
153
 
154
  with gr.Row():
 
4
  import pandas as pd
5
  from apscheduler.schedulers.background import BackgroundScheduler
6
  from huggingface_hub import snapshot_download
7
+ from datasets import load_dataset
8
 
9
  from src.about import (
10
  CITATION_BUTTON_LABEL,
 
28
  WeightType,
29
  Precision
30
  )
31
+ from src.envs import API, EVAL_RESULTS_PATH, GOLDEN_REPO, REPO_ID, TOKEN
32
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
33
  from src.submission.submit import add_new_eval
34
+ from src.evaluation import evaluate
35
 
36
  import pdb
37
 
 
54
  # except Exception:
55
  # restart_space()
56
 
57
+ try:
58
+ golden = load_dataset(GOLDEN_REPO, token=TOKEN)
59
+ print(golden)
60
+ except Exception:
61
+ restart_space()
62
+
63
+ task = ['Overall', 'Crossword', 'Acrostic', 'Logic_Puzzle', 'Cryptogram', 'Sudoku', 'Drop_Quote']
64
  leaderboard_dict = {}
65
  for t in task:
66
+ leaderboard_dict[t] = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, task=t)
67
 
 
 
 
 
 
68
 
69
  def init_leaderboard(dataframe):
70
  if dataframe is None or dataframe.empty:
 
85
  column_widths=[180, 60, 80, 80, 80, 80, 60],
86
  )
87
 
88
+
89
+
90
+ def eval_json(file):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  try:
92
  with open(file.name, 'r', encoding='utf-8') as f:
93
  data = json.load(f)
94
+
95
+ tasks = ["crossword", "acrostic", "logic", "cryptogram", "sudoku", "drop"]
96
+
97
+ eval_dict = {}
98
+
99
+ for task in tasks:
100
+ data_list = data["results"][task]
101
+ golden_list = golden[task]
102
+ result = evaluate(data_list, golden_list, task)
103
+ eval_dict[task] = result
104
+
105
+ return json.dumps(eval_dict, indent=4)
106
+
107
+
108
  except Exception as e:
109
  return str(e)
110
 
111
+
112
+
113
  demo = gr.Blocks(css=custom_css)
114
  with demo:
115
  gr.HTML(TITLE)
 
134
  gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
135
 
136
  gr.Markdown("## Submission Template", elem_classes="markdown-text")
137
+ gr.Markdown("See [submission_template.json](https://github.com/Ultramarine-spec/LR2Bench/blob/main/submission_template.json) for detail.", elem_classes="markdown-text")
138
  gr.Markdown(SUBMIT_TEMPLATE, elem_classes="markdown-text", height=250)
139
 
140
  file_input = gr.File(label="Upload JSON File", file_types=[".json"], height=150)
141
+ json_output = gr.JSON(label="Your Model Performance") # 输出 JSON 数据
142
  submit_button = gr.Button("Submit")
143
+ submit_button.click(fn=eval_json, inputs=file_input, outputs=json_output)
144
 
145
 
146
  with gr.Row():
src/about.py CHANGED
@@ -64,30 +64,24 @@ SUBMIT_TEMPLATE = """
64
  "show_on_leaderboard": true, # whether to show your model on the leaderboard
65
  },
66
  "results": {
67
- "Acrostic": {
68
- "TAG1": "RESPONSE1",
69
- "TAG2": "RESPONSE2",
70
- },
71
- "Crossword": {
72
- "TAG1": "RESPONSE1",
73
- "TAG2": "RESPONSE2",
74
- },
75
- "Cryptogram": {
76
- "TAG1": "RESPONSE1",
77
- "TAG2": "RESPONSE2",
78
- },
79
- "Logic_Puzzle": {
80
- "TAG1": "RESPONSE1",
81
- "TAG2": "RESPONSE2",
82
- },
83
- "Sudoku": {
84
- "TAG1": "RESPONSE1",
85
- "TAG2": "RESPONSE2",
86
- },
87
- "Drop_Quote": {
88
- "TAG1": "RESPONSE1",
89
- "TAG2": "RESPONSE2",
90
- }
91
  }
92
  }
93
  ```
 
64
  "show_on_leaderboard": true, # whether to show your model on the leaderboard
65
  },
66
  "results": {
67
+ "crossword": [
68
+ {"tag": "TAG", "level": "LEVEL", "answer": "ANSWER"},
69
+ ],
70
+ "acrostic": [
71
+ {"tag": "TAG", "level": "LEVEL", "answer": "ANSWER"},
72
+ ],
73
+ "logic": [
74
+ {"tag": "TAG", "level": "LEVEL", "answer": "ANSWER"},
75
+ ],
76
+ "cryptogram": [
77
+ {"tag": "TAG", "level": "LEVEL", "answer": "ANSWER"},
78
+ ],
79
+ "sudoku": [
80
+ {"tag": "TAG", "level": "LEVEL", "answer": "ANSWER"},
81
+ ],
82
+ "drop": [
83
+ {"tag": "TAG", "level": "LEVEL", "answer": "ANSWER"},
84
+ ]
 
 
 
 
 
 
85
  }
86
  }
87
  ```
src/envs.py CHANGED
@@ -9,9 +9,10 @@ TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
9
  OWNER = "UltraRonin" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
- REPO_ID = f"{OWNER}/leaderboard"
 
13
  QUEUE_REPO = f"{OWNER}/requests"
14
- RESULTS_REPO = f"{OWNER}/LR2Bench"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
  CACHE_PATH=os.getenv("HF_HOME", ".")
 
9
  OWNER = "UltraRonin" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
+ REPO_ID = f"{OWNER}/LR2Bench"
13
+ GOLDEN_REPO = f"{OWNER}/LR2Bench_answer"
14
  QUEUE_REPO = f"{OWNER}/requests"
15
+
16
 
17
  # If you setup a cache later, just change HF_HOME
18
  CACHE_PATH=os.getenv("HF_HOME", ".")
src/evaluation.py ADDED
@@ -0,0 +1,423 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import traceback
3
+ from collections import defaultdict
4
+
5
+
6
+ level_dict = {
7
+ "crossword": ["5_5", "10_10", "15_15"],
8
+ "acrostic": ["easy", "hard"],
9
+ "logic": ["4_4", "4_5", "4_6", "4_7"],
10
+ "cryptogram": ["easy", "hard"],
11
+ "sudoku": ["4_4_easy", "4_4_hard", "9_9_easy", "9_9_hard"],
12
+ "drop": ["easy", "hard"]
13
+ }
14
+
15
+
16
+ def norm_dict(d):
17
+ if d:
18
+ return {str(key).lower(): str(value).replace(" ", "").lower() for key, value in d.items()}
19
+ else:
20
+ return {}
21
+
22
+
23
+ def calculate_dict_correct(gold, prediction_text):
24
+ try:
25
+ prediction = eval(prediction_text)
26
+ gold = norm_dict(gold)
27
+ prediction = norm_dict(prediction)
28
+
29
+ matching_dict = {}
30
+ correct_cnt = 0
31
+
32
+ for key, gold_value in gold.items():
33
+ predicted_value = prediction.get(key, "MISSING")
34
+ is_correct = (gold_value == predicted_value)
35
+ correct_cnt += is_correct
36
+ matching_dict[key] = {
37
+ "gold": gold_value,
38
+ "model": predicted_value,
39
+ "correct": is_correct
40
+ }
41
+
42
+ correct_100 = (correct_cnt == len(gold))
43
+ correct_50 = (correct_cnt / len(gold) >= 0.5)
44
+
45
+
46
+ except Exception as e:
47
+ print(prediction_text)
48
+ print(f"Error: {e}")
49
+ print(traceback.format_exc())
50
+
51
+ correct_cnt = 0
52
+ correct_100 = False
53
+ correct_50 = False
54
+
55
+ matching_dict = {
56
+ key: {
57
+ "gold": gold[key],
58
+ "model": f"ERROR: {str(e)}",
59
+ "correct": False
60
+ }
61
+ for key in gold.keys()
62
+ }
63
+
64
+ return correct_cnt, correct_100, correct_50, matching_dict
65
+
66
+
67
+ def calculate_logic_answer_correct(gold, prediction_text):
68
+ def norm(ans):
69
+ return [{str(key).lower(): str(value).lower() for key, value in d.items()} for d in ans]
70
+ try:
71
+ prediction = eval(prediction_text)
72
+ gold = norm(gold)
73
+ prediction = norm(prediction)
74
+ except Exception as e:
75
+ print(f"Error: {e}")
76
+ print(traceback.format_exc())
77
+ prediction = []
78
+
79
+ correct_cnt = 0
80
+ all_cnt = 0
81
+ for d_gold in gold:
82
+ first_pair = list(d_gold.items())[0]
83
+ d_prediction = [d for d in prediction if first_pair in list(d.items())]
84
+ if not d_prediction:
85
+ d_prediction = {}
86
+ else:
87
+ d_prediction = d_prediction[0]
88
+
89
+ for key, gold_value in d_gold.items():
90
+ if key == first_pair[0]:
91
+ continue
92
+ all_cnt += 1
93
+ predicted_value = d_prediction.get(key, "")
94
+ if gold_value == predicted_value:
95
+ correct_cnt += 1
96
+
97
+ correct_100 = (correct_cnt == all_cnt)
98
+ correct_50 = (correct_cnt / all_cnt >= 0.5)
99
+
100
+ return correct_cnt, all_cnt, correct_100, correct_50
101
+
102
+
103
+ def calculate_sudoku_answer_correct(grid, gold, prediction_text):
104
+ try:
105
+ prediction = eval(prediction_text)
106
+ except Exception as e:
107
+ print(f"Error: {e}")
108
+ print(traceback.format_exc())
109
+ prediction = [[]]
110
+
111
+ all_cnt = sum([row.count(0) for row in grid])
112
+ correct_cnt = 0
113
+ for i in range(min(len(gold), len(prediction))):
114
+ for j in range(min(len(gold[i]), len(prediction[i]))):
115
+ if gold[i][j] == prediction[i][j] and grid[i][j] == 0:
116
+ correct_cnt += 1
117
+
118
+ if correct_cnt > all_cnt:
119
+ print("Error: correct_cnt > all_cnt")
120
+ correct_cnt = all_cnt
121
+
122
+ correct_100 = (correct_cnt == all_cnt)
123
+ correct_50 = (correct_cnt / all_cnt >= 0.5)
124
+
125
+ return correct_cnt, all_cnt, correct_100, correct_50
126
+
127
+
128
+ def calculate_drop_answer_correct(gold, prediction_text):
129
+ try:
130
+ prediction = eval(prediction_text)
131
+ except Exception as e:
132
+ print(f"Error: {e}")
133
+ print(traceback.format_exc())
134
+ prediction = [[]]
135
+
136
+ all_cnt = len([x for row in gold for x in row if x != "#"])
137
+ correct_cnt = 0
138
+ for i in range(min(len(gold), len(prediction))):
139
+ for j in range(min(len(gold[i]), len(prediction[i]))):
140
+ if gold[i][j] != "#" and gold[i][j] == prediction[i][j]:
141
+ correct_cnt += 1
142
+
143
+ if correct_cnt > all_cnt:
144
+ print("Error: correct_cnt > all_cnt")
145
+ correct_cnt = all_cnt
146
+
147
+ correct_100 = (correct_cnt == all_cnt)
148
+ correct_50 = (correct_cnt / all_cnt >= 0.5)
149
+
150
+ return correct_cnt, all_cnt, correct_100, correct_50
151
+
152
+
153
+
154
+
155
+ def eval_crossword(data_list, golden_list):
156
+ eval_dict = defaultdict(dict)
157
+ for level in level_dict["crossword"]:
158
+ golden = [g for g in golden_list if g["level"] == level]
159
+ golden_dict = {g["tag"]: g for g in golden}
160
+
161
+ data = [d for d in data_list if d["level"] == level]
162
+
163
+ answer_exist_cnt = 0
164
+ subtask_cnt = 0
165
+ subtask_correct_cnt = 0
166
+
167
+ sample_correct_100_cnt = 0
168
+ sample_correct_50_cnt = 0
169
+
170
+
171
+ for d in data:
172
+ tag = str(d["tag"])
173
+ model_answer = d['answer']
174
+ gold = json.loads(golden_dict[tag]['answer'])
175
+
176
+ if model_answer != "{}":
177
+ answer_exist_cnt += 1
178
+
179
+ curr_subtask_correct_cnt, curr_correct_100, curr_correct_50, matching_dict = calculate_dict_correct(gold, model_answer)
180
+
181
+ subtask_cnt += len(gold)
182
+ subtask_correct_cnt += curr_subtask_correct_cnt
183
+
184
+ sample_correct_100_cnt += curr_correct_100
185
+ sample_correct_50_cnt += curr_correct_50
186
+
187
+ eval_dict[level] = {
188
+ "CR": answer_exist_cnt / len(data),
189
+ "S-Acc": subtask_correct_cnt / subtask_cnt,
190
+ "EM": sample_correct_100_cnt / len(data),
191
+ "PM-0.5": sample_correct_50_cnt / len(data),
192
+ }
193
+
194
+ return eval_dict
195
+
196
+
197
+ def eval_acrostic(data_list, golden_list):
198
+ eval_dict = defaultdict(dict)
199
+ for level in level_dict["acrostic"]:
200
+ golden = [g for g in golden_list if g["level"] == level]
201
+ golden_dict = {g["tag"]: g for g in golden}
202
+
203
+ data = [d for d in data_list if d["level"] == level]
204
+
205
+ answer_exist_cnt = 0
206
+ subtask_cnt = 0
207
+ subtask_correct_cnt = 0
208
+
209
+ sample_correct_100_cnt = 0
210
+ sample_correct_50_cnt = 0
211
+
212
+
213
+ for d in data:
214
+ tag = str(d["tag"])
215
+ model_answer = d['answer']
216
+ gold = json.loads(golden_dict[tag]['answer'])
217
+
218
+ if model_answer != "{}":
219
+ answer_exist_cnt += 1
220
+
221
+ curr_subtask_correct_cnt, curr_correct_100, curr_correct_50, matching_dict = calculate_dict_correct(gold, model_answer)
222
+
223
+ subtask_cnt += len(gold)
224
+ subtask_correct_cnt += curr_subtask_correct_cnt
225
+
226
+ sample_correct_100_cnt += curr_correct_100
227
+ sample_correct_50_cnt += curr_correct_50
228
+
229
+ eval_dict[level] = {
230
+ "CR": answer_exist_cnt / len(data),
231
+ "S-Acc": subtask_correct_cnt / subtask_cnt,
232
+ "EM": sample_correct_100_cnt / len(data),
233
+ "PM-0.5": sample_correct_50_cnt / len(data),
234
+ }
235
+
236
+ return eval_dict
237
+
238
+
239
+ def eval_logic(data_list, golden_list):
240
+ eval_dict = defaultdict(dict)
241
+ for level in level_dict["logic"]:
242
+ golden = [g for g in golden_list if g["level"] == level]
243
+ golden_dict = {g["tag"]: g for g in golden}
244
+
245
+ data = [d for d in data_list if d["level"] == level]
246
+
247
+ answer_exist_cnt = 0
248
+ subtask_cnt = 0
249
+ subtask_correct_cnt = 0
250
+
251
+ sample_correct_100_cnt = 0
252
+ sample_correct_50_cnt = 0
253
+
254
+
255
+ for d in data:
256
+ tag = str(d["tag"])
257
+ model_answer = d['answer']
258
+ gold = json.loads(golden_dict[tag]['answer'])
259
+
260
+ if model_answer != "[]":
261
+ answer_exist_cnt += 1
262
+
263
+ curr_subtask_correct_cnt, curr_subtask_cnt, curr_correct_100, curr_correct_50 = calculate_logic_answer_correct(gold, model_answer)
264
+
265
+ subtask_cnt += curr_subtask_cnt
266
+ subtask_correct_cnt += curr_subtask_correct_cnt
267
+
268
+ sample_correct_100_cnt += curr_correct_100
269
+ sample_correct_50_cnt += curr_correct_50
270
+
271
+ eval_dict[level] = {
272
+ "CR": answer_exist_cnt / len(data),
273
+ "S-Acc": subtask_correct_cnt / subtask_cnt,
274
+ "EM": sample_correct_100_cnt / len(data),
275
+ "PM-0.5": sample_correct_50_cnt / len(data),
276
+ }
277
+
278
+ return eval_dict
279
+
280
+
281
+ def eval_cryptogram(data_list, golden_list):
282
+ eval_dict = defaultdict(dict)
283
+ for level in level_dict["cryptogram"]:
284
+ golden = [g for g in golden_list if g["level"] == level]
285
+ golden_dict = {g["tag"]: g for g in golden}
286
+
287
+ data = [d for d in data_list if d["level"] == level]
288
+
289
+ answer_exist_cnt = 0
290
+ subtask_cnt = 0
291
+ subtask_correct_cnt = 0
292
+
293
+ sample_correct_100_cnt = 0
294
+ sample_correct_50_cnt = 0
295
+
296
+
297
+ for d in data:
298
+ tag = str(d["tag"])
299
+ model_answer = d['answer']
300
+ gold = json.loads(golden_dict[tag]['answer'])
301
+
302
+ if model_answer != "{}":
303
+ answer_exist_cnt += 1
304
+
305
+ curr_subtask_correct_cnt, curr_correct_100, curr_correct_50, matching_dict = calculate_dict_correct(gold, model_answer)
306
+
307
+ subtask_cnt += len(gold)
308
+ subtask_correct_cnt += curr_subtask_correct_cnt
309
+
310
+ sample_correct_100_cnt += curr_correct_100
311
+ sample_correct_50_cnt += curr_correct_50
312
+
313
+ eval_dict[level] = {
314
+ "CR": answer_exist_cnt / len(data),
315
+ "S-Acc": subtask_correct_cnt / subtask_cnt,
316
+ "EM": sample_correct_100_cnt / len(data),
317
+ "PM-0.5": sample_correct_50_cnt / len(data),
318
+ }
319
+
320
+ return eval_dict
321
+
322
+
323
+ def eval_sudoku(data_list, golden_list):
324
+ eval_dict = defaultdict(dict)
325
+ for level in level_dict["sudoku"]:
326
+ golden = [g for g in golden_list if g["level"] == level]
327
+ golden_dict = {g["tag"]: g for g in golden}
328
+
329
+ data = [d for d in data_list if d["level"] == level]
330
+
331
+ answer_exist_cnt = 0
332
+ subtask_cnt = 0
333
+ subtask_correct_cnt = 0
334
+
335
+ sample_correct_100_cnt = 0
336
+ sample_correct_50_cnt = 0
337
+
338
+
339
+ for d in data:
340
+ tag = str(d["tag"])
341
+ model_answer = d['answer']
342
+ gold = json.loads(golden_dict[tag]['answer'])
343
+ grid = gold["grid"]
344
+ gold = gold["answer"]
345
+
346
+ if model_answer != "[[]]":
347
+ answer_exist_cnt += 1
348
+
349
+ curr_subtask_correct_cnt, curr_subtask_cnt, curr_correct_100, curr_correct_50 = calculate_sudoku_answer_correct(grid, gold, model_answer)
350
+
351
+ subtask_cnt += curr_subtask_cnt
352
+ subtask_correct_cnt += curr_subtask_correct_cnt
353
+
354
+ sample_correct_100_cnt += curr_correct_100
355
+ sample_correct_50_cnt += curr_correct_50
356
+
357
+ eval_dict[level] = {
358
+ "CR": answer_exist_cnt / len(data),
359
+ "S-Acc": subtask_correct_cnt / subtask_cnt,
360
+ "EM": sample_correct_100_cnt / len(data),
361
+ "PM-0.5": sample_correct_50_cnt / len(data),
362
+ }
363
+
364
+ return eval_dict
365
+
366
+
367
+ def eval_drop(data_list, golden_list):
368
+ eval_dict = defaultdict(dict)
369
+ for level in level_dict["drop"]:
370
+ golden = [g for g in golden_list if g["level"] == level]
371
+ golden_dict = {g["tag"]: g for g in golden}
372
+
373
+ data = [d for d in data_list if d["level"] == level]
374
+
375
+ answer_exist_cnt = 0
376
+ subtask_cnt = 0
377
+ subtask_correct_cnt = 0
378
+
379
+ sample_correct_100_cnt = 0
380
+ sample_correct_50_cnt = 0
381
+
382
+
383
+ for d in data:
384
+ tag = str(d["tag"])
385
+ model_answer = d['answer']
386
+ gold = json.loads(golden_dict[tag]['answer'])
387
+
388
+ if model_answer != "[[]]":
389
+ answer_exist_cnt += 1
390
+
391
+ curr_subtask_correct_cnt, curr_subtask_cnt, curr_correct_100, curr_correct_50 = calculate_drop_answer_correct(gold, model_answer)
392
+
393
+ subtask_cnt += curr_subtask_cnt
394
+ subtask_correct_cnt += curr_subtask_correct_cnt
395
+
396
+ sample_correct_100_cnt += curr_correct_100
397
+ sample_correct_50_cnt += curr_correct_50
398
+
399
+ eval_dict[level] = {
400
+ "CR": answer_exist_cnt / len(data),
401
+ "S-Acc": subtask_correct_cnt / subtask_cnt,
402
+ "EM": sample_correct_100_cnt / len(data),
403
+ "PM-0.5": sample_correct_50_cnt / len(data),
404
+ }
405
+
406
+ return eval_dict
407
+
408
+
409
+ def evaluate(data_list, golden_list, task):
410
+ if task == "crossword":
411
+ return eval_crossword(data_list, golden_list)
412
+ elif task == "acrostic":
413
+ return eval_acrostic(data_list, golden_list)
414
+ elif task == "logic":
415
+ return eval_logic(data_list, golden_list)
416
+ elif task == "cryptogram":
417
+ return eval_cryptogram(data_list, golden_list)
418
+ elif task == "sudoku":
419
+ return eval_sudoku(data_list, golden_list)
420
+ elif task == "drop":
421
+ return eval_drop(data_list, golden_list)
422
+ else:
423
+ raise ValueError(f"Invalid task: {task}")
src/leaderboard/read_evals.py CHANGED
@@ -175,7 +175,7 @@ class EvalResult:
175
  # return request_file
176
 
177
 
178
- def get_raw_eval_results(results_path: str, requests_path: str, task: str) -> list[EvalResult]:
179
  """From the path of the results folder root, extract all needed info for results"""
180
  model_result_filepaths = []
181
 
 
175
  # return request_file
176
 
177
 
178
+ def get_raw_eval_results(results_path: str, task: str) -> list[EvalResult]:
179
  """From the path of the results folder root, extract all needed info for results"""
180
  model_result_filepaths = []
181
 
src/populate.py CHANGED
@@ -8,10 +8,10 @@ from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, task) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
  # import pdb; pdb.set_trace()
14
- raw_data = get_raw_eval_results(results_path, requests_path, task)
15
  all_data_json = [v.to_dict() for v in raw_data]
16
 
17
  df = pd.DataFrame.from_records(all_data_json)
 
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
+ def get_leaderboard_df(results_path: str, cols: list, task) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
  # import pdb; pdb.set_trace()
14
+ raw_data = get_raw_eval_results(results_path, task)
15
  all_data_json = [v.to_dict() for v in raw_data]
16
 
17
  df = pd.DataFrame.from_records(all_data_json)