Spaces:

UltraRonin
/

LR2Bench

Running

App Files Files Community

UltraRonin commited on Mar 13

Commit

3a8cf08

1 Parent(s): 84010af

add

Browse files

Files changed (6) hide show

app.py +33 -41
src/about.py +18 -24
src/envs.py +3 -2
src/evaluation.py +423 -0
src/leaderboard/read_evals.py +1 -1
src/populate.py +2 -2

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
 from src.about import (
     CITATION_BUTTON_LABEL,
@@ -27,9 +28,10 @@ from src.display.utils import (
     WeightType,
     Precision
 )
-from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
 import pdb
@@ -52,16 +54,17 @@ def restart_space():
 # except Exception:
 #     restart_space()
-task = ['Overall', 'Acrostic', 'Crossword', 'Cryptogram', 'Logic_Puzzle', 'Sudoku', 'Drop_Quote']
 leaderboard_dict = {}
 for t in task:
-    leaderboard_dict[t] = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS, task=t)
-(
-    finished_eval_queue_df,
-    running_eval_queue_df,
-    pending_eval_queue_df,
-) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 def init_leaderboard(dataframe):
     if dataframe is None or dataframe.empty:
@@ -82,43 +85,31 @@ def init_leaderboard(dataframe):
         column_widths=[180, 60, 80, 80, 80, 80, 60],
     )
-    # return Leaderboard(
-    #     value=dataframe,
-    #     datatype=[c.type for c in fields(AutoEvalColumn)],
-    #     select_columns=SelectColumns(
-    #         default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
-    #         cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
-    #         label="Select Columns to Display:",
-    #     ),
-    #     # search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
-    #     # hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
-    #     # filter_columns=[
-    #     #     ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
-    #     #     ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
-    #     #     ColumnFilter(
-    #     #         AutoEvalColumn.params.name,
-    #     #         type="slider",
-    #     #         min=0.01,
-    #     #         max=150,
-    #     #         label="Select the number of parameters (B)",
-    #     #     ),
-    #     #     ColumnFilter(
-    #     #         AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
-    #     #     ),
-    #     # ],
-    #     # bool_checkboxgroup_label="Hide models",
-    #     interactive=False,
-    # )
-def process_json(file):
-    """ 读取用户上传的 JSON 文件并返回解析后的数据 """
     try:
         with open(file.name, 'r', encoding='utf-8') as f:
             data = json.load(f)
-        return json.dumps(data, indent=4, ensure_ascii=False)  # 格式化 JSON 以便显示
     except Exception as e:
         return str(e)
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE)
@@ -143,12 +134,13 @@ with demo:
                 gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
             gr.Markdown("## Submission Template", elem_classes="markdown-text")
             gr.Markdown(SUBMIT_TEMPLATE, elem_classes="markdown-text", height=250)
             file_input = gr.File(label="Upload JSON File", file_types=[".json"], height=150)
-            json_output = gr.JSON(label="Parsed JSON Data")  # 输出 JSON 数据
             submit_button = gr.Button("Submit")
-            submit_button.click(fn=process_json, inputs=file_input, outputs=json_output)
     with gr.Row():

 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
+from datasets import load_dataset
 from src.about import (
     CITATION_BUTTON_LABEL,
     WeightType,
     Precision
 )
+from src.envs import API, EVAL_RESULTS_PATH, GOLDEN_REPO, REPO_ID, TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
+from src.evaluation import evaluate
 import pdb
 # except Exception:
 #     restart_space()
+try:
+    golden = load_dataset(GOLDEN_REPO, token=TOKEN)
+    print(golden)
+except Exception:
+    restart_space()
+task = ['Overall', 'Crossword', 'Acrostic', 'Logic_Puzzle', 'Cryptogram', 'Sudoku', 'Drop_Quote']
 leaderboard_dict = {}
 for t in task:
+    leaderboard_dict[t] = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, task=t)
 def init_leaderboard(dataframe):
     if dataframe is None or dataframe.empty:
         column_widths=[180, 60, 80, 80, 80, 80, 60],
     )
+def eval_json(file):
     try:
         with open(file.name, 'r', encoding='utf-8') as f:
             data = json.load(f)
+        tasks = ["crossword", "acrostic", "logic", "cryptogram", "sudoku", "drop"]
+        eval_dict = {}
+        for task in tasks:
+            data_list = data["results"][task]
+            golden_list = golden[task]
+            result = evaluate(data_list, golden_list, task)
+            eval_dict[task] = result
+        return json.dumps(eval_dict, indent=4)
     except Exception as e:
         return str(e)
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE)
                 gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
             gr.Markdown("## Submission Template", elem_classes="markdown-text")
+            gr.Markdown("See [submission_template.json](https://github.com/Ultramarine-spec/LR2Bench/blob/main/submission_template.json) for detail.", elem_classes="markdown-text")
             gr.Markdown(SUBMIT_TEMPLATE, elem_classes="markdown-text", height=250)
             file_input = gr.File(label="Upload JSON File", file_types=[".json"], height=150)
+            json_output = gr.JSON(label="Your Model Performance")  # 输出 JSON 数据
             submit_button = gr.Button("Submit")
+            submit_button.click(fn=eval_json, inputs=file_input, outputs=json_output)
     with gr.Row():

src/about.py CHANGED Viewed

@@ -64,30 +64,24 @@ SUBMIT_TEMPLATE = """
         "show_on_leaderboard": true, # whether to show your model on the leaderboard
     },
     "results": {
-        "Acrostic": {
-            "TAG1": "RESPONSE1",
-            "TAG2": "RESPONSE2",
-        },
-        "Crossword": {
-            "TAG1": "RESPONSE1",
-            "TAG2": "RESPONSE2",
-        },
-        "Cryptogram": {
-            "TAG1": "RESPONSE1",
-            "TAG2": "RESPONSE2",
-        },
-        "Logic_Puzzle": {
-            "TAG1": "RESPONSE1",
-            "TAG2": "RESPONSE2",
-        },
-        "Sudoku": {
-            "TAG1": "RESPONSE1",
-            "TAG2": "RESPONSE2",
-        },
-        "Drop_Quote": {
-            "TAG1": "RESPONSE1",
-            "TAG2": "RESPONSE2",
-        }
     }
 }
 ```

         "show_on_leaderboard": true, # whether to show your model on the leaderboard
     },
     "results": {
+        "crossword": [
+            {"tag": "TAG", "level": "LEVEL", "answer": "ANSWER"},
+        ],
+        "acrostic": [
+            {"tag": "TAG", "level": "LEVEL", "answer": "ANSWER"},
+        ],
+        "logic": [
+            {"tag": "TAG", "level": "LEVEL", "answer": "ANSWER"},
+        ],
+        "cryptogram": [
+            {"tag": "TAG", "level": "LEVEL", "answer": "ANSWER"},
+        ],
+        "sudoku": [
+            {"tag": "TAG", "level": "LEVEL", "answer": "ANSWER"},
+        ],
+        "drop": [
+            {"tag": "TAG", "level": "LEVEL", "answer": "ANSWER"},
+        ]
     }
 }
 ```

src/envs.py CHANGED Viewed

@@ -9,9 +9,10 @@ TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
 OWNER = "UltraRonin" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 # ----------------------------------
-REPO_ID = f"{OWNER}/leaderboard"
 QUEUE_REPO = f"{OWNER}/requests"
-RESULTS_REPO = f"{OWNER}/LR2Bench"
 # If you setup a cache later, just change HF_HOME
 CACHE_PATH=os.getenv("HF_HOME", ".")

 OWNER = "UltraRonin" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 # ----------------------------------
+REPO_ID = f"{OWNER}/LR2Bench"
+GOLDEN_REPO = f"{OWNER}/LR2Bench_answer"
 QUEUE_REPO = f"{OWNER}/requests"
 # If you setup a cache later, just change HF_HOME
 CACHE_PATH=os.getenv("HF_HOME", ".")

src/evaluation.py ADDED Viewed

	@@ -0,0 +1,423 @@

+import json
+import traceback
+from collections import defaultdict
+level_dict = {
+    "crossword": ["5_5", "10_10", "15_15"],
+    "acrostic": ["easy", "hard"],
+    "logic": ["4_4", "4_5", "4_6", "4_7"],
+    "cryptogram": ["easy", "hard"],
+    "sudoku": ["4_4_easy", "4_4_hard", "9_9_easy", "9_9_hard"],
+    "drop": ["easy", "hard"]
+}
+def norm_dict(d):
+    if d:
+        return {str(key).lower(): str(value).replace(" ", "").lower() for key, value in d.items()}
+    else:
+        return {}
+def calculate_dict_correct(gold, prediction_text):
+    try:
+        prediction = eval(prediction_text)
+        gold = norm_dict(gold)
+        prediction = norm_dict(prediction)
+        matching_dict = {}
+        correct_cnt = 0
+        for key, gold_value in gold.items():
+            predicted_value = prediction.get(key, "MISSING")
+            is_correct = (gold_value == predicted_value)
+            correct_cnt += is_correct
+            matching_dict[key] = {
+                "gold": gold_value,
+                "model": predicted_value,
+                "correct": is_correct
+            }
+        correct_100 = (correct_cnt == len(gold))
+        correct_50 = (correct_cnt / len(gold) >= 0.5)
+    except Exception as e:
+        print(prediction_text)
+        print(f"Error: {e}")
+        print(traceback.format_exc())
+        correct_cnt = 0
+        correct_100 = False
+        correct_50 = False
+        matching_dict = {
+            key: {
+                "gold": gold[key],
+                "model": f"ERROR: {str(e)}",
+                "correct": False
+            }
+            for key in gold.keys()
+        }
+    return correct_cnt, correct_100, correct_50, matching_dict
+def calculate_logic_answer_correct(gold, prediction_text):
+    def norm(ans):
+        return [{str(key).lower(): str(value).lower() for key, value in d.items()} for d in ans]
+    try:
+        prediction = eval(prediction_text)
+        gold = norm(gold)
+        prediction = norm(prediction)
+    except Exception as e:
+        print(f"Error: {e}")
+        print(traceback.format_exc())
+        prediction = []
+    correct_cnt = 0
+    all_cnt = 0
+    for d_gold in gold:
+        first_pair = list(d_gold.items())[0]
+        d_prediction = [d for d in prediction if first_pair in list(d.items())]
+        if not d_prediction:
+            d_prediction = {}
+        else:
+            d_prediction = d_prediction[0]
+        for key, gold_value in d_gold.items():
+            if key == first_pair[0]:
+                continue
+            all_cnt += 1
+            predicted_value = d_prediction.get(key, "")
+            if gold_value == predicted_value:
+                correct_cnt += 1
+    correct_100 = (correct_cnt == all_cnt)
+    correct_50 = (correct_cnt / all_cnt >= 0.5)
+    return correct_cnt, all_cnt, correct_100, correct_50
+def calculate_sudoku_answer_correct(grid, gold, prediction_text):
+    try:
+        prediction = eval(prediction_text)
+    except Exception as e:
+        print(f"Error: {e}")
+        print(traceback.format_exc())
+        prediction = [[]]
+    all_cnt = sum([row.count(0) for row in grid])
+    correct_cnt = 0
+    for i in range(min(len(gold), len(prediction))):
+        for j in range(min(len(gold[i]), len(prediction[i]))):
+            if gold[i][j] == prediction[i][j] and grid[i][j] == 0:
+                correct_cnt += 1
+    if correct_cnt > all_cnt:
+        print("Error: correct_cnt > all_cnt")
+        correct_cnt = all_cnt
+    correct_100 = (correct_cnt == all_cnt)
+    correct_50 = (correct_cnt / all_cnt >= 0.5)
+    return correct_cnt, all_cnt, correct_100, correct_50
+def calculate_drop_answer_correct(gold, prediction_text):
+    try:
+        prediction = eval(prediction_text)
+    except Exception as e:
+        print(f"Error: {e}")
+        print(traceback.format_exc())
+        prediction = [[]]
+    all_cnt = len([x for row in gold for x in row if x != "#"])
+    correct_cnt = 0
+    for i in range(min(len(gold), len(prediction))):
+        for j in range(min(len(gold[i]), len(prediction[i]))):
+            if gold[i][j] != "#" and gold[i][j] == prediction[i][j]:
+                correct_cnt += 1
+    if correct_cnt > all_cnt:
+        print("Error: correct_cnt > all_cnt")
+        correct_cnt = all_cnt
+    correct_100 = (correct_cnt == all_cnt)
+    correct_50 = (correct_cnt / all_cnt >= 0.5)
+    return correct_cnt, all_cnt, correct_100, correct_50
+def eval_crossword(data_list, golden_list):
+    eval_dict = defaultdict(dict)
+    for level in level_dict["crossword"]:
+        golden = [g for g in golden_list if g["level"] == level]
+        golden_dict = {g["tag"]: g for g in golden}
+        data = [d for d in data_list if d["level"] == level]
+        answer_exist_cnt = 0
+        subtask_cnt = 0
+        subtask_correct_cnt = 0
+        sample_correct_100_cnt = 0
+        sample_correct_50_cnt = 0
+        for d in data:
+            tag = str(d["tag"])
+            model_answer = d['answer']
+            gold = json.loads(golden_dict[tag]['answer'])
+            if model_answer != "{}":
+                answer_exist_cnt += 1
+            curr_subtask_correct_cnt, curr_correct_100, curr_correct_50, matching_dict = calculate_dict_correct(gold, model_answer)
+            subtask_cnt += len(gold)
+            subtask_correct_cnt += curr_subtask_correct_cnt
+            sample_correct_100_cnt += curr_correct_100
+            sample_correct_50_cnt += curr_correct_50
+        eval_dict[level] = {
+            "CR": answer_exist_cnt / len(data),
+            "S-Acc": subtask_correct_cnt / subtask_cnt,
+            "EM": sample_correct_100_cnt / len(data),
+            "PM-0.5": sample_correct_50_cnt / len(data),
+        }
+    return eval_dict
+def eval_acrostic(data_list, golden_list):
+    eval_dict = defaultdict(dict)
+    for level in level_dict["acrostic"]:
+        golden = [g for g in golden_list if g["level"] == level]
+        golden_dict = {g["tag"]: g for g in golden}
+        data = [d for d in data_list if d["level"] == level]
+        answer_exist_cnt = 0
+        subtask_cnt = 0
+        subtask_correct_cnt = 0
+        sample_correct_100_cnt = 0
+        sample_correct_50_cnt = 0
+        for d in data:
+            tag = str(d["tag"])
+            model_answer = d['answer']
+            gold = json.loads(golden_dict[tag]['answer'])
+            if model_answer != "{}":
+                answer_exist_cnt += 1
+            curr_subtask_correct_cnt, curr_correct_100, curr_correct_50, matching_dict = calculate_dict_correct(gold, model_answer)
+            subtask_cnt += len(gold)
+            subtask_correct_cnt += curr_subtask_correct_cnt
+            sample_correct_100_cnt += curr_correct_100
+            sample_correct_50_cnt += curr_correct_50
+        eval_dict[level] = {
+            "CR": answer_exist_cnt / len(data),
+            "S-Acc": subtask_correct_cnt / subtask_cnt,
+            "EM": sample_correct_100_cnt / len(data),
+            "PM-0.5": sample_correct_50_cnt / len(data),
+        }
+    return eval_dict
+def eval_logic(data_list, golden_list):
+    eval_dict = defaultdict(dict)
+    for level in level_dict["logic"]:
+        golden = [g for g in golden_list if g["level"] == level]
+        golden_dict = {g["tag"]: g for g in golden}
+        data = [d for d in data_list if d["level"] == level]
+        answer_exist_cnt = 0
+        subtask_cnt = 0
+        subtask_correct_cnt = 0
+        sample_correct_100_cnt = 0
+        sample_correct_50_cnt = 0
+        for d in data:
+            tag = str(d["tag"])
+            model_answer = d['answer']
+            gold = json.loads(golden_dict[tag]['answer'])
+            if model_answer != "[]":
+                answer_exist_cnt += 1
+            curr_subtask_correct_cnt, curr_subtask_cnt, curr_correct_100, curr_correct_50 = calculate_logic_answer_correct(gold, model_answer)
+            subtask_cnt += curr_subtask_cnt
+            subtask_correct_cnt += curr_subtask_correct_cnt
+            sample_correct_100_cnt += curr_correct_100
+            sample_correct_50_cnt += curr_correct_50
+        eval_dict[level] = {
+            "CR": answer_exist_cnt / len(data),
+            "S-Acc": subtask_correct_cnt / subtask_cnt,
+            "EM": sample_correct_100_cnt / len(data),
+            "PM-0.5": sample_correct_50_cnt / len(data),
+        }
+    return eval_dict
+def eval_cryptogram(data_list, golden_list):
+    eval_dict = defaultdict(dict)
+    for level in level_dict["cryptogram"]:
+        golden = [g for g in golden_list if g["level"] == level]
+        golden_dict = {g["tag"]: g for g in golden}
+        data = [d for d in data_list if d["level"] == level]
+        answer_exist_cnt = 0
+        subtask_cnt = 0
+        subtask_correct_cnt = 0
+        sample_correct_100_cnt = 0
+        sample_correct_50_cnt = 0
+        for d in data:
+            tag = str(d["tag"])
+            model_answer = d['answer']
+            gold = json.loads(golden_dict[tag]['answer'])
+            if model_answer != "{}":
+                answer_exist_cnt += 1
+            curr_subtask_correct_cnt, curr_correct_100, curr_correct_50, matching_dict = calculate_dict_correct(gold, model_answer)
+            subtask_cnt += len(gold)
+            subtask_correct_cnt += curr_subtask_correct_cnt
+            sample_correct_100_cnt += curr_correct_100
+            sample_correct_50_cnt += curr_correct_50
+        eval_dict[level] = {
+            "CR": answer_exist_cnt / len(data),
+            "S-Acc": subtask_correct_cnt / subtask_cnt,
+            "EM": sample_correct_100_cnt / len(data),
+            "PM-0.5": sample_correct_50_cnt / len(data),
+        }
+    return eval_dict
+def eval_sudoku(data_list, golden_list):
+    eval_dict = defaultdict(dict)
+    for level in level_dict["sudoku"]:
+        golden = [g for g in golden_list if g["level"] == level]
+        golden_dict = {g["tag"]: g for g in golden}
+        data = [d for d in data_list if d["level"] == level]
+        answer_exist_cnt = 0
+        subtask_cnt = 0
+        subtask_correct_cnt = 0
+        sample_correct_100_cnt = 0
+        sample_correct_50_cnt = 0
+        for d in data:
+            tag = str(d["tag"])
+            model_answer = d['answer']
+            gold = json.loads(golden_dict[tag]['answer'])
+            grid = gold["grid"]
+            gold = gold["answer"]
+            if model_answer != "[[]]":
+                answer_exist_cnt += 1
+            curr_subtask_correct_cnt, curr_subtask_cnt, curr_correct_100, curr_correct_50 = calculate_sudoku_answer_correct(grid, gold, model_answer)
+            subtask_cnt += curr_subtask_cnt
+            subtask_correct_cnt += curr_subtask_correct_cnt
+            sample_correct_100_cnt += curr_correct_100
+            sample_correct_50_cnt += curr_correct_50
+        eval_dict[level] = {
+            "CR": answer_exist_cnt / len(data),
+            "S-Acc": subtask_correct_cnt / subtask_cnt,
+            "EM": sample_correct_100_cnt / len(data),
+            "PM-0.5": sample_correct_50_cnt / len(data),
+        }
+    return eval_dict
+def eval_drop(data_list, golden_list):
+    eval_dict = defaultdict(dict)
+    for level in level_dict["drop"]:
+        golden = [g for g in golden_list if g["level"] == level]
+        golden_dict = {g["tag"]: g for g in golden}
+        data = [d for d in data_list if d["level"] == level]
+        answer_exist_cnt = 0
+        subtask_cnt = 0
+        subtask_correct_cnt = 0
+        sample_correct_100_cnt = 0
+        sample_correct_50_cnt = 0
+        for d in data:
+            tag = str(d["tag"])
+            model_answer = d['answer']
+            gold = json.loads(golden_dict[tag]['answer'])
+            if model_answer != "[[]]":
+                answer_exist_cnt += 1
+            curr_subtask_correct_cnt, curr_subtask_cnt, curr_correct_100, curr_correct_50 = calculate_drop_answer_correct(gold, model_answer)
+            subtask_cnt += curr_subtask_cnt
+            subtask_correct_cnt += curr_subtask_correct_cnt
+            sample_correct_100_cnt += curr_correct_100
+            sample_correct_50_cnt += curr_correct_50
+        eval_dict[level] = {
+            "CR": answer_exist_cnt / len(data),
+            "S-Acc": subtask_correct_cnt / subtask_cnt,
+            "EM": sample_correct_100_cnt / len(data),
+            "PM-0.5": sample_correct_50_cnt / len(data),
+        }
+    return eval_dict
+def evaluate(data_list, golden_list, task):
+    if task == "crossword":
+        return eval_crossword(data_list, golden_list)
+    elif task == "acrostic":
+        return eval_acrostic(data_list, golden_list)
+    elif task == "logic":
+        return eval_logic(data_list, golden_list)
+    elif task == "cryptogram":
+        return eval_cryptogram(data_list, golden_list)
+    elif task == "sudoku":
+        return eval_sudoku(data_list, golden_list)
+    elif task == "drop":
+        return eval_drop(data_list, golden_list)
+    else:
+        raise ValueError(f"Invalid task: {task}")

src/leaderboard/read_evals.py CHANGED Viewed

@@ -175,7 +175,7 @@ class EvalResult:
 #     return request_file
-def get_raw_eval_results(results_path: str, requests_path: str, task: str) -> list[EvalResult]:
     """From the path of the results folder root, extract all needed info for results"""
     model_result_filepaths = []

 #     return request_file
+def get_raw_eval_results(results_path: str, task: str) -> list[EvalResult]:
     """From the path of the results folder root, extract all needed info for results"""
     model_result_filepaths = []

src/populate.py CHANGED Viewed

@@ -8,10 +8,10 @@ from src.display.utils import AutoEvalColumn, EvalQueueColumn
 from src.leaderboard.read_evals import get_raw_eval_results
-def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, task) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
     # import pdb; pdb.set_trace()
-    raw_data = get_raw_eval_results(results_path, requests_path, task)
     all_data_json = [v.to_dict() for v in raw_data]
     df = pd.DataFrame.from_records(all_data_json)

 from src.leaderboard.read_evals import get_raw_eval_results
+def get_leaderboard_df(results_path: str, cols: list, task) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
     # import pdb; pdb.set_trace()
+    raw_data = get_raw_eval_results(results_path, task)
     all_data_json = [v.to_dict() for v in raw_data]
     df = pd.DataFrame.from_records(all_data_json)