Spaces:
AIR-Bench
/
Running on CPU Upgrade

File size: 3,092 Bytes
8b7a945
 
ec8e2d4
649e0fb
 
8b7a945
 
 
 
 
 
 
3b83af7
ec8e2d4
3b83af7
 
8b7a945
 
 
 
 
ec8e2d4
3d59d51
 
 
 
e5c7cad
 
8b7a945
 
 
1a2dba5
649e0fb
8b7a945
1a2dba5
 
 
 
 
3b83af7
8a1daf9
3b83af7
1a2dba5
649e0fb
ec8e2d4
1a2dba5
3b83af7
1a2dba5
 
 
 
 
 
 
3b83af7
 
 
 
649e0fb
ec8e2d4
3b83af7
 
 
 
 
 
 
 
ec8e2d4
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from pathlib import Path

from src.models import FullEvalResult
from src.read_evals import load_raw_eval_results
from src.utils import get_leaderboard_df

cur_fp = Path(__file__)


def test_init_from_json_file():
    json_fp = cur_fp.parents[2] / "toydata" / "test_data.json"
    full_eval_result = FullEvalResult.init_from_json_file(json_fp)
    num_different_task_domain_lang_metric_dataset_combination = 6
    assert len(full_eval_result.results) == num_different_task_domain_lang_metric_dataset_combination
    assert full_eval_result.retrieval_model == "bge-m3"
    assert full_eval_result.reranking_model == "bge-reranker-v2-m3"


def test_to_dict():
    json_fp = cur_fp.parents[2] / "toydata" / "test_data.json"
    full_eval_result = FullEvalResult.init_from_json_file(json_fp)
    result_list = full_eval_result.to_dict(task="qa", metric="ndcg_at_1")
    assert len(result_list) == 1
    result_dict = result_list[0]
    assert result_dict["Retrieval Model"] == "bge-m3"
    assert result_dict["Reranking Model"] == "bge-reranker-v2-m3"
    assert result_dict["wiki_en"] is not None
    assert result_dict["wiki_zh"] is not None


def test_get_raw_eval_results():
    results_path = cur_fp.parents[2] / "toydata" / "eval_results" / "AIR-Bench_24.04"
    results = load_raw_eval_results(results_path)
    # only load the latest results
    assert len(results) == 4
    assert results[0].eval_name == "bge-base-en-v1.5_NoReranker"
    assert len(results[0].results) == 70
    assert results[0].eval_name == "bge-base-en-v1.5_bge-reranker-v2-m3"
    assert len(results[1].results) == 70


def test_get_leaderboard_df():
    results_path = cur_fp.parents[2] / "toydata" / "eval_results" / "AIR-Bench_24.04"
    raw_data = load_raw_eval_results(results_path)
    df = get_leaderboard_df(raw_data, "qa", "ndcg_at_10")
    assert df.shape[0] == 4
    # the results contain only one embedding model
    # for i in range(4):
    #     assert df["Retrieval Model"][i] == "bge-m3"
    # # the results contain only two reranking model
    # assert df["Reranking Model"][0] == "bge-reranker-v2-m3"
    # assert df["Reranking Model"][1] == "NoReranker"
    # assert df["Average ⬆️"][0] > df["Average ⬆️"][1]
    # assert not df[['Average ⬆️', 'wiki_en', 'wiki_zh', ]].isnull().values.any()


def test_get_leaderboard_df_long_doc():
    results_path = cur_fp.parents[2] / "toydata" / "test_results"
    raw_data = load_raw_eval_results(results_path)
    df = get_leaderboard_df(raw_data, "long-doc", "ndcg_at_1")
    assert df.shape[0] == 2
    # the results contain only one embedding model
    for i in range(2):
        assert df["Retrieval Model"][i] == "bge-m3"
    # the results contains only two reranking model
    assert df["Reranking Model"][0] == "bge-reranker-v2-m3"
    assert df["Reranking Model"][1] == "NoReranker"
    assert df["Average ⬆️"][0] > df["Average ⬆️"][1]
    assert (
        not df[
            [
                "Average ⬆️",
                "law_en_lex_files_500k_600k",
            ]
        ]
        .isnull()
        .values.any()
    )