Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 3,092 Bytes
8b7a945 ec8e2d4 649e0fb 8b7a945 3b83af7 ec8e2d4 3b83af7 8b7a945 ec8e2d4 3d59d51 e5c7cad 8b7a945 1a2dba5 649e0fb 8b7a945 1a2dba5 3b83af7 8a1daf9 3b83af7 1a2dba5 649e0fb ec8e2d4 1a2dba5 3b83af7 1a2dba5 3b83af7 649e0fb ec8e2d4 3b83af7 ec8e2d4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
from pathlib import Path
from src.models import FullEvalResult
from src.read_evals import load_raw_eval_results
from src.utils import get_leaderboard_df
cur_fp = Path(__file__)
def test_init_from_json_file():
json_fp = cur_fp.parents[2] / "toydata" / "test_data.json"
full_eval_result = FullEvalResult.init_from_json_file(json_fp)
num_different_task_domain_lang_metric_dataset_combination = 6
assert len(full_eval_result.results) == num_different_task_domain_lang_metric_dataset_combination
assert full_eval_result.retrieval_model == "bge-m3"
assert full_eval_result.reranking_model == "bge-reranker-v2-m3"
def test_to_dict():
json_fp = cur_fp.parents[2] / "toydata" / "test_data.json"
full_eval_result = FullEvalResult.init_from_json_file(json_fp)
result_list = full_eval_result.to_dict(task="qa", metric="ndcg_at_1")
assert len(result_list) == 1
result_dict = result_list[0]
assert result_dict["Retrieval Model"] == "bge-m3"
assert result_dict["Reranking Model"] == "bge-reranker-v2-m3"
assert result_dict["wiki_en"] is not None
assert result_dict["wiki_zh"] is not None
def test_get_raw_eval_results():
results_path = cur_fp.parents[2] / "toydata" / "eval_results" / "AIR-Bench_24.04"
results = load_raw_eval_results(results_path)
# only load the latest results
assert len(results) == 4
assert results[0].eval_name == "bge-base-en-v1.5_NoReranker"
assert len(results[0].results) == 70
assert results[0].eval_name == "bge-base-en-v1.5_bge-reranker-v2-m3"
assert len(results[1].results) == 70
def test_get_leaderboard_df():
results_path = cur_fp.parents[2] / "toydata" / "eval_results" / "AIR-Bench_24.04"
raw_data = load_raw_eval_results(results_path)
df = get_leaderboard_df(raw_data, "qa", "ndcg_at_10")
assert df.shape[0] == 4
# the results contain only one embedding model
# for i in range(4):
# assert df["Retrieval Model"][i] == "bge-m3"
# # the results contain only two reranking model
# assert df["Reranking Model"][0] == "bge-reranker-v2-m3"
# assert df["Reranking Model"][1] == "NoReranker"
# assert df["Average ⬆️"][0] > df["Average ⬆️"][1]
# assert not df[['Average ⬆️', 'wiki_en', 'wiki_zh', ]].isnull().values.any()
def test_get_leaderboard_df_long_doc():
results_path = cur_fp.parents[2] / "toydata" / "test_results"
raw_data = load_raw_eval_results(results_path)
df = get_leaderboard_df(raw_data, "long-doc", "ndcg_at_1")
assert df.shape[0] == 2
# the results contain only one embedding model
for i in range(2):
assert df["Retrieval Model"][i] == "bge-m3"
# the results contains only two reranking model
assert df["Reranking Model"][0] == "bge-reranker-v2-m3"
assert df["Reranking Model"][1] == "NoReranker"
assert df["Average ⬆️"][0] > df["Average ⬆️"][1]
assert (
not df[
[
"Average ⬆️",
"law_en_lex_files_500k_600k",
]
]
.isnull()
.values.any()
)
|