advanced / yourbench_space /evaluation.py
Clémentine
added evaluation + leaderboard generation + reorg of the viz
67741f2
raw
history blame
2.7 kB
import asyncio, os
from src.env import INIT_MODELS
ON_SPACES=os.environ.get("system") == "spaces"
OUTPUT_DIR = "/data" if ON_SPACES else "."
def create_eval_file(eval_ds_name):
# TODO: replace by Nathan's call
content = """
from aenum import extend_enum
from lighteval.metrics.metrics import Metrics
from lighteval.metrics.utils.metric_utils import (
CorpusLevelMetricGrouping,
MetricCategory,
MetricUseCase,
)
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.extended.hle.main import JudgeLLMHLE
from lighteval.tasks.requests import Doc
def prompt_function(line, task_name: str = None):
if line["image"] not in [None, ""]:
return
return Doc(
task_name=task_name,
query="Question: " + line["question"] + "\\nAnswer:",
choices=[line["answer"]],
gold_index=0,
specific={"question": line["question"]},
)
""" + f"""
hle = LightevalTaskConfig(
name="{eval_ds_name.replace('/', '_')}",
suite=["custom"],
prompt_function=prompt_function,
hf_repo="{eval_ds_name}",
hf_subset="default",
hf_avail_splits=["test"],
evaluation_splits=["test"],
few_shots_split=None,
few_shots_select=None,
generation_size=8192,
metric=[Metrics.exact_match],
stop_sequence=[],
trust_dataset=True,
version=0,
)
TASKS_TABLE = [hle]
"""
with open(f"{OUTPUT_DIR}/custom_task.py", "w") as f:
f.write(content)
async def run_process(args: list) -> dict:
process = await asyncio.create_subprocess_exec(
*args,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
await asyncio.wait_for(process.wait(), timeout=180)
stdout = await process.stdout.read()
stderr = await process.stderr.read()
return {
'pid': process.pid,
'stdout': stdout.decode(),
'stderr': stderr.decode()
}
async def run_evaluations(eval_ds_name: str, org: str) -> list:
tasks = []
for model_name, provider in INIT_MODELS:
args = [
"lighteval",
"endpoint", "inference-providers", f"model={model_name},provider={provider}",
f"custom|{eval_ds_name.replace('/', '_')}|0|0", "--custom-tasks", f"{OUTPUT_DIR}/custom_task.py", "--max-samples", "10",
"--output-dir", f"{OUTPUT_DIR}", "--save-details", "--results-org", org, "--push-to-hub"
]
tasks.append(run_process(args))
# Will capture the task if failed
processes = await asyncio.gather(*tasks, return_exceptions=True)
if all(not isinstance(result, Exception) for result in processes):
return "✅"
return "At least one model failed"