File size: 1,770 Bytes
ea047ad
3adea5e
ea047ad
 
1a6cc70
67741f2
ea047ad
 
67741f2
 
ea047ad
3adea5e
 
 
ea047ad
67741f2
 
ea047ad
67741f2
 
 
 
ea047ad
 
67741f2
 
3adea5e
67741f2
 
 
ea047ad
 
 
 
3adea5e
ea047ad
3adea5e
ea047ad
3adea5e
ea047ad
 
 
 
 
 
67741f2
 
 
 
 
 
ea047ad
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import os
import subprocess
import asyncio

from yourbench_space.leaderboard_space.env import INIT_MODELS


ON_SPACES = os.environ.get("system") == "spaces"
OUTPUT_DIR = "/data" if ON_SPACES else "."


def create_eval_file(eval_ds_name: str):
    task_name = eval_ds_name.replace("/", "_")
    subprocess.run(["lighteval", "tasks", "create", "examples/custom_tasks_templates/custom_yourbench_task.py", task_name, eval_ds_name])

async def run_process(args: list) -> dict:
    process = await asyncio.create_subprocess_exec(
        *args, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
    )
    await asyncio.wait_for(process.wait(), timeout=180)
    stdout = await process.stdout.read()
    stderr = await process.stderr.read()
    return {"pid": process.pid, "stdout": stdout.decode(), "stderr": stderr.decode()}


async def run_evaluations(eval_ds_name: str, org: str) -> list:
    task_name = eval_ds_name.replace("/", "_")
    tasks = []
    for model_name, provider in INIT_MODELS:
        args = [
            "lighteval",
            "endpoint",
            "inference-providers",
            f"model={model_name},provider={provider}",
            f"custom|{task_name}|0|0",
            "--custom-tasks",
            f"custom_{task_name}_task.py",
            "--max-samples",
            "30",
            "--output-dir",
            f"{OUTPUT_DIR}",
            "--save-details",
            "--results-org",
            org,
            "--push-to-hub",
        ]
        tasks.append(run_process(args))
    # Will capture the task if failed
    processes = await asyncio.gather(*tasks, return_exceptions=True)
    if all(not isinstance(result, Exception) for result in processes):
        return "✅"
    return "At least one model failed"