File size: 2,051 Bytes
ea047ad
3adea5e
ea047ad
9562cba
ea047ad
1a6cc70
4e21b7f
ea047ad
 
18a3d4c
67741f2
ea047ad
3adea5e
 
9562cba
 
ea047ad
67741f2
 
ea047ad
67741f2
 
 
 
ea047ad
 
67741f2
 
3adea5e
67741f2
 
 
ea047ad
 
 
 
3adea5e
ea047ad
3adea5e
ea047ad
3adea5e
ea047ad
 
 
 
 
 
67741f2
 
 
 
4e21b7f
 
 
 
 
67741f2
 
ea047ad
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import os
import subprocess
import asyncio
from pathlib import Path

from yourbench_space.leaderboard_space.env import INIT_MODELS
from loguru import logger

ON_SPACES = os.environ.get("system") == "spaces"
OUTPUT_DIR = "/data" if ON_SPACES else "." # TODO: fix the space folder


def create_eval_file(eval_ds_name: str):
    task_name = eval_ds_name.replace("/", "_")
    template_path = Path("/home/user/app/yourbench_space/lighteval_task/yourbench_task.py")
    subprocess.run(["lighteval", "tasks", "create", str(template_path), task_name, eval_ds_name])

async def run_process(args: list) -> dict:
    process = await asyncio.create_subprocess_exec(
        *args, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
    )
    await asyncio.wait_for(process.wait(), timeout=180)
    stdout = await process.stdout.read()
    stderr = await process.stderr.read()
    return {"pid": process.pid, "stdout": stdout.decode(), "stderr": stderr.decode()}


async def run_evaluations(eval_ds_name: str, org: str) -> list:
    task_name = eval_ds_name.replace("/", "_")
    tasks = []
    for model_name, provider in INIT_MODELS:
        args = [
            "lighteval",
            "endpoint",
            "inference-providers",
            f"model={model_name},provider={provider}",
            f"custom|{task_name}|0|0",
            "--custom-tasks",
            f"custom_{task_name}_task.py",
            "--max-samples",
            "30",
            "--output-dir",
            f"{OUTPUT_DIR}",
            "--save-details",
            "--results-org",
            org,
            "--push-to-hub",
        ]
        tasks.append(run_process(args))
    # Will capture the task if failed
    processes = await asyncio.gather(*tasks, return_exceptions=True)
    for process in processes:
        logger.info("Logs for process:")
        logger.info(process["stdout"])
        logger.info(process["stderr"])

    if all(not isinstance(result, Exception) for result in processes):
        return "✅"
    return "At least one model failed"