advanced / yourbench_space /evaluation.py
alozowski's picture
alozowski HF Staff
Fix token for space generation
2d54755
raw
history blame
2.12 kB
import os
import asyncio
import subprocess
from pathlib import Path
from loguru import logger
from yourbench_space.leaderboard_space.env import INIT_MODELS
ON_SPACES = os.environ.get("system") == "spaces"
OUTPUT_DIR = "/data" if ON_SPACES else "." # TODO: fix the space folder
def create_eval_file(eval_ds_name: str):
task_name = eval_ds_name.replace("/", "_")
template_path = Path("/home/user/app/yourbench_space/lighteval_task/yourbench_task.py")
subprocess.run(["lighteval", "tasks", "create", str(template_path), task_name, eval_ds_name])
async def run_process(args: list, custom_env = None) -> dict:
process = await asyncio.create_subprocess_exec(
*args, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, env=custom_env
)
await asyncio.wait_for(process.wait(), timeout=180)
stdout = await process.stdout.read()
stderr = await process.stderr.read()
return {"pid": process.pid, "stdout": stdout.decode(), "stderr": stderr.decode()}
async def run_evaluations(eval_ds_name: str, org: str, custom_env = None) -> list:
task_name = eval_ds_name.replace("/", "_")
tasks = []
for model_name, provider in INIT_MODELS:
args = [
"lighteval",
"endpoint",
"inference-providers",
f"model={model_name},provider={provider}",
f"custom|{task_name}|0|0",
"--custom-tasks",
f"custom_{task_name}_task.py",
"--max-samples",
"30",
"--output-dir",
f"{OUTPUT_DIR}",
"--save-details",
"--results-org",
org,
"--push-to-hub",
]
tasks.append(run_process(args, custom_env))
# Will capture the task if failed
processes = await asyncio.gather(*tasks, return_exceptions=True)
for process in processes:
logger.info("Logs for process:")
logger.info(process["stdout"])
logger.info(process["stderr"])
if all(not isinstance(result, Exception) for result in processes):
return "✅"
return "At least one model failed"