Spaces:
yourbench
/
Running on CPU Upgrade

alozowski HF Staff commited on
Commit
9562cba
Β·
1 Parent(s): 8ff0670

Update Evaluation tab

Browse files
yourbench_space/app.py CHANGED
@@ -9,7 +9,7 @@ from loguru import logger
9
 
10
  import gradio as gr
11
  from datasets import load_dataset
12
- from huggingface_hub import whoami
13
  from yourbench_space import PATH
14
  from yourbench_space.utils import (
15
  STAGES,
@@ -136,23 +136,26 @@ def enable_button(files):
136
  return gr.update(interactive=bool(files))
137
 
138
 
139
- def run_evaluation_pipeline(oauth_token: gr.OAuthToken | None, org_name, eval_name):
140
- # Test dataset existence
141
  eval_ds_name = f"{org_name}/{eval_name}"
142
- # Test dataset existence
 
 
143
  try:
144
- load_dataset(eval_ds_name, streaming=True, token=oauth_token.token)
145
  except Exception as e:
146
- print(f"Error while loading the dataset: {e}")
147
- return
148
- # Run evaluations
149
- create_eval_file(eval_ds_name)
150
- status = asyncio.run(run_evaluations(eval_ds_name=eval_ds_name, org=org_name))
151
- # Create space
152
- from huggingface_hub import HfApi
 
 
153
 
154
- repo_id = f"{org_name}/leaderboard_yourbench_{eval_ds_name.replace('/', '_')}"
155
  api = HfApi()
 
156
 
157
  try:
158
  api.create_repo(
@@ -161,10 +164,30 @@ def run_evaluation_pipeline(oauth_token: gr.OAuthToken | None, org_name, eval_na
161
  space_sdk="gradio",
162
  token=oauth_token.token,
163
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  api.upload_folder(
165
  repo_id=repo_id,
166
  repo_type="space",
167
- folder_path="src/",
168
  token=oauth_token.token,
169
  )
170
  api.add_space_secret(
@@ -176,8 +199,12 @@ def run_evaluation_pipeline(oauth_token: gr.OAuthToken | None, org_name, eval_na
176
  api.add_space_variable(repo_id=repo_id, key="TASK", value=eval_ds_name, token=oauth_token.token)
177
  api.add_space_variable(repo_id=repo_id, key="ORG_NAME", value=org_name, token=oauth_token.token)
178
  except Exception as e:
179
- status = "Evaluation" + status + "\nLeaderboard creation:" + e
180
- return status
 
 
 
 
181
 
182
 
183
  def init_session(profile: gr.OAuthProfile | None):
@@ -338,11 +365,30 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
338
  outputs=[log_output, stages_table],
339
  )
340
 
 
 
 
 
 
 
341
  with gr.Tab("Evaluate", id=2):
342
- with gr.Row():
343
- btn_launch_evals = gr.Button("Launch evaluations")
344
- status = gr.Textbox(label="Status")
345
- btn_launch_evals.click(run_evaluation_pipeline, [hf_org_dropdown, hf_dataset_name], status)
 
 
 
 
 
 
 
 
 
 
 
 
 
346
 
347
  app.load(init_session, outputs=session_state)
348
 
 
9
 
10
  import gradio as gr
11
  from datasets import load_dataset
12
+ from huggingface_hub import whoami, HfApi
13
  from yourbench_space import PATH
14
  from yourbench_space.utils import (
15
  STAGES,
 
136
  return gr.update(interactive=bool(files))
137
 
138
 
139
+ def run_evaluation_pipeline(oauth_token: gr.OAuthToken | None, org_name, eval_name, config_name="lighteval"):
 
140
  eval_ds_name = f"{org_name}/{eval_name}"
141
+ repo_id = f"{org_name}/leaderboard_yourbench_{eval_ds_name.replace('/', '_')}"
142
+ folder_path = str(Path(PATH) / "yourbench_space" / "leaderboard_space")
143
+
144
  try:
145
+ load_dataset(eval_ds_name, name=config_name, streaming=True, token=oauth_token.token)
146
  except Exception as e:
147
+ logger.error(f"Failed to load dataset '{eval_ds_name}': {e}")
148
+ return "❌ Failed: Dataset loading error"
149
+
150
+ try:
151
+ create_eval_file(eval_ds_name)
152
+ status = asyncio.run(run_evaluations(eval_ds_name=eval_ds_name, org=org_name))
153
+ except Exception as e:
154
+ logger.error(f"Evaluation error: {e}")
155
+ return f"❌ Failed: Evaluation error\n{e}"
156
 
 
157
  api = HfApi()
158
+ space_was_regenerated = False
159
 
160
  try:
161
  api.create_repo(
 
164
  space_sdk="gradio",
165
  token=oauth_token.token,
166
  )
167
+ except Exception as e:
168
+ if "409" in str(e) and "already created this space repo" in str(e):
169
+ logger.info(f"Space '{repo_id}' already exists. Deleting and regenerating it.")
170
+ try:
171
+ api.delete_repo(repo_id=repo_id, repo_type="space", token=oauth_token.token)
172
+ api.create_repo(
173
+ repo_id=repo_id,
174
+ repo_type="space",
175
+ space_sdk="gradio",
176
+ token=oauth_token.token,
177
+ )
178
+ space_was_regenerated = True
179
+ except Exception as delete_err:
180
+ logger.error(f"Failed to delete and recreate space '{repo_id}': {delete_err}")
181
+ return f"βœ… Evaluation succeeded\n❌ Failed: Could not recreate space\n{delete_err}"
182
+ else:
183
+ logger.error(f"Space creation error: {e}")
184
+ return f"βœ… Evaluation succeeded\n❌ Failed: Space creation error\n{e}"
185
+
186
+ try:
187
  api.upload_folder(
188
  repo_id=repo_id,
189
  repo_type="space",
190
+ folder_path=folder_path,
191
  token=oauth_token.token,
192
  )
193
  api.add_space_secret(
 
199
  api.add_space_variable(repo_id=repo_id, key="TASK", value=eval_ds_name, token=oauth_token.token)
200
  api.add_space_variable(repo_id=repo_id, key="ORG_NAME", value=org_name, token=oauth_token.token)
201
  except Exception as e:
202
+ logger.error(f"Failed during space setup: {e}")
203
+ return f"βœ… Evaluation succeeded\n❌ Failed: Space setup error\n{e}"
204
+
205
+ if space_was_regenerated:
206
+ return f"βœ… Evaluation succeeded\nπŸ” Space '{repo_id}' was regenerated successfully"
207
+ return f"βœ… Evaluation and Space creation completed successfully for: {repo_id}"
208
 
209
 
210
  def init_session(profile: gr.OAuthProfile | None):
 
365
  outputs=[log_output, stages_table],
366
  )
367
 
368
+ # with gr.Tab("Evaluate", id=2):
369
+ # with gr.Row():
370
+ # btn_launch_evals = gr.Button("Launch evaluations")
371
+ # status = gr.Textbox(label="Status")
372
+ # btn_launch_evals.click(run_evaluation_pipeline, [hf_org_dropdown, hf_dataset_name, gr.State("lighteval")], status)
373
+
374
  with gr.Tab("Evaluate", id=2):
375
+ with gr.Column():
376
+ gr.Markdown("### πŸ§ͺ Run YourBench Evaluation")
377
+ gr.Markdown("Run the full evaluation pipeline on the uploaded dataset. This includes computing metrics, creating the leaderboard, and pushing results.")
378
+
379
+ with gr.Row():
380
+ btn_launch_evals = gr.Button("πŸš€ Launch Evaluation", variant="primary")
381
+ clear_status_btn = gr.Button("Clear", variant="secondary")
382
+
383
+ with gr.Accordion("Evaluation Log", open=True):
384
+ eval_status = gr.Textbox(label="", lines=6, interactive=False, show_label=False)
385
+
386
+ btn_launch_evals.click(
387
+ run_evaluation_pipeline,
388
+ [hf_org_dropdown, hf_dataset_name, gr.State("lighteval")],
389
+ eval_status,
390
+ )
391
+ clear_status_btn.click(lambda: "", outputs=eval_status)
392
 
393
  app.load(init_session, outputs=session_state)
394
 
yourbench_space/evaluation.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  import subprocess
3
  import asyncio
 
4
 
5
  from yourbench_space.leaderboard_space.env import INIT_MODELS
6
 
@@ -11,7 +12,8 @@ OUTPUT_DIR = "/data" if ON_SPACES else "."
11
 
12
  def create_eval_file(eval_ds_name: str):
13
  task_name = eval_ds_name.replace("/", "_")
14
- subprocess.run(["lighteval", "tasks", "create", "examples/custom_tasks_templates/custom_yourbench_task.py", task_name, eval_ds_name])
 
15
 
16
  async def run_process(args: list) -> dict:
17
  process = await asyncio.create_subprocess_exec(
 
1
  import os
2
  import subprocess
3
  import asyncio
4
+ from pathlib import Path
5
 
6
  from yourbench_space.leaderboard_space.env import INIT_MODELS
7
 
 
12
 
13
  def create_eval_file(eval_ds_name: str):
14
  task_name = eval_ds_name.replace("/", "_")
15
+ template_path = Path("/home/user/app/yourbench_space/lighteval_task/yourbench_task.py")
16
+ subprocess.run(["lighteval", "tasks", "create", str(template_path), task_name, eval_ds_name])
17
 
18
  async def run_process(args: list) -> dict:
19
  process = await asyncio.create_subprocess_exec(