Alina Lozovskaya commited on
Commit
2617bee
·
1 Parent(s): e1a6c20

Apply Ruff to yourbench_space/

Browse files
yourbench_space/leaderboard_space/app.py CHANGED
@@ -1,7 +1,8 @@
 
 
 
1
  import gradio as gr
2
 
3
- from utils import run_pipeline, update_examples
4
- from env import TASK
5
 
6
  with gr.Blocks(
7
  title="YourBench Leaderboard",
@@ -11,12 +12,7 @@ with gr.Blocks(
11
  # DISPLAY TABLE AND ANALYSIS
12
  title = gr.Markdown(f"YourBench auto-Leaderboard for {TASK}")
13
  leaderboard = gr.DataFrame(label="Results", interactive=False)
14
- samples_ix = gr.Number(
15
- label="Example Index",
16
- value=0,
17
- step=1,
18
- info="Navigate through different examples"
19
- )
20
  with gr.Tab("Hardest samples"):
21
  hard_samples = gr.HTML()
22
  with gr.Tab("Easiest samples"):
@@ -28,4 +24,4 @@ with gr.Blocks(
28
 
29
  demo.load(run_pipeline, [samples_ix], [leaderboard, easy_samples, hard_samples, all_samples])
30
 
31
- demo.launch()
 
1
+ from env import TASK
2
+ from utils import run_pipeline, update_examples
3
+
4
  import gradio as gr
5
 
 
 
6
 
7
  with gr.Blocks(
8
  title="YourBench Leaderboard",
 
12
  # DISPLAY TABLE AND ANALYSIS
13
  title = gr.Markdown(f"YourBench auto-Leaderboard for {TASK}")
14
  leaderboard = gr.DataFrame(label="Results", interactive=False)
15
+ samples_ix = gr.Number(label="Example Index", value=0, step=1, info="Navigate through different examples")
 
 
 
 
 
16
  with gr.Tab("Hardest samples"):
17
  hard_samples = gr.HTML()
18
  with gr.Tab("Easiest samples"):
 
24
 
25
  demo.load(run_pipeline, [samples_ix], [leaderboard, easy_samples, hard_samples, all_samples])
26
 
27
+ demo.launch()
yourbench_space/leaderboard_space/env.py CHANGED
@@ -1,4 +1,6 @@
1
  import os
 
 
2
  INIT_MODELS = [
3
  # 70B
4
  ("Qwen/Qwen2.5-72B-Instruct", "sambanova"),
@@ -7,10 +9,10 @@ INIT_MODELS = [
7
  # 20 to 30B
8
  ("Qwen/QwQ-32B", "sambanova"),
9
  ("mistralai/Mistral-Small-24B-Instruct-2501", "together"),
10
- #("allenai/OLMo-2-0325-32B-Instruct", "hf-inference")
11
- ]
12
  MODELS = [m[0] for m in INIT_MODELS]
13
  TASK = os.getenv("TASK")
14
  # With storage
15
- HF_TOKEN=os.getenv("HF_TOKEN")
16
  ORG_NAME = os.getenv("ORG_NAME")
 
1
  import os
2
+
3
+
4
  INIT_MODELS = [
5
  # 70B
6
  ("Qwen/Qwen2.5-72B-Instruct", "sambanova"),
 
9
  # 20 to 30B
10
  ("Qwen/QwQ-32B", "sambanova"),
11
  ("mistralai/Mistral-Small-24B-Instruct-2501", "together"),
12
+ # ("allenai/OLMo-2-0325-32B-Instruct", "hf-inference")
13
+ ]
14
  MODELS = [m[0] for m in INIT_MODELS]
15
  TASK = os.getenv("TASK")
16
  # With storage
17
+ HF_TOKEN = os.getenv("HF_TOKEN")
18
  ORG_NAME = os.getenv("ORG_NAME")
yourbench_space/leaderboard_space/utils.py CHANGED
@@ -1,14 +1,14 @@
1
- from datasets import load_dataset, Dataset
2
- from functools import lru_cache
3
  from typing import Tuple
 
 
 
4
  import gradio as gr
5
- import json
6
 
7
- from env import MODELS, TASK, ORG_NAME
8
 
9
  def aggregate_results() -> list:
10
- """From the path of outputs and model list, extracts the current scores and stores them in a list of dicts with model, score, time as keys
11
- """
12
  all_results = []
13
  for org_model in MODELS:
14
  try:
@@ -16,16 +16,12 @@ def aggregate_results() -> list:
16
  ds = load_dataset(path, "results", split="latest")
17
  config = json.loads(ds["config_general"][0])
18
  results = json.loads(ds["results"][0])
19
-
20
  # Model data
21
  org, model = org_model.split("/")
22
 
23
- cur_result = {
24
- "Org": org,
25
- "Model": model,
26
- "Duration (s)": config["end_time"] - config["start_time"]
27
- }
28
-
29
  # Extract the task from the JSON data
30
  for k_metric, v_dict in results.items():
31
  if k_metric != "all":
@@ -36,9 +32,9 @@ def aggregate_results() -> list:
36
  print(f"Error processing {model} {ORG_NAME}: {e}")
37
  return all_results
38
 
 
39
  def extract_dataviz() -> Tuple[list, list]:
40
- """From the path of outputs and model list, extracts from the details the worst samples, best samples
41
- """
42
  all_samples = {}
43
  for org_model in MODELS:
44
  try:
@@ -51,7 +47,6 @@ def extract_dataviz() -> Tuple[list, list]:
51
  score = list(row["metrics"].values())[0]
52
  prediction = row["predictions"][0]
53
 
54
-
55
  # We store flattened samples in a dict
56
  # ix -> ix, prompt, gold, model_score for each model, model_prediction for each model
57
  # then 2 lists: model_scores and models, to aggreg more easily
@@ -62,7 +57,7 @@ def extract_dataviz() -> Tuple[list, list]:
62
  "gold": gold[0] if isinstance(gold, list) else gold,
63
  # A bit redundant, but put in their own boxes for simplicity of access later
64
  "model_scores": [],
65
- "models": []
66
  }
67
  if org_model not in all_samples[ix]["models"]:
68
  all_samples[ix][f"{org_model}_score"] = row["metrics"]
@@ -73,14 +68,20 @@ def extract_dataviz() -> Tuple[list, list]:
73
  except Exception as e:
74
  print(f"Error processing {org_model}: {e}")
75
 
76
- full_samples = sorted(list(all_samples.values()), key= lambda r: r['ix'])
77
- hard_samples = sorted([sample for sample in all_samples.values() if sum(sample["model_scores"]) == 0], key= lambda r: r['ix'])
78
- easy_samples = sorted([sample for sample in all_samples.values() if sum(sample["model_scores"]) == len(sample["model_scores"])], key= lambda r: r['ix'])
 
 
 
 
 
 
79
  return easy_samples, hard_samples, full_samples
80
 
 
81
  def samples_to_box_display(samples: list, example_index: int = 0):
82
- """Adapted from Nathan's code in https://huggingface.co/spaces/SaylorTwift/OpenEvalsModelDetails/
83
- """
84
  if len(samples) == 0:
85
  return "No samples in this category!"
86
  outputs = []
@@ -88,21 +89,21 @@ def samples_to_box_display(samples: list, example_index: int = 0):
88
  for model in sample["models"]:
89
  try:
90
  outputs.append({
91
- 'Model': model,
92
- 'Prediction': sample[f'{model}_prediction'],
93
- 'Prompt': sample['prompt'],
94
- 'Metrics': sample[f'{model}_score'],
95
- 'Gold': sample['gold']
96
  })
97
  except (KeyError, IndexError):
98
  continue
99
-
100
  if not outputs:
101
  return "No results found for the selected combination."
102
-
103
  # Create HTML output with all models
104
  html_output = "<div style='max-width: 800px; margin: 0 auto;'>\n\n"
105
-
106
  # Show gold answer at the top with distinct styling
107
  if outputs:
108
  html_output += "<div style='background: #e6f3e6; padding: 20px; border-radius: 10px; margin-bottom: 20px;'>\n"
@@ -111,15 +112,15 @@ def samples_to_box_display(samples: list, example_index: int = 0):
111
  html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 0;'><code>{outputs[0]['Gold']}</code></pre>\n"
112
  html_output += "</div>\n"
113
  html_output += "</div>\n"
114
-
115
  for output in outputs:
116
  html_output += "<div style='background: #f5f5f5; padding: 20px; margin-bottom: 20px; border-radius: 10px;'>\n"
117
  html_output += f"<h2 style='margin-top: 0;'>{output['Model']}</h2>\n"
118
-
119
  # Format metrics as a clean table
120
  html_output += "<details open style='margin-bottom: 15px;'>\n"
121
  html_output += "<summary><h3 style='display: inline; margin: 0;'>Metrics</h3></summary>\n"
122
- metrics = output['Metrics']
123
  if isinstance(metrics, str):
124
  metrics = eval(metrics)
125
  html_output += "<div style='overflow-x: auto;'>\n"
@@ -131,17 +132,17 @@ def samples_to_box_display(samples: list, example_index: int = 0):
131
  html_output += "</table>\n"
132
  html_output += "</div>\n"
133
  html_output += "</details>\n\n"
134
-
135
  # Handle prompt formatting with better styling
136
  html_output += "<details style='margin-bottom: 15px;'>\n"
137
  html_output += "<summary><h3 style='display: inline; margin: 0;'>Prompt</h3></summary>\n"
138
  html_output += "<div style='background: #ffffff; padding: 15px; border-radius: 5px; margin-top: 10px;'>\n"
139
-
140
- prompt_text = output['Prompt']
141
  if isinstance(prompt_text, list):
142
  for i, msg in enumerate(prompt_text):
143
- if isinstance(msg, dict) and 'content' in msg:
144
- role = msg.get('role', 'message').title()
145
  html_output += "<div style='margin-bottom: 10px;'>\n"
146
  html_output += f"<strong>{role}:</strong>\n"
147
  html_output += "<div style='overflow-x: auto;'>\n"
@@ -156,20 +157,20 @@ def samples_to_box_display(samples: list, example_index: int = 0):
156
  html_output += "</div>\n"
157
  else:
158
  html_output += "<div style='overflow-x: auto;'>\n"
159
- if isinstance(prompt_text, dict) and 'content' in prompt_text:
160
  html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{prompt_text['content']}</code></pre>\n"
161
  else:
162
  html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{prompt_text}</code></pre>\n"
163
  html_output += "</div>\n"
164
-
165
  html_output += "</div>\n"
166
  html_output += "</details>\n\n"
167
-
168
  # Style prediction output - now in a collapsible section
169
  html_output += "<details open style='margin-bottom: 15px;'>\n"
170
  html_output += "<summary><h3 style='display: inline; margin: 0;'>Prediction</h3>"
171
  # Add word count in a muted style
172
- word_count = len(output['Prediction'].split())
173
  html_output += f"<span style='color: #666; font-size: 0.8em; margin-left: 10px;'>({word_count} words)</span>"
174
  html_output += "</summary>\n"
175
  html_output += "<div style='background: #ffffff; padding: 15px; border-radius: 5px; margin-top: 10px;'>\n"
@@ -179,20 +180,30 @@ def samples_to_box_display(samples: list, example_index: int = 0):
179
  html_output += "</div>\n"
180
  html_output += "</details>\n"
181
  html_output += "</div>\n\n"
182
-
183
  html_output += "</div>"
184
  return html_output
185
 
 
186
  def run_pipeline(samples_ix: int = 0):
187
  results = aggregate_results()
188
  best_samples, worst_samples, all_samples = extract_dataviz()
189
- return gr.Dataframe(Dataset.from_list(results).to_pandas(), visible=True), \
190
- gr.HTML(samples_to_box_display(best_samples, samples_ix), label="Easiest samples (always found)", visible=True), \
191
- gr.HTML(samples_to_box_display(worst_samples, samples_ix), label="Hardest samples (always failed)", visible=True), \
192
- gr.HTML(samples_to_box_display(all_samples, samples_ix), label="All samples", visible=True)
 
 
 
 
 
 
 
193
 
194
  def update_examples(samples_ix: int = 0):
195
  best_samples, worst_samples, all_samples = extract_dataviz()
196
- return samples_to_box_display(best_samples, samples_ix), \
197
- samples_to_box_display(worst_samples, samples_ix), \
198
- samples_to_box_display(all_samples, samples_ix)
 
 
 
1
+ import json
 
2
  from typing import Tuple
3
+
4
+ from env import TASK, MODELS, ORG_NAME
5
+
6
  import gradio as gr
7
+ from datasets import Dataset, load_dataset
8
 
 
9
 
10
  def aggregate_results() -> list:
11
+ """From the path of outputs and model list, extracts the current scores and stores them in a list of dicts with model, score, time as keys"""
 
12
  all_results = []
13
  for org_model in MODELS:
14
  try:
 
16
  ds = load_dataset(path, "results", split="latest")
17
  config = json.loads(ds["config_general"][0])
18
  results = json.loads(ds["results"][0])
19
+
20
  # Model data
21
  org, model = org_model.split("/")
22
 
23
+ cur_result = {"Org": org, "Model": model, "Duration (s)": config["end_time"] - config["start_time"]}
24
+
 
 
 
 
25
  # Extract the task from the JSON data
26
  for k_metric, v_dict in results.items():
27
  if k_metric != "all":
 
32
  print(f"Error processing {model} {ORG_NAME}: {e}")
33
  return all_results
34
 
35
+
36
  def extract_dataviz() -> Tuple[list, list]:
37
+ """From the path of outputs and model list, extracts from the details the worst samples, best samples"""
 
38
  all_samples = {}
39
  for org_model in MODELS:
40
  try:
 
47
  score = list(row["metrics"].values())[0]
48
  prediction = row["predictions"][0]
49
 
 
50
  # We store flattened samples in a dict
51
  # ix -> ix, prompt, gold, model_score for each model, model_prediction for each model
52
  # then 2 lists: model_scores and models, to aggreg more easily
 
57
  "gold": gold[0] if isinstance(gold, list) else gold,
58
  # A bit redundant, but put in their own boxes for simplicity of access later
59
  "model_scores": [],
60
+ "models": [],
61
  }
62
  if org_model not in all_samples[ix]["models"]:
63
  all_samples[ix][f"{org_model}_score"] = row["metrics"]
 
68
  except Exception as e:
69
  print(f"Error processing {org_model}: {e}")
70
 
71
+ full_samples = sorted(all_samples.values(), key=lambda r: r["ix"])
72
+
73
+ hard_samples = sorted(
74
+ [sample for sample in all_samples.values() if sum(sample["model_scores"]) == 0], key=lambda r: r["ix"]
75
+ )
76
+ easy_samples = sorted(
77
+ [sample for sample in all_samples.values() if sum(sample["model_scores"]) == len(sample["model_scores"])],
78
+ key=lambda r: r["ix"],
79
+ )
80
  return easy_samples, hard_samples, full_samples
81
 
82
+
83
  def samples_to_box_display(samples: list, example_index: int = 0):
84
+ """Adapted from Nathan's code in https://huggingface.co/spaces/SaylorTwift/OpenEvalsModelDetails/"""
 
85
  if len(samples) == 0:
86
  return "No samples in this category!"
87
  outputs = []
 
89
  for model in sample["models"]:
90
  try:
91
  outputs.append({
92
+ "Model": model,
93
+ "Prediction": sample[f"{model}_prediction"],
94
+ "Prompt": sample["prompt"],
95
+ "Metrics": sample[f"{model}_score"],
96
+ "Gold": sample["gold"],
97
  })
98
  except (KeyError, IndexError):
99
  continue
100
+
101
  if not outputs:
102
  return "No results found for the selected combination."
103
+
104
  # Create HTML output with all models
105
  html_output = "<div style='max-width: 800px; margin: 0 auto;'>\n\n"
106
+
107
  # Show gold answer at the top with distinct styling
108
  if outputs:
109
  html_output += "<div style='background: #e6f3e6; padding: 20px; border-radius: 10px; margin-bottom: 20px;'>\n"
 
112
  html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 0;'><code>{outputs[0]['Gold']}</code></pre>\n"
113
  html_output += "</div>\n"
114
  html_output += "</div>\n"
115
+
116
  for output in outputs:
117
  html_output += "<div style='background: #f5f5f5; padding: 20px; margin-bottom: 20px; border-radius: 10px;'>\n"
118
  html_output += f"<h2 style='margin-top: 0;'>{output['Model']}</h2>\n"
119
+
120
  # Format metrics as a clean table
121
  html_output += "<details open style='margin-bottom: 15px;'>\n"
122
  html_output += "<summary><h3 style='display: inline; margin: 0;'>Metrics</h3></summary>\n"
123
+ metrics = output["Metrics"]
124
  if isinstance(metrics, str):
125
  metrics = eval(metrics)
126
  html_output += "<div style='overflow-x: auto;'>\n"
 
132
  html_output += "</table>\n"
133
  html_output += "</div>\n"
134
  html_output += "</details>\n\n"
135
+
136
  # Handle prompt formatting with better styling
137
  html_output += "<details style='margin-bottom: 15px;'>\n"
138
  html_output += "<summary><h3 style='display: inline; margin: 0;'>Prompt</h3></summary>\n"
139
  html_output += "<div style='background: #ffffff; padding: 15px; border-radius: 5px; margin-top: 10px;'>\n"
140
+
141
+ prompt_text = output["Prompt"]
142
  if isinstance(prompt_text, list):
143
  for i, msg in enumerate(prompt_text):
144
+ if isinstance(msg, dict) and "content" in msg:
145
+ role = msg.get("role", "message").title()
146
  html_output += "<div style='margin-bottom: 10px;'>\n"
147
  html_output += f"<strong>{role}:</strong>\n"
148
  html_output += "<div style='overflow-x: auto;'>\n"
 
157
  html_output += "</div>\n"
158
  else:
159
  html_output += "<div style='overflow-x: auto;'>\n"
160
+ if isinstance(prompt_text, dict) and "content" in prompt_text:
161
  html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{prompt_text['content']}</code></pre>\n"
162
  else:
163
  html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{prompt_text}</code></pre>\n"
164
  html_output += "</div>\n"
165
+
166
  html_output += "</div>\n"
167
  html_output += "</details>\n\n"
168
+
169
  # Style prediction output - now in a collapsible section
170
  html_output += "<details open style='margin-bottom: 15px;'>\n"
171
  html_output += "<summary><h3 style='display: inline; margin: 0;'>Prediction</h3>"
172
  # Add word count in a muted style
173
+ word_count = len(output["Prediction"].split())
174
  html_output += f"<span style='color: #666; font-size: 0.8em; margin-left: 10px;'>({word_count} words)</span>"
175
  html_output += "</summary>\n"
176
  html_output += "<div style='background: #ffffff; padding: 15px; border-radius: 5px; margin-top: 10px;'>\n"
 
180
  html_output += "</div>\n"
181
  html_output += "</details>\n"
182
  html_output += "</div>\n\n"
183
+
184
  html_output += "</div>"
185
  return html_output
186
 
187
+
188
  def run_pipeline(samples_ix: int = 0):
189
  results = aggregate_results()
190
  best_samples, worst_samples, all_samples = extract_dataviz()
191
+ return (
192
+ gr.Dataframe(Dataset.from_list(results).to_pandas(), visible=True),
193
+ gr.HTML(
194
+ samples_to_box_display(best_samples, samples_ix), label="Easiest samples (always found)", visible=True
195
+ ),
196
+ gr.HTML(
197
+ samples_to_box_display(worst_samples, samples_ix), label="Hardest samples (always failed)", visible=True
198
+ ),
199
+ gr.HTML(samples_to_box_display(all_samples, samples_ix), label="All samples", visible=True),
200
+ )
201
+
202
 
203
  def update_examples(samples_ix: int = 0):
204
  best_samples, worst_samples, all_samples = extract_dataviz()
205
+ return (
206
+ samples_to_box_display(best_samples, samples_ix),
207
+ samples_to_box_display(worst_samples, samples_ix),
208
+ samples_to_box_display(all_samples, samples_ix),
209
+ )