apsys commited on
Commit
3c01baa
·
1 Parent(s): d4d998a

submodule + versioning

Browse files
.gitignore CHANGED
@@ -43,3 +43,9 @@ eval-queue/
43
  eval-results/
44
  eval-queue-bk/
45
  eval-results-bk/
 
 
 
 
 
 
 
43
  eval-results/
44
  eval-queue-bk/
45
  eval-results-bk/
46
+
47
+ # Data files
48
+ data/
49
+
50
+ # Versioned leaderboard files
51
+ data/leaderboard_v*.json
.gitmodules ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [submodule "guard-bench-submodule"]
2
+ path = guard-bench-submodule
3
+ url = https://github.com/whitecircle-ai/guard-bench.git
app.py CHANGED
@@ -51,10 +51,14 @@ logger = logging.getLogger(__name__)
51
  # Ensure data directory exists
52
  os.makedirs(DATA_PATH, exist_ok=True)
53
 
 
 
 
 
54
  # Initialize leaderboard data
55
  try:
56
  logger.info("Initializing leaderboard data...")
57
- LEADERBOARD_DF = get_leaderboard_df()
58
  logger.info(f"Loaded leaderboard with {len(LEADERBOARD_DF)} entries")
59
  except Exception as e:
60
  logger.error(f"Error loading leaderboard data: {e}")
@@ -70,7 +74,7 @@ def init_leaderboard(dataframe):
70
  columns = [getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS]
71
  dataframe = pd.DataFrame(columns=columns)
72
  logger.warning("Initializing empty leaderboard")
73
-
74
  return Leaderboard(
75
  value=dataframe,
76
  datatype=[getattr(GUARDBENCH_COLUMN, col).type for col in DISPLAY_COLS],
@@ -79,7 +83,7 @@ def init_leaderboard(dataframe):
79
  cant_deselect=[getattr(GUARDBENCH_COLUMN, col).name for col in NEVER_HIDDEN_COLS],
80
  label="Select Columns to Display:",
81
  ),
82
- search_columns=[GUARDBENCH_COLUMN.model.name],
83
  hide_columns=[getattr(GUARDBENCH_COLUMN, col).name for col in HIDDEN_COLS],
84
  filter_columns=[
85
  ColumnFilter(GUARDBENCH_COLUMN.model_type.name, type="checkboxgroup", label="Model types"),
@@ -95,23 +99,24 @@ def submit_results(
95
  precision: str,
96
  weight_type: str,
97
  model_type: str,
98
- submission_file: tempfile._TemporaryFileWrapper
 
99
  ):
100
  """
101
  Handle submission of results with model metadata.
102
  """
103
  if submission_file is None:
104
  return styled_error("No submission file provided")
105
-
106
  if not model_name:
107
  return styled_error("Model name is required")
108
-
109
  if not model_type:
110
  return styled_error("Please select a model type")
111
-
112
  file_path = submission_file.name
113
  logger.info(f"Received submission for model {model_name}: {file_path}")
114
-
115
  # Add metadata to the submission
116
  metadata = {
117
  "model_name": model_name,
@@ -119,35 +124,46 @@ def submit_results(
119
  "revision": revision if revision else "main",
120
  "precision": precision,
121
  "weight_type": weight_type,
122
- "model_type": model_type
 
123
  }
124
-
125
  # Process the submission
126
- result = process_submission(file_path, metadata)
127
-
128
  # Refresh the leaderboard data
129
  global LEADERBOARD_DF
130
  try:
131
- logger.info("Refreshing leaderboard data after submission...")
132
- LEADERBOARD_DF = get_leaderboard_df()
133
  logger.info("Refreshed leaderboard data after submission")
134
  except Exception as e:
135
  logger.error(f"Error refreshing leaderboard data: {e}")
136
-
137
  return result
138
 
139
 
140
- def refresh_data():
141
  """
142
  Refresh the leaderboard data from HuggingFace.
143
  """
144
  global LEADERBOARD_DF
145
  try:
146
- logger.info("Performing scheduled refresh of leaderboard data...")
147
- LEADERBOARD_DF = get_leaderboard_df()
148
  logger.info("Scheduled refresh of leaderboard data completed")
149
  except Exception as e:
150
  logger.error(f"Error in scheduled refresh: {e}")
 
 
 
 
 
 
 
 
 
 
151
 
152
 
153
  # Create Gradio app
@@ -155,43 +171,54 @@ demo = gr.Blocks(css=custom_css)
155
 
156
  with demo:
157
  gr.HTML(TITLE)
158
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
159
-
 
 
 
 
 
 
 
 
 
 
 
160
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
161
  with gr.TabItem("🏅 Leaderboard", elem_id="guardbench-leaderboard-tab", id=0):
162
  refresh_button = gr.Button("Refresh Leaderboard")
163
-
164
  # Create tabs for each category
165
  with gr.Tabs(elem_classes="category-tabs") as category_tabs:
166
  # First tab for average metrics across all categories
167
  with gr.TabItem("📊 Overall Performance", elem_id="overall-tab"):
168
  leaderboard = init_leaderboard(LEADERBOARD_DF)
169
-
170
  # Create a tab for each category
171
  for category in CATEGORIES:
172
  with gr.TabItem(f"{category}", elem_id=f"category-{category.lower().replace(' ', '-')}-tab"):
173
- category_df = get_category_leaderboard_df(category)
174
  category_leaderboard = init_leaderboard(category_df)
175
-
176
  # Refresh button functionality
177
  refresh_button.click(
178
  fn=lambda: [
179
- init_leaderboard(get_leaderboard_df()),
180
- *[init_leaderboard(get_category_leaderboard_df(category)) for category in CATEGORIES]
181
  ],
182
  inputs=[],
183
  outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)]
184
  )
185
-
186
  with gr.TabItem("📝 About", elem_id="guardbench-about-tab", id=1):
187
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
188
-
189
  with gr.TabItem("🚀 Submit", elem_id="guardbench-submit-tab", id=2):
190
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
191
-
192
  with gr.Row():
193
  gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
194
-
195
  with gr.Row():
196
  with gr.Column():
197
  model_name_textbox = gr.Textbox(label="Model name")
@@ -203,33 +230,33 @@ with demo:
203
  value=None,
204
  interactive=True,
205
  )
206
-
207
  with gr.Column():
208
  precision = gr.Dropdown(
209
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
210
  label="Precision",
211
  multiselect=False,
212
  value="float16",
213
  interactive=True,
214
  )
215
  weight_type = gr.Dropdown(
216
- choices=[i.value.name for i in WeightType],
217
  label="Weights type",
218
  multiselect=False,
219
  value="Original",
220
  interactive=True,
221
  )
222
  base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
223
-
224
  with gr.Row():
225
  file_input = gr.File(
226
- label="Upload JSONL Results File",
227
  file_types=[".jsonl"]
228
  )
229
-
230
  submit_button = gr.Button("Submit Results")
231
  result_output = gr.Markdown()
232
-
233
  submit_button.click(
234
  fn=submit_results,
235
  inputs=[
@@ -239,11 +266,19 @@ with demo:
239
  precision,
240
  weight_type,
241
  model_type,
242
- file_input
 
243
  ],
244
  outputs=result_output
245
  )
246
-
 
 
 
 
 
 
 
247
  with gr.Row():
248
  with gr.Accordion("📙 Citation", open=False):
249
  citation_button = gr.Textbox(
@@ -253,29 +288,21 @@ with demo:
253
  elem_id="citation-button",
254
  show_copy_button=True,
255
  )
256
-
257
  with gr.Accordion("ℹ️ Dataset Information", open=False):
258
  dataset_info = gr.Markdown(f"""
259
  ## Dataset Information
260
-
261
  Results are stored in the HuggingFace dataset: [{RESULTS_DATASET_ID}](https://huggingface.co/datasets/{RESULTS_DATASET_ID})
262
-
263
  Last updated: {pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S UTC")}
264
  """)
265
 
266
- # Set up scheduler to refresh data periodically
267
  scheduler = BackgroundScheduler()
268
- scheduler.add_job(refresh_data, 'interval', minutes=30)
269
  scheduler.start()
270
 
271
  # Launch the app
272
  if __name__ == "__main__":
273
- # Set up authentication if credentials are provided
274
- if not ADMIN_USERNAME or not ADMIN_PASSWORD:
275
- logger.warning("Admin username or password not set. Running without authentication.")
276
- auth = None
277
- else:
278
- auth = (ADMIN_USERNAME, ADMIN_PASSWORD)
279
-
280
- # Launch the app
281
- demo.launch(server_name="0.0.0.0", server_port=7860, auth=auth)
 
51
  # Ensure data directory exists
52
  os.makedirs(DATA_PATH, exist_ok=True)
53
 
54
+ # Available benchmark versions
55
+ BENCHMARK_VERSIONS = ["v0"]
56
+ CURRENT_VERSION = "v0"
57
+
58
  # Initialize leaderboard data
59
  try:
60
  logger.info("Initializing leaderboard data...")
61
+ LEADERBOARD_DF = get_leaderboard_df(version=CURRENT_VERSION)
62
  logger.info(f"Loaded leaderboard with {len(LEADERBOARD_DF)} entries")
63
  except Exception as e:
64
  logger.error(f"Error loading leaderboard data: {e}")
 
74
  columns = [getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS]
75
  dataframe = pd.DataFrame(columns=columns)
76
  logger.warning("Initializing empty leaderboard")
77
+
78
  return Leaderboard(
79
  value=dataframe,
80
  datatype=[getattr(GUARDBENCH_COLUMN, col).type for col in DISPLAY_COLS],
 
83
  cant_deselect=[getattr(GUARDBENCH_COLUMN, col).name for col in NEVER_HIDDEN_COLS],
84
  label="Select Columns to Display:",
85
  ),
86
+ search_columns=[GUARDBENCH_COLUMN.model_name.name],
87
  hide_columns=[getattr(GUARDBENCH_COLUMN, col).name for col in HIDDEN_COLS],
88
  filter_columns=[
89
  ColumnFilter(GUARDBENCH_COLUMN.model_type.name, type="checkboxgroup", label="Model types"),
 
99
  precision: str,
100
  weight_type: str,
101
  model_type: str,
102
+ submission_file: tempfile._TemporaryFileWrapper,
103
+ version: str
104
  ):
105
  """
106
  Handle submission of results with model metadata.
107
  """
108
  if submission_file is None:
109
  return styled_error("No submission file provided")
110
+
111
  if not model_name:
112
  return styled_error("Model name is required")
113
+
114
  if not model_type:
115
  return styled_error("Please select a model type")
116
+
117
  file_path = submission_file.name
118
  logger.info(f"Received submission for model {model_name}: {file_path}")
119
+
120
  # Add metadata to the submission
121
  metadata = {
122
  "model_name": model_name,
 
124
  "revision": revision if revision else "main",
125
  "precision": precision,
126
  "weight_type": weight_type,
127
+ "model_type": model_type,
128
+ "version": version
129
  }
130
+
131
  # Process the submission
132
+ result = process_submission(file_path, metadata, version=version)
133
+
134
  # Refresh the leaderboard data
135
  global LEADERBOARD_DF
136
  try:
137
+ logger.info(f"Refreshing leaderboard data after submission for version {version}...")
138
+ LEADERBOARD_DF = get_leaderboard_df(version=version)
139
  logger.info("Refreshed leaderboard data after submission")
140
  except Exception as e:
141
  logger.error(f"Error refreshing leaderboard data: {e}")
142
+
143
  return result
144
 
145
 
146
+ def refresh_data(version=CURRENT_VERSION):
147
  """
148
  Refresh the leaderboard data from HuggingFace.
149
  """
150
  global LEADERBOARD_DF
151
  try:
152
+ logger.info(f"Performing scheduled refresh of leaderboard data for version {version}...")
153
+ LEADERBOARD_DF = get_leaderboard_df(version=version)
154
  logger.info("Scheduled refresh of leaderboard data completed")
155
  except Exception as e:
156
  logger.error(f"Error in scheduled refresh: {e}")
157
+ return LEADERBOARD_DF
158
+
159
+
160
+ def update_leaderboards(version):
161
+ """
162
+ Update all leaderboard components with data for the selected version.
163
+ """
164
+ new_df = get_leaderboard_df(version=version)
165
+ category_dfs = [get_category_leaderboard_df(category, version=version) for category in CATEGORIES]
166
+ return [init_leaderboard(new_df)] + [init_leaderboard(df) for df in category_dfs]
167
 
168
 
169
  # Create Gradio app
 
171
 
172
  with demo:
173
  gr.HTML(TITLE)
174
+
175
+ with gr.Row():
176
+ with gr.Column(scale=3):
177
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
178
+ with gr.Column(scale=1):
179
+ version_selector = gr.Dropdown(
180
+ choices=BENCHMARK_VERSIONS,
181
+ label="Benchmark Version",
182
+ value=CURRENT_VERSION,
183
+ interactive=True,
184
+ elem_classes="version-selector"
185
+ )
186
+
187
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
188
  with gr.TabItem("🏅 Leaderboard", elem_id="guardbench-leaderboard-tab", id=0):
189
  refresh_button = gr.Button("Refresh Leaderboard")
190
+
191
  # Create tabs for each category
192
  with gr.Tabs(elem_classes="category-tabs") as category_tabs:
193
  # First tab for average metrics across all categories
194
  with gr.TabItem("📊 Overall Performance", elem_id="overall-tab"):
195
  leaderboard = init_leaderboard(LEADERBOARD_DF)
196
+
197
  # Create a tab for each category
198
  for category in CATEGORIES:
199
  with gr.TabItem(f"{category}", elem_id=f"category-{category.lower().replace(' ', '-')}-tab"):
200
+ category_df = get_category_leaderboard_df(category, version=CURRENT_VERSION)
201
  category_leaderboard = init_leaderboard(category_df)
202
+
203
  # Refresh button functionality
204
  refresh_button.click(
205
  fn=lambda: [
206
+ init_leaderboard(get_leaderboard_df(version=version_selector.value)),
207
+ *[init_leaderboard(get_category_leaderboard_df(category, version=version_selector.value)) for category in CATEGORIES]
208
  ],
209
  inputs=[],
210
  outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)]
211
  )
212
+
213
  with gr.TabItem("📝 About", elem_id="guardbench-about-tab", id=1):
214
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
215
+
216
  with gr.TabItem("🚀 Submit", elem_id="guardbench-submit-tab", id=2):
217
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
218
+
219
  with gr.Row():
220
  gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
221
+
222
  with gr.Row():
223
  with gr.Column():
224
  model_name_textbox = gr.Textbox(label="Model name")
 
230
  value=None,
231
  interactive=True,
232
  )
233
+
234
  with gr.Column():
235
  precision = gr.Dropdown(
236
+ choices=[i.name for i in Precision if i != Precision.Unknown],
237
  label="Precision",
238
  multiselect=False,
239
  value="float16",
240
  interactive=True,
241
  )
242
  weight_type = gr.Dropdown(
243
+ choices=[i.name for i in WeightType],
244
  label="Weights type",
245
  multiselect=False,
246
  value="Original",
247
  interactive=True,
248
  )
249
  base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
250
+
251
  with gr.Row():
252
  file_input = gr.File(
253
+ label="Upload JSONL Results File",
254
  file_types=[".jsonl"]
255
  )
256
+
257
  submit_button = gr.Button("Submit Results")
258
  result_output = gr.Markdown()
259
+
260
  submit_button.click(
261
  fn=submit_results,
262
  inputs=[
 
266
  precision,
267
  weight_type,
268
  model_type,
269
+ file_input,
270
+ version_selector
271
  ],
272
  outputs=result_output
273
  )
274
+
275
+ # Version selector functionality
276
+ version_selector.change(
277
+ fn=update_leaderboards,
278
+ inputs=[version_selector],
279
+ outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)]
280
+ )
281
+
282
  with gr.Row():
283
  with gr.Accordion("📙 Citation", open=False):
284
  citation_button = gr.Textbox(
 
288
  elem_id="citation-button",
289
  show_copy_button=True,
290
  )
291
+
292
  with gr.Accordion("ℹ️ Dataset Information", open=False):
293
  dataset_info = gr.Markdown(f"""
294
  ## Dataset Information
295
+
296
  Results are stored in the HuggingFace dataset: [{RESULTS_DATASET_ID}](https://huggingface.co/datasets/{RESULTS_DATASET_ID})
297
+
298
  Last updated: {pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S UTC")}
299
  """)
300
 
 
301
  scheduler = BackgroundScheduler()
302
+ scheduler.add_job(lambda: refresh_data(version=CURRENT_VERSION), 'interval', minutes=30)
303
  scheduler.start()
304
 
305
  # Launch the app
306
  if __name__ == "__main__":
307
+
308
+ demo.launch(server_name="0.0.0.0", server_port=7860, share=True)
 
 
 
 
 
 
 
guard-bench-submodule ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 0a9f48bcedd0ccb6b5cf59ff7ed1186e32a5dc17
src/display/css_html_js.py CHANGED
@@ -43,4 +43,22 @@ custom_css = """
43
  text-decoration: underline;
44
  color: #1976D2;
45
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  """
 
43
  text-decoration: underline;
44
  color: #1976D2;
45
  }
46
+
47
+ .version-selector {
48
+ margin-top: 10px;
49
+ padding: 5px;
50
+ border: 1px solid #e0e0e0;
51
+ border-radius: 5px;
52
+ background-color: #f9f9f9;
53
+ }
54
+
55
+ .version-selector label {
56
+ font-weight: bold;
57
+ color: #2196F3;
58
+ }
59
+
60
+ .version-selector select {
61
+ border-color: #2196F3;
62
+ border-radius: 4px;
63
+ }
64
  """
src/display/utils.py CHANGED
@@ -36,12 +36,19 @@ class Precision(Enum):
36
  int8 = auto()
37
  int4 = auto()
38
 
 
 
 
 
39
 
40
  class WeightType(Enum):
41
  """Model weight types."""
42
  Original = auto()
43
  Delta = auto()
44
  Adapter = auto()
 
 
 
45
 
46
 
47
  @dataclass
@@ -58,19 +65,19 @@ class ColumnInfo:
58
  @dataclass
59
  class GuardBenchColumn:
60
  """Columns for the GuardBench leaderboard."""
61
- model: ColumnInfo = field(default_factory=lambda: ColumnInfo(
62
  name="model_name",
63
  display_name="Model",
64
  never_hidden=True,
65
  displayed_by_default=True
66
  ))
67
-
68
  model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
69
  name="model_type",
70
  display_name="Type",
71
  displayed_by_default=True
72
  ))
73
-
74
  # Metrics for all categories
75
  default_prompts_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
76
  name="default_prompts_f1",
@@ -78,28 +85,28 @@ class GuardBenchColumn:
78
  type="number",
79
  displayed_by_default=True
80
  ))
81
-
82
  jailbreaked_prompts_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
83
  name="jailbreaked_prompts_f1",
84
  display_name="Jailbreaked Prompts F1",
85
  type="number",
86
  displayed_by_default=True
87
  ))
88
-
89
  default_answers_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
90
  name="default_answers_f1",
91
  display_name="Default Answers F1",
92
  type="number",
93
  displayed_by_default=True
94
  ))
95
-
96
  jailbreaked_answers_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
97
  name="jailbreaked_answers_f1",
98
  display_name="Jailbreaked Answers F1",
99
  type="number",
100
  displayed_by_default=True
101
  ))
102
-
103
  # Average metrics
104
  average_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
105
  name="average_f1",
@@ -108,21 +115,21 @@ class GuardBenchColumn:
108
  displayed_by_default=True,
109
  never_hidden=True
110
  ))
111
-
112
  average_recall: ColumnInfo = field(default_factory=lambda: ColumnInfo(
113
  name="average_recall",
114
  display_name="Average Recall",
115
  type="number",
116
  displayed_by_default=False
117
  ))
118
-
119
  average_precision: ColumnInfo = field(default_factory=lambda: ColumnInfo(
120
  name="average_precision",
121
  display_name="Average Precision",
122
  type="number",
123
  displayed_by_default=False
124
  ))
125
-
126
  # Additional metadata
127
  submission_date: ColumnInfo = field(default_factory=lambda: ColumnInfo(
128
  name="submission_date",
@@ -136,13 +143,13 @@ GUARDBENCH_COLUMN = GuardBenchColumn()
136
 
137
  # Extract column lists for different views
138
  COLS = [f.name for f in fields(GUARDBENCH_COLUMN)]
139
- DISPLAY_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
140
  if getattr(GUARDBENCH_COLUMN, f.name).displayed_by_default]
141
- METRIC_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
142
  if getattr(GUARDBENCH_COLUMN, f.name).type == "number"]
143
- HIDDEN_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
144
  if getattr(GUARDBENCH_COLUMN, f.name).hidden]
145
- NEVER_HIDDEN_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
146
  if getattr(GUARDBENCH_COLUMN, f.name).never_hidden]
147
 
148
  # Categories in GuardBench
 
36
  int8 = auto()
37
  int4 = auto()
38
 
39
+ def __str__(self):
40
+ """String representation of the precision type."""
41
+ return self.name
42
+
43
 
44
  class WeightType(Enum):
45
  """Model weight types."""
46
  Original = auto()
47
  Delta = auto()
48
  Adapter = auto()
49
+ def __str__(self):
50
+ """String representation of the weight type."""
51
+ return self.name
52
 
53
 
54
  @dataclass
 
65
  @dataclass
66
  class GuardBenchColumn:
67
  """Columns for the GuardBench leaderboard."""
68
+ model_name: ColumnInfo = field(default_factory=lambda: ColumnInfo(
69
  name="model_name",
70
  display_name="Model",
71
  never_hidden=True,
72
  displayed_by_default=True
73
  ))
74
+
75
  model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
76
  name="model_type",
77
  display_name="Type",
78
  displayed_by_default=True
79
  ))
80
+
81
  # Metrics for all categories
82
  default_prompts_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
83
  name="default_prompts_f1",
 
85
  type="number",
86
  displayed_by_default=True
87
  ))
88
+
89
  jailbreaked_prompts_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
90
  name="jailbreaked_prompts_f1",
91
  display_name="Jailbreaked Prompts F1",
92
  type="number",
93
  displayed_by_default=True
94
  ))
95
+
96
  default_answers_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
97
  name="default_answers_f1",
98
  display_name="Default Answers F1",
99
  type="number",
100
  displayed_by_default=True
101
  ))
102
+
103
  jailbreaked_answers_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
104
  name="jailbreaked_answers_f1",
105
  display_name="Jailbreaked Answers F1",
106
  type="number",
107
  displayed_by_default=True
108
  ))
109
+
110
  # Average metrics
111
  average_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
112
  name="average_f1",
 
115
  displayed_by_default=True,
116
  never_hidden=True
117
  ))
118
+
119
  average_recall: ColumnInfo = field(default_factory=lambda: ColumnInfo(
120
  name="average_recall",
121
  display_name="Average Recall",
122
  type="number",
123
  displayed_by_default=False
124
  ))
125
+
126
  average_precision: ColumnInfo = field(default_factory=lambda: ColumnInfo(
127
  name="average_precision",
128
  display_name="Average Precision",
129
  type="number",
130
  displayed_by_default=False
131
  ))
132
+
133
  # Additional metadata
134
  submission_date: ColumnInfo = field(default_factory=lambda: ColumnInfo(
135
  name="submission_date",
 
143
 
144
  # Extract column lists for different views
145
  COLS = [f.name for f in fields(GUARDBENCH_COLUMN)]
146
+ DISPLAY_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
147
  if getattr(GUARDBENCH_COLUMN, f.name).displayed_by_default]
148
+ METRIC_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
149
  if getattr(GUARDBENCH_COLUMN, f.name).type == "number"]
150
+ HIDDEN_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
151
  if getattr(GUARDBENCH_COLUMN, f.name).hidden]
152
+ NEVER_HIDDEN_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
153
  if getattr(GUARDBENCH_COLUMN, f.name).never_hidden]
154
 
155
  # Categories in GuardBench
src/leaderboard/processor.py CHANGED
@@ -16,11 +16,21 @@ def load_leaderboard_data(file_path: str) -> Dict:
16
  Load the leaderboard data from a JSON file.
17
  """
18
  if not os.path.exists(file_path):
19
- return {"entries": [], "last_updated": datetime.now().isoformat()}
20
-
 
 
 
21
  with open(file_path, 'r') as f:
22
  data = json.load(f)
23
-
 
 
 
 
 
 
 
24
  return data
25
 
26
 
@@ -30,10 +40,17 @@ def save_leaderboard_data(data: Dict, file_path: str) -> None:
30
  """
31
  # Ensure the directory exists
32
  os.makedirs(os.path.dirname(file_path), exist_ok=True)
33
-
34
  # Update the last_updated timestamp
35
  data["last_updated"] = datetime.now().isoformat()
36
-
 
 
 
 
 
 
 
37
  with open(file_path, 'w') as f:
38
  json.dump(data, f, indent=2)
39
 
@@ -43,26 +60,32 @@ def process_submission(submission_data: List[Dict]) -> List[Dict]:
43
  Process submission data and convert it to leaderboard entries.
44
  """
45
  entries = []
46
-
47
  for item in submission_data:
48
  # Create a new entry for the leaderboard
49
  entry = {
50
  "model_name": item.get("model_name", "Unknown Model"),
51
  "per_category_metrics": {},
52
  "avg_metrics": {},
53
- "submission_date": datetime.now().isoformat()
 
54
  }
55
-
 
 
 
 
 
56
  # Process per-category metrics
57
  if "per_category_metrics" in item:
58
  entry["per_category_metrics"] = item["per_category_metrics"]
59
-
60
  # Process average metrics
61
  if "avg_metrics" in item:
62
  entry["avg_metrics"] = item["avg_metrics"]
63
-
64
  entries.append(entry)
65
-
66
  return entries
67
 
68
 
@@ -71,17 +94,23 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
71
  Convert leaderboard data to a pandas DataFrame for display.
72
  """
73
  rows = []
74
-
75
  for entry in leaderboard_data.get("entries", []):
76
  model_name = entry.get("model_name", "Unknown Model")
77
-
78
  # Extract average metrics for main display
79
  row = {
80
  "model_name": model_name,
81
  "model_type": entry.get("model_type", "Unknown"),
82
- "submission_date": entry.get("submission_date", "")
 
83
  }
84
-
 
 
 
 
 
85
  # Add average metrics
86
  avg_metrics = entry.get("avg_metrics", {})
87
  for test_type in TEST_TYPES:
@@ -90,12 +119,12 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
90
  if metric in avg_metrics[test_type]:
91
  col_name = f"{test_type}_{metric}"
92
  row[col_name] = avg_metrics[test_type][metric]
93
-
94
  # Calculate overall averages for key metrics
95
  f1_values = []
96
  recall_values = []
97
  precision_values = []
98
-
99
  for test_type in TEST_TYPES:
100
  if test_type in avg_metrics and "f1_binary" in avg_metrics[test_type]:
101
  f1_values.append(avg_metrics[test_type]["f1_binary"])
@@ -103,7 +132,7 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
103
  recall_values.append(avg_metrics[test_type]["recall_binary"])
104
  if test_type in avg_metrics and "precision_binary" in avg_metrics[test_type]:
105
  precision_values.append(avg_metrics[test_type]["precision_binary"])
106
-
107
  # Add overall averages
108
  if f1_values:
109
  row["average_f1"] = sum(f1_values) / len(f1_values)
@@ -111,7 +140,7 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
111
  row["average_recall"] = sum(recall_values) / len(recall_values)
112
  if precision_values:
113
  row["average_precision"] = sum(precision_values) / len(precision_values)
114
-
115
  # Add specific test type F1 scores for display
116
  if "default_prompts" in avg_metrics and "f1_binary" in avg_metrics["default_prompts"]:
117
  row["default_prompts_f1"] = avg_metrics["default_prompts"]["f1_binary"]
@@ -121,14 +150,14 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
121
  row["default_answers_f1"] = avg_metrics["default_answers"]["f1_binary"]
122
  if "jailbreaked_answers" in avg_metrics and "f1_binary" in avg_metrics["jailbreaked_answers"]:
123
  row["jailbreaked_answers_f1"] = avg_metrics["jailbreaked_answers"]["f1_binary"]
124
-
125
  rows.append(row)
126
-
127
  # Create DataFrame and sort by average F1 score
128
  df = pd.DataFrame(rows)
129
  if not df.empty and "average_f1" in df.columns:
130
  df = df.sort_values(by="average_f1", ascending=False)
131
-
132
  return df
133
 
134
 
@@ -136,25 +165,29 @@ def add_entries_to_leaderboard(leaderboard_data: Dict, new_entries: List[Dict])
136
  """
137
  Add new entries to the leaderboard, replacing any with the same model name.
138
  """
139
- # Create a mapping of existing entries by model name
140
- existing_entries = {entry["model_name"]: i for i, entry in enumerate(leaderboard_data.get("entries", []))}
141
-
 
 
 
142
  # Process each new entry
143
  for new_entry in new_entries:
144
  model_name = new_entry.get("model_name")
145
-
146
- if model_name in existing_entries:
 
147
  # Replace existing entry
148
- leaderboard_data["entries"][existing_entries[model_name]] = new_entry
149
  else:
150
  # Add new entry
151
  if "entries" not in leaderboard_data:
152
  leaderboard_data["entries"] = []
153
  leaderboard_data["entries"].append(new_entry)
154
-
155
  # Update the last_updated timestamp
156
  leaderboard_data["last_updated"] = datetime.now().isoformat()
157
-
158
  return leaderboard_data
159
 
160
 
@@ -171,10 +204,10 @@ def process_jsonl_submission(file_path: str) -> Tuple[List[Dict], str]:
171
  entries.append(entry)
172
  except json.JSONDecodeError as e:
173
  return [], f"Invalid JSON in submission file: {e}"
174
-
175
  if not entries:
176
  return [], "Submission file is empty"
177
-
178
  return entries, "Successfully processed submission"
179
  except Exception as e:
180
  return [], f"Error processing submission file: {e}"
 
16
  Load the leaderboard data from a JSON file.
17
  """
18
  if not os.path.exists(file_path):
19
+ version = "v0"
20
+ if "_v" in file_path:
21
+ version = file_path.split("_")[-1].split(".")[0]
22
+ return {"entries": [], "last_updated": datetime.now().isoformat(), "version": version}
23
+
24
  with open(file_path, 'r') as f:
25
  data = json.load(f)
26
+
27
+ # Ensure version field exists
28
+ if "version" not in data:
29
+ version = "v0"
30
+ if "_v" in file_path:
31
+ version = file_path.split("_")[-1].split(".")[0]
32
+ data["version"] = version
33
+
34
  return data
35
 
36
 
 
40
  """
41
  # Ensure the directory exists
42
  os.makedirs(os.path.dirname(file_path), exist_ok=True)
43
+
44
  # Update the last_updated timestamp
45
  data["last_updated"] = datetime.now().isoformat()
46
+
47
+ # Ensure version is set
48
+ if "version" not in data:
49
+ version = "v0"
50
+ if "_v" in file_path:
51
+ version = file_path.split("_")[-1].split(".")[0]
52
+ data["version"] = version
53
+
54
  with open(file_path, 'w') as f:
55
  json.dump(data, f, indent=2)
56
 
 
60
  Process submission data and convert it to leaderboard entries.
61
  """
62
  entries = []
63
+
64
  for item in submission_data:
65
  # Create a new entry for the leaderboard
66
  entry = {
67
  "model_name": item.get("model_name", "Unknown Model"),
68
  "per_category_metrics": {},
69
  "avg_metrics": {},
70
+ "submission_date": datetime.now().isoformat(),
71
+ "version": item.get("version", "v0")
72
  }
73
+
74
+ # Copy model metadata
75
+ for key in ["model_type", "base_model", "revision", "precision", "weight_type"]:
76
+ if key in item:
77
+ entry[key] = item[key]
78
+
79
  # Process per-category metrics
80
  if "per_category_metrics" in item:
81
  entry["per_category_metrics"] = item["per_category_metrics"]
82
+
83
  # Process average metrics
84
  if "avg_metrics" in item:
85
  entry["avg_metrics"] = item["avg_metrics"]
86
+
87
  entries.append(entry)
88
+
89
  return entries
90
 
91
 
 
94
  Convert leaderboard data to a pandas DataFrame for display.
95
  """
96
  rows = []
97
+
98
  for entry in leaderboard_data.get("entries", []):
99
  model_name = entry.get("model_name", "Unknown Model")
100
+
101
  # Extract average metrics for main display
102
  row = {
103
  "model_name": model_name,
104
  "model_type": entry.get("model_type", "Unknown"),
105
+ "submission_date": entry.get("submission_date", ""),
106
+ "version": entry.get("version", "v0")
107
  }
108
+
109
+ # Add additional metadata fields if present
110
+ for key in ["base_model", "revision", "precision", "weight_type"]:
111
+ if key in entry:
112
+ row[key] = entry[key]
113
+
114
  # Add average metrics
115
  avg_metrics = entry.get("avg_metrics", {})
116
  for test_type in TEST_TYPES:
 
119
  if metric in avg_metrics[test_type]:
120
  col_name = f"{test_type}_{metric}"
121
  row[col_name] = avg_metrics[test_type][metric]
122
+
123
  # Calculate overall averages for key metrics
124
  f1_values = []
125
  recall_values = []
126
  precision_values = []
127
+
128
  for test_type in TEST_TYPES:
129
  if test_type in avg_metrics and "f1_binary" in avg_metrics[test_type]:
130
  f1_values.append(avg_metrics[test_type]["f1_binary"])
 
132
  recall_values.append(avg_metrics[test_type]["recall_binary"])
133
  if test_type in avg_metrics and "precision_binary" in avg_metrics[test_type]:
134
  precision_values.append(avg_metrics[test_type]["precision_binary"])
135
+
136
  # Add overall averages
137
  if f1_values:
138
  row["average_f1"] = sum(f1_values) / len(f1_values)
 
140
  row["average_recall"] = sum(recall_values) / len(recall_values)
141
  if precision_values:
142
  row["average_precision"] = sum(precision_values) / len(precision_values)
143
+
144
  # Add specific test type F1 scores for display
145
  if "default_prompts" in avg_metrics and "f1_binary" in avg_metrics["default_prompts"]:
146
  row["default_prompts_f1"] = avg_metrics["default_prompts"]["f1_binary"]
 
150
  row["default_answers_f1"] = avg_metrics["default_answers"]["f1_binary"]
151
  if "jailbreaked_answers" in avg_metrics and "f1_binary" in avg_metrics["jailbreaked_answers"]:
152
  row["jailbreaked_answers_f1"] = avg_metrics["jailbreaked_answers"]["f1_binary"]
153
+
154
  rows.append(row)
155
+
156
  # Create DataFrame and sort by average F1 score
157
  df = pd.DataFrame(rows)
158
  if not df.empty and "average_f1" in df.columns:
159
  df = df.sort_values(by="average_f1", ascending=False)
160
+
161
  return df
162
 
163
 
 
165
  """
166
  Add new entries to the leaderboard, replacing any with the same model name.
167
  """
168
+ # Create a mapping of existing entries by model name and version
169
+ existing_entries = {
170
+ (entry["model_name"], entry.get("version", "v0")): i
171
+ for i, entry in enumerate(leaderboard_data.get("entries", []))
172
+ }
173
+
174
  # Process each new entry
175
  for new_entry in new_entries:
176
  model_name = new_entry.get("model_name")
177
+ version = new_entry.get("version", "v0")
178
+
179
+ if (model_name, version) in existing_entries:
180
  # Replace existing entry
181
+ leaderboard_data["entries"][existing_entries[(model_name, version)]] = new_entry
182
  else:
183
  # Add new entry
184
  if "entries" not in leaderboard_data:
185
  leaderboard_data["entries"] = []
186
  leaderboard_data["entries"].append(new_entry)
187
+
188
  # Update the last_updated timestamp
189
  leaderboard_data["last_updated"] = datetime.now().isoformat()
190
+
191
  return leaderboard_data
192
 
193
 
 
204
  entries.append(entry)
205
  except json.JSONDecodeError as e:
206
  return [], f"Invalid JSON in submission file: {e}"
207
+
208
  if not entries:
209
  return [], "Submission file is empty"
210
+
211
  return entries, "Successfully processed submission"
212
  except Exception as e:
213
  return [], f"Error processing submission file: {e}"
src/populate.py CHANGED
@@ -17,15 +17,29 @@ from src.envs import RESULTS_DATASET_ID, TOKEN, LEADERBOARD_FILE, CACHE_PATH
17
  from src.leaderboard.processor import leaderboard_to_dataframe, load_leaderboard_data, save_leaderboard_data, process_jsonl_submission, add_entries_to_leaderboard
18
 
19
 
20
- def download_leaderboard_data() -> bool:
 
 
 
 
 
 
 
 
21
  """
22
  Download the latest leaderboard data from HuggingFace.
 
 
 
23
  """
24
  try:
25
  # Create a temporary directory to download the submissions
26
- temp_dir = os.path.join(CACHE_PATH, "temp_submissions")
27
  os.makedirs(temp_dir, exist_ok=True)
28
 
 
 
 
29
  # Download the entire repository
30
  try:
31
  snapshot_path = snapshot_download(
@@ -43,25 +57,43 @@ def download_leaderboard_data() -> bool:
43
 
44
  # Look for submission files in the submissions directory
45
  submissions_dir = os.path.join(snapshot_path, "submissions")
 
 
 
46
  if os.path.exists(submissions_dir):
47
  submission_files.extend(glob(os.path.join(submissions_dir, "*.jsonl")))
48
 
49
- # Also look for any JSONL files in the root
50
- submission_files.extend(glob(os.path.join(snapshot_path, "*.jsonl")))
 
 
 
 
 
 
 
51
 
52
  # Process each submission file
53
  for file_path in submission_files:
54
  entries, _ = process_jsonl_submission(file_path)
55
- all_entries.extend(entries)
 
 
 
 
 
 
 
56
 
57
  # Create leaderboard data structure
58
  leaderboard_data = {
59
  "entries": all_entries,
60
- "last_updated": pd.Timestamp.now().isoformat()
 
61
  }
62
 
63
  # Save to local file
64
- save_leaderboard_data(leaderboard_data, LEADERBOARD_FILE)
65
 
66
  return True
67
  except Exception as e:
@@ -72,7 +104,14 @@ def download_leaderboard_data() -> bool:
72
  api = HfApi(token=TOKEN)
73
  files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
74
 
75
- submission_files = [f for f in files if f.endswith('.jsonl')]
 
 
 
 
 
 
 
76
  all_entries = []
77
 
78
  for file_path in submission_files:
@@ -84,49 +123,70 @@ def download_leaderboard_data() -> bool:
84
  token=TOKEN
85
  )
86
  entries, _ = process_jsonl_submission(local_path)
87
- all_entries.extend(entries)
 
 
 
 
 
 
 
88
  except Exception as file_error:
89
  print(f"Error downloading file {file_path}: {file_error}")
90
 
91
  # Create leaderboard data structure
92
  leaderboard_data = {
93
  "entries": all_entries,
94
- "last_updated": pd.Timestamp.now().isoformat()
 
95
  }
96
 
97
  # Save to local file
98
- save_leaderboard_data(leaderboard_data, LEADERBOARD_FILE)
99
 
100
  return True
101
  except Exception as list_error:
102
  print(f"Error listing repository files: {list_error}")
103
 
104
  # If we can't download anything, create an empty leaderboard
105
- if not os.path.exists(LEADERBOARD_FILE):
106
- empty_data = {"entries": [], "last_updated": pd.Timestamp.now().isoformat()}
107
- save_leaderboard_data(empty_data, LEADERBOARD_FILE)
 
 
 
 
108
 
109
  return False
110
  except Exception as e:
111
  print(f"Error downloading leaderboard data: {e}")
112
 
113
  # Ensure we have at least an empty leaderboard file
114
- if not os.path.exists(LEADERBOARD_FILE):
115
- empty_data = {"entries": [], "last_updated": pd.Timestamp.now().isoformat()}
116
- save_leaderboard_data(empty_data, LEADERBOARD_FILE)
 
 
 
 
 
117
 
118
  return False
119
 
120
 
121
- def get_leaderboard_df() -> pd.DataFrame:
122
  """
123
  Get the leaderboard data as a DataFrame.
 
 
 
124
  """
125
  # Try to download the latest data
126
- download_leaderboard_data()
127
 
128
  # Load from local file
129
- leaderboard_data = load_leaderboard_data(LEADERBOARD_FILE)
 
130
 
131
  # Convert to DataFrame
132
  df = leaderboard_to_dataframe(leaderboard_data)
@@ -134,18 +194,20 @@ def get_leaderboard_df() -> pd.DataFrame:
134
  return df
135
 
136
 
137
- def get_category_leaderboard_df(category: str) -> pd.DataFrame:
138
  """
139
  Get the leaderboard data filtered by a specific category.
140
 
141
  Args:
142
  category: The category to filter by (e.g., "Criminal, Violent, and Terrorist Activity")
 
143
 
144
  Returns:
145
  DataFrame with metrics for the specified category
146
  """
147
  # Load the leaderboard data
148
- leaderboard_data = load_leaderboard_data(LEADERBOARD_FILE)
 
149
 
150
  # Filter entries to only include those with data for the specified category
151
  filtered_entries = []
@@ -158,6 +220,7 @@ def get_category_leaderboard_df(category: str) -> pd.DataFrame:
158
  "model_name": entry.get("model_name", "Unknown Model"),
159
  "model_type": entry.get("model_type", "Unknown"),
160
  "submission_date": entry.get("submission_date", ""),
 
161
  }
162
 
163
  # Extract metrics for this category
@@ -189,7 +252,8 @@ def get_category_leaderboard_df(category: str) -> pd.DataFrame:
189
  # Create a new leaderboard data structure with the filtered entries
190
  filtered_leaderboard = {
191
  "entries": filtered_entries,
192
- "last_updated": leaderboard_data.get("last_updated", pd.Timestamp.now().isoformat())
 
193
  }
194
 
195
  # Convert to DataFrame
@@ -198,14 +262,21 @@ def get_category_leaderboard_df(category: str) -> pd.DataFrame:
198
  return df
199
 
200
 
201
- def get_detailed_model_data(model_name: str) -> Dict:
202
  """
203
  Get detailed data for a specific model.
 
 
 
 
204
  """
205
- leaderboard_data = load_leaderboard_data(LEADERBOARD_FILE)
 
206
 
207
  for entry in leaderboard_data.get("entries", []):
208
- if entry.get("model_name") == model_name:
 
 
209
  return entry
210
 
211
  return {}
 
17
  from src.leaderboard.processor import leaderboard_to_dataframe, load_leaderboard_data, save_leaderboard_data, process_jsonl_submission, add_entries_to_leaderboard
18
 
19
 
20
+ def get_versioned_leaderboard_file(version="v0"):
21
+ """
22
+ Get the versioned leaderboard file path.
23
+ """
24
+ base_name, ext = os.path.splitext(LEADERBOARD_FILE)
25
+ return f"{base_name}_{version}{ext}"
26
+
27
+
28
+ def download_leaderboard_data(version="v0") -> bool:
29
  """
30
  Download the latest leaderboard data from HuggingFace.
31
+
32
+ Args:
33
+ version: The dataset version to download
34
  """
35
  try:
36
  # Create a temporary directory to download the submissions
37
+ temp_dir = os.path.join(CACHE_PATH, f"temp_submissions_{version}")
38
  os.makedirs(temp_dir, exist_ok=True)
39
 
40
+ # Get the versioned leaderboard file
41
+ leaderboard_file = get_versioned_leaderboard_file(version)
42
+
43
  # Download the entire repository
44
  try:
45
  snapshot_path = snapshot_download(
 
57
 
58
  # Look for submission files in the submissions directory
59
  submissions_dir = os.path.join(snapshot_path, "submissions")
60
+ version_submissions_dir = os.path.join(snapshot_path, f"submissions_{version}")
61
+
62
+ # Check both standard and versioned submission directories
63
  if os.path.exists(submissions_dir):
64
  submission_files.extend(glob(os.path.join(submissions_dir, "*.jsonl")))
65
 
66
+ if os.path.exists(version_submissions_dir):
67
+ submission_files.extend(glob(os.path.join(version_submissions_dir, "*.jsonl")))
68
+
69
+ # Also look for any versioned JSONL files in the root
70
+ submission_files.extend(glob(os.path.join(snapshot_path, f"*_{version}.jsonl")))
71
+
72
+ # If we're looking for v0 and no versioned files found, use generic ones
73
+ if version == "v0" and not submission_files:
74
+ submission_files.extend(glob(os.path.join(snapshot_path, "*.jsonl")))
75
 
76
  # Process each submission file
77
  for file_path in submission_files:
78
  entries, _ = process_jsonl_submission(file_path)
79
+
80
+ # Filter entries to those that match the version or don't have version specified
81
+ filtered_entries = [
82
+ entry for entry in entries
83
+ if entry.get("version", "v0") == version or "version" not in entry
84
+ ]
85
+
86
+ all_entries.extend(filtered_entries)
87
 
88
  # Create leaderboard data structure
89
  leaderboard_data = {
90
  "entries": all_entries,
91
+ "last_updated": pd.Timestamp.now().isoformat(),
92
+ "version": version
93
  }
94
 
95
  # Save to local file
96
+ save_leaderboard_data(leaderboard_data, leaderboard_file)
97
 
98
  return True
99
  except Exception as e:
 
104
  api = HfApi(token=TOKEN)
105
  files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
106
 
107
+ # Look for versioned and regular files
108
+ submission_files = [
109
+ f for f in files
110
+ if (f.endswith(f'_{version}.jsonl') or
111
+ f.startswith(f'submissions_{version}/') or
112
+ (version == "v0" and f.endswith('.jsonl')))
113
+ ]
114
+
115
  all_entries = []
116
 
117
  for file_path in submission_files:
 
123
  token=TOKEN
124
  )
125
  entries, _ = process_jsonl_submission(local_path)
126
+
127
+ # Filter entries to those that match the version or don't have version specified
128
+ filtered_entries = [
129
+ entry for entry in entries
130
+ if entry.get("version", "v0") == version or "version" not in entry
131
+ ]
132
+
133
+ all_entries.extend(filtered_entries)
134
  except Exception as file_error:
135
  print(f"Error downloading file {file_path}: {file_error}")
136
 
137
  # Create leaderboard data structure
138
  leaderboard_data = {
139
  "entries": all_entries,
140
+ "last_updated": pd.Timestamp.now().isoformat(),
141
+ "version": version
142
  }
143
 
144
  # Save to local file
145
+ save_leaderboard_data(leaderboard_data, leaderboard_file)
146
 
147
  return True
148
  except Exception as list_error:
149
  print(f"Error listing repository files: {list_error}")
150
 
151
  # If we can't download anything, create an empty leaderboard
152
+ if not os.path.exists(leaderboard_file):
153
+ empty_data = {
154
+ "entries": [],
155
+ "last_updated": pd.Timestamp.now().isoformat(),
156
+ "version": version
157
+ }
158
+ save_leaderboard_data(empty_data, leaderboard_file)
159
 
160
  return False
161
  except Exception as e:
162
  print(f"Error downloading leaderboard data: {e}")
163
 
164
  # Ensure we have at least an empty leaderboard file
165
+ leaderboard_file = get_versioned_leaderboard_file(version)
166
+ if not os.path.exists(leaderboard_file):
167
+ empty_data = {
168
+ "entries": [],
169
+ "last_updated": pd.Timestamp.now().isoformat(),
170
+ "version": version
171
+ }
172
+ save_leaderboard_data(empty_data, leaderboard_file)
173
 
174
  return False
175
 
176
 
177
+ def get_leaderboard_df(version="v0") -> pd.DataFrame:
178
  """
179
  Get the leaderboard data as a DataFrame.
180
+
181
+ Args:
182
+ version: The dataset version to retrieve
183
  """
184
  # Try to download the latest data
185
+ download_leaderboard_data(version=version)
186
 
187
  # Load from local file
188
+ leaderboard_file = get_versioned_leaderboard_file(version)
189
+ leaderboard_data = load_leaderboard_data(leaderboard_file)
190
 
191
  # Convert to DataFrame
192
  df = leaderboard_to_dataframe(leaderboard_data)
 
194
  return df
195
 
196
 
197
+ def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
198
  """
199
  Get the leaderboard data filtered by a specific category.
200
 
201
  Args:
202
  category: The category to filter by (e.g., "Criminal, Violent, and Terrorist Activity")
203
+ version: The dataset version to retrieve
204
 
205
  Returns:
206
  DataFrame with metrics for the specified category
207
  """
208
  # Load the leaderboard data
209
+ leaderboard_file = get_versioned_leaderboard_file(version)
210
+ leaderboard_data = load_leaderboard_data(leaderboard_file)
211
 
212
  # Filter entries to only include those with data for the specified category
213
  filtered_entries = []
 
220
  "model_name": entry.get("model_name", "Unknown Model"),
221
  "model_type": entry.get("model_type", "Unknown"),
222
  "submission_date": entry.get("submission_date", ""),
223
+ "version": entry.get("version", version),
224
  }
225
 
226
  # Extract metrics for this category
 
252
  # Create a new leaderboard data structure with the filtered entries
253
  filtered_leaderboard = {
254
  "entries": filtered_entries,
255
+ "last_updated": leaderboard_data.get("last_updated", pd.Timestamp.now().isoformat()),
256
+ "version": version
257
  }
258
 
259
  # Convert to DataFrame
 
262
  return df
263
 
264
 
265
+ def get_detailed_model_data(model_name: str, version="v0") -> Dict:
266
  """
267
  Get detailed data for a specific model.
268
+
269
+ Args:
270
+ model_name: The name of the model to get data for
271
+ version: The dataset version to retrieve
272
  """
273
+ leaderboard_file = get_versioned_leaderboard_file(version)
274
+ leaderboard_data = load_leaderboard_data(leaderboard_file)
275
 
276
  for entry in leaderboard_data.get("entries", []):
277
+ # Check both the model name and version
278
+ entry_version = entry.get("version", "v0")
279
+ if entry.get("model_name") == model_name and (entry_version == version or entry_version is None):
280
  return entry
281
 
282
  return {}
src/submission/submit.py CHANGED
@@ -25,33 +25,40 @@ def validate_submission(file_path: str) -> Tuple[bool, str]:
25
  entries, message = process_jsonl_submission(file_path)
26
  if not entries:
27
  return False, message
28
-
29
  # Additional validation could be added here
30
-
31
  return True, "Submission is valid"
32
  except Exception as e:
33
  return False, f"Error validating submission: {e}"
34
 
35
 
36
- def submit_to_hub(file_path: str, metadata: Dict, dataset_id: str, token: str) -> Tuple[bool, str]:
37
  """
38
  Submit results to a HuggingFace dataset repository as individual files.
 
 
 
 
 
 
 
39
  """
40
  try:
41
  # Process the submission file to validate
42
  entries, message = process_jsonl_submission(file_path)
43
  if not entries:
44
  return False, message
45
-
46
  # Generate a unique submission ID
47
  model_name = metadata.get("model_name", "unknown")
48
  model_name_safe = model_name.replace("/", "_").replace(" ", "_")
49
  timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
50
  submission_id = f"{model_name_safe}_{timestamp}"
51
-
52
  # Create an API instance
53
  api = HfApi(token=token)
54
-
55
  # Create a temporary file with metadata added
56
  with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as temp_file:
57
  # Add metadata to each entry
@@ -59,47 +66,58 @@ def submit_to_hub(file_path: str, metadata: Dict, dataset_id: str, token: str) -
59
  # If the entry already has a model_name, don't override it
60
  if "model_name" not in entry:
61
  entry["model_name"] = metadata.get("model_name")
62
-
63
  # Add other metadata if not present
64
  for key, value in metadata.items():
65
  if key != "model_name" and key not in entry:
66
  entry[key] = value
67
-
 
 
 
68
  # Write to temp file
69
  temp_file.write(json.dumps(entry) + "\n")
70
-
71
  temp_path = temp_file.name
72
-
73
- # Upload the file directly to the repository
74
- submission_path = f"submissions/{submission_id}.jsonl"
75
  api.upload_file(
76
  path_or_fileobj=temp_path,
77
  path_in_repo=submission_path,
78
  repo_id=dataset_id,
79
  repo_type="dataset",
80
- commit_message=f"Add submission for {model_name}"
81
  )
82
-
83
  # Clean up the temporary file
84
  os.unlink(temp_path)
85
-
86
- return True, f"Successfully uploaded submission for {model_name} to {dataset_id}"
87
  except Exception as e:
88
  return False, f"Error submitting to dataset: {e}"
89
 
90
 
91
- def process_submission(file_path: str, metadata: Dict) -> str:
92
  """
93
  Process a submission to the GuardBench leaderboard.
 
 
 
 
 
94
  """
95
  # Validate submission file
96
  is_valid, validation_message = validate_submission(file_path)
97
  if not is_valid:
98
  return styled_error(validation_message)
99
-
 
 
 
100
  # Submit to HuggingFace dataset repository
101
- success, message = submit_to_hub(file_path, metadata, RESULTS_DATASET_ID, TOKEN)
102
  if not success:
103
  return styled_error(message)
104
-
105
  return styled_message(f"Submission successful! {message}")
 
25
  entries, message = process_jsonl_submission(file_path)
26
  if not entries:
27
  return False, message
28
+
29
  # Additional validation could be added here
30
+
31
  return True, "Submission is valid"
32
  except Exception as e:
33
  return False, f"Error validating submission: {e}"
34
 
35
 
36
+ def submit_to_hub(file_path: str, metadata: Dict, dataset_id: str, token: str, version="v0") -> Tuple[bool, str]:
37
  """
38
  Submit results to a HuggingFace dataset repository as individual files.
39
+
40
+ Args:
41
+ file_path: Path to the submission file
42
+ metadata: Metadata to include with the submission
43
+ dataset_id: The dataset repository ID
44
+ token: HuggingFace API token
45
+ version: The version of the benchmark used (e.g., "v0", "v1")
46
  """
47
  try:
48
  # Process the submission file to validate
49
  entries, message = process_jsonl_submission(file_path)
50
  if not entries:
51
  return False, message
52
+
53
  # Generate a unique submission ID
54
  model_name = metadata.get("model_name", "unknown")
55
  model_name_safe = model_name.replace("/", "_").replace(" ", "_")
56
  timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
57
  submission_id = f"{model_name_safe}_{timestamp}"
58
+
59
  # Create an API instance
60
  api = HfApi(token=token)
61
+
62
  # Create a temporary file with metadata added
63
  with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as temp_file:
64
  # Add metadata to each entry
 
66
  # If the entry already has a model_name, don't override it
67
  if "model_name" not in entry:
68
  entry["model_name"] = metadata.get("model_name")
69
+
70
  # Add other metadata if not present
71
  for key, value in metadata.items():
72
  if key != "model_name" and key not in entry:
73
  entry[key] = value
74
+
75
+ # Ensure version is set
76
+ entry["version"] = version
77
+
78
  # Write to temp file
79
  temp_file.write(json.dumps(entry) + "\n")
80
+
81
  temp_path = temp_file.name
82
+
83
+ # Upload the file to the version-specific directory
84
+ submission_path = f"submissions_{version}/{submission_id}_{version}.jsonl" if version != "v0" else f"submissions/{submission_id}.jsonl"
85
  api.upload_file(
86
  path_or_fileobj=temp_path,
87
  path_in_repo=submission_path,
88
  repo_id=dataset_id,
89
  repo_type="dataset",
90
+ commit_message=f"Add submission for {model_name} (version {version})"
91
  )
92
+
93
  # Clean up the temporary file
94
  os.unlink(temp_path)
95
+
96
+ return True, f"Successfully uploaded submission for {model_name} to {dataset_id} (version {version})"
97
  except Exception as e:
98
  return False, f"Error submitting to dataset: {e}"
99
 
100
 
101
+ def process_submission(file_path: str, metadata: Dict, version="v0") -> str:
102
  """
103
  Process a submission to the GuardBench leaderboard.
104
+
105
+ Args:
106
+ file_path: Path to the submission file
107
+ metadata: Metadata to include with the submission
108
+ version: The version of the benchmark used (e.g., "v0", "v1")
109
  """
110
  # Validate submission file
111
  is_valid, validation_message = validate_submission(file_path)
112
  if not is_valid:
113
  return styled_error(validation_message)
114
+
115
+ # Add version to metadata
116
+ metadata["version"] = version
117
+
118
  # Submit to HuggingFace dataset repository
119
+ success, message = submit_to_hub(file_path, metadata, RESULTS_DATASET_ID, TOKEN, version=version)
120
  if not success:
121
  return styled_error(message)
122
+
123
  return styled_message(f"Submission successful! {message}")