apsys commited on
Commit
b1cb07d
·
1 Parent(s): dbfaa11
Files changed (46) hide show
  1. .gitignore +1 -0
  2. app.py +285 -97
  3. guard-bench-submodule +1 -1
  4. logs/guardbench_20250331_202524_f070fad2.log +19 -0
  5. logs/guardbench_20250331_203348_6e24b81a.log +19 -0
  6. logs/guardbench_20250331_204307_8d77ec17.log +23 -0
  7. logs/guardbench_20250331_204836_3421e73f.log +16 -0
  8. logs/guardbench_20250331_205152_1140353b.log +19 -0
  9. logs/guardbench_20250331_205420_d59a424e.log +61 -0
  10. logs/guardbench_20250331_205853_bf4fa85f.log +2 -0
  11. logs/guardbench_20250331_210025_ba691e37.log +2 -0
  12. logs/guardbench_20250331_210109_f46123b9.log +2 -0
  13. logs/guardbench_20250331_210119_f0e10b9b.log +2 -0
  14. logs/guardbench_20250331_210208_029d8318.log +2 -0
  15. logs/guardbench_20250331_210459_437adc64.log +2 -0
  16. logs/guardbench_20250331_210550_0358568e.log +2 -0
  17. logs/guardbench_20250331_210641_91d1d06b.log +2 -0
  18. logs/guardbench_20250331_210920_cd35c4f0.log +2 -0
  19. logs/guardbench_20250331_211215_a8cf06a3.log +2 -0
  20. logs/guardbench_20250331_211255_d32ee1a4.log +2 -0
  21. logs/guardbench_20250331_211332_24223f1a.log +2 -0
  22. logs/guardbench_20250331_211545_244674df.log +2 -0
  23. logs/guardbench_20250331_211735_6d503239.log +2 -0
  24. logs/guardbench_20250331_211916_3e96de42.log +2 -0
  25. logs/guardbench_20250331_212027_f9d450e9.log +5 -0
  26. logs/guardbench_20250331_212722_655f3190.log +2 -0
  27. logs/guardbench_20250331_213207_74f9e2de.log +5 -0
  28. logs/guardbench_20250331_213331_f72d2f6a.log +5 -0
  29. logs/guardbench_20250331_214118_0da0491f.log +50 -0
  30. logs/guardbench_20250331_214511_0a5acf8b.log +5 -0
  31. logs/guardbench_20250331_214841_4df080f3.log +5 -0
  32. logs/guardbench_20250331_215007_9c98c60a.log +5 -0
  33. logs/guardbench_20250331_215514_ad36e4b4.log +50 -0
  34. logs/guardbench_20250331_220348_0cb4d8e9.log +50 -0
  35. logs/guardbench_20250331_220638_21aa20f2.log +50 -0
  36. logs/guardbench_20250331_221124_3ffa908f.log +50 -0
  37. logs/guardbench_20250331_221755_fc667123.log +50 -0
  38. logs/guardbench_20250331_222103_90a3095d.log +50 -0
  39. logs/guardbench_20250331_222531_b0fff871.log +50 -0
  40. logs/guardbench_20250331_223148_4e22eb66.log +50 -0
  41. requirements.txt +6 -5
  42. src/display/css_html_js.py +11 -3
  43. src/display/utils.py +164 -22
  44. src/leaderboard/processor.py +57 -39
  45. src/populate.py +143 -208
  46. src/submission/submit.py +166 -68
.gitignore CHANGED
@@ -20,6 +20,7 @@ var/
20
  *.egg-info/
21
  .installed.cfg
22
  *.egg
 
23
 
24
  # Environment variables
25
  .env
 
20
  *.egg-info/
21
  .installed.cfg
22
  *.egg
23
+ .gradio/
24
 
25
  # Environment variables
26
  .env
app.py CHANGED
@@ -9,6 +9,8 @@ import logging
9
  import gradio as gr
10
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
11
  import pandas as pd
 
 
12
  from apscheduler.schedulers.background import BackgroundScheduler
13
 
14
  from src.about import (
@@ -30,7 +32,8 @@ from src.display.utils import (
30
  TEST_TYPES,
31
  ModelType,
32
  Precision,
33
- WeightType
 
34
  )
35
  from src.display.formatting import styled_message, styled_error, styled_warning
36
  from src.envs import (
@@ -41,7 +44,7 @@ from src.envs import (
41
  TOKEN,
42
  DATA_PATH
43
  )
44
- from src.populate import get_leaderboard_df, download_leaderboard_data, get_category_leaderboard_df
45
  from src.submission.submit import process_submission
46
 
47
  # Configure logging
@@ -64,6 +67,7 @@ except Exception as e:
64
  logger.error(f"Error loading leaderboard data: {e}")
65
  LEADERBOARD_DF = pd.DataFrame()
66
 
 
67
 
68
  def init_leaderboard(dataframe):
69
  """
@@ -100,7 +104,8 @@ def submit_results(
100
  weight_type: str,
101
  model_type: str,
102
  submission_file: tempfile._TemporaryFileWrapper,
103
- version: str
 
104
  ):
105
  """
106
  Handle submission of results with model metadata.
@@ -125,7 +130,8 @@ def submit_results(
125
  "precision": precision,
126
  "weight_type": weight_type,
127
  "model_type": model_type,
128
- "version": version
 
129
  }
130
 
131
  # Process the submission
@@ -150,10 +156,22 @@ def refresh_data(version=CURRENT_VERSION):
150
  global LEADERBOARD_DF
151
  try:
152
  logger.info(f"Performing scheduled refresh of leaderboard data for version {version}...")
153
- LEADERBOARD_DF = get_leaderboard_df(version=version)
154
- logger.info("Scheduled refresh of leaderboard data completed")
 
 
 
 
 
 
 
 
155
  except Exception as e:
156
  logger.error(f"Error in scheduled refresh: {e}")
 
 
 
 
157
  return LEADERBOARD_DF
158
 
159
 
@@ -166,111 +184,281 @@ def update_leaderboards(version):
166
  return [init_leaderboard(new_df)] + [init_leaderboard(df) for df in category_dfs]
167
 
168
 
169
- # Create Gradio app
170
- demo = gr.Blocks(css=custom_css)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
- with demo:
173
- gr.HTML(TITLE)
174
 
175
- with gr.Row():
176
- with gr.Column(scale=3):
177
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
178
- with gr.Column(scale=1):
179
- version_selector = gr.Dropdown(
180
- choices=BENCHMARK_VERSIONS,
181
- label="Benchmark Version",
182
- value=CURRENT_VERSION,
183
- interactive=True,
184
- elem_classes="version-selector"
185
- )
186
 
187
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
188
- with gr.TabItem("🏅 Leaderboard", elem_id="guardbench-leaderboard-tab", id=0):
189
- refresh_button = gr.Button("Refresh Leaderboard")
190
-
191
- # Create tabs for each category
192
- with gr.Tabs(elem_classes="category-tabs") as category_tabs:
193
- # First tab for average metrics across all categories
194
- with gr.TabItem("📊 Overall Performance", elem_id="overall-tab"):
195
- leaderboard = init_leaderboard(LEADERBOARD_DF)
196
-
197
- # Create a tab for each category
198
- for category in CATEGORIES:
199
- with gr.TabItem(f"{category}", elem_id=f"category-{category.lower().replace(' ', '-')}-tab"):
200
- category_df = get_category_leaderboard_df(category, version=CURRENT_VERSION)
201
- category_leaderboard = init_leaderboard(category_df)
202
-
203
- # Refresh button functionality
204
- refresh_button.click(
205
- fn=lambda: [
206
- init_leaderboard(get_leaderboard_df(version=version_selector.value)),
207
- *[init_leaderboard(get_category_leaderboard_df(category, version=version_selector.value)) for category in CATEGORIES]
208
- ],
209
- inputs=[],
210
- outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)]
211
- )
212
 
213
- with gr.TabItem("📝 About", elem_id="guardbench-about-tab", id=1):
214
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
215
 
216
- with gr.TabItem("🚀 Submit", elem_id="guardbench-submit-tab", id=2):
217
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
 
 
 
 
 
218
 
219
- with gr.Row():
220
- gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
221
 
222
- with gr.Row():
223
- with gr.Column():
224
- model_name_textbox = gr.Textbox(label="Model name")
225
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
226
- model_type = gr.Dropdown(
227
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
228
- label="Model type",
229
- multiselect=False,
230
- value=None,
231
- interactive=True,
232
- )
233
 
234
- with gr.Column():
235
- precision = gr.Dropdown(
236
- choices=[i.name for i in Precision if i != Precision.Unknown],
237
- label="Precision",
238
- multiselect=False,
239
- value="float16",
 
 
 
 
 
 
 
 
 
240
  interactive=True,
 
 
241
  )
242
- weight_type = gr.Dropdown(
243
- choices=[i.name for i in WeightType],
244
- label="Weights type",
245
- multiselect=False,
246
- value="Original",
247
- interactive=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  )
249
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
250
 
251
- with gr.Row():
252
- file_input = gr.File(
253
- label="Upload JSONL Results File",
254
- file_types=[".jsonl"]
 
255
  )
256
 
257
- submit_button = gr.Button("Submit Results")
258
- result_output = gr.Markdown()
259
-
260
- submit_button.click(
261
- fn=submit_results,
262
- inputs=[
263
- model_name_textbox,
264
- base_model_name_textbox,
265
- revision_name_textbox,
266
- precision,
267
- weight_type,
268
- model_type,
269
- file_input,
270
- version_selector
271
- ],
272
- outputs=result_output
273
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
 
275
  # Version selector functionality
276
  version_selector.change(
 
9
  import gradio as gr
10
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
11
  import pandas as pd
12
+ import plotly.express as px
13
+ import plotly.graph_objects as go
14
  from apscheduler.schedulers.background import BackgroundScheduler
15
 
16
  from src.about import (
 
32
  TEST_TYPES,
33
  ModelType,
34
  Precision,
35
+ WeightType,
36
+ GuardModelType
37
  )
38
  from src.display.formatting import styled_message, styled_error, styled_warning
39
  from src.envs import (
 
44
  TOKEN,
45
  DATA_PATH
46
  )
47
+ from src.populate import get_leaderboard_df, get_category_leaderboard_df
48
  from src.submission.submit import process_submission
49
 
50
  # Configure logging
 
67
  logger.error(f"Error loading leaderboard data: {e}")
68
  LEADERBOARD_DF = pd.DataFrame()
69
 
70
+ print(DISPLAY_COLS)
71
 
72
  def init_leaderboard(dataframe):
73
  """
 
104
  weight_type: str,
105
  model_type: str,
106
  submission_file: tempfile._TemporaryFileWrapper,
107
+ version: str,
108
+ guard_model_type: GuardModelType
109
  ):
110
  """
111
  Handle submission of results with model metadata.
 
130
  "precision": precision,
131
  "weight_type": weight_type,
132
  "model_type": model_type,
133
+ "version": version,
134
+ "guard_model_type": guard_model_type
135
  }
136
 
137
  # Process the submission
 
156
  global LEADERBOARD_DF
157
  try:
158
  logger.info(f"Performing scheduled refresh of leaderboard data for version {version}...")
159
+ new_df = get_leaderboard_df(version=version)
160
+ if new_df is not None and not new_df.empty:
161
+ LEADERBOARD_DF = new_df
162
+ logger.info("Scheduled refresh of leaderboard data completed")
163
+ else:
164
+ logger.warning("Refresh returned empty data, keeping existing data")
165
+ # If empty, create a dataframe with correct columns
166
+ if LEADERBOARD_DF is None or LEADERBOARD_DF.empty:
167
+ columns = [getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS]
168
+ LEADERBOARD_DF = pd.DataFrame(columns=columns)
169
  except Exception as e:
170
  logger.error(f"Error in scheduled refresh: {e}")
171
+ # Ensure we have at least an empty dataframe with correct columns
172
+ if LEADERBOARD_DF is None or LEADERBOARD_DF.empty:
173
+ columns = [getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS]
174
+ LEADERBOARD_DF = pd.DataFrame(columns=columns)
175
  return LEADERBOARD_DF
176
 
177
 
 
184
  return [init_leaderboard(new_df)] + [init_leaderboard(df) for df in category_dfs]
185
 
186
 
187
+ def create_performance_plot(selected_models, category, metric="f1_binary", version=CURRENT_VERSION):
188
+ """
189
+ Create a radar plot comparing model performance for selected models.
190
+ """
191
+ if category == "📊 Overall Performance":
192
+ df = get_leaderboard_df(version=version)
193
+ else:
194
+ df = get_category_leaderboard_df(category, version=version)
195
+
196
+ if df.empty:
197
+ return go.Figure()
198
+
199
+ # Filter for selected models
200
+ df = df[df['model_name'].isin(selected_models)]
201
+
202
+ # Get the relevant metric columns
203
+ metric_cols = [col for col in df.columns if metric in col]
204
+
205
+ # Create figure
206
+ fig = go.Figure()
207
+
208
+ # Custom colors for different models
209
+ colors = ['#8FCCCC', '#C2A4B6', '#98B4A6', '#B68F7C'] # Pale Cyan, Pale Pink, Pale Green, Pale Orange
210
+
211
+ # Add traces for each model
212
+ for idx, model in enumerate(selected_models):
213
+ model_data = df[df['model_name'] == model]
214
+ if not model_data.empty:
215
+ values = model_data[metric_cols].values[0].tolist()
216
+ # Add the first value again at the end to complete the polygon
217
+ values = values + [values[0]]
218
+
219
+ # Clean up test type names
220
+ categories = [col.replace(f'_{metric}', '') for col in metric_cols]
221
+ # Add the first category again at the end to complete the polygon
222
+ categories = categories + [categories[0]]
223
+
224
+ fig.add_trace(go.Scatterpolar(
225
+ r=values,
226
+ theta=categories,
227
+ name=model,
228
+ line_color=colors[idx % len(colors)],
229
+ fill='toself'
230
+ ))
231
+
232
+ # Update layout with all settings at once
233
+ fig.update_layout(
234
+ paper_bgcolor='#000000',
235
+ plot_bgcolor='#000000',
236
+ font={'color': '#ffffff'},
237
+ title={
238
+ 'text': f'{category} - {metric.upper()} Score Comparison',
239
+ 'font': {'color': '#ffffff', 'size': 24}
240
+ },
241
+ polar=dict(
242
+ bgcolor='#000000',
243
+ radialaxis=dict(
244
+ visible=True,
245
+ range=[0, 1],
246
+ gridcolor='#333333',
247
+ linecolor='#333333',
248
+ tickfont={'color': '#ffffff'},
249
+ ),
250
+ angularaxis=dict(
251
+ gridcolor='#333333',
252
+ linecolor='#333333',
253
+ tickfont={'color': '#ffffff'},
254
+ )
255
+ ),
256
+ height=600,
257
+ showlegend=True,
258
+ legend=dict(
259
+ yanchor="top",
260
+ y=0.99,
261
+ xanchor="right",
262
+ x=0.99,
263
+ bgcolor='rgba(0,0,0,0.5)',
264
+ font={'color': '#ffffff'}
265
+ )
266
+ )
267
 
268
+ return fig
 
269
 
 
 
 
 
 
 
 
 
 
 
 
270
 
271
+ def update_model_choices(version):
272
+ """
273
+ Update the list of available models for the given version.
274
+ """
275
+ df = get_leaderboard_df(version=version)
276
+ if df.empty:
277
+ return []
278
+ return sorted(df['model_name'].unique().tolist())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
 
 
 
280
 
281
+ def update_visualization(selected_models, selected_category, selected_metric, version):
282
+ """
283
+ Update the visualization based on user selections.
284
+ """
285
+ if not selected_models:
286
+ return go.Figure()
287
+ return create_performance_plot(selected_models, selected_category, selected_metric, version)
288
 
 
 
289
 
290
+ # Create Gradio app
291
+ demo = gr.Blocks(css=custom_css)
 
 
 
 
 
 
 
 
 
292
 
293
+ with demo:
294
+ gr.HTML(TITLE)
295
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
296
+
297
+ with gr.Row():
298
+ tabs = gr.Tabs(elem_classes="tab-buttons")
299
+
300
+ with tabs:
301
+ with gr.TabItem("🏅 Leaderboard", elem_id="guardbench-leaderboard-tab", id=0):
302
+ with gr.Row():
303
+ refresh_button = gr.Button("Refresh Leaderboard", scale=3)
304
+ version_selector = gr.Dropdown(
305
+ choices=BENCHMARK_VERSIONS,
306
+ label="Benchmark Version",
307
+ value=CURRENT_VERSION,
308
  interactive=True,
309
+ elem_classes="version-selector",
310
+ scale=1
311
  )
312
+
313
+ # Create tabs for each category
314
+ with gr.Tabs(elem_classes="category-tabs") as category_tabs:
315
+ # First tab for average metrics across all categories
316
+ with gr.TabItem("📊 Overall Performance", elem_id="overall-tab"):
317
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
318
+
319
+ # Create a tab for each category
320
+ for category in CATEGORIES:
321
+ with gr.TabItem(f"{category}", elem_id=f"category-{category.lower().replace(' ', '-')}-tab"):
322
+ category_df = get_category_leaderboard_df(category, version=CURRENT_VERSION)
323
+ category_leaderboard = init_leaderboard(category_df)
324
+
325
+ # Refresh button functionality
326
+ refresh_button.click(
327
+ fn=lambda: [
328
+ init_leaderboard(get_leaderboard_df(version=version_selector.value)),
329
+ *[init_leaderboard(get_category_leaderboard_df(category, version=version_selector.value)) for category in CATEGORIES]
330
+ ],
331
+ inputs=[],
332
+ outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)]
333
+ )
334
+
335
+ with gr.TabItem("📊 Visualize", elem_id="guardbench-viz-tab", id=1):
336
+ with gr.Row():
337
+ with gr.Column():
338
+ viz_version_selector = gr.Dropdown(
339
+ choices=BENCHMARK_VERSIONS,
340
+ label="Benchmark Version",
341
+ value=CURRENT_VERSION,
342
+ interactive=True
343
+ )
344
+ model_selector = gr.Dropdown(
345
+ choices=update_model_choices(CURRENT_VERSION),
346
+ label="Select Models to Compare",
347
+ multiselect=True,
348
+ interactive=True
349
+ )
350
+ with gr.Column():
351
+ # Add Overall Performance to categories
352
+ viz_categories = ["📊 Overall Performance"] + CATEGORIES
353
+ category_selector = gr.Dropdown(
354
+ choices=viz_categories,
355
+ label="Select Category",
356
+ value=viz_categories[0],
357
+ interactive=True
358
+ )
359
+ metric_selector = gr.Dropdown(
360
+ choices=["f1_binary", "precision_binary", "recall_binary"],
361
+ label="Select Metric",
362
+ value="f1_binary",
363
+ interactive=True
364
+ )
365
+
366
+ plot_output = gr.Plot()
367
+
368
+ # Update visualization when any selector changes
369
+ for control in [viz_version_selector, model_selector, category_selector, metric_selector]:
370
+ control.change(
371
+ fn=update_visualization,
372
+ inputs=[model_selector, category_selector, metric_selector, viz_version_selector],
373
+ outputs=plot_output
374
  )
 
375
 
376
+ # Update model choices when version changes
377
+ viz_version_selector.change(
378
+ fn=update_model_choices,
379
+ inputs=[viz_version_selector],
380
+ outputs=[model_selector]
381
  )
382
 
383
+ with gr.TabItem("📝 About", elem_id="guardbench-about-tab", id=2):
384
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
385
+
386
+ with gr.TabItem("🚀 Submit", elem_id="guardbench-submit-tab", id=3):
387
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
388
+
389
+ with gr.Row():
390
+ with gr.Column(scale=3):
391
+ gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
392
+ with gr.Column(scale=1):
393
+ # Add version selector specifically for the submission tab
394
+ submission_version_selector = gr.Dropdown(
395
+ choices=BENCHMARK_VERSIONS,
396
+ label="Benchmark Version",
397
+ value=CURRENT_VERSION,
398
+ interactive=True,
399
+ elem_classes="version-selector"
400
+ )
401
+
402
+ with gr.Row():
403
+ with gr.Column():
404
+ model_name_textbox = gr.Textbox(label="Model name")
405
+ revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
406
+ model_type = gr.Dropdown(
407
+ choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
408
+ label="Model type",
409
+ multiselect=False,
410
+ value=None,
411
+ interactive=True,
412
+ )
413
+ guard_model_type = gr.Dropdown(
414
+ choices=[t.name for t in GuardModelType],
415
+ label="Guard model type",
416
+ multiselect=False,
417
+ value=GuardModelType.LLM_REGEXP.name,
418
+ interactive=True,
419
+ )
420
+
421
+ with gr.Column():
422
+ precision = gr.Dropdown(
423
+ choices=[i.name for i in Precision if i != Precision.Unknown],
424
+ label="Precision",
425
+ multiselect=False,
426
+ value="float16",
427
+ interactive=True,
428
+ )
429
+ weight_type = gr.Dropdown(
430
+ choices=[i.name for i in WeightType],
431
+ label="Weights type",
432
+ multiselect=False,
433
+ value="Original",
434
+ interactive=True,
435
+ )
436
+ base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
437
+
438
+ with gr.Row():
439
+ file_input = gr.File(
440
+ label="Upload JSONL Results File",
441
+ file_types=[".jsonl"]
442
+ )
443
+
444
+ submit_button = gr.Button("Submit Results")
445
+ result_output = gr.Markdown()
446
+
447
+ submit_button.click(
448
+ fn=submit_results,
449
+ inputs=[
450
+ model_name_textbox,
451
+ base_model_name_textbox,
452
+ revision_name_textbox,
453
+ precision,
454
+ weight_type,
455
+ model_type,
456
+ file_input,
457
+ submission_version_selector,
458
+ guard_model_type
459
+ ],
460
+ outputs=result_output
461
+ )
462
 
463
  # Version selector functionality
464
  version_selector.change(
guard-bench-submodule CHANGED
@@ -1 +1 @@
1
- Subproject commit 0a9f48bcedd0ccb6b5cf59ff7ed1186e32a5dc17
 
1
+ Subproject commit 34b40c5c6c766632f460ce7d7f1895881a866d83
logs/guardbench_20250331_202524_f070fad2.log ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-03-31 20:25:27,855 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 20:25:28,116 - __main__ - INFO - Loaded leaderboard with 0 entries
3
+ 2025-03-31 20:25:28,184 - __main__ - WARNING - Initializing empty leaderboard
4
+ 2025-03-31 20:25:28,348 - __main__ - WARNING - Initializing empty leaderboard
5
+ 2025-03-31 20:25:28,514 - __main__ - WARNING - Initializing empty leaderboard
6
+ 2025-03-31 20:25:28,624 - __main__ - WARNING - Initializing empty leaderboard
7
+ 2025-03-31 20:25:28,739 - __main__ - WARNING - Initializing empty leaderboard
8
+ 2025-03-31 20:25:28,869 - __main__ - WARNING - Initializing empty leaderboard
9
+ 2025-03-31 20:25:29,008 - __main__ - WARNING - Initializing empty leaderboard
10
+ 2025-03-31 20:25:29,568 - __main__ - WARNING - Initializing empty leaderboard
11
+ 2025-03-31 20:25:29,687 - __main__ - WARNING - Initializing empty leaderboard
12
+ 2025-03-31 20:25:29,796 - __main__ - WARNING - Initializing empty leaderboard
13
+ 2025-03-31 20:25:29,908 - __main__ - WARNING - Initializing empty leaderboard
14
+ 2025-03-31 20:25:30,148 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
15
+ 2025-03-31 20:25:30,148 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
16
+ 2025-03-31 20:25:30,148 - apscheduler.scheduler - INFO - Scheduler started
17
+ 2025-03-31 20:29:44,676 - __main__ - INFO - Received submission for model chatgpt-4o-latest (CoT): /tmp/gradio/a1f2d3a725f7b441a1fbfdac8e51dfd3bf7bbb4ab2d1c20362cfa130f4bdda6d/chatgpt-4o-latest CoT.jsonl
18
+ 2025-03-31 20:29:44,708 - __main__ - INFO - Refreshing leaderboard data after submission for version v0...
19
+ 2025-03-31 20:29:44,923 - __main__ - INFO - Refreshed leaderboard data after submission
logs/guardbench_20250331_203348_6e24b81a.log ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-03-31 20:33:49,499 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 20:33:49,728 - __main__ - INFO - Loaded leaderboard with 0 entries
3
+ 2025-03-31 20:33:49,795 - __main__ - WARNING - Initializing empty leaderboard
4
+ 2025-03-31 20:33:49,946 - __main__ - WARNING - Initializing empty leaderboard
5
+ 2025-03-31 20:33:50,278 - __main__ - WARNING - Initializing empty leaderboard
6
+ 2025-03-31 20:33:50,398 - __main__ - WARNING - Initializing empty leaderboard
7
+ 2025-03-31 20:33:50,624 - __main__ - WARNING - Initializing empty leaderboard
8
+ 2025-03-31 20:33:50,735 - __main__ - WARNING - Initializing empty leaderboard
9
+ 2025-03-31 20:33:50,853 - __main__ - WARNING - Initializing empty leaderboard
10
+ 2025-03-31 20:33:51,000 - __main__ - WARNING - Initializing empty leaderboard
11
+ 2025-03-31 20:33:51,211 - __main__ - WARNING - Initializing empty leaderboard
12
+ 2025-03-31 20:33:51,471 - __main__ - WARNING - Initializing empty leaderboard
13
+ 2025-03-31 20:33:51,591 - __main__ - WARNING - Initializing empty leaderboard
14
+ 2025-03-31 20:33:51,921 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
15
+ 2025-03-31 20:33:51,921 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
16
+ 2025-03-31 20:33:51,922 - apscheduler.scheduler - INFO - Scheduler started
17
+ 2025-03-31 20:37:48,565 - __main__ - INFO - Received submission for model chatgpt-4o-latest (CoT): /tmp/gradio/a1f2d3a725f7b441a1fbfdac8e51dfd3bf7bbb4ab2d1c20362cfa130f4bdda6d/chatgpt-4o-latest CoT.jsonl
18
+ 2025-03-31 20:37:48,595 - __main__ - INFO - Refreshing leaderboard data after submission for version v0...
19
+ 2025-03-31 20:37:48,765 - __main__ - INFO - Refreshed leaderboard data after submission
logs/guardbench_20250331_204307_8d77ec17.log ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-03-31 20:43:08,422 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 20:43:08,623 - __main__ - INFO - Loaded leaderboard with 0 entries
3
+ 2025-03-31 20:43:08,693 - __main__ - WARNING - Initializing empty leaderboard
4
+ 2025-03-31 20:43:08,832 - __main__ - WARNING - Initializing empty leaderboard
5
+ 2025-03-31 20:43:08,948 - __main__ - WARNING - Initializing empty leaderboard
6
+ 2025-03-31 20:43:09,071 - __main__ - WARNING - Initializing empty leaderboard
7
+ 2025-03-31 20:43:09,188 - __main__ - WARNING - Initializing empty leaderboard
8
+ 2025-03-31 20:43:09,383 - __main__ - WARNING - Initializing empty leaderboard
9
+ 2025-03-31 20:43:09,494 - __main__ - WARNING - Initializing empty leaderboard
10
+ 2025-03-31 20:43:09,604 - __main__ - WARNING - Initializing empty leaderboard
11
+ 2025-03-31 20:43:09,803 - __main__ - WARNING - Initializing empty leaderboard
12
+ 2025-03-31 20:43:10,013 - __main__ - WARNING - Initializing empty leaderboard
13
+ 2025-03-31 20:43:10,123 - __main__ - WARNING - Initializing empty leaderboard
14
+ 2025-03-31 20:43:10,578 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
15
+ 2025-03-31 20:43:10,579 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
16
+ 2025-03-31 20:43:10,579 - apscheduler.scheduler - INFO - Scheduler started
17
+ 2025-03-31 20:46:56,010 - __main__ - INFO - Received submission for model chatgpt-4o-latest (CoT): /tmp/gradio/a1f2d3a725f7b441a1fbfdac8e51dfd3bf7bbb4ab2d1c20362cfa130f4bdda6d/chatgpt-4o-latest CoT.jsonl
18
+ 2025-03-31 20:46:56,040 - guardbench.context - INFO - Loading dataset from: whitecircle-ai/guardbench_dataset_1k_public
19
+ 2025-03-31 20:46:57,488 - guardbench.context - INFO - Successfully loaded dataset with 980 examples
20
+ 2025-03-31 20:46:57,488 - guardbench.evaluator - INFO - Starting evaluation for model: chatgpt-4o-latest_(CoT)
21
+ 2025-03-31 20:46:57,488 - guardbench.evaluator - INFO - Using cached results for model: chatgpt-4o-latest_(CoT)
22
+ 2025-03-31 20:46:57,489 - __main__ - INFO - Refreshing leaderboard data after submission for version v0...
23
+ 2025-03-31 20:46:57,582 - __main__ - INFO - Refreshed leaderboard data after submission
logs/guardbench_20250331_204836_3421e73f.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-03-31 20:48:37,607 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 20:48:37,906 - __main__ - INFO - Loaded leaderboard with 0 entries
3
+ 2025-03-31 20:48:37,976 - __main__ - WARNING - Initializing empty leaderboard
4
+ 2025-03-31 20:48:38,156 - __main__ - WARNING - Initializing empty leaderboard
5
+ 2025-03-31 20:48:38,310 - __main__ - WARNING - Initializing empty leaderboard
6
+ 2025-03-31 20:48:38,484 - __main__ - WARNING - Initializing empty leaderboard
7
+ 2025-03-31 20:48:38,744 - __main__ - WARNING - Initializing empty leaderboard
8
+ 2025-03-31 20:48:38,898 - __main__ - WARNING - Initializing empty leaderboard
9
+ 2025-03-31 20:48:39,059 - __main__ - WARNING - Initializing empty leaderboard
10
+ 2025-03-31 20:48:39,327 - __main__ - WARNING - Initializing empty leaderboard
11
+ 2025-03-31 20:48:39,480 - __main__ - WARNING - Initializing empty leaderboard
12
+ 2025-03-31 20:48:39,670 - __main__ - WARNING - Initializing empty leaderboard
13
+ 2025-03-31 20:48:39,832 - __main__ - WARNING - Initializing empty leaderboard
14
+ 2025-03-31 20:48:40,214 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
15
+ 2025-03-31 20:48:40,214 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
16
+ 2025-03-31 20:48:40,214 - apscheduler.scheduler - INFO - Scheduler started
logs/guardbench_20250331_205152_1140353b.log ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-03-31 20:51:53,395 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 20:51:53,703 - __main__ - INFO - Loaded leaderboard with 0 entries
3
+ 2025-03-31 20:51:53,774 - __main__ - WARNING - Initializing empty leaderboard
4
+ 2025-03-31 20:51:54,056 - __main__ - WARNING - Initializing empty leaderboard
5
+ 2025-03-31 20:51:54,148 - __main__ - WARNING - Initializing empty leaderboard
6
+ 2025-03-31 20:51:54,235 - __main__ - WARNING - Initializing empty leaderboard
7
+ 2025-03-31 20:51:54,326 - __main__ - WARNING - Initializing empty leaderboard
8
+ 2025-03-31 20:51:54,417 - __main__ - WARNING - Initializing empty leaderboard
9
+ 2025-03-31 20:51:54,705 - __main__ - WARNING - Initializing empty leaderboard
10
+ 2025-03-31 20:51:54,797 - __main__ - WARNING - Initializing empty leaderboard
11
+ 2025-03-31 20:51:54,901 - __main__ - WARNING - Initializing empty leaderboard
12
+ 2025-03-31 20:51:54,993 - __main__ - WARNING - Initializing empty leaderboard
13
+ 2025-03-31 20:51:55,092 - __main__ - WARNING - Initializing empty leaderboard
14
+ 2025-03-31 20:51:55,407 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
15
+ 2025-03-31 20:51:55,407 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
16
+ 2025-03-31 20:51:55,407 - apscheduler.scheduler - INFO - Scheduler started
17
+ 2025-03-31 20:53:44,802 - __main__ - INFO - Received submission for model chatgpt-4o-latest (CoT): /tmp/gradio/a1f2d3a725f7b441a1fbfdac8e51dfd3bf7bbb4ab2d1c20362cfa130f4bdda6d/chatgpt-4o-latest CoT.jsonl
18
+ 2025-03-31 20:53:44,829 - __main__ - INFO - Refreshing leaderboard data after submission for version v0...
19
+ 2025-03-31 20:53:44,996 - __main__ - INFO - Refreshed leaderboard data after submission
logs/guardbench_20250331_205420_d59a424e.log ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-03-31 20:54:21,474 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 20:54:21,606 - __main__ - INFO - Loaded leaderboard with 0 entries
3
+ 2025-03-31 20:54:21,675 - __main__ - WARNING - Initializing empty leaderboard
4
+ 2025-03-31 20:54:21,785 - __main__ - WARNING - Initializing empty leaderboard
5
+ 2025-03-31 20:54:21,881 - __main__ - WARNING - Initializing empty leaderboard
6
+ 2025-03-31 20:54:21,977 - __main__ - WARNING - Initializing empty leaderboard
7
+ 2025-03-31 20:54:22,074 - __main__ - WARNING - Initializing empty leaderboard
8
+ 2025-03-31 20:54:22,169 - __main__ - WARNING - Initializing empty leaderboard
9
+ 2025-03-31 20:54:22,293 - __main__ - WARNING - Initializing empty leaderboard
10
+ 2025-03-31 20:54:22,394 - __main__ - WARNING - Initializing empty leaderboard
11
+ 2025-03-31 20:54:22,505 - __main__ - WARNING - Initializing empty leaderboard
12
+ 2025-03-31 20:54:22,594 - __main__ - WARNING - Initializing empty leaderboard
13
+ 2025-03-31 20:54:22,685 - __main__ - WARNING - Initializing empty leaderboard
14
+ 2025-03-31 20:54:22,997 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
15
+ 2025-03-31 20:54:22,997 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
16
+ 2025-03-31 20:54:22,997 - apscheduler.scheduler - INFO - Scheduler started
17
+ 2025-03-31 20:55:51,877 - __main__ - INFO - Received submission for model chatgpt-4o-latest (CoT): /tmp/gradio/a1f2d3a725f7b441a1fbfdac8e51dfd3bf7bbb4ab2d1c20362cfa130f4bdda6d/chatgpt-4o-latest CoT.jsonl
18
+ 2025-03-31 20:55:51,906 - guardbench.context - INFO - Loading dataset from: whitecircle-ai/guardbench_dataset_1k_public
19
+ 2025-03-31 20:55:52,929 - guardbench.context - INFO - Successfully loaded dataset with 980 examples
20
+ 2025-03-31 20:55:52,929 - guardbench.evaluator - INFO - Starting evaluation for model: chatgpt-4o-latest_(CoT)
21
+ 2025-03-31 20:55:52,929 - guardbench.evaluator - INFO - Using cached results for model: chatgpt-4o-latest_(CoT)
22
+ 2025-03-31 20:55:52,966 - guardbench.evaluator - INFO - Processing cached results for category: Animal Cruelty and Exploitation
23
+ 2025-03-31 20:55:52,970 - guardbench.evaluator - INFO - Length Animal Cruelty and Exploitation - 30
24
+ 2025-03-31 20:55:53,073 - guardbench.evaluator - INFO - Processing cached results for category: Hate Speech, Extremism, and Discrimination
25
+ 2025-03-31 20:55:53,076 - guardbench.evaluator - INFO - Length Hate Speech, Extremism, and Discrimination - 29
26
+ 2025-03-31 20:55:53,175 - guardbench.evaluator - INFO - Processing cached results for category: Creative Content Involving Illicit Themes
27
+ 2025-03-31 20:55:53,178 - guardbench.evaluator - INFO - Length Creative Content Involving Illicit Themes - 30
28
+ 2025-03-31 20:55:53,281 - guardbench.evaluator - INFO - Processing cached results for category: AI Manipulation and Jailbreaking
29
+ 2025-03-31 20:55:53,284 - guardbench.evaluator - INFO - Length AI Manipulation and Jailbreaking - 30
30
+ 2025-03-31 20:55:53,386 - guardbench.evaluator - INFO - Processing cached results for category: Sexual Content and Violence
31
+ 2025-03-31 20:55:53,390 - guardbench.evaluator - INFO - Length Sexual Content and Violence - 29
32
+ 2025-03-31 20:55:53,487 - guardbench.evaluator - INFO - Processing cached results for category: Child Exploitation and Abuse
33
+ 2025-03-31 20:55:53,491 - guardbench.evaluator - INFO - Length Child Exploitation and Abuse - 30
34
+ 2025-03-31 20:55:53,592 - guardbench.evaluator - INFO - Processing cached results for category: Manipulation, Deception, and Misinformation
35
+ 2025-03-31 20:55:53,596 - guardbench.evaluator - INFO - Length Manipulation, Deception, and Misinformation - 30
36
+ 2025-03-31 20:55:53,698 - guardbench.evaluator - INFO - Processing cached results for category: Safe Prompts
37
+ 2025-03-31 20:55:53,701 - guardbench.evaluator - INFO - Length Safe Prompts - 490
38
+ 2025-03-31 20:55:54,267 - guardbench.evaluator - INFO - Processing cached results for category: Environmental and Industrial Harm
39
+ 2025-03-31 20:55:54,271 - guardbench.evaluator - INFO - Length Environmental and Industrial Harm - 30
40
+ 2025-03-31 20:55:54,371 - guardbench.evaluator - INFO - Processing cached results for category: Self–Harm and Suicidal Ideation
41
+ 2025-03-31 20:55:54,375 - guardbench.evaluator - INFO - Length Self–Harm and Suicidal Ideation - 13
42
+ 2025-03-31 20:55:54,431 - guardbench.evaluator - INFO - Processing cached results for category: Drug– and Substance–Related Activities
43
+ 2025-03-31 20:55:54,434 - guardbench.evaluator - INFO - Length Drug– and Substance–Related Activities - 30
44
+ 2025-03-31 20:55:54,534 - guardbench.evaluator - INFO - Processing cached results for category: Academic Dishonesty and Cheating
45
+ 2025-03-31 20:55:54,537 - guardbench.evaluator - INFO - Length Academic Dishonesty and Cheating - 29
46
+ 2025-03-31 20:55:54,634 - guardbench.evaluator - INFO - Processing cached results for category: Financial Fraud and Unethical Business
47
+ 2025-03-31 20:55:54,638 - guardbench.evaluator - INFO - Length Financial Fraud and Unethical Business - 30
48
+ 2025-03-31 20:55:54,738 - guardbench.evaluator - INFO - Processing cached results for category: Labor Exploitation and Human Trafficking
49
+ 2025-03-31 20:55:54,741 - guardbench.evaluator - INFO - Length Labor Exploitation and Human Trafficking - 30
50
+ 2025-03-31 20:55:54,841 - guardbench.evaluator - INFO - Processing cached results for category: Cybercrime, Hacking, and Digital Exploits
51
+ 2025-03-31 20:55:54,844 - guardbench.evaluator - INFO - Length Cybercrime, Hacking, and Digital Exploits - 30
52
+ 2025-03-31 20:55:54,945 - guardbench.evaluator - INFO - Processing cached results for category: Political Corruption and Legal Evasion
53
+ 2025-03-31 20:55:54,949 - guardbench.evaluator - INFO - Length Political Corruption and Legal Evasion - 30
54
+ 2025-03-31 20:55:55,049 - guardbench.evaluator - INFO - Processing cached results for category: Weapon, Explosives, and Hazardous Materials
55
+ 2025-03-31 20:55:55,052 - guardbench.evaluator - INFO - Length Weapon, Explosives, and Hazardous Materials - 30
56
+ 2025-03-31 20:55:55,152 - guardbench.evaluator - INFO - Processing cached results for category: Criminal, Violent, and Terrorist Activity
57
+ 2025-03-31 20:55:55,156 - guardbench.evaluator - INFO - Length Criminal, Violent, and Terrorist Activity - 30
58
+ 2025-03-31 20:55:55,260 - guardbench.evaluator - INFO - Updated leaderboard for model: chatgpt-4o-latest_(CoT) from cached results
59
+ 2025-03-31 20:55:55,262 - guardbench.evaluator - INFO - Evaluation from cached results completed for model: chatgpt-4o-latest_(CoT)
60
+ 2025-03-31 20:55:56,838 - __main__ - INFO - Refreshing leaderboard data after submission for version v0...
61
+ 2025-03-31 20:55:57,001 - __main__ - INFO - Refreshed leaderboard data after submission
logs/guardbench_20250331_205853_bf4fa85f.log ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 2025-03-31 20:58:54,582 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 20:58:54,675 - __main__ - INFO - Loaded leaderboard with 1 entries
logs/guardbench_20250331_210025_ba691e37.log ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 2025-03-31 21:00:26,667 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 21:00:26,793 - __main__ - INFO - Loaded leaderboard with 1 entries
logs/guardbench_20250331_210109_f46123b9.log ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 2025-03-31 21:01:10,756 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 21:01:10,828 - __main__ - INFO - Loaded leaderboard with 1 entries
logs/guardbench_20250331_210119_f0e10b9b.log ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 2025-03-31 21:01:20,172 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 21:01:20,242 - __main__ - INFO - Loaded leaderboard with 1 entries
logs/guardbench_20250331_210208_029d8318.log ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 2025-03-31 21:02:09,399 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 21:02:09,500 - __main__ - INFO - Loaded leaderboard with 1 entries
logs/guardbench_20250331_210459_437adc64.log ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 2025-03-31 21:04:59,995 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 21:05:00,111 - __main__ - INFO - Loaded leaderboard with 1 entries
logs/guardbench_20250331_210550_0358568e.log ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 2025-03-31 21:05:51,686 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 21:05:51,759 - __main__ - INFO - Loaded leaderboard with 1 entries
logs/guardbench_20250331_210641_91d1d06b.log ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 2025-03-31 21:06:42,594 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 21:06:42,717 - __main__ - INFO - Loaded leaderboard with 1 entries
logs/guardbench_20250331_210920_cd35c4f0.log ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 2025-03-31 21:09:21,490 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 21:09:21,613 - __main__ - INFO - Loaded leaderboard with 1 entries
logs/guardbench_20250331_211215_a8cf06a3.log ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 2025-03-31 21:12:16,593 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 21:12:16,704 - __main__ - INFO - Loaded leaderboard with 1 entries
logs/guardbench_20250331_211255_d32ee1a4.log ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 2025-03-31 21:12:56,431 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 21:12:56,505 - __main__ - INFO - Loaded leaderboard with 1 entries
logs/guardbench_20250331_211332_24223f1a.log ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 2025-03-31 21:13:33,061 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 21:13:33,189 - __main__ - INFO - Loaded leaderboard with 1 entries
logs/guardbench_20250331_211545_244674df.log ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 2025-03-31 21:15:46,264 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 21:15:46,382 - __main__ - INFO - Loaded leaderboard with 1 entries
logs/guardbench_20250331_211735_6d503239.log ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 2025-03-31 21:17:36,939 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 21:17:37,057 - __main__ - INFO - Loaded leaderboard with 1 entries
logs/guardbench_20250331_211916_3e96de42.log ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 2025-03-31 21:19:17,112 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 21:19:17,224 - __main__ - INFO - Loaded leaderboard with 1 entries
logs/guardbench_20250331_212027_f9d450e9.log ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ 2025-03-31 21:20:28,558 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 21:20:28,829 - __main__ - INFO - Loaded leaderboard with 1 entries
3
+ 2025-03-31 21:20:30,189 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
4
+ 2025-03-31 21:20:30,189 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
5
+ 2025-03-31 21:20:30,189 - apscheduler.scheduler - INFO - Scheduler started
logs/guardbench_20250331_212722_655f3190.log ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 2025-03-31 21:27:23,630 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 21:27:23,765 - __main__ - INFO - Loaded leaderboard with 1 entries
logs/guardbench_20250331_213207_74f9e2de.log ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ 2025-03-31 21:32:08,256 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 21:32:08,412 - __main__ - INFO - Loaded leaderboard with 1 entries
3
+ 2025-03-31 21:32:09,385 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
4
+ 2025-03-31 21:32:09,385 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
5
+ 2025-03-31 21:32:09,386 - apscheduler.scheduler - INFO - Scheduler started
logs/guardbench_20250331_213331_f72d2f6a.log ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ 2025-03-31 21:33:32,956 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 21:33:33,112 - __main__ - INFO - Loaded leaderboard with 1 entries
3
+ 2025-03-31 21:33:34,050 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
4
+ 2025-03-31 21:33:34,050 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
5
+ 2025-03-31 21:33:34,050 - apscheduler.scheduler - INFO - Scheduler started
logs/guardbench_20250331_214118_0da0491f.log ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-03-31 21:41:19,264 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 21:41:19,411 - __main__ - INFO - Loaded leaderboard with 1 entries
3
+ 2025-03-31 21:41:20,492 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
4
+ 2025-03-31 21:41:20,492 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
5
+ 2025-03-31 21:41:20,492 - apscheduler.scheduler - INFO - Scheduler started
6
+ 2025-03-31 21:44:25,074 - __main__ - INFO - Received submission for model gpt-4o-mini (CoT): /tmp/gradio/35fc6ab7ba3af1e1b210ed2851ec70f52004490c3534b64bfd8e4830f5cccea0/gpt-4o-mini CoT.jsonl
7
+ 2025-03-31 21:44:25,100 - guardbench.context - INFO - Loading dataset from: whitecircle-ai/guardbench_dataset_1k_public
8
+ 2025-03-31 21:44:26,183 - guardbench.context - INFO - Successfully loaded dataset with 980 examples
9
+ 2025-03-31 21:44:26,183 - guardbench.evaluator - INFO - Starting evaluation for model: gpt-4o-mini_(CoT)
10
+ 2025-03-31 21:44:26,183 - guardbench.evaluator - INFO - Using cached results for model: gpt-4o-mini_(CoT)
11
+ 2025-03-31 21:44:26,214 - guardbench.evaluator - INFO - Processing cached results for category: Manipulation, Deception, and Misinformation
12
+ 2025-03-31 21:44:26,218 - guardbench.evaluator - INFO - Length Manipulation, Deception, and Misinformation - 30
13
+ 2025-03-31 21:44:26,486 - guardbench.evaluator - INFO - Processing cached results for category: Financial Fraud and Unethical Business
14
+ 2025-03-31 21:44:26,490 - guardbench.evaluator - INFO - Length Financial Fraud and Unethical Business - 30
15
+ 2025-03-31 21:44:26,594 - guardbench.evaluator - INFO - Processing cached results for category: Criminal, Violent, and Terrorist Activity
16
+ 2025-03-31 21:44:26,597 - guardbench.evaluator - INFO - Length Criminal, Violent, and Terrorist Activity - 30
17
+ 2025-03-31 21:44:26,700 - guardbench.evaluator - INFO - Processing cached results for category: Cybercrime, Hacking, and Digital Exploits
18
+ 2025-03-31 21:44:26,703 - guardbench.evaluator - INFO - Length Cybercrime, Hacking, and Digital Exploits - 30
19
+ 2025-03-31 21:44:26,806 - guardbench.evaluator - INFO - Processing cached results for category: Academic Dishonesty and Cheating
20
+ 2025-03-31 21:44:26,810 - guardbench.evaluator - INFO - Length Academic Dishonesty and Cheating - 29
21
+ 2025-03-31 21:44:26,908 - guardbench.evaluator - INFO - Processing cached results for category: Animal Cruelty and Exploitation
22
+ 2025-03-31 21:44:26,912 - guardbench.evaluator - INFO - Length Animal Cruelty and Exploitation - 30
23
+ 2025-03-31 21:44:27,015 - guardbench.evaluator - INFO - Processing cached results for category: Environmental and Industrial Harm
24
+ 2025-03-31 21:44:27,018 - guardbench.evaluator - INFO - Length Environmental and Industrial Harm - 30
25
+ 2025-03-31 21:44:27,130 - guardbench.evaluator - INFO - Processing cached results for category: Self–Harm and Suicidal Ideation
26
+ 2025-03-31 21:44:27,134 - guardbench.evaluator - INFO - Length Self–Harm and Suicidal Ideation - 13
27
+ 2025-03-31 21:44:27,201 - guardbench.evaluator - INFO - Processing cached results for category: Child Exploitation and Abuse
28
+ 2025-03-31 21:44:27,205 - guardbench.evaluator - INFO - Length Child Exploitation and Abuse - 30
29
+ 2025-03-31 21:44:27,326 - guardbench.evaluator - INFO - Processing cached results for category: Safe Prompts
30
+ 2025-03-31 21:44:27,330 - guardbench.evaluator - INFO - Length Safe Prompts - 490
31
+ 2025-03-31 21:44:27,962 - guardbench.evaluator - INFO - Processing cached results for category: Weapon, Explosives, and Hazardous Materials
32
+ 2025-03-31 21:44:27,966 - guardbench.evaluator - INFO - Length Weapon, Explosives, and Hazardous Materials - 30
33
+ 2025-03-31 21:44:28,070 - guardbench.evaluator - INFO - Processing cached results for category: Labor Exploitation and Human Trafficking
34
+ 2025-03-31 21:44:28,074 - guardbench.evaluator - INFO - Length Labor Exploitation and Human Trafficking - 30
35
+ 2025-03-31 21:44:28,175 - guardbench.evaluator - INFO - Processing cached results for category: Drug– and Substance–Related Activities
36
+ 2025-03-31 21:44:28,179 - guardbench.evaluator - INFO - Length Drug– and Substance–Related Activities - 30
37
+ 2025-03-31 21:44:28,282 - guardbench.evaluator - INFO - Processing cached results for category: Sexual Content and Violence
38
+ 2025-03-31 21:44:28,286 - guardbench.evaluator - INFO - Length Sexual Content and Violence - 29
39
+ 2025-03-31 21:44:28,386 - guardbench.evaluator - INFO - Processing cached results for category: Hate Speech, Extremism, and Discrimination
40
+ 2025-03-31 21:44:28,390 - guardbench.evaluator - INFO - Length Hate Speech, Extremism, and Discrimination - 29
41
+ 2025-03-31 21:44:28,489 - guardbench.evaluator - INFO - Processing cached results for category: Political Corruption and Legal Evasion
42
+ 2025-03-31 21:44:28,493 - guardbench.evaluator - INFO - Length Political Corruption and Legal Evasion - 30
43
+ 2025-03-31 21:44:28,594 - guardbench.evaluator - INFO - Processing cached results for category: AI Manipulation and Jailbreaking
44
+ 2025-03-31 21:44:28,598 - guardbench.evaluator - INFO - Length AI Manipulation and Jailbreaking - 30
45
+ 2025-03-31 21:44:28,702 - guardbench.evaluator - INFO - Processing cached results for category: Creative Content Involving Illicit Themes
46
+ 2025-03-31 21:44:28,705 - guardbench.evaluator - INFO - Length Creative Content Involving Illicit Themes - 30
47
+ 2025-03-31 21:44:28,813 - guardbench.evaluator - INFO - Updated leaderboard for model: gpt-4o-mini_(CoT) from cached results
48
+ 2025-03-31 21:44:28,815 - guardbench.evaluator - INFO - Evaluation from cached results completed for model: gpt-4o-mini_(CoT)
49
+ 2025-03-31 21:44:30,083 - __main__ - INFO - Refreshing leaderboard data after submission for version v0...
50
+ 2025-03-31 21:44:30,284 - __main__ - INFO - Refreshed leaderboard data after submission
logs/guardbench_20250331_214511_0a5acf8b.log ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ 2025-03-31 21:45:12,578 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 21:45:12,654 - __main__ - INFO - Loaded leaderboard with 2 entries
3
+ 2025-03-31 21:45:13,819 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
4
+ 2025-03-31 21:45:13,819 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
5
+ 2025-03-31 21:45:13,819 - apscheduler.scheduler - INFO - Scheduler started
logs/guardbench_20250331_214841_4df080f3.log ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ 2025-03-31 21:48:42,942 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 21:48:43,058 - __main__ - INFO - Loaded leaderboard with 2 entries
3
+ 2025-03-31 21:48:43,995 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
4
+ 2025-03-31 21:48:43,995 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
5
+ 2025-03-31 21:48:43,995 - apscheduler.scheduler - INFO - Scheduler started
logs/guardbench_20250331_215007_9c98c60a.log ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ 2025-03-31 21:50:08,436 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 21:50:08,552 - __main__ - INFO - Loaded leaderboard with 2 entries
3
+ 2025-03-31 21:50:09,654 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
4
+ 2025-03-31 21:50:09,654 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
5
+ 2025-03-31 21:50:09,655 - apscheduler.scheduler - INFO - Scheduler started
logs/guardbench_20250331_215514_ad36e4b4.log ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-03-31 21:55:15,352 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 21:55:15,454 - __main__ - INFO - Loaded leaderboard with 2 entries
3
+ 2025-03-31 21:55:16,351 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
4
+ 2025-03-31 21:55:16,352 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
5
+ 2025-03-31 21:55:16,352 - apscheduler.scheduler - INFO - Scheduler started
6
+ 2025-03-31 21:57:09,862 - __main__ - INFO - Received submission for model gpt-4o-mini: /tmp/gradio/26f1c6837517a59736a02ffe486b3504336116347e339745a7973e9412dad4db/gpt-4o-mini-3.jsonl
7
+ 2025-03-31 21:57:10,863 - guardbench.context - INFO - Loading dataset from: whitecircle-ai/guardbench_dataset_1k_public
8
+ 2025-03-31 21:57:11,927 - guardbench.context - INFO - Successfully loaded dataset with 980 examples
9
+ 2025-03-31 21:57:11,927 - guardbench.evaluator - INFO - Starting evaluation for model: gpt-4o-mini
10
+ 2025-03-31 21:57:11,927 - guardbench.evaluator - INFO - Using cached results for model: gpt-4o-mini
11
+ 2025-03-31 21:57:11,947 - guardbench.evaluator - INFO - Processing cached results for category: Child Exploitation and Abuse
12
+ 2025-03-31 21:57:11,951 - guardbench.evaluator - INFO - Length Child Exploitation and Abuse - 30
13
+ 2025-03-31 21:57:12,054 - guardbench.evaluator - INFO - Processing cached results for category: Sexual Content and Violence
14
+ 2025-03-31 21:57:12,058 - guardbench.evaluator - INFO - Length Sexual Content and Violence - 29
15
+ 2025-03-31 21:57:12,155 - guardbench.evaluator - INFO - Processing cached results for category: Self–Harm and Suicidal Ideation
16
+ 2025-03-31 21:57:12,158 - guardbench.evaluator - INFO - Length Self–Harm and Suicidal Ideation - 13
17
+ 2025-03-31 21:57:12,215 - guardbench.evaluator - INFO - Processing cached results for category: Criminal, Violent, and Terrorist Activity
18
+ 2025-03-31 21:57:12,219 - guardbench.evaluator - INFO - Length Criminal, Violent, and Terrorist Activity - 30
19
+ 2025-03-31 21:57:12,319 - guardbench.evaluator - INFO - Processing cached results for category: Financial Fraud and Unethical Business
20
+ 2025-03-31 21:57:12,322 - guardbench.evaluator - INFO - Length Financial Fraud and Unethical Business - 30
21
+ 2025-03-31 21:57:12,423 - guardbench.evaluator - INFO - Processing cached results for category: Animal Cruelty and Exploitation
22
+ 2025-03-31 21:57:12,426 - guardbench.evaluator - INFO - Length Animal Cruelty and Exploitation - 30
23
+ 2025-03-31 21:57:12,526 - guardbench.evaluator - INFO - Processing cached results for category: Cybercrime, Hacking, and Digital Exploits
24
+ 2025-03-31 21:57:12,530 - guardbench.evaluator - INFO - Length Cybercrime, Hacking, and Digital Exploits - 30
25
+ 2025-03-31 21:57:12,631 - guardbench.evaluator - INFO - Processing cached results for category: AI Manipulation and Jailbreaking
26
+ 2025-03-31 21:57:12,634 - guardbench.evaluator - INFO - Length AI Manipulation and Jailbreaking - 30
27
+ 2025-03-31 21:57:12,736 - guardbench.evaluator - INFO - Processing cached results for category: Creative Content Involving Illicit Themes
28
+ 2025-03-31 21:57:12,740 - guardbench.evaluator - INFO - Length Creative Content Involving Illicit Themes - 30
29
+ 2025-03-31 21:57:12,840 - guardbench.evaluator - INFO - Processing cached results for category: Weapon, Explosives, and Hazardous Materials
30
+ 2025-03-31 21:57:12,843 - guardbench.evaluator - INFO - Length Weapon, Explosives, and Hazardous Materials - 30
31
+ 2025-03-31 21:57:12,943 - guardbench.evaluator - INFO - Processing cached results for category: Manipulation, Deception, and Misinformation
32
+ 2025-03-31 21:57:12,946 - guardbench.evaluator - INFO - Length Manipulation, Deception, and Misinformation - 30
33
+ 2025-03-31 21:57:13,045 - guardbench.evaluator - INFO - Processing cached results for category: Political Corruption and Legal Evasion
34
+ 2025-03-31 21:57:13,049 - guardbench.evaluator - INFO - Length Political Corruption and Legal Evasion - 30
35
+ 2025-03-31 21:57:13,147 - guardbench.evaluator - INFO - Processing cached results for category: Safe Prompts
36
+ 2025-03-31 21:57:13,151 - guardbench.evaluator - INFO - Length Safe Prompts - 490
37
+ 2025-03-31 21:57:13,701 - guardbench.evaluator - INFO - Processing cached results for category: Environmental and Industrial Harm
38
+ 2025-03-31 21:57:13,705 - guardbench.evaluator - INFO - Length Environmental and Industrial Harm - 30
39
+ 2025-03-31 21:57:13,805 - guardbench.evaluator - INFO - Processing cached results for category: Hate Speech, Extremism, and Discrimination
40
+ 2025-03-31 21:57:13,809 - guardbench.evaluator - INFO - Length Hate Speech, Extremism, and Discrimination - 29
41
+ 2025-03-31 21:57:13,905 - guardbench.evaluator - INFO - Processing cached results for category: Academic Dishonesty and Cheating
42
+ 2025-03-31 21:57:13,909 - guardbench.evaluator - INFO - Length Academic Dishonesty and Cheating - 29
43
+ 2025-03-31 21:57:14,008 - guardbench.evaluator - INFO - Processing cached results for category: Labor Exploitation and Human Trafficking
44
+ 2025-03-31 21:57:14,011 - guardbench.evaluator - INFO - Length Labor Exploitation and Human Trafficking - 30
45
+ 2025-03-31 21:57:14,113 - guardbench.evaluator - INFO - Processing cached results for category: Drug– and Substance–Related Activities
46
+ 2025-03-31 21:57:14,117 - guardbench.evaluator - INFO - Length Drug– and Substance–Related Activities - 30
47
+ 2025-03-31 21:57:14,219 - guardbench.evaluator - INFO - Updated leaderboard for model: gpt-4o-mini from cached results
48
+ 2025-03-31 21:57:14,220 - guardbench.evaluator - INFO - Evaluation from cached results completed for model: gpt-4o-mini
49
+ 2025-03-31 21:57:15,528 - __main__ - INFO - Refreshing leaderboard data after submission for version v0...
50
+ 2025-03-31 21:57:15,650 - __main__ - INFO - Refreshed leaderboard data after submission
logs/guardbench_20250331_220348_0cb4d8e9.log ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-03-31 22:03:49,677 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 22:03:49,841 - __main__ - INFO - Loaded leaderboard with 3 entries
3
+ 2025-03-31 22:03:51,379 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
4
+ 2025-03-31 22:03:51,379 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
5
+ 2025-03-31 22:03:51,379 - apscheduler.scheduler - INFO - Scheduler started
6
+ 2025-03-31 22:04:45,096 - __main__ - INFO - Received submission for model gpt4omini-TEST: /tmp/gradio/26f1c6837517a59736a02ffe486b3504336116347e339745a7973e9412dad4db/gpt-4o-mini-3.jsonl
7
+ 2025-03-31 22:04:45,574 - guardbench.context - INFO - Loading dataset from: whitecircle-ai/guardbench_dataset_1k_public
8
+ 2025-03-31 22:04:46,470 - guardbench.context - INFO - Successfully loaded dataset with 980 examples
9
+ 2025-03-31 22:04:46,470 - guardbench.evaluator - INFO - Starting evaluation for model: gpt4omini-TEST
10
+ 2025-03-31 22:04:46,470 - guardbench.evaluator - INFO - Using cached results for model: gpt4omini-TEST
11
+ 2025-03-31 22:04:46,490 - guardbench.evaluator - INFO - Processing cached results for category: Labor Exploitation and Human Trafficking
12
+ 2025-03-31 22:04:46,493 - guardbench.evaluator - INFO - Length Labor Exploitation and Human Trafficking - 30
13
+ 2025-03-31 22:04:46,595 - guardbench.evaluator - INFO - Processing cached results for category: Cybercrime, Hacking, and Digital Exploits
14
+ 2025-03-31 22:04:46,600 - guardbench.evaluator - INFO - Length Cybercrime, Hacking, and Digital Exploits - 30
15
+ 2025-03-31 22:04:46,702 - guardbench.evaluator - INFO - Processing cached results for category: Academic Dishonesty and Cheating
16
+ 2025-03-31 22:04:46,706 - guardbench.evaluator - INFO - Length Academic Dishonesty and Cheating - 29
17
+ 2025-03-31 22:04:46,804 - guardbench.evaluator - INFO - Processing cached results for category: Creative Content Involving Illicit Themes
18
+ 2025-03-31 22:04:46,808 - guardbench.evaluator - INFO - Length Creative Content Involving Illicit Themes - 30
19
+ 2025-03-31 22:04:46,914 - guardbench.evaluator - INFO - Processing cached results for category: Hate Speech, Extremism, and Discrimination
20
+ 2025-03-31 22:04:46,917 - guardbench.evaluator - INFO - Length Hate Speech, Extremism, and Discrimination - 29
21
+ 2025-03-31 22:04:47,019 - guardbench.evaluator - INFO - Processing cached results for category: Weapon, Explosives, and Hazardous Materials
22
+ 2025-03-31 22:04:47,023 - guardbench.evaluator - INFO - Length Weapon, Explosives, and Hazardous Materials - 30
23
+ 2025-03-31 22:04:47,128 - guardbench.evaluator - INFO - Processing cached results for category: Drug– and Substance–Related Activities
24
+ 2025-03-31 22:04:47,132 - guardbench.evaluator - INFO - Length Drug– and Substance–Related Activities - 30
25
+ 2025-03-31 22:04:47,237 - guardbench.evaluator - INFO - Processing cached results for category: Criminal, Violent, and Terrorist Activity
26
+ 2025-03-31 22:04:47,240 - guardbench.evaluator - INFO - Length Criminal, Violent, and Terrorist Activity - 30
27
+ 2025-03-31 22:04:47,355 - guardbench.evaluator - INFO - Processing cached results for category: Political Corruption and Legal Evasion
28
+ 2025-03-31 22:04:47,359 - guardbench.evaluator - INFO - Length Political Corruption and Legal Evasion - 30
29
+ 2025-03-31 22:04:47,462 - guardbench.evaluator - INFO - Processing cached results for category: Child Exploitation and Abuse
30
+ 2025-03-31 22:04:47,466 - guardbench.evaluator - INFO - Length Child Exploitation and Abuse - 30
31
+ 2025-03-31 22:04:47,573 - guardbench.evaluator - INFO - Processing cached results for category: Environmental and Industrial Harm
32
+ 2025-03-31 22:04:47,576 - guardbench.evaluator - INFO - Length Environmental and Industrial Harm - 30
33
+ 2025-03-31 22:04:47,694 - guardbench.evaluator - INFO - Processing cached results for category: AI Manipulation and Jailbreaking
34
+ 2025-03-31 22:04:47,697 - guardbench.evaluator - INFO - Length AI Manipulation and Jailbreaking - 30
35
+ 2025-03-31 22:04:47,804 - guardbench.evaluator - INFO - Processing cached results for category: Financial Fraud and Unethical Business
36
+ 2025-03-31 22:04:47,808 - guardbench.evaluator - INFO - Length Financial Fraud and Unethical Business - 30
37
+ 2025-03-31 22:04:47,913 - guardbench.evaluator - INFO - Processing cached results for category: Manipulation, Deception, and Misinformation
38
+ 2025-03-31 22:04:47,917 - guardbench.evaluator - INFO - Length Manipulation, Deception, and Misinformation - 30
39
+ 2025-03-31 22:04:48,022 - guardbench.evaluator - INFO - Processing cached results for category: Safe Prompts
40
+ 2025-03-31 22:04:48,026 - guardbench.evaluator - INFO - Length Safe Prompts - 490
41
+ 2025-03-31 22:04:48,605 - guardbench.evaluator - INFO - Processing cached results for category: Self–Harm and Suicidal Ideation
42
+ 2025-03-31 22:04:48,609 - guardbench.evaluator - INFO - Length Self–Harm and Suicidal Ideation - 13
43
+ 2025-03-31 22:04:48,667 - guardbench.evaluator - INFO - Processing cached results for category: Sexual Content and Violence
44
+ 2025-03-31 22:04:48,671 - guardbench.evaluator - INFO - Length Sexual Content and Violence - 29
45
+ 2025-03-31 22:04:48,771 - guardbench.evaluator - INFO - Processing cached results for category: Animal Cruelty and Exploitation
46
+ 2025-03-31 22:04:48,775 - guardbench.evaluator - INFO - Length Animal Cruelty and Exploitation - 30
47
+ 2025-03-31 22:04:48,882 - guardbench.evaluator - INFO - Updated leaderboard for model: gpt4omini-TEST from cached results
48
+ 2025-03-31 22:04:48,883 - guardbench.evaluator - INFO - Evaluation from cached results completed for model: gpt4omini-TEST
49
+ 2025-03-31 22:04:50,345 - __main__ - INFO - Refreshing leaderboard data after submission for version v0...
50
+ 2025-03-31 22:04:50,514 - __main__ - INFO - Refreshed leaderboard data after submission
logs/guardbench_20250331_220638_21aa20f2.log ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-03-31 22:06:39,559 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 22:06:39,705 - __main__ - INFO - Loaded leaderboard with 4 entries
3
+ 2025-03-31 22:06:41,404 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
4
+ 2025-03-31 22:06:41,404 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
5
+ 2025-03-31 22:06:41,404 - apscheduler.scheduler - INFO - Scheduler started
6
+ 2025-03-31 22:07:29,463 - __main__ - INFO - Received submission for model gpt4omini-TEST2: /tmp/gradio/26f1c6837517a59736a02ffe486b3504336116347e339745a7973e9412dad4db/gpt-4o-mini-3.jsonl
7
+ 2025-03-31 22:07:30,063 - guardbench.context - INFO - Loading dataset from: whitecircle-ai/guardbench_dataset_1k_public
8
+ 2025-03-31 22:07:31,596 - guardbench.context - INFO - Successfully loaded dataset with 980 examples
9
+ 2025-03-31 22:07:31,597 - guardbench.evaluator - INFO - Starting evaluation for model: gpt4omini-TEST2
10
+ 2025-03-31 22:07:31,597 - guardbench.evaluator - INFO - Using cached results for model: gpt4omini-TEST2
11
+ 2025-03-31 22:07:31,616 - guardbench.evaluator - INFO - Processing cached results for category: Manipulation, Deception, and Misinformation
12
+ 2025-03-31 22:07:31,620 - guardbench.evaluator - INFO - Length Manipulation, Deception, and Misinformation - 30
13
+ 2025-03-31 22:07:31,723 - guardbench.evaluator - INFO - Processing cached results for category: Environmental and Industrial Harm
14
+ 2025-03-31 22:07:31,727 - guardbench.evaluator - INFO - Length Environmental and Industrial Harm - 30
15
+ 2025-03-31 22:07:31,828 - guardbench.evaluator - INFO - Processing cached results for category: Weapon, Explosives, and Hazardous Materials
16
+ 2025-03-31 22:07:31,831 - guardbench.evaluator - INFO - Length Weapon, Explosives, and Hazardous Materials - 30
17
+ 2025-03-31 22:07:31,934 - guardbench.evaluator - INFO - Processing cached results for category: Criminal, Violent, and Terrorist Activity
18
+ 2025-03-31 22:07:31,937 - guardbench.evaluator - INFO - Length Criminal, Violent, and Terrorist Activity - 30
19
+ 2025-03-31 22:07:32,040 - guardbench.evaluator - INFO - Processing cached results for category: Labor Exploitation and Human Trafficking
20
+ 2025-03-31 22:07:32,043 - guardbench.evaluator - INFO - Length Labor Exploitation and Human Trafficking - 30
21
+ 2025-03-31 22:07:32,144 - guardbench.evaluator - INFO - Processing cached results for category: Safe Prompts
22
+ 2025-03-31 22:07:32,148 - guardbench.evaluator - INFO - Length Safe Prompts - 490
23
+ 2025-03-31 22:07:32,703 - guardbench.evaluator - INFO - Processing cached results for category: Cybercrime, Hacking, and Digital Exploits
24
+ 2025-03-31 22:07:32,706 - guardbench.evaluator - INFO - Length Cybercrime, Hacking, and Digital Exploits - 30
25
+ 2025-03-31 22:07:32,808 - guardbench.evaluator - INFO - Processing cached results for category: Financial Fraud and Unethical Business
26
+ 2025-03-31 22:07:32,812 - guardbench.evaluator - INFO - Length Financial Fraud and Unethical Business - 30
27
+ 2025-03-31 22:07:32,912 - guardbench.evaluator - INFO - Processing cached results for category: Drug– and Substance–Related Activities
28
+ 2025-03-31 22:07:32,916 - guardbench.evaluator - INFO - Length Drug– and Substance–Related Activities - 30
29
+ 2025-03-31 22:07:33,018 - guardbench.evaluator - INFO - Processing cached results for category: Creative Content Involving Illicit Themes
30
+ 2025-03-31 22:07:33,022 - guardbench.evaluator - INFO - Length Creative Content Involving Illicit Themes - 30
31
+ 2025-03-31 22:07:33,123 - guardbench.evaluator - INFO - Processing cached results for category: Animal Cruelty and Exploitation
32
+ 2025-03-31 22:07:33,127 - guardbench.evaluator - INFO - Length Animal Cruelty and Exploitation - 30
33
+ 2025-03-31 22:07:33,231 - guardbench.evaluator - INFO - Processing cached results for category: Political Corruption and Legal Evasion
34
+ 2025-03-31 22:07:33,234 - guardbench.evaluator - INFO - Length Political Corruption and Legal Evasion - 30
35
+ 2025-03-31 22:07:33,336 - guardbench.evaluator - INFO - Processing cached results for category: Hate Speech, Extremism, and Discrimination
36
+ 2025-03-31 22:07:33,339 - guardbench.evaluator - INFO - Length Hate Speech, Extremism, and Discrimination - 29
37
+ 2025-03-31 22:07:33,438 - guardbench.evaluator - INFO - Processing cached results for category: AI Manipulation and Jailbreaking
38
+ 2025-03-31 22:07:33,441 - guardbench.evaluator - INFO - Length AI Manipulation and Jailbreaking - 30
39
+ 2025-03-31 22:07:33,545 - guardbench.evaluator - INFO - Processing cached results for category: Sexual Content and Violence
40
+ 2025-03-31 22:07:33,548 - guardbench.evaluator - INFO - Length Sexual Content and Violence - 29
41
+ 2025-03-31 22:07:33,647 - guardbench.evaluator - INFO - Processing cached results for category: Self–Harm and Suicidal Ideation
42
+ 2025-03-31 22:07:33,650 - guardbench.evaluator - INFO - Length Self–Harm and Suicidal Ideation - 13
43
+ 2025-03-31 22:07:33,707 - guardbench.evaluator - INFO - Processing cached results for category: Academic Dishonesty and Cheating
44
+ 2025-03-31 22:07:33,711 - guardbench.evaluator - INFO - Length Academic Dishonesty and Cheating - 29
45
+ 2025-03-31 22:07:33,807 - guardbench.evaluator - INFO - Processing cached results for category: Child Exploitation and Abuse
46
+ 2025-03-31 22:07:33,811 - guardbench.evaluator - INFO - Length Child Exploitation and Abuse - 30
47
+ 2025-03-31 22:07:33,915 - guardbench.evaluator - INFO - Updated leaderboard for model: gpt4omini-TEST2 from cached results
48
+ 2025-03-31 22:07:33,917 - guardbench.evaluator - INFO - Evaluation from cached results completed for model: gpt4omini-TEST2
49
+ 2025-03-31 22:07:36,275 - __main__ - INFO - Refreshing leaderboard data after submission for version v0...
50
+ 2025-03-31 22:07:36,423 - __main__ - INFO - Refreshed leaderboard data after submission
logs/guardbench_20250331_221124_3ffa908f.log ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-03-31 22:11:25,101 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 22:11:25,216 - __main__ - INFO - Loaded leaderboard with 5 entries
3
+ 2025-03-31 22:11:26,084 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
4
+ 2025-03-31 22:11:26,084 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
5
+ 2025-03-31 22:11:26,085 - apscheduler.scheduler - INFO - Scheduler started
6
+ 2025-03-31 22:12:12,091 - __main__ - INFO - Received submission for model gpt4omini-TEST5: /tmp/gradio/26f1c6837517a59736a02ffe486b3504336116347e339745a7973e9412dad4db/gpt-4o-mini-3.jsonl
7
+ 2025-03-31 22:12:12,831 - guardbench.context - INFO - Loading dataset from: whitecircle-ai/guardbench_dataset_1k_public
8
+ 2025-03-31 22:12:14,244 - guardbench.context - INFO - Successfully loaded dataset with 980 examples
9
+ 2025-03-31 22:12:14,244 - guardbench.evaluator - INFO - Starting evaluation for model: gpt4omini-TEST5
10
+ 2025-03-31 22:12:14,244 - guardbench.evaluator - INFO - Using cached results for model: gpt4omini-TEST5
11
+ 2025-03-31 22:12:14,263 - guardbench.evaluator - INFO - Processing cached results for category: Safe Prompts
12
+ 2025-03-31 22:12:14,267 - guardbench.evaluator - INFO - Length Safe Prompts - 490
13
+ 2025-03-31 22:12:14,831 - guardbench.evaluator - INFO - Processing cached results for category: Manipulation, Deception, and Misinformation
14
+ 2025-03-31 22:12:14,835 - guardbench.evaluator - INFO - Length Manipulation, Deception, and Misinformation - 30
15
+ 2025-03-31 22:12:14,939 - guardbench.evaluator - INFO - Processing cached results for category: Animal Cruelty and Exploitation
16
+ 2025-03-31 22:12:14,942 - guardbench.evaluator - INFO - Length Animal Cruelty and Exploitation - 30
17
+ 2025-03-31 22:12:15,044 - guardbench.evaluator - INFO - Processing cached results for category: Financial Fraud and Unethical Business
18
+ 2025-03-31 22:12:15,047 - guardbench.evaluator - INFO - Length Financial Fraud and Unethical Business - 30
19
+ 2025-03-31 22:12:15,149 - guardbench.evaluator - INFO - Processing cached results for category: Weapon, Explosives, and Hazardous Materials
20
+ 2025-03-31 22:12:15,152 - guardbench.evaluator - INFO - Length Weapon, Explosives, and Hazardous Materials - 30
21
+ 2025-03-31 22:12:15,255 - guardbench.evaluator - INFO - Processing cached results for category: AI Manipulation and Jailbreaking
22
+ 2025-03-31 22:12:15,258 - guardbench.evaluator - INFO - Length AI Manipulation and Jailbreaking - 30
23
+ 2025-03-31 22:12:15,361 - guardbench.evaluator - INFO - Processing cached results for category: Academic Dishonesty and Cheating
24
+ 2025-03-31 22:12:15,364 - guardbench.evaluator - INFO - Length Academic Dishonesty and Cheating - 29
25
+ 2025-03-31 22:12:15,463 - guardbench.evaluator - INFO - Processing cached results for category: Cybercrime, Hacking, and Digital Exploits
26
+ 2025-03-31 22:12:15,466 - guardbench.evaluator - INFO - Length Cybercrime, Hacking, and Digital Exploits - 30
27
+ 2025-03-31 22:12:15,569 - guardbench.evaluator - INFO - Processing cached results for category: Creative Content Involving Illicit Themes
28
+ 2025-03-31 22:12:15,573 - guardbench.evaluator - INFO - Length Creative Content Involving Illicit Themes - 30
29
+ 2025-03-31 22:12:15,676 - guardbench.evaluator - INFO - Processing cached results for category: Political Corruption and Legal Evasion
30
+ 2025-03-31 22:12:15,680 - guardbench.evaluator - INFO - Length Political Corruption and Legal Evasion - 30
31
+ 2025-03-31 22:12:15,782 - guardbench.evaluator - INFO - Processing cached results for category: Self–Harm and Suicidal Ideation
32
+ 2025-03-31 22:12:15,785 - guardbench.evaluator - INFO - Length Self–Harm and Suicidal Ideation - 13
33
+ 2025-03-31 22:12:15,843 - guardbench.evaluator - INFO - Processing cached results for category: Drug– and Substance–Related Activities
34
+ 2025-03-31 22:12:15,847 - guardbench.evaluator - INFO - Length Drug– and Substance–Related Activities - 30
35
+ 2025-03-31 22:12:15,949 - guardbench.evaluator - INFO - Processing cached results for category: Sexual Content and Violence
36
+ 2025-03-31 22:12:15,953 - guardbench.evaluator - INFO - Length Sexual Content and Violence - 29
37
+ 2025-03-31 22:12:16,053 - guardbench.evaluator - INFO - Processing cached results for category: Hate Speech, Extremism, and Discrimination
38
+ 2025-03-31 22:12:16,056 - guardbench.evaluator - INFO - Length Hate Speech, Extremism, and Discrimination - 29
39
+ 2025-03-31 22:12:16,156 - guardbench.evaluator - INFO - Processing cached results for category: Child Exploitation and Abuse
40
+ 2025-03-31 22:12:16,160 - guardbench.evaluator - INFO - Length Child Exploitation and Abuse - 30
41
+ 2025-03-31 22:12:16,263 - guardbench.evaluator - INFO - Processing cached results for category: Labor Exploitation and Human Trafficking
42
+ 2025-03-31 22:12:16,267 - guardbench.evaluator - INFO - Length Labor Exploitation and Human Trafficking - 30
43
+ 2025-03-31 22:12:16,369 - guardbench.evaluator - INFO - Processing cached results for category: Environmental and Industrial Harm
44
+ 2025-03-31 22:12:16,373 - guardbench.evaluator - INFO - Length Environmental and Industrial Harm - 30
45
+ 2025-03-31 22:12:16,474 - guardbench.evaluator - INFO - Processing cached results for category: Criminal, Violent, and Terrorist Activity
46
+ 2025-03-31 22:12:16,478 - guardbench.evaluator - INFO - Length Criminal, Violent, and Terrorist Activity - 30
47
+ 2025-03-31 22:12:16,583 - guardbench.evaluator - INFO - Updated leaderboard for model: gpt4omini-TEST5 from cached results
48
+ 2025-03-31 22:12:16,584 - guardbench.evaluator - INFO - Evaluation from cached results completed for model: gpt4omini-TEST5
49
+ 2025-03-31 22:12:18,047 - __main__ - INFO - Refreshing leaderboard data after submission for version v0...
50
+ 2025-03-31 22:12:18,174 - __main__ - INFO - Refreshed leaderboard data after submission
logs/guardbench_20250331_221755_fc667123.log ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-03-31 22:17:56,280 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 22:17:56,406 - __main__ - INFO - Loaded leaderboard with 6 entries
3
+ 2025-03-31 22:17:57,983 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
4
+ 2025-03-31 22:17:57,983 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
5
+ 2025-03-31 22:17:57,983 - apscheduler.scheduler - INFO - Scheduler started
6
+ 2025-03-31 22:18:49,068 - __main__ - INFO - Received submission for model gpt4omini-TEST27: /tmp/gradio/26f1c6837517a59736a02ffe486b3504336116347e339745a7973e9412dad4db/gpt-4o-mini-3.jsonl
7
+ 2025-03-31 22:18:49,596 - guardbench.context - INFO - Loading dataset from: whitecircle-ai/guardbench_dataset_1k_public
8
+ 2025-03-31 22:18:50,543 - guardbench.context - INFO - Successfully loaded dataset with 980 examples
9
+ 2025-03-31 22:18:50,543 - guardbench.evaluator - INFO - Starting evaluation for model: gpt4omini-TEST27
10
+ 2025-03-31 22:18:50,543 - guardbench.evaluator - INFO - Using cached results for model: gpt4omini-TEST27
11
+ 2025-03-31 22:18:50,563 - guardbench.evaluator - INFO - Processing cached results for category: Drug– and Substance–Related Activities
12
+ 2025-03-31 22:18:50,567 - guardbench.evaluator - INFO - Length Drug– and Substance–Related Activities - 30
13
+ 2025-03-31 22:18:50,668 - guardbench.evaluator - INFO - Processing cached results for category: Sexual Content and Violence
14
+ 2025-03-31 22:18:50,672 - guardbench.evaluator - INFO - Length Sexual Content and Violence - 29
15
+ 2025-03-31 22:18:50,769 - guardbench.evaluator - INFO - Processing cached results for category: Labor Exploitation and Human Trafficking
16
+ 2025-03-31 22:18:50,772 - guardbench.evaluator - INFO - Length Labor Exploitation and Human Trafficking - 30
17
+ 2025-03-31 22:18:50,873 - guardbench.evaluator - INFO - Processing cached results for category: Criminal, Violent, and Terrorist Activity
18
+ 2025-03-31 22:18:50,877 - guardbench.evaluator - INFO - Length Criminal, Violent, and Terrorist Activity - 30
19
+ 2025-03-31 22:18:50,978 - guardbench.evaluator - INFO - Processing cached results for category: Financial Fraud and Unethical Business
20
+ 2025-03-31 22:18:50,982 - guardbench.evaluator - INFO - Length Financial Fraud and Unethical Business - 30
21
+ 2025-03-31 22:18:51,085 - guardbench.evaluator - INFO - Processing cached results for category: AI Manipulation and Jailbreaking
22
+ 2025-03-31 22:18:51,088 - guardbench.evaluator - INFO - Length AI Manipulation and Jailbreaking - 30
23
+ 2025-03-31 22:18:51,191 - guardbench.evaluator - INFO - Processing cached results for category: Manipulation, Deception, and Misinformation
24
+ 2025-03-31 22:18:51,195 - guardbench.evaluator - INFO - Length Manipulation, Deception, and Misinformation - 30
25
+ 2025-03-31 22:18:51,297 - guardbench.evaluator - INFO - Processing cached results for category: Hate Speech, Extremism, and Discrimination
26
+ 2025-03-31 22:18:51,300 - guardbench.evaluator - INFO - Length Hate Speech, Extremism, and Discrimination - 29
27
+ 2025-03-31 22:18:51,398 - guardbench.evaluator - INFO - Processing cached results for category: Child Exploitation and Abuse
28
+ 2025-03-31 22:18:51,401 - guardbench.evaluator - INFO - Length Child Exploitation and Abuse - 30
29
+ 2025-03-31 22:18:51,503 - guardbench.evaluator - INFO - Processing cached results for category: Animal Cruelty and Exploitation
30
+ 2025-03-31 22:18:51,507 - guardbench.evaluator - INFO - Length Animal Cruelty and Exploitation - 30
31
+ 2025-03-31 22:18:51,607 - guardbench.evaluator - INFO - Processing cached results for category: Political Corruption and Legal Evasion
32
+ 2025-03-31 22:18:51,611 - guardbench.evaluator - INFO - Length Political Corruption and Legal Evasion - 30
33
+ 2025-03-31 22:18:51,712 - guardbench.evaluator - INFO - Processing cached results for category: Cybercrime, Hacking, and Digital Exploits
34
+ 2025-03-31 22:18:51,716 - guardbench.evaluator - INFO - Length Cybercrime, Hacking, and Digital Exploits - 30
35
+ 2025-03-31 22:18:51,817 - guardbench.evaluator - INFO - Processing cached results for category: Creative Content Involving Illicit Themes
36
+ 2025-03-31 22:18:51,821 - guardbench.evaluator - INFO - Length Creative Content Involving Illicit Themes - 30
37
+ 2025-03-31 22:18:51,923 - guardbench.evaluator - INFO - Processing cached results for category: Safe Prompts
38
+ 2025-03-31 22:18:51,926 - guardbench.evaluator - INFO - Length Safe Prompts - 490
39
+ 2025-03-31 22:18:52,476 - guardbench.evaluator - INFO - Processing cached results for category: Weapon, Explosives, and Hazardous Materials
40
+ 2025-03-31 22:18:52,480 - guardbench.evaluator - INFO - Length Weapon, Explosives, and Hazardous Materials - 30
41
+ 2025-03-31 22:18:52,582 - guardbench.evaluator - INFO - Processing cached results for category: Environmental and Industrial Harm
42
+ 2025-03-31 22:18:52,586 - guardbench.evaluator - INFO - Length Environmental and Industrial Harm - 30
43
+ 2025-03-31 22:18:52,686 - guardbench.evaluator - INFO - Processing cached results for category: Self–Harm and Suicidal Ideation
44
+ 2025-03-31 22:18:52,690 - guardbench.evaluator - INFO - Length Self–Harm and Suicidal Ideation - 13
45
+ 2025-03-31 22:18:52,747 - guardbench.evaluator - INFO - Processing cached results for category: Academic Dishonesty and Cheating
46
+ 2025-03-31 22:18:52,750 - guardbench.evaluator - INFO - Length Academic Dishonesty and Cheating - 29
47
+ 2025-03-31 22:18:52,851 - guardbench.evaluator - INFO - Updated leaderboard for model: gpt4omini-TEST27 from cached results
48
+ 2025-03-31 22:18:52,852 - guardbench.evaluator - INFO - Evaluation from cached results completed for model: gpt4omini-TEST27
49
+ 2025-03-31 22:18:54,180 - __main__ - INFO - Refreshing leaderboard data after submission for version v0...
50
+ 2025-03-31 22:18:54,298 - __main__ - INFO - Refreshed leaderboard data after submission
logs/guardbench_20250331_222103_90a3095d.log ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-03-31 22:21:04,147 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 22:21:04,266 - __main__ - INFO - Loaded leaderboard with 7 entries
3
+ 2025-03-31 22:21:06,082 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
4
+ 2025-03-31 22:21:06,083 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
5
+ 2025-03-31 22:21:06,083 - apscheduler.scheduler - INFO - Scheduler started
6
+ 2025-03-31 22:21:41,064 - __main__ - INFO - Received submission for model gpt-4o-mini-TEST6: /tmp/gradio/26f1c6837517a59736a02ffe486b3504336116347e339745a7973e9412dad4db/gpt-4o-mini-3.jsonl
7
+ 2025-03-31 22:21:42,056 - guardbench.context - INFO - Loading dataset from: whitecircle-ai/guardbench_dataset_1k_public
8
+ 2025-03-31 22:21:42,922 - guardbench.context - INFO - Successfully loaded dataset with 980 examples
9
+ 2025-03-31 22:21:42,922 - guardbench.evaluator - INFO - Starting evaluation for model: gpt-4o-mini-TEST6
10
+ 2025-03-31 22:21:42,922 - guardbench.evaluator - INFO - Using cached results for model: gpt-4o-mini-TEST6
11
+ 2025-03-31 22:21:42,942 - guardbench.evaluator - INFO - Processing cached results for category: Hate Speech, Extremism, and Discrimination
12
+ 2025-03-31 22:21:42,945 - guardbench.evaluator - INFO - Length Hate Speech, Extremism, and Discrimination - 29
13
+ 2025-03-31 22:21:43,044 - guardbench.evaluator - INFO - Processing cached results for category: Drug– and Substance–Related Activities
14
+ 2025-03-31 22:21:43,048 - guardbench.evaluator - INFO - Length Drug– and Substance–Related Activities - 30
15
+ 2025-03-31 22:21:43,151 - guardbench.evaluator - INFO - Processing cached results for category: AI Manipulation and Jailbreaking
16
+ 2025-03-31 22:21:43,155 - guardbench.evaluator - INFO - Length AI Manipulation and Jailbreaking - 30
17
+ 2025-03-31 22:21:43,257 - guardbench.evaluator - INFO - Processing cached results for category: Political Corruption and Legal Evasion
18
+ 2025-03-31 22:21:43,260 - guardbench.evaluator - INFO - Length Political Corruption and Legal Evasion - 30
19
+ 2025-03-31 22:21:43,360 - guardbench.evaluator - INFO - Processing cached results for category: Academic Dishonesty and Cheating
20
+ 2025-03-31 22:21:43,363 - guardbench.evaluator - INFO - Length Academic Dishonesty and Cheating - 29
21
+ 2025-03-31 22:21:43,461 - guardbench.evaluator - INFO - Processing cached results for category: Labor Exploitation and Human Trafficking
22
+ 2025-03-31 22:21:43,465 - guardbench.evaluator - INFO - Length Labor Exploitation and Human Trafficking - 30
23
+ 2025-03-31 22:21:43,563 - guardbench.evaluator - INFO - Processing cached results for category: Safe Prompts
24
+ 2025-03-31 22:21:43,567 - guardbench.evaluator - INFO - Length Safe Prompts - 490
25
+ 2025-03-31 22:21:44,119 - guardbench.evaluator - INFO - Processing cached results for category: Manipulation, Deception, and Misinformation
26
+ 2025-03-31 22:21:44,123 - guardbench.evaluator - INFO - Length Manipulation, Deception, and Misinformation - 30
27
+ 2025-03-31 22:21:44,223 - guardbench.evaluator - INFO - Processing cached results for category: Sexual Content and Violence
28
+ 2025-03-31 22:21:44,226 - guardbench.evaluator - INFO - Length Sexual Content and Violence - 29
29
+ 2025-03-31 22:21:44,323 - guardbench.evaluator - INFO - Processing cached results for category: Self–Harm and Suicidal Ideation
30
+ 2025-03-31 22:21:44,326 - guardbench.evaluator - INFO - Length Self–Harm and Suicidal Ideation - 13
31
+ 2025-03-31 22:21:44,383 - guardbench.evaluator - INFO - Processing cached results for category: Animal Cruelty and Exploitation
32
+ 2025-03-31 22:21:44,387 - guardbench.evaluator - INFO - Length Animal Cruelty and Exploitation - 30
33
+ 2025-03-31 22:21:44,487 - guardbench.evaluator - INFO - Processing cached results for category: Criminal, Violent, and Terrorist Activity
34
+ 2025-03-31 22:21:44,490 - guardbench.evaluator - INFO - Length Criminal, Violent, and Terrorist Activity - 30
35
+ 2025-03-31 22:21:44,590 - guardbench.evaluator - INFO - Processing cached results for category: Child Exploitation and Abuse
36
+ 2025-03-31 22:21:44,593 - guardbench.evaluator - INFO - Length Child Exploitation and Abuse - 30
37
+ 2025-03-31 22:21:44,694 - guardbench.evaluator - INFO - Processing cached results for category: Financial Fraud and Unethical Business
38
+ 2025-03-31 22:21:44,698 - guardbench.evaluator - INFO - Length Financial Fraud and Unethical Business - 30
39
+ 2025-03-31 22:21:44,799 - guardbench.evaluator - INFO - Processing cached results for category: Cybercrime, Hacking, and Digital Exploits
40
+ 2025-03-31 22:21:44,802 - guardbench.evaluator - INFO - Length Cybercrime, Hacking, and Digital Exploits - 30
41
+ 2025-03-31 22:21:44,903 - guardbench.evaluator - INFO - Processing cached results for category: Creative Content Involving Illicit Themes
42
+ 2025-03-31 22:21:44,907 - guardbench.evaluator - INFO - Length Creative Content Involving Illicit Themes - 30
43
+ 2025-03-31 22:21:45,008 - guardbench.evaluator - INFO - Processing cached results for category: Environmental and Industrial Harm
44
+ 2025-03-31 22:21:45,011 - guardbench.evaluator - INFO - Length Environmental and Industrial Harm - 30
45
+ 2025-03-31 22:21:45,112 - guardbench.evaluator - INFO - Processing cached results for category: Weapon, Explosives, and Hazardous Materials
46
+ 2025-03-31 22:21:45,116 - guardbench.evaluator - INFO - Length Weapon, Explosives, and Hazardous Materials - 30
47
+ 2025-03-31 22:21:45,222 - guardbench.evaluator - INFO - Updated leaderboard for model: gpt-4o-mini-TEST6 from cached results
48
+ 2025-03-31 22:21:45,223 - guardbench.evaluator - INFO - Evaluation from cached results completed for model: gpt-4o-mini-TEST6
49
+ 2025-03-31 22:21:47,120 - __main__ - INFO - Refreshing leaderboard data after submission for version v0...
50
+ 2025-03-31 22:21:47,363 - __main__ - INFO - Refreshed leaderboard data after submission
logs/guardbench_20250331_222531_b0fff871.log ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-03-31 22:25:32,811 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 22:25:32,908 - __main__ - INFO - Loaded leaderboard with 8 entries
3
+ 2025-03-31 22:25:34,603 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
4
+ 2025-03-31 22:25:34,603 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
5
+ 2025-03-31 22:25:34,603 - apscheduler.scheduler - INFO - Scheduler started
6
+ 2025-03-31 22:26:13,775 - __main__ - INFO - Received submission for model gpt4omini-TEST7: /tmp/gradio/26f1c6837517a59736a02ffe486b3504336116347e339745a7973e9412dad4db/gpt-4o-mini-3.jsonl
7
+ 2025-03-31 22:26:14,580 - guardbench.context - INFO - Loading dataset from: whitecircle-ai/guardbench_dataset_1k_public
8
+ 2025-03-31 22:26:15,600 - guardbench.context - INFO - Successfully loaded dataset with 980 examples
9
+ 2025-03-31 22:26:15,600 - guardbench.evaluator - INFO - Starting evaluation for model: gpt4omini-TEST7
10
+ 2025-03-31 22:26:15,601 - guardbench.evaluator - INFO - Using cached results for model: gpt4omini-TEST7
11
+ 2025-03-31 22:26:15,620 - guardbench.evaluator - INFO - Processing cached results for category: Political Corruption and Legal Evasion
12
+ 2025-03-31 22:26:15,624 - guardbench.evaluator - INFO - Length Political Corruption and Legal Evasion - 30
13
+ 2025-03-31 22:26:15,727 - guardbench.evaluator - INFO - Processing cached results for category: Safe Prompts
14
+ 2025-03-31 22:26:15,731 - guardbench.evaluator - INFO - Length Safe Prompts - 490
15
+ 2025-03-31 22:26:16,284 - guardbench.evaluator - INFO - Processing cached results for category: Child Exploitation and Abuse
16
+ 2025-03-31 22:26:16,287 - guardbench.evaluator - INFO - Length Child Exploitation and Abuse - 30
17
+ 2025-03-31 22:26:16,389 - guardbench.evaluator - INFO - Processing cached results for category: Sexual Content and Violence
18
+ 2025-03-31 22:26:16,392 - guardbench.evaluator - INFO - Length Sexual Content and Violence - 29
19
+ 2025-03-31 22:26:16,488 - guardbench.evaluator - INFO - Processing cached results for category: AI Manipulation and Jailbreaking
20
+ 2025-03-31 22:26:16,491 - guardbench.evaluator - INFO - Length AI Manipulation and Jailbreaking - 30
21
+ 2025-03-31 22:26:16,593 - guardbench.evaluator - INFO - Processing cached results for category: Manipulation, Deception, and Misinformation
22
+ 2025-03-31 22:26:16,597 - guardbench.evaluator - INFO - Length Manipulation, Deception, and Misinformation - 30
23
+ 2025-03-31 22:26:16,696 - guardbench.evaluator - INFO - Processing cached results for category: Self–Harm and Suicidal Ideation
24
+ 2025-03-31 22:26:16,700 - guardbench.evaluator - INFO - Length Self–Harm and Suicidal Ideation - 13
25
+ 2025-03-31 22:26:16,758 - guardbench.evaluator - INFO - Processing cached results for category: Labor Exploitation and Human Trafficking
26
+ 2025-03-31 22:26:16,761 - guardbench.evaluator - INFO - Length Labor Exploitation and Human Trafficking - 30
27
+ 2025-03-31 22:26:16,863 - guardbench.evaluator - INFO - Processing cached results for category: Hate Speech, Extremism, and Discrimination
28
+ 2025-03-31 22:26:16,866 - guardbench.evaluator - INFO - Length Hate Speech, Extremism, and Discrimination - 29
29
+ 2025-03-31 22:26:16,964 - guardbench.evaluator - INFO - Processing cached results for category: Criminal, Violent, and Terrorist Activity
30
+ 2025-03-31 22:26:16,968 - guardbench.evaluator - INFO - Length Criminal, Violent, and Terrorist Activity - 30
31
+ 2025-03-31 22:26:17,069 - guardbench.evaluator - INFO - Processing cached results for category: Weapon, Explosives, and Hazardous Materials
32
+ 2025-03-31 22:26:17,073 - guardbench.evaluator - INFO - Length Weapon, Explosives, and Hazardous Materials - 30
33
+ 2025-03-31 22:26:17,172 - guardbench.evaluator - INFO - Processing cached results for category: Cybercrime, Hacking, and Digital Exploits
34
+ 2025-03-31 22:26:17,176 - guardbench.evaluator - INFO - Length Cybercrime, Hacking, and Digital Exploits - 30
35
+ 2025-03-31 22:26:17,276 - guardbench.evaluator - INFO - Processing cached results for category: Financial Fraud and Unethical Business
36
+ 2025-03-31 22:26:17,279 - guardbench.evaluator - INFO - Length Financial Fraud and Unethical Business - 30
37
+ 2025-03-31 22:26:17,381 - guardbench.evaluator - INFO - Processing cached results for category: Academic Dishonesty and Cheating
38
+ 2025-03-31 22:26:17,384 - guardbench.evaluator - INFO - Length Academic Dishonesty and Cheating - 29
39
+ 2025-03-31 22:26:17,480 - guardbench.evaluator - INFO - Processing cached results for category: Drug– and Substance–Related Activities
40
+ 2025-03-31 22:26:17,484 - guardbench.evaluator - INFO - Length Drug– and Substance–Related Activities - 30
41
+ 2025-03-31 22:26:17,583 - guardbench.evaluator - INFO - Processing cached results for category: Animal Cruelty and Exploitation
42
+ 2025-03-31 22:26:17,586 - guardbench.evaluator - INFO - Length Animal Cruelty and Exploitation - 30
43
+ 2025-03-31 22:26:17,687 - guardbench.evaluator - INFO - Processing cached results for category: Environmental and Industrial Harm
44
+ 2025-03-31 22:26:17,691 - guardbench.evaluator - INFO - Length Environmental and Industrial Harm - 30
45
+ 2025-03-31 22:26:17,793 - guardbench.evaluator - INFO - Processing cached results for category: Creative Content Involving Illicit Themes
46
+ 2025-03-31 22:26:17,796 - guardbench.evaluator - INFO - Length Creative Content Involving Illicit Themes - 30
47
+ 2025-03-31 22:26:17,903 - guardbench.evaluator - INFO - Updated leaderboard for model: gpt4omini-TEST7 from cached results
48
+ 2025-03-31 22:26:17,905 - guardbench.evaluator - INFO - Evaluation from cached results completed for model: gpt4omini-TEST7
49
+ 2025-03-31 22:26:19,582 - __main__ - INFO - Refreshing leaderboard data after submission for version v0...
50
+ 2025-03-31 22:26:19,716 - __main__ - INFO - Refreshed leaderboard data after submission
logs/guardbench_20250331_223148_4e22eb66.log ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-03-31 22:31:49,992 - __main__ - INFO - Initializing leaderboard data...
2
+ 2025-03-31 22:31:50,115 - __main__ - INFO - Loaded leaderboard with 9 entries
3
+ 2025-03-31 22:31:51,543 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
4
+ 2025-03-31 22:31:51,543 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
5
+ 2025-03-31 22:31:51,543 - apscheduler.scheduler - INFO - Scheduler started
6
+ 2025-03-31 22:32:19,107 - __main__ - INFO - Received submission for model got-r0mini-8: /tmp/gradio/26f1c6837517a59736a02ffe486b3504336116347e339745a7973e9412dad4db/gpt-4o-mini-3.jsonl
7
+ 2025-03-31 22:32:19,583 - guardbench.context - INFO - Loading dataset from: whitecircle-ai/guardbench_dataset_1k_public
8
+ 2025-03-31 22:32:20,839 - guardbench.context - INFO - Successfully loaded dataset with 980 examples
9
+ 2025-03-31 22:32:20,839 - guardbench.evaluator - INFO - Starting evaluation for model: got-r0mini-8
10
+ 2025-03-31 22:32:20,839 - guardbench.evaluator - INFO - Using cached results for model: got-r0mini-8
11
+ 2025-03-31 22:32:20,857 - guardbench.evaluator - INFO - Processing cached results for category: Political Corruption and Legal Evasion
12
+ 2025-03-31 22:32:20,861 - guardbench.evaluator - INFO - Length Political Corruption and Legal Evasion - 30
13
+ 2025-03-31 22:32:20,960 - guardbench.evaluator - INFO - Processing cached results for category: Creative Content Involving Illicit Themes
14
+ 2025-03-31 22:32:20,963 - guardbench.evaluator - INFO - Length Creative Content Involving Illicit Themes - 30
15
+ 2025-03-31 22:32:21,062 - guardbench.evaluator - INFO - Processing cached results for category: Financial Fraud and Unethical Business
16
+ 2025-03-31 22:32:21,065 - guardbench.evaluator - INFO - Length Financial Fraud and Unethical Business - 30
17
+ 2025-03-31 22:32:21,165 - guardbench.evaluator - INFO - Processing cached results for category: Manipulation, Deception, and Misinformation
18
+ 2025-03-31 22:32:21,168 - guardbench.evaluator - INFO - Length Manipulation, Deception, and Misinformation - 30
19
+ 2025-03-31 22:32:21,266 - guardbench.evaluator - INFO - Processing cached results for category: Drug– and Substance–Related Activities
20
+ 2025-03-31 22:32:21,269 - guardbench.evaluator - INFO - Length Drug– and Substance–Related Activities - 30
21
+ 2025-03-31 22:32:21,367 - guardbench.evaluator - INFO - Processing cached results for category: AI Manipulation and Jailbreaking
22
+ 2025-03-31 22:32:21,371 - guardbench.evaluator - INFO - Length AI Manipulation and Jailbreaking - 30
23
+ 2025-03-31 22:32:21,469 - guardbench.evaluator - INFO - Processing cached results for category: Sexual Content and Violence
24
+ 2025-03-31 22:32:21,473 - guardbench.evaluator - INFO - Length Sexual Content and Violence - 29
25
+ 2025-03-31 22:32:21,568 - guardbench.evaluator - INFO - Processing cached results for category: Cybercrime, Hacking, and Digital Exploits
26
+ 2025-03-31 22:32:21,571 - guardbench.evaluator - INFO - Length Cybercrime, Hacking, and Digital Exploits - 30
27
+ 2025-03-31 22:32:21,669 - guardbench.evaluator - INFO - Processing cached results for category: Safe Prompts
28
+ 2025-03-31 22:32:21,673 - guardbench.evaluator - INFO - Length Safe Prompts - 490
29
+ 2025-03-31 22:32:22,215 - guardbench.evaluator - INFO - Processing cached results for category: Child Exploitation and Abuse
30
+ 2025-03-31 22:32:22,219 - guardbench.evaluator - INFO - Length Child Exploitation and Abuse - 30
31
+ 2025-03-31 22:32:22,318 - guardbench.evaluator - INFO - Processing cached results for category: Labor Exploitation and Human Trafficking
32
+ 2025-03-31 22:32:22,321 - guardbench.evaluator - INFO - Length Labor Exploitation and Human Trafficking - 30
33
+ 2025-03-31 22:32:22,418 - guardbench.evaluator - INFO - Processing cached results for category: Self–Harm and Suicidal Ideation
34
+ 2025-03-31 22:32:22,422 - guardbench.evaluator - INFO - Length Self–Harm and Suicidal Ideation - 13
35
+ 2025-03-31 22:32:22,476 - guardbench.evaluator - INFO - Processing cached results for category: Criminal, Violent, and Terrorist Activity
36
+ 2025-03-31 22:32:22,480 - guardbench.evaluator - INFO - Length Criminal, Violent, and Terrorist Activity - 30
37
+ 2025-03-31 22:32:22,578 - guardbench.evaluator - INFO - Processing cached results for category: Hate Speech, Extremism, and Discrimination
38
+ 2025-03-31 22:32:22,581 - guardbench.evaluator - INFO - Length Hate Speech, Extremism, and Discrimination - 29
39
+ 2025-03-31 22:32:22,678 - guardbench.evaluator - INFO - Processing cached results for category: Environmental and Industrial Harm
40
+ 2025-03-31 22:32:22,681 - guardbench.evaluator - INFO - Length Environmental and Industrial Harm - 30
41
+ 2025-03-31 22:32:22,779 - guardbench.evaluator - INFO - Processing cached results for category: Animal Cruelty and Exploitation
42
+ 2025-03-31 22:32:22,783 - guardbench.evaluator - INFO - Length Animal Cruelty and Exploitation - 30
43
+ 2025-03-31 22:32:22,884 - guardbench.evaluator - INFO - Processing cached results for category: Academic Dishonesty and Cheating
44
+ 2025-03-31 22:32:22,888 - guardbench.evaluator - INFO - Length Academic Dishonesty and Cheating - 29
45
+ 2025-03-31 22:32:22,984 - guardbench.evaluator - INFO - Processing cached results for category: Weapon, Explosives, and Hazardous Materials
46
+ 2025-03-31 22:32:22,987 - guardbench.evaluator - INFO - Length Weapon, Explosives, and Hazardous Materials - 30
47
+ 2025-03-31 22:32:23,092 - guardbench.evaluator - INFO - Updated leaderboard for model: got-r0mini-8 from cached results
48
+ 2025-03-31 22:32:23,093 - guardbench.evaluator - INFO - Evaluation from cached results completed for model: got-r0mini-8
49
+ 2025-03-31 22:32:25,575 - __main__ - INFO - Refreshing leaderboard data after submission for version v0...
50
+ 2025-03-31 22:32:25,790 - __main__ - INFO - Refreshed leaderboard data after submission
requirements.txt CHANGED
@@ -1,7 +1,8 @@
1
- gradio>=3.50.2
2
- huggingface_hub>=0.19.0
3
- datasets>=2.16.0
4
  pandas>=2.0.0
 
 
 
5
  python-dotenv>=1.0.0
6
- apscheduler>=3.10.1
7
- gradio-leaderboard
 
1
+ gradio>=4.0.0
 
 
2
  pandas>=2.0.0
3
+ huggingface_hub>=0.20.0
4
+ datasets>=2.0.0
5
+ apscheduler>=3.10.0
6
  python-dotenv>=1.0.0
7
+ plotly>=5.18.0
8
+ gradio-leaderboard>=0.1.0
src/display/css_html_js.py CHANGED
@@ -45,11 +45,9 @@ custom_css = """
45
  }
46
 
47
  .version-selector {
48
- margin-top: 10px;
49
  padding: 5px;
50
- border: 1px solid #e0e0e0;
51
  border-radius: 5px;
52
- background-color: #f9f9f9;
53
  }
54
 
55
  .version-selector label {
@@ -61,4 +59,14 @@ custom_css = """
61
  border-color: #2196F3;
62
  border-radius: 4px;
63
  }
 
 
 
 
 
 
 
 
 
 
64
  """
 
45
  }
46
 
47
  .version-selector {
48
+ margin: 0 !important;
49
  padding: 5px;
 
50
  border-radius: 5px;
 
51
  }
52
 
53
  .version-selector label {
 
59
  border-color: #2196F3;
60
  border-radius: 4px;
61
  }
62
+
63
+ /* Make sure the version selector is properly aligned with refresh button */
64
+ .version-selector > .block {
65
+ padding: 0 !important;
66
+ }
67
+
68
+ .version-selector > .block > .wrap {
69
+ position: relative;
70
+ top: -5px;
71
+ }
72
  """
src/display/utils.py CHANGED
@@ -26,6 +26,20 @@ class ModelType(Enum):
26
  return "API"
27
  return "Unknown"
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  class Precision(Enum):
31
  """Model precision types."""
@@ -65,75 +79,203 @@ class ColumnInfo:
65
  @dataclass
66
  class GuardBenchColumn:
67
  """Columns for the GuardBench leaderboard."""
 
68
  model_name: ColumnInfo = field(default_factory=lambda: ColumnInfo(
69
  name="model_name",
70
  display_name="Model",
71
  never_hidden=True,
72
  displayed_by_default=True
73
  ))
74
-
75
  model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
76
  name="model_type",
77
  display_name="Type",
78
  displayed_by_default=True
79
  ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
- # Metrics for all categories
 
 
 
 
 
 
82
  default_prompts_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
83
  name="default_prompts_f1",
84
  display_name="Default Prompts F1",
85
  type="number",
86
  displayed_by_default=True
87
  ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
 
 
 
 
 
 
 
89
  jailbreaked_prompts_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
90
  name="jailbreaked_prompts_f1",
91
  display_name="Jailbreaked Prompts F1",
92
  type="number",
93
  displayed_by_default=True
94
  ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
 
 
 
 
 
 
 
96
  default_answers_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
97
  name="default_answers_f1",
98
  display_name="Default Answers F1",
99
  type="number",
100
  displayed_by_default=True
101
  ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
 
 
 
 
 
 
 
103
  jailbreaked_answers_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
104
  name="jailbreaked_answers_f1",
105
  display_name="Jailbreaked Answers F1",
106
  type="number",
107
  displayed_by_default=True
108
  ))
109
-
110
- # Average metrics
111
- average_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
112
- name="average_f1",
113
- display_name="Average F1",
114
  type="number",
115
- displayed_by_default=True,
116
- never_hidden=True
117
  ))
118
-
119
- average_recall: ColumnInfo = field(default_factory=lambda: ColumnInfo(
120
- name="average_recall",
121
- display_name="Average Recall",
122
  type="number",
123
  displayed_by_default=False
124
  ))
125
-
126
- average_precision: ColumnInfo = field(default_factory=lambda: ColumnInfo(
127
- name="average_precision",
128
- display_name="Average Precision",
129
  type="number",
130
  displayed_by_default=False
131
  ))
132
-
133
- # Additional metadata
134
- submission_date: ColumnInfo = field(default_factory=lambda: ColumnInfo(
135
- name="submission_date",
136
- display_name="Submission Date",
137
  displayed_by_default=False
138
  ))
139
 
 
26
  return "API"
27
  return "Unknown"
28
 
29
+ class GuardModelType(str, Enum):
30
+ """Guard model types for the leaderboard."""
31
+ LLAMA_GUARD = "llama_guard"
32
+ PROMPT_GUARD_CLF = "prompt_guard_clf"
33
+ ATLA_SELENE = "atla_selene"
34
+ GEMMA_SHIELD = "gemma_shield"
35
+ LLM_REGEXP = "llm_regexp"
36
+ LLM_SO = "llm_so"
37
+
38
+ def __str__(self):
39
+ """String representation of the guard model type."""
40
+ return self.name
41
+
42
+
43
 
44
  class Precision(Enum):
45
  """Model precision types."""
 
79
  @dataclass
80
  class GuardBenchColumn:
81
  """Columns for the GuardBench leaderboard."""
82
+ # Core metadata
83
  model_name: ColumnInfo = field(default_factory=lambda: ColumnInfo(
84
  name="model_name",
85
  display_name="Model",
86
  never_hidden=True,
87
  displayed_by_default=True
88
  ))
 
89
  model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
90
  name="model_type",
91
  display_name="Type",
92
  displayed_by_default=True
93
  ))
94
+ submission_date: ColumnInfo = field(default_factory=lambda: ColumnInfo(
95
+ name="submission_date",
96
+ display_name="Submission Date",
97
+ displayed_by_default=False
98
+ ))
99
+ version: ColumnInfo = field(default_factory=lambda: ColumnInfo(
100
+ name="version",
101
+ display_name="Version",
102
+ displayed_by_default=False
103
+ ))
104
+ guard_model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
105
+ name="guard_model_type",
106
+ display_name="Guard Model Type",
107
+ displayed_by_default=True
108
+ ))
109
+ base_model: ColumnInfo = field(default_factory=lambda: ColumnInfo(
110
+ name="base_model",
111
+ display_name="Base Model",
112
+ displayed_by_default=False
113
+ ))
114
+ revision: ColumnInfo = field(default_factory=lambda: ColumnInfo(
115
+ name="revision",
116
+ display_name="Revision",
117
+ displayed_by_default=False
118
+ ))
119
+ precision: ColumnInfo = field(default_factory=lambda: ColumnInfo(
120
+ name="precision",
121
+ display_name="Precision",
122
+ displayed_by_default=False
123
+ ))
124
+ weight_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
125
+ name="weight_type",
126
+ display_name="Weight Type",
127
+ displayed_by_default=False
128
+ ))
129
 
130
+ # Default prompts metrics
131
+ default_prompts_f1_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
132
+ name="default_prompts_f1_binary",
133
+ display_name="Default Prompts F1 Binary",
134
+ type="number",
135
+ displayed_by_default=False
136
+ ))
137
  default_prompts_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
138
  name="default_prompts_f1",
139
  display_name="Default Prompts F1",
140
  type="number",
141
  displayed_by_default=True
142
  ))
143
+ default_prompts_recall_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
144
+ name="default_prompts_recall_binary",
145
+ display_name="Default Prompts Recall",
146
+ type="number",
147
+ displayed_by_default=False
148
+ ))
149
+ default_prompts_precision_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
150
+ name="default_prompts_precision_binary",
151
+ display_name="Default Prompts Precision",
152
+ type="number",
153
+ displayed_by_default=False
154
+ ))
155
+ default_prompts_error_ratio: ColumnInfo = field(default_factory=lambda: ColumnInfo(
156
+ name="default_prompts_error_ratio",
157
+ display_name="Default Prompts Error Ratio",
158
+ type="number",
159
+ displayed_by_default=False
160
+ ))
161
+ default_prompts_avg_runtime_ms: ColumnInfo = field(default_factory=lambda: ColumnInfo(
162
+ name="default_prompts_avg_runtime_ms",
163
+ display_name="Default Prompts Avg Runtime (ms)",
164
+ type="number",
165
+ displayed_by_default=False
166
+ ))
167
 
168
+ # Jailbreaked prompts metrics
169
+ jailbreaked_prompts_f1_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
170
+ name="jailbreaked_prompts_f1_binary",
171
+ display_name="Jailbreaked Prompts F1 Binary",
172
+ type="number",
173
+ displayed_by_default=False
174
+ ))
175
  jailbreaked_prompts_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
176
  name="jailbreaked_prompts_f1",
177
  display_name="Jailbreaked Prompts F1",
178
  type="number",
179
  displayed_by_default=True
180
  ))
181
+ jailbreaked_prompts_recall_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
182
+ name="jailbreaked_prompts_recall_binary",
183
+ display_name="Jailbreaked Prompts Recall",
184
+ type="number",
185
+ displayed_by_default=False
186
+ ))
187
+ jailbreaked_prompts_precision_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
188
+ name="jailbreaked_prompts_precision_binary",
189
+ display_name="Jailbreaked Prompts Precision",
190
+ type="number",
191
+ displayed_by_default=False
192
+ ))
193
+ jailbreaked_prompts_error_ratio: ColumnInfo = field(default_factory=lambda: ColumnInfo(
194
+ name="jailbreaked_prompts_error_ratio",
195
+ display_name="Jailbreaked Prompts Error Ratio",
196
+ type="number",
197
+ displayed_by_default=False
198
+ ))
199
+ jailbreaked_prompts_avg_runtime_ms: ColumnInfo = field(default_factory=lambda: ColumnInfo(
200
+ name="jailbreaked_prompts_avg_runtime_ms",
201
+ display_name="Jailbreaked Prompts Avg Runtime (ms)",
202
+ type="number",
203
+ displayed_by_default=False
204
+ ))
205
 
206
+ # Default answers metrics
207
+ default_answers_f1_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
208
+ name="default_answers_f1_binary",
209
+ display_name="Default Answers F1 Binary",
210
+ type="number",
211
+ displayed_by_default=False
212
+ ))
213
  default_answers_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
214
  name="default_answers_f1",
215
  display_name="Default Answers F1",
216
  type="number",
217
  displayed_by_default=True
218
  ))
219
+ default_answers_recall_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
220
+ name="default_answers_recall_binary",
221
+ display_name="Default Answers Recall",
222
+ type="number",
223
+ displayed_by_default=False
224
+ ))
225
+ default_answers_precision_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
226
+ name="default_answers_precision_binary",
227
+ display_name="Default Answers Precision",
228
+ type="number",
229
+ displayed_by_default=False
230
+ ))
231
+ default_answers_error_ratio: ColumnInfo = field(default_factory=lambda: ColumnInfo(
232
+ name="default_answers_error_ratio",
233
+ display_name="Default Answers Error Ratio",
234
+ type="number",
235
+ displayed_by_default=False
236
+ ))
237
+ default_answers_avg_runtime_ms: ColumnInfo = field(default_factory=lambda: ColumnInfo(
238
+ name="default_answers_avg_runtime_ms",
239
+ display_name="Default Answers Avg Runtime (ms)",
240
+ type="number",
241
+ displayed_by_default=False
242
+ ))
243
 
244
+ # Jailbreaked answers metrics
245
+ jailbreaked_answers_f1_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
246
+ name="jailbreaked_answers_f1_binary",
247
+ display_name="Jailbreaked Answers F1 Binary",
248
+ type="number",
249
+ displayed_by_default=False
250
+ ))
251
  jailbreaked_answers_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
252
  name="jailbreaked_answers_f1",
253
  display_name="Jailbreaked Answers F1",
254
  type="number",
255
  displayed_by_default=True
256
  ))
257
+ jailbreaked_answers_recall_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
258
+ name="jailbreaked_answers_recall_binary",
259
+ display_name="Jailbreaked Answers Recall",
 
 
260
  type="number",
261
+ displayed_by_default=False
 
262
  ))
263
+ jailbreaked_answers_precision_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
264
+ name="jailbreaked_answers_precision_binary",
265
+ display_name="Jailbreaked Answers Precision",
 
266
  type="number",
267
  displayed_by_default=False
268
  ))
269
+ jailbreaked_answers_error_ratio: ColumnInfo = field(default_factory=lambda: ColumnInfo(
270
+ name="jailbreaked_answers_error_ratio",
271
+ display_name="Jailbreaked Answers Error Ratio",
 
272
  type="number",
273
  displayed_by_default=False
274
  ))
275
+ jailbreaked_answers_avg_runtime_ms: ColumnInfo = field(default_factory=lambda: ColumnInfo(
276
+ name="jailbreaked_answers_avg_runtime_ms",
277
+ display_name="Jailbreaked Answers Avg Runtime (ms)",
278
+ type="number",
 
279
  displayed_by_default=False
280
  ))
281
 
src/leaderboard/processor.py CHANGED
@@ -103,7 +103,8 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
103
  "model_name": model_name,
104
  "model_type": entry.get("model_type", "Unknown"),
105
  "submission_date": entry.get("submission_date", ""),
106
- "version": entry.get("version", "v0")
 
107
  }
108
 
109
  # Add additional metadata fields if present
@@ -111,50 +112,67 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
111
  if key in entry:
112
  row[key] = entry[key]
113
 
114
- # Add average metrics
 
 
 
 
 
115
  avg_metrics = entry.get("avg_metrics", {})
116
- for test_type in TEST_TYPES:
117
- if test_type in avg_metrics:
118
- for metric in METRICS:
119
- if metric in avg_metrics[test_type]:
120
- col_name = f"{test_type}_{metric}"
121
- row[col_name] = avg_metrics[test_type][metric]
122
-
123
- # Calculate overall averages for key metrics
124
- f1_values = []
125
- recall_values = []
126
- precision_values = []
127
-
128
- for test_type in TEST_TYPES:
129
- if test_type in avg_metrics and "f1_binary" in avg_metrics[test_type]:
130
- f1_values.append(avg_metrics[test_type]["f1_binary"])
131
- if test_type in avg_metrics and "recall_binary" in avg_metrics[test_type]:
132
- recall_values.append(avg_metrics[test_type]["recall_binary"])
133
- if test_type in avg_metrics and "precision_binary" in avg_metrics[test_type]:
134
- precision_values.append(avg_metrics[test_type]["precision_binary"])
135
-
136
- # Add overall averages
137
- if f1_values:
138
- row["average_f1"] = sum(f1_values) / len(f1_values)
139
- if recall_values:
140
- row["average_recall"] = sum(recall_values) / len(recall_values)
141
- if precision_values:
142
- row["average_precision"] = sum(precision_values) / len(precision_values)
143
-
144
- # Add specific test type F1 scores for display
145
- if "default_prompts" in avg_metrics and "f1_binary" in avg_metrics["default_prompts"]:
146
- row["default_prompts_f1"] = avg_metrics["default_prompts"]["f1_binary"]
147
- if "jailbreaked_prompts" in avg_metrics and "f1_binary" in avg_metrics["jailbreaked_prompts"]:
148
- row["jailbreaked_prompts_f1"] = avg_metrics["jailbreaked_prompts"]["f1_binary"]
149
- if "default_answers" in avg_metrics and "f1_binary" in avg_metrics["default_answers"]:
150
- row["default_answers_f1"] = avg_metrics["default_answers"]["f1_binary"]
151
- if "jailbreaked_answers" in avg_metrics and "f1_binary" in avg_metrics["jailbreaked_answers"]:
152
- row["jailbreaked_answers_f1"] = avg_metrics["jailbreaked_answers"]["f1_binary"]
153
 
154
  rows.append(row)
155
 
156
  # Create DataFrame and sort by average F1 score
157
  df = pd.DataFrame(rows)
 
 
 
 
 
 
 
 
 
 
 
 
158
  if not df.empty and "average_f1" in df.columns:
159
  df = df.sort_values(by="average_f1", ascending=False)
160
 
 
103
  "model_name": model_name,
104
  "model_type": entry.get("model_type", "Unknown"),
105
  "submission_date": entry.get("submission_date", ""),
106
+ "version": entry.get("version", "v0"),
107
+ "guard_model_type": entry.get("guard_model_type", "llm_regexp").lower()
108
  }
109
 
110
  # Add additional metadata fields if present
 
112
  if key in entry:
113
  row[key] = entry[key]
114
 
115
+ # CASE 1: Metrics are flat in the root
116
+ for key, value in entry.items():
117
+ if any(test_type in key for test_type in TEST_TYPES) or key in ["average_f1", "average_recall", "average_precision"]:
118
+ row[key] = value
119
+
120
+ # CASE 2: Metrics are in avg_metrics structure
121
  avg_metrics = entry.get("avg_metrics", {})
122
+ if avg_metrics:
123
+ for test_type in TEST_TYPES:
124
+ if test_type in avg_metrics:
125
+ metrics = avg_metrics[test_type]
126
+ for metric in METRICS:
127
+ if metric in metrics:
128
+ col_name = f"{test_type}_{metric}"
129
+ row[col_name] = metrics[metric]
130
+
131
+ # Also add non-binary version for F1 scores
132
+ if metric == "f1_binary":
133
+ row[f"{test_type}_f1"] = metrics[metric]
134
+
135
+ # Calculate averages if not present
136
+ if "average_f1" not in row:
137
+ f1_values = []
138
+ for test_type in TEST_TYPES:
139
+ if test_type in avg_metrics and "f1_binary" in avg_metrics[test_type]:
140
+ f1_values.append(avg_metrics[test_type]["f1_binary"])
141
+ if f1_values:
142
+ row["average_f1"] = sum(f1_values) / len(f1_values)
143
+
144
+ if "average_recall" not in row:
145
+ recall_values = []
146
+ for test_type in TEST_TYPES:
147
+ if test_type in avg_metrics and "recall_binary" in avg_metrics[test_type]:
148
+ recall_values.append(avg_metrics[test_type]["recall_binary"])
149
+ if recall_values:
150
+ row["average_recall"] = sum(recall_values) / len(recall_values)
151
+
152
+ if "average_precision" not in row:
153
+ precision_values = []
154
+ for test_type in TEST_TYPES:
155
+ if test_type in avg_metrics and "precision_binary" in avg_metrics[test_type]:
156
+ precision_values.append(avg_metrics[test_type]["precision_binary"])
157
+ if precision_values:
158
+ row["average_precision"] = sum(precision_values) / len(precision_values)
159
 
160
  rows.append(row)
161
 
162
  # Create DataFrame and sort by average F1 score
163
  df = pd.DataFrame(rows)
164
+
165
+ # Ensure all expected columns exist
166
+ for test_type in TEST_TYPES:
167
+ if f"{test_type}_f1" not in df.columns:
168
+ df[f"{test_type}_f1"] = None
169
+ if f"{test_type}_f1_binary" not in df.columns:
170
+ df[f"{test_type}_f1_binary"] = None
171
+ if f"{test_type}_recall_binary" not in df.columns:
172
+ df[f"{test_type}_recall_binary"] = None
173
+ if f"{test_type}_precision_binary" not in df.columns:
174
+ df[f"{test_type}_precision_binary"] = None
175
+
176
  if not df.empty and "average_f1" in df.columns:
177
  df = df.sort_values(by="average_f1", ascending=False)
178
 
src/populate.py CHANGED
@@ -6,277 +6,212 @@ import json
6
  import os
7
  import pandas as pd
8
  import tempfile
9
- from typing import Dict, Tuple, List
10
- from glob import glob
11
 
12
- from huggingface_hub import snapshot_download, hf_hub_download, HfApi
13
  from datasets import load_dataset
14
 
15
  from src.display.utils import GUARDBENCH_COLUMN, DISPLAY_COLS, CATEGORIES
16
- from src.envs import RESULTS_DATASET_ID, TOKEN, LEADERBOARD_FILE, CACHE_PATH
17
- from src.leaderboard.processor import leaderboard_to_dataframe, load_leaderboard_data, save_leaderboard_data, process_jsonl_submission, add_entries_to_leaderboard
18
 
19
 
20
- def get_versioned_leaderboard_file(version="v0"):
21
  """
22
- Get the versioned leaderboard file path.
23
  """
24
- base_name, ext = os.path.splitext(LEADERBOARD_FILE)
25
- return f"{base_name}_{version}{ext}"
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
 
28
- def download_leaderboard_data(version="v0") -> bool:
29
  """
30
- Download the latest leaderboard data from HuggingFace.
31
-
32
- Args:
33
- version: The dataset version to download
34
  """
35
  try:
36
- # Create a temporary directory to download the submissions
37
- temp_dir = os.path.join(CACHE_PATH, f"temp_submissions_{version}")
38
- os.makedirs(temp_dir, exist_ok=True)
39
-
40
- # Get the versioned leaderboard file
41
- leaderboard_file = get_versioned_leaderboard_file(version)
42
-
43
- # Download the entire repository
44
- try:
45
- snapshot_path = snapshot_download(
46
- repo_id=RESULTS_DATASET_ID,
47
- repo_type="dataset",
48
- local_dir=temp_dir,
49
- token=TOKEN,
50
- ignore_patterns=["*.md", ".*"],
51
- etag_timeout=30
52
- )
53
-
54
- # Process all submission files
55
- all_entries = []
56
- submission_files = []
57
-
58
- # Look for submission files in the submissions directory
59
- submissions_dir = os.path.join(snapshot_path, "submissions")
60
- version_submissions_dir = os.path.join(snapshot_path, f"submissions_{version}")
61
-
62
- # Check both standard and versioned submission directories
63
- if os.path.exists(submissions_dir):
64
- submission_files.extend(glob(os.path.join(submissions_dir, "*.jsonl")))
65
-
66
- if os.path.exists(version_submissions_dir):
67
- submission_files.extend(glob(os.path.join(version_submissions_dir, "*.jsonl")))
68
-
69
- # Also look for any versioned JSONL files in the root
70
- submission_files.extend(glob(os.path.join(snapshot_path, f"*_{version}.jsonl")))
71
-
72
- # If we're looking for v0 and no versioned files found, use generic ones
73
- if version == "v0" and not submission_files:
74
- submission_files.extend(glob(os.path.join(snapshot_path, "*.jsonl")))
75
-
76
- # Process each submission file
77
- for file_path in submission_files:
78
- entries, _ = process_jsonl_submission(file_path)
79
-
80
- # Filter entries to those that match the version or don't have version specified
81
- filtered_entries = [
82
- entry for entry in entries
83
- if entry.get("version", "v0") == version or "version" not in entry
84
- ]
85
-
86
- all_entries.extend(filtered_entries)
87
-
88
- # Create leaderboard data structure
89
- leaderboard_data = {
90
- "entries": all_entries,
91
- "last_updated": pd.Timestamp.now().isoformat(),
92
- "version": version
93
- }
94
 
95
- # Save to local file
96
- save_leaderboard_data(leaderboard_data, leaderboard_file)
97
 
98
- return True
99
- except Exception as e:
100
- print(f"Error downloading repository: {e}")
 
 
 
 
 
101
 
102
- # If we can't download the repository, try to download individual files
 
103
  try:
104
- api = HfApi(token=TOKEN)
105
- files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
106
-
107
- # Look for versioned and regular files
108
- submission_files = [
109
- f for f in files
110
- if (f.endswith(f'_{version}.jsonl') or
111
- f.startswith(f'submissions_{version}/') or
112
- (version == "v0" and f.endswith('.jsonl')))
113
- ]
114
-
115
- all_entries = []
116
-
117
- for file_path in submission_files:
118
- try:
119
- local_path = hf_hub_download(
120
- repo_id=RESULTS_DATASET_ID,
121
- filename=file_path,
122
- repo_type="dataset",
123
- token=TOKEN
124
- )
125
- entries, _ = process_jsonl_submission(local_path)
126
-
127
- # Filter entries to those that match the version or don't have version specified
128
- filtered_entries = [
129
- entry for entry in entries
130
- if entry.get("version", "v0") == version or "version" not in entry
131
- ]
132
-
133
- all_entries.extend(filtered_entries)
134
- except Exception as file_error:
135
- print(f"Error downloading file {file_path}: {file_error}")
136
-
137
- # Create leaderboard data structure
138
- leaderboard_data = {
139
- "entries": all_entries,
140
- "last_updated": pd.Timestamp.now().isoformat(),
141
- "version": version
142
- }
143
-
144
- # Save to local file
145
- save_leaderboard_data(leaderboard_data, leaderboard_file)
146
-
147
- return True
148
- except Exception as list_error:
149
- print(f"Error listing repository files: {list_error}")
150
-
151
- # If we can't download anything, create an empty leaderboard
152
- if not os.path.exists(leaderboard_file):
153
- empty_data = {
154
- "entries": [],
155
- "last_updated": pd.Timestamp.now().isoformat(),
156
- "version": version
157
- }
158
- save_leaderboard_data(empty_data, leaderboard_file)
159
-
160
- return False
161
  except Exception as e:
162
- print(f"Error downloading leaderboard data: {e}")
163
-
164
- # Ensure we have at least an empty leaderboard file
165
- leaderboard_file = get_versioned_leaderboard_file(version)
166
- if not os.path.exists(leaderboard_file):
167
- empty_data = {
168
- "entries": [],
169
- "last_updated": pd.Timestamp.now().isoformat(),
170
- "version": version
171
- }
172
- save_leaderboard_data(empty_data, leaderboard_file)
173
-
174
- return False
175
 
176
 
177
  def get_leaderboard_df(version="v0") -> pd.DataFrame:
178
  """
179
  Get the leaderboard data as a DataFrame.
180
-
181
- Args:
182
- version: The dataset version to retrieve
183
  """
184
- # Try to download the latest data
185
- download_leaderboard_data(version=version)
186
 
187
- # Load from local file
188
- leaderboard_file = get_versioned_leaderboard_file(version)
189
- leaderboard_data = load_leaderboard_data(leaderboard_file)
 
 
 
 
 
 
 
 
 
190
 
191
  # Convert to DataFrame
192
- df = leaderboard_to_dataframe(leaderboard_data)
193
-
194
- return df
195
 
196
 
197
  def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
198
  """
199
  Get the leaderboard data filtered by a specific category.
200
-
201
- Args:
202
- category: The category to filter by (e.g., "Criminal, Violent, and Terrorist Activity")
203
- version: The dataset version to retrieve
204
-
205
- Returns:
206
- DataFrame with metrics for the specified category
207
  """
208
- # Load the leaderboard data
209
- leaderboard_file = get_versioned_leaderboard_file(version)
210
- leaderboard_data = load_leaderboard_data(leaderboard_file)
 
 
 
 
 
 
 
 
 
 
 
 
211
 
212
  # Filter entries to only include those with data for the specified category
213
  filtered_entries = []
214
 
215
  for entry in leaderboard_data.get("entries", []):
216
- # Check if the entry has data for this category
217
- if "per_category_metrics" in entry and category in entry["per_category_metrics"]:
218
- # Create a new entry with just the overall info and this category's metrics
219
- filtered_entry = {
220
- "model_name": entry.get("model_name", "Unknown Model"),
221
- "model_type": entry.get("model_type", "Unknown"),
222
- "submission_date": entry.get("submission_date", ""),
223
- "version": entry.get("version", version),
224
- }
 
 
 
225
 
226
- # Extract metrics for this category
227
  category_metrics = entry["per_category_metrics"][category]
228
 
229
- # Add metrics for each test type
230
- for test_type in category_metrics:
231
- if test_type and isinstance(category_metrics[test_type], dict):
232
- for metric, value in category_metrics[test_type].items():
233
  col_name = f"{test_type}_{metric}"
234
  filtered_entry[col_name] = value
235
 
236
- # Calculate average F1 for this category
 
 
 
 
237
  f1_values = []
238
- for test_type in category_metrics:
239
- if test_type and isinstance(category_metrics[test_type], dict) and "f1_binary" in category_metrics[test_type]:
240
- f1_values.append(category_metrics[test_type]["f1_binary"])
241
 
 
 
 
 
 
 
 
 
 
 
242
  if f1_values:
243
  filtered_entry["average_f1"] = sum(f1_values) / len(f1_values)
244
-
245
- # Add specific test type F1 scores for display
246
- for test_type in ["default_prompts", "jailbreaked_prompts", "default_answers", "jailbreaked_answers"]:
247
- if test_type in category_metrics and "f1_binary" in category_metrics[test_type]:
248
- filtered_entry[f"{test_type}_f1"] = category_metrics[test_type]["f1_binary"]
249
 
250
  filtered_entries.append(filtered_entry)
251
 
252
  # Create a new leaderboard data structure with the filtered entries
253
  filtered_leaderboard = {
254
  "entries": filtered_entries,
255
- "last_updated": leaderboard_data.get("last_updated", pd.Timestamp.now().isoformat()),
256
  "version": version
257
  }
 
258
 
259
  # Convert to DataFrame
260
- df = leaderboard_to_dataframe(filtered_leaderboard)
261
-
262
- return df
263
 
264
 
265
  def get_detailed_model_data(model_name: str, version="v0") -> Dict:
266
  """
267
  Get detailed data for a specific model.
268
-
269
- Args:
270
- model_name: The name of the model to get data for
271
- version: The dataset version to retrieve
272
  """
273
- leaderboard_file = get_versioned_leaderboard_file(version)
274
- leaderboard_data = load_leaderboard_data(leaderboard_file)
275
-
276
- for entry in leaderboard_data.get("entries", []):
277
- # Check both the model name and version
278
- entry_version = entry.get("version", "v0")
279
- if entry.get("model_name") == model_name and (entry_version == version or entry_version is None):
280
- return entry
 
 
 
281
 
282
  return {}
 
6
  import os
7
  import pandas as pd
8
  import tempfile
9
+ from typing import Dict, List, Optional
10
+ from datetime import datetime
11
 
12
+ from huggingface_hub import hf_hub_download, HfApi
13
  from datasets import load_dataset
14
 
15
  from src.display.utils import GUARDBENCH_COLUMN, DISPLAY_COLS, CATEGORIES
16
+ from src.envs import RESULTS_DATASET_ID, TOKEN, CACHE_PATH
17
+ from src.leaderboard.processor import leaderboard_to_dataframe
18
 
19
 
20
+ def get_latest_leaderboard(version="v0") -> Optional[Dict]:
21
  """
22
+ Get the latest leaderboard data from HuggingFace dataset.
23
  """
24
+ try:
25
+ # Try to download the leaderboard file
26
+ leaderboard_path = hf_hub_download(
27
+ repo_id=RESULTS_DATASET_ID,
28
+ filename=f"leaderboards/leaderboard_{version}.json",
29
+ repo_type="dataset",
30
+ token=TOKEN
31
+ )
32
+
33
+ with open(leaderboard_path, 'r') as f:
34
+ return json.load(f)
35
+ except Exception as e:
36
+ print(f"Error downloading leaderboard: {e}")
37
+ return None
38
 
39
 
40
+ def get_model_entry(model_name: str, version="v0") -> Optional[Dict]:
41
  """
42
+ Get a specific model's entry from the entries folder.
 
 
 
43
  """
44
  try:
45
+ model_name_safe = model_name.replace("/", "_").replace(" ", "_")
46
+ entry_path = hf_hub_download(
47
+ repo_id=RESULTS_DATASET_ID,
48
+ filename=f"entries/entry_{model_name_safe}_{version}.json",
49
+ repo_type="dataset",
50
+ token=TOKEN
51
+ )
52
+
53
+ with open(entry_path, 'r') as f:
54
+ return json.load(f)
55
+ except Exception as e:
56
+ print(f"Error downloading model entry: {e}")
57
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
 
 
59
 
60
+ def get_all_entries(version="v0") -> List[Dict]:
61
+ """
62
+ Get all model entries from the entries folder.
63
+ """
64
+ try:
65
+ api = HfApi(token=TOKEN)
66
+ files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
67
+ entry_files = [f for f in files if f.startswith("entries/") and f.endswith(f"_{version}.json")]
68
 
69
+ entries = []
70
+ for entry_file in entry_files:
71
  try:
72
+ entry_path = hf_hub_download(
73
+ repo_id=RESULTS_DATASET_ID,
74
+ filename=entry_file,
75
+ repo_type="dataset",
76
+ token=TOKEN
77
+ )
78
+ with open(entry_path, 'r') as f:
79
+ entry_data = json.load(f)
80
+ entries.append(entry_data)
81
+ except Exception as e:
82
+ print(f"Error loading entry {entry_file}: {e}")
83
+
84
+ return entries
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  except Exception as e:
86
+ print(f"Error listing entries: {e}")
87
+ return []
 
 
 
 
 
 
 
 
 
 
 
88
 
89
 
90
  def get_leaderboard_df(version="v0") -> pd.DataFrame:
91
  """
92
  Get the leaderboard data as a DataFrame.
 
 
 
93
  """
94
+ # Get latest leaderboard data
95
+ leaderboard_data = get_latest_leaderboard(version)
96
 
97
+ if not leaderboard_data:
98
+ # If no leaderboard exists, try to build it from entries
99
+ entries = get_all_entries(version)
100
+ if entries:
101
+ leaderboard_data = {
102
+ "entries": entries,
103
+ "last_updated": datetime.now().isoformat(),
104
+ "version": version
105
+ }
106
+ else:
107
+ # Return empty DataFrame if no data available
108
+ return pd.DataFrame(columns=DISPLAY_COLS)
109
 
110
  # Convert to DataFrame
111
+ return leaderboard_to_dataframe(leaderboard_data)
 
 
112
 
113
 
114
  def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
115
  """
116
  Get the leaderboard data filtered by a specific category.
 
 
 
 
 
 
 
117
  """
118
+ # Get latest leaderboard data
119
+ leaderboard_data = get_latest_leaderboard(version)
120
+
121
+ if not leaderboard_data:
122
+ # If no leaderboard exists, try to build it from entries
123
+ entries = get_all_entries(version)
124
+ if entries:
125
+ leaderboard_data = {
126
+ "entries": entries,
127
+ "last_updated": datetime.now().isoformat(),
128
+ "version": version
129
+ }
130
+ else:
131
+ # Return empty DataFrame if no data available
132
+ return pd.DataFrame(columns=DISPLAY_COLS)
133
 
134
  # Filter entries to only include those with data for the specified category
135
  filtered_entries = []
136
 
137
  for entry in leaderboard_data.get("entries", []):
138
+ # Copy all base fields
139
+ filtered_entry = {
140
+ "model_name": entry.get("model_name", "Unknown Model"),
141
+ "model_type": entry.get("model_type", "Unknown"),
142
+ "guard_model_type": entry.get("guard_model_type", "Unknown"),
143
+ "submission_date": entry.get("submission_date", ""),
144
+ "version": entry.get("version", version),
145
+ "base_model": entry.get("base_model", ""),
146
+ "revision": entry.get("revision", ""),
147
+ "precision": entry.get("precision", ""),
148
+ "weight_type": entry.get("weight_type", "")
149
+ }
150
 
151
+ if "per_category_metrics" in entry and category in entry["per_category_metrics"]:
152
  category_metrics = entry["per_category_metrics"][category]
153
 
154
+ # Add all metrics for each test type
155
+ for test_type, metrics in category_metrics.items():
156
+ if isinstance(metrics, dict):
157
+ for metric, value in metrics.items():
158
  col_name = f"{test_type}_{metric}"
159
  filtered_entry[col_name] = value
160
 
161
+ # Also add the non-binary version for F1 scores
162
+ if metric == "f1_binary":
163
+ filtered_entry[f"{test_type}_f1"] = value
164
+
165
+ # Calculate averages
166
  f1_values = []
167
+ recall_values = []
168
+ precision_values = []
 
169
 
170
+ for test_type in ["default_prompts", "jailbreaked_prompts", "default_answers", "jailbreaked_answers"]:
171
+ if test_type in category_metrics and isinstance(category_metrics[test_type], dict):
172
+ if "f1_binary" in category_metrics[test_type]:
173
+ f1_values.append(category_metrics[test_type]["f1_binary"])
174
+ if "recall_binary" in category_metrics[test_type]:
175
+ recall_values.append(category_metrics[test_type]["recall_binary"])
176
+ if "precision_binary" in category_metrics[test_type]:
177
+ precision_values.append(category_metrics[test_type]["precision_binary"])
178
+
179
+ # Add overall averages
180
  if f1_values:
181
  filtered_entry["average_f1"] = sum(f1_values) / len(f1_values)
182
+ if recall_values:
183
+ filtered_entry["average_recall"] = sum(recall_values) / len(recall_values)
184
+ if precision_values:
185
+ filtered_entry["average_precision"] = sum(precision_values) / len(precision_values)
 
186
 
187
  filtered_entries.append(filtered_entry)
188
 
189
  # Create a new leaderboard data structure with the filtered entries
190
  filtered_leaderboard = {
191
  "entries": filtered_entries,
192
+ "last_updated": leaderboard_data.get("last_updated", datetime.now().isoformat()),
193
  "version": version
194
  }
195
+ print(filtered_leaderboard)
196
 
197
  # Convert to DataFrame
198
+ return leaderboard_to_dataframe(filtered_leaderboard)
 
 
199
 
200
 
201
  def get_detailed_model_data(model_name: str, version="v0") -> Dict:
202
  """
203
  Get detailed data for a specific model.
 
 
 
 
204
  """
205
+ # Try to get model's entry directly first
206
+ entry = get_model_entry(model_name, version)
207
+ if entry:
208
+ return entry
209
+
210
+ # If no direct entry found, try looking in the leaderboard
211
+ leaderboard_data = get_latest_leaderboard(version)
212
+ if leaderboard_data:
213
+ for entry in leaderboard_data.get("entries", []):
214
+ if entry.get("model_name") == model_name:
215
+ return entry
216
 
217
  return {}
src/submission/submit.py CHANGED
@@ -5,16 +5,19 @@ Handle submissions to the GuardBench leaderboard.
5
  import json
6
  import os
7
  import tempfile
8
- import uuid
9
  from datetime import datetime
10
  from typing import Dict, List, Tuple
 
11
 
12
  from huggingface_hub import HfApi
13
- from datasets import load_dataset, Dataset
14
 
15
- from src.display.formatting import styled_error, styled_message, styled_warning
16
- from src.envs import API, RESULTS_DATASET_ID, TOKEN
17
- from src.leaderboard.processor import process_jsonl_submission, add_entries_to_leaderboard, load_leaderboard_data
 
 
 
18
 
19
 
20
  def validate_submission(file_path: str) -> Tuple[bool, str]:
@@ -25,99 +28,194 @@ def validate_submission(file_path: str) -> Tuple[bool, str]:
25
  entries, message = process_jsonl_submission(file_path)
26
  if not entries:
27
  return False, message
28
-
29
- # Additional validation could be added here
30
-
31
  return True, "Submission is valid"
32
  except Exception as e:
33
  return False, f"Error validating submission: {e}"
34
 
35
 
36
- def submit_to_hub(file_path: str, metadata: Dict, dataset_id: str, token: str, version="v0") -> Tuple[bool, str]:
37
  """
38
- Submit results to a HuggingFace dataset repository as individual files.
39
-
40
- Args:
41
- file_path: Path to the submission file
42
- metadata: Metadata to include with the submission
43
- dataset_id: The dataset repository ID
44
- token: HuggingFace API token
45
- version: The version of the benchmark used (e.g., "v0", "v1")
46
  """
47
  try:
48
- # Process the submission file to validate
49
- entries, message = process_jsonl_submission(file_path)
50
- if not entries:
51
- return False, message
52
-
53
- # Generate a unique submission ID
54
- model_name = metadata.get("model_name", "unknown")
55
  model_name_safe = model_name.replace("/", "_").replace(" ", "_")
56
- timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
57
- submission_id = f"{model_name_safe}_{timestamp}"
58
 
59
- # Create an API instance
60
- api = HfApi(token=token)
61
 
62
- # Create a temporary file with metadata added
63
- with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as temp_file:
64
- # Add metadata to each entry
65
- for entry in entries:
66
- # If the entry already has a model_name, don't override it
67
- if "model_name" not in entry:
68
- entry["model_name"] = metadata.get("model_name")
69
 
70
- # Add other metadata if not present
71
- for key, value in metadata.items():
72
- if key != "model_name" and key not in entry:
73
- entry[key] = value
 
 
 
 
 
74
 
75
- # Ensure version is set
76
- entry["version"] = version
 
 
77
 
78
- # Write to temp file
79
- temp_file.write(json.dumps(entry) + "\n")
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  temp_path = temp_file.name
82
 
83
- # Upload the file to the version-specific directory
84
- submission_path = f"submissions_{version}/{submission_id}_{version}.jsonl" if version != "v0" else f"submissions/{submission_id}.jsonl"
85
  api.upload_file(
86
  path_or_fileobj=temp_path,
87
- path_in_repo=submission_path,
88
- repo_id=dataset_id,
89
  repo_type="dataset",
90
- commit_message=f"Add submission for {model_name} (version {version})"
91
  )
92
 
93
- # Clean up the temporary file
94
  os.unlink(temp_path)
95
-
96
- return True, f"Successfully uploaded submission for {model_name} to {dataset_id} (version {version})"
97
  except Exception as e:
98
- return False, f"Error submitting to dataset: {e}"
99
 
100
 
101
  def process_submission(file_path: str, metadata: Dict, version="v0") -> str:
102
  """
103
  Process a submission to the GuardBench leaderboard.
104
-
105
- Args:
106
- file_path: Path to the submission file
107
- metadata: Metadata to include with the submission
108
- version: The version of the benchmark used (e.g., "v0", "v1")
109
  """
110
- # Validate submission file
111
- is_valid, validation_message = validate_submission(file_path)
112
- if not is_valid:
113
- return styled_error(validation_message)
 
114
 
115
- # Add version to metadata
116
- metadata["version"] = version
 
 
117
 
118
- # Submit to HuggingFace dataset repository
119
- success, message = submit_to_hub(file_path, metadata, RESULTS_DATASET_ID, TOKEN, version=version)
120
- if not success:
121
- return styled_error(message)
 
122
 
123
- return styled_message(f"Submission successful! {message}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import json
6
  import os
7
  import tempfile
 
8
  from datetime import datetime
9
  from typing import Dict, List, Tuple
10
+ import shutil
11
 
12
  from huggingface_hub import HfApi
13
+ from datasets import load_dataset
14
 
15
+ from src.display.formatting import styled_error, styled_message
16
+ from src.envs import RESULTS_DATASET_ID, TOKEN
17
+ from src.leaderboard.processor import process_jsonl_submission
18
+ from guardbench.evaluator import Evaluator
19
+ from guardbench.context import GuardbenchContext
20
+ from guardbench.models_config import ModelType
21
 
22
 
23
  def validate_submission(file_path: str) -> Tuple[bool, str]:
 
28
  entries, message = process_jsonl_submission(file_path)
29
  if not entries:
30
  return False, message
 
 
 
31
  return True, "Submission is valid"
32
  except Exception as e:
33
  return False, f"Error validating submission: {e}"
34
 
35
 
36
+ def submit_entry_to_hub(entry: Dict, model_name: str, version="v0") -> Tuple[bool, str]:
37
  """
38
+ Submit a model's evaluation entry to the HuggingFace dataset.
 
 
 
 
 
 
 
39
  """
40
  try:
41
+ # Create safe model name for file path
 
 
 
 
 
 
42
  model_name_safe = model_name.replace("/", "_").replace(" ", "_")
 
 
43
 
44
+ # Create entry path in entries folder
45
+ entry_path = f"entries/entry_{model_name_safe}_{version}.json"
46
 
47
+ # Save entry to temporary file
48
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file:
49
+ json.dump(entry, temp_file, indent=2)
50
+ temp_path = temp_file.name
 
 
 
51
 
52
+ # Upload file
53
+ api = HfApi(token=TOKEN)
54
+ api.upload_file(
55
+ path_or_fileobj=temp_path,
56
+ path_in_repo=entry_path,
57
+ repo_id=RESULTS_DATASET_ID,
58
+ repo_type="dataset",
59
+ commit_message=f"Add evaluation entry for {model_name} (version {version})"
60
+ )
61
 
62
+ os.unlink(temp_path)
63
+ return True, f"Successfully uploaded evaluation entry for {model_name}"
64
+ except Exception as e:
65
+ return False, f"Error submitting entry to dataset: {e}"
66
 
 
 
67
 
68
+ def submit_leaderboard_to_hub(entries: List[Dict], version="v0") -> Tuple[bool, str]:
69
+ """
70
+ Submit updated leaderboard to the HuggingFace dataset.
71
+ """
72
+ try:
73
+ # Create leaderboard data
74
+ leaderboard_data = {
75
+ "entries": entries,
76
+ "last_updated": datetime.now().isoformat(),
77
+ "version": version
78
+ }
79
+
80
+ # Save to temporary file
81
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file:
82
+ json.dump(leaderboard_data, temp_file, indent=2)
83
  temp_path = temp_file.name
84
 
85
+ # Upload file
86
+ api = HfApi(token=TOKEN)
87
  api.upload_file(
88
  path_or_fileobj=temp_path,
89
+ path_in_repo=f"leaderboards/leaderboard_{version}.json",
90
+ repo_id=RESULTS_DATASET_ID,
91
  repo_type="dataset",
92
+ commit_message=f"Update leaderboard for version {version}"
93
  )
94
 
 
95
  os.unlink(temp_path)
96
+ return True, "Leaderboard updated successfully"
 
97
  except Exception as e:
98
+ return False, f"Error updating leaderboard: {e}"
99
 
100
 
101
  def process_submission(file_path: str, metadata: Dict, version="v0") -> str:
102
  """
103
  Process a submission to the GuardBench leaderboard.
 
 
 
 
 
104
  """
105
+ try:
106
+ # Validate submission
107
+ is_valid, validation_message = validate_submission(file_path)
108
+ if not is_valid:
109
+ return styled_error(validation_message)
110
 
111
+ # Get GuardBench results directory path
112
+ guardbench_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "guard-bench-submodule")
113
+ results_dir = os.path.join(guardbench_dir, "results")
114
+ os.makedirs(results_dir, exist_ok=True)
115
 
116
+ # Copy submission to GuardBench results directory
117
+ model_name = metadata.get("model_name", "unknown")
118
+ model_name_safe = model_name.replace("/", "_").replace(" ", "_")
119
+ guard_model_type = metadata.get("guard_model_type", "unknown")
120
+ target_file = os.path.join(results_dir + "/guardbench_dataset_1k_public", f"{model_name_safe}.jsonl")
121
 
122
+ # Upload raw submission file
123
+ api = HfApi(token=TOKEN)
124
+ submission_path = f"submissions_{version}/{model_name_safe}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl"
125
+ api.upload_file(
126
+ path_or_fileobj=file_path,
127
+ path_in_repo=submission_path,
128
+ repo_id=RESULTS_DATASET_ID,
129
+ repo_type="dataset",
130
+ commit_message=f"Add raw submission for {model_name}"
131
+ )
132
+
133
+ shutil.copy2(file_path, target_file)
134
+
135
+ try:
136
+ # Initialize GuardBench context
137
+ ctx = GuardbenchContext()
138
+ # Set results directory
139
+ ctx.results_dir = results_dir
140
+ # Set bench name from the results directory
141
+ ctx.bench_name = "guardbench_dataset_1k_public"
142
+ # Load dataset
143
+ ctx.load_dataset("whitecircle-ai/guardbench_dataset_1k_public")
144
+ # Mark as initialized
145
+ ctx.is_initialized = True
146
+
147
+ evaluator = Evaluator(ctx, force=True, using_cached=True)
148
+
149
+ # Run evaluation and get entry
150
+ evaluator.evaluate_model(model_name_safe, str(guard_model_type).lower())
151
+
152
+ # Get the entry from results
153
+ with open(os.path.join(results_dir + "/" + ctx.bench_name, "leaderboard.json"), 'r') as f:
154
+ results_data = json.load(f)
155
+ model_entry = next(
156
+ (entry for entry in results_data.get("entries", [])
157
+ if entry.get("model_name") == model_name_safe),
158
+ None
159
+ )
160
+
161
+ if not model_entry:
162
+ return styled_error("No evaluation results found")
163
+
164
+ # Add metadata to entry
165
+ model_entry.update({
166
+ "model_name": metadata.get("model_name"), # Use original model name
167
+ "model_type": metadata.get("model_type"),
168
+ "guard_model_type": str(metadata.get("guard_model_type")).lower(),
169
+ "base_model": metadata.get("base_model"),
170
+ "revision": metadata.get("revision"),
171
+ "precision": metadata.get("precision"),
172
+ "weight_type": metadata.get("weight_type"),
173
+ "version": version,
174
+ "submission_date": datetime.now().isoformat()
175
+ })
176
+
177
+ # Submit entry to entries folder
178
+ success, message = submit_entry_to_hub(model_entry, model_name, version)
179
+ if not success:
180
+ return styled_error(message)
181
+
182
+ # Get all entries from HF dataset
183
+ api = HfApi(token=TOKEN)
184
+ files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
185
+ entry_files = [f for f in files if f.startswith("entries/") and f.endswith(f"_{version}.json")]
186
+
187
+ all_entries = []
188
+ for entry_file in entry_files:
189
+ try:
190
+ entry_path = api.hf_hub_download(
191
+ repo_id=RESULTS_DATASET_ID,
192
+ filename=entry_file,
193
+ repo_type="dataset",
194
+ )
195
+ with open(entry_path, 'r') as f:
196
+ entry_data = json.load(f)
197
+ all_entries.append(entry_data)
198
+ except Exception as e:
199
+ print(f"Error loading entry {entry_file}: {e}")
200
+
201
+ # Update leaderboard with all entries
202
+ success, message = submit_leaderboard_to_hub(all_entries, version)
203
+ if not success:
204
+ return styled_error(message)
205
+
206
+ return styled_message(f"Submission successful! Model evaluated and leaderboard updated.")
207
+
208
+ except Exception as eval_error:
209
+ return styled_error(f"Error during evaluation: {eval_error}")
210
+
211
+ except Exception as e:
212
+ return styled_error(f"Error processing submission: {e}")
213
+ finally:
214
+ # Clean up temporary files
215
+ try:
216
+ if os.path.exists(file_path):
217
+ os.remove(file_path)
218
+ if os.path.exists(target_file):
219
+ os.remove(target_file)
220
+ except:
221
+ pass