apsys commited on
Commit
a17bcda
·
1 Parent(s): 37c2f94

categories rename

Browse files
app.py CHANGED
@@ -32,6 +32,7 @@ from src.display.utils import (
32
  CATEGORIES,
33
  TEST_TYPES,
34
  ModelType,
 
35
  Precision,
36
  WeightType,
37
  GuardModelType,
@@ -394,6 +395,7 @@ def submit_results(
394
  precision: str,
395
  weight_type: str,
396
  model_type: str,
 
397
  submission_file: tempfile._TemporaryFileWrapper,
398
  version: str,
399
  guard_model_type: GuardModelType
@@ -410,6 +412,9 @@ def submit_results(
410
  if not model_type:
411
  return styled_error("Please select a model type")
412
 
 
 
 
413
  file_path = submission_file.name
414
  logger.info(f"Received submission for model {model_name}: {file_path}")
415
 
@@ -421,6 +426,7 @@ def submit_results(
421
  "precision": precision,
422
  "weight_type": weight_type,
423
  "model_type": model_type,
 
424
  "version": version,
425
  "guard_model_type": guard_model_type
426
  }
@@ -809,8 +815,8 @@ with demo:
809
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
810
 
811
  with gr.Row():
812
- with gr.Column(scale=3):
813
- gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
814
  with gr.Column(scale=1):
815
  # Add version selector specifically for the submission tab
816
  submission_version_selector = gr.Dropdown(
@@ -825,6 +831,13 @@ with demo:
825
  with gr.Row():
826
  with gr.Column():
827
  model_name_textbox = gr.Textbox(label="Model name")
 
 
 
 
 
 
 
828
  revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
829
  model_type = gr.Dropdown(
830
  choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
@@ -876,6 +889,7 @@ with demo:
876
  precision,
877
  weight_type,
878
  model_type,
 
879
  file_input,
880
  submission_version_selector,
881
  guard_model_type
 
32
  CATEGORIES,
33
  TEST_TYPES,
34
  ModelType,
35
+ Mode,
36
  Precision,
37
  WeightType,
38
  GuardModelType,
 
395
  precision: str,
396
  weight_type: str,
397
  model_type: str,
398
+ mode: str,
399
  submission_file: tempfile._TemporaryFileWrapper,
400
  version: str,
401
  guard_model_type: GuardModelType
 
412
  if not model_type:
413
  return styled_error("Please select a model type")
414
 
415
+ if not mode:
416
+ return styled_error("Please select an inference mode")
417
+
418
  file_path = submission_file.name
419
  logger.info(f"Received submission for model {model_name}: {file_path}")
420
 
 
426
  "precision": precision,
427
  "weight_type": weight_type,
428
  "model_type": model_type,
429
+ "mode": mode,
430
  "version": version,
431
  "guard_model_type": guard_model_type
432
  }
 
815
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
816
 
817
  with gr.Row():
818
+ # with gr.Column(scale=3):
819
+ # gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
820
  with gr.Column(scale=1):
821
  # Add version selector specifically for the submission tab
822
  submission_version_selector = gr.Dropdown(
 
831
  with gr.Row():
832
  with gr.Column():
833
  model_name_textbox = gr.Textbox(label="Model name")
834
+ mode_selector = gr.Dropdown(
835
+ choices=[m.name for m in Mode],
836
+ label="Mode",
837
+ multiselect=False,
838
+ value=None,
839
+ interactive=True,
840
+ )
841
  revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
842
  model_type = gr.Dropdown(
843
  choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
 
889
  precision,
890
  weight_type,
891
  model_type,
892
+ mode_selector,
893
  file_input,
894
  submission_version_selector,
895
  guard_model_type
src/about.py CHANGED
@@ -20,8 +20,6 @@ across multiple categories and test scenarios.
20
  """
21
 
22
  LLM_BENCHMARKS_TEXT = """
23
- ## GuardBench evaluation methodology
24
-
25
  GuardBench checks how well models handle safety challenges — from misinformation and self-harm to sexual content and corruption.
26
 
27
  Models are tested with regular and adversarial prompts to see if they can avoid saying harmful things.
@@ -30,15 +28,15 @@ We track how accurate they are, how often they make mistakes, and how fast they
30
  """
31
 
32
  EVALUATION_QUEUE_TEXT = """
33
- ## Submission Process
34
 
35
- To submit your model results to the GuardBench leaderboard:
36
 
37
- 1. Evaluate your model using the [GuardBench framework](https://github.com/huggingface/guard-bench)
38
- 2. Format your results as a JSONL file according to our schema
39
- 3. Submit your results using the submission form with your authorized token
40
 
41
- Results will be processed and added to the leaderboard once validated.
42
  """
43
 
44
  CITATION_BUTTON_LABEL = "Cite GuardBench"
 
20
  """
21
 
22
  LLM_BENCHMARKS_TEXT = """
 
 
23
  GuardBench checks how well models handle safety challenges — from misinformation and self-harm to sexual content and corruption.
24
 
25
  Models are tested with regular and adversarial prompts to see if they can avoid saying harmful things.
 
28
  """
29
 
30
  EVALUATION_QUEUE_TEXT = """
31
+ ## Submit Your Model
32
 
33
+ To add your model to the GuardBench leaderboard:
34
 
35
+ Run your evaluation using the GuardBench framework at https://github.com/whitecircle-ai/guard-bench
36
+ Upload your run results in .jsonl format using this form
37
+ Once validated, your model will appear on the leaderboard.
38
 
39
+ ✉️✨ Ready? Upload your results below!
40
  """
41
 
42
  CITATION_BUTTON_LABEL = "Cite GuardBench"
src/display/utils.py CHANGED
@@ -7,6 +7,16 @@ from enum import Enum, auto
7
  from typing import List, Optional
8
 
9
 
 
 
 
 
 
 
 
 
 
 
10
  class ModelType(Enum):
11
  """Model types for the leaderboard."""
12
  Unknown = auto()
@@ -86,6 +96,11 @@ class GuardBenchColumn:
86
  never_hidden=True,
87
  displayed_by_default=True
88
  ))
 
 
 
 
 
89
  model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
90
  name="model_type",
91
  display_name="Type",
@@ -333,6 +348,17 @@ GUARDBENCH_COLUMN = GuardBenchColumn()
333
  COLS = [f.name for f in fields(GUARDBENCH_COLUMN)]
334
  DISPLAY_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
335
  if getattr(GUARDBENCH_COLUMN, f.name).displayed_by_default]
 
 
 
 
 
 
 
 
 
 
 
336
  METRIC_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
337
  if getattr(GUARDBENCH_COLUMN, f.name).type == "number"]
338
  HIDDEN_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
 
7
  from typing import List, Optional
8
 
9
 
10
+ class Mode(Enum):
11
+ """Inference mode for the guard model."""
12
+ CoT = auto() # Chain of Thought
13
+ Strict = auto()
14
+
15
+ def __str__(self):
16
+ """String representation of the mode."""
17
+ return self.name
18
+
19
+
20
  class ModelType(Enum):
21
  """Model types for the leaderboard."""
22
  Unknown = auto()
 
96
  never_hidden=True,
97
  displayed_by_default=True
98
  ))
99
+ mode: ColumnInfo = field(default_factory=lambda: ColumnInfo(
100
+ name="mode",
101
+ display_name="Mode",
102
+ displayed_by_default=True
103
+ ))
104
  model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
105
  name="model_type",
106
  display_name="Type",
 
348
  COLS = [f.name for f in fields(GUARDBENCH_COLUMN)]
349
  DISPLAY_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
350
  if getattr(GUARDBENCH_COLUMN, f.name).displayed_by_default]
351
+
352
+ # Manually reorder DISPLAY_COLS to put 'mode' after 'model_name'
353
+ def reorder_display_cols():
354
+ cols = DISPLAY_COLS
355
+ if 'model_name' in cols and 'mode' in cols:
356
+ cols.remove('mode')
357
+ model_name_index = cols.index('model_name')
358
+ cols.insert(model_name_index + 1, 'mode')
359
+ return cols
360
+ DISPLAY_COLS = reorder_display_cols()
361
+
362
  METRIC_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
363
  if getattr(GUARDBENCH_COLUMN, f.name).type == "number"]
364
  HIDDEN_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
src/leaderboard/processor.py CHANGED
@@ -175,7 +175,7 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
175
  }
176
 
177
  # Add additional metadata fields if present
178
- for key in ["base_model", "revision", "precision", "weight_type"]:
179
  if key in entry:
180
  row[key] = entry[key]
181
 
 
175
  }
176
 
177
  # Add additional metadata fields if present
178
+ for key in ["base_model", "revision", "precision", "weight_type", "mode"]:
179
  if key in entry:
180
  row[key] = entry[key]
181
 
src/populate.py CHANGED
@@ -141,6 +141,7 @@ def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
141
  "model_name": entry.get("model_name", "Unknown Model"),
142
  "model_type": entry.get("model_type", "Unknown"),
143
  "guard_model_type": entry.get("guard_model_type", "Unknown"),
 
144
  "submission_date": entry.get("submission_date", ""),
145
  "version": entry.get("version", version),
146
  "base_model": entry.get("base_model", ""),
 
141
  "model_name": entry.get("model_name", "Unknown Model"),
142
  "model_type": entry.get("model_type", "Unknown"),
143
  "guard_model_type": entry.get("guard_model_type", "Unknown"),
144
+ "mode": entry.get("mode", "Strict"),
145
  "submission_date": entry.get("submission_date", ""),
146
  "version": entry.get("version", version),
147
  "base_model": entry.get("base_model", ""),
src/submission/submit.py CHANGED
@@ -175,6 +175,7 @@ def process_submission(file_path: str, metadata: Dict, version="v0") -> str:
175
  "model_name": metadata.get("model_name"), # Use original model name
176
  "model_type": metadata.get("model_type"),
177
  "guard_model_type": str(metadata.get("guard_model_type")).lower(),
 
178
  "base_model": metadata.get("base_model"),
179
  "revision": metadata.get("revision"),
180
  "precision": metadata.get("precision"),
 
175
  "model_name": metadata.get("model_name"), # Use original model name
176
  "model_type": metadata.get("model_type"),
177
  "guard_model_type": str(metadata.get("guard_model_type")).lower(),
178
+ "mode": metadata.get("mode"),
179
  "base_model": metadata.get("base_model"),
180
  "revision": metadata.get("revision"),
181
  "precision": metadata.get("precision"),