categories rename
Browse files- app.py +16 -2
- src/about.py +6 -8
- src/display/utils.py +26 -0
- src/leaderboard/processor.py +1 -1
- src/populate.py +1 -0
- src/submission/submit.py +1 -0
app.py
CHANGED
@@ -32,6 +32,7 @@ from src.display.utils import (
|
|
32 |
CATEGORIES,
|
33 |
TEST_TYPES,
|
34 |
ModelType,
|
|
|
35 |
Precision,
|
36 |
WeightType,
|
37 |
GuardModelType,
|
@@ -394,6 +395,7 @@ def submit_results(
|
|
394 |
precision: str,
|
395 |
weight_type: str,
|
396 |
model_type: str,
|
|
|
397 |
submission_file: tempfile._TemporaryFileWrapper,
|
398 |
version: str,
|
399 |
guard_model_type: GuardModelType
|
@@ -410,6 +412,9 @@ def submit_results(
|
|
410 |
if not model_type:
|
411 |
return styled_error("Please select a model type")
|
412 |
|
|
|
|
|
|
|
413 |
file_path = submission_file.name
|
414 |
logger.info(f"Received submission for model {model_name}: {file_path}")
|
415 |
|
@@ -421,6 +426,7 @@ def submit_results(
|
|
421 |
"precision": precision,
|
422 |
"weight_type": weight_type,
|
423 |
"model_type": model_type,
|
|
|
424 |
"version": version,
|
425 |
"guard_model_type": guard_model_type
|
426 |
}
|
@@ -809,8 +815,8 @@ with demo:
|
|
809 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
810 |
|
811 |
with gr.Row():
|
812 |
-
with gr.Column(scale=3):
|
813 |
-
|
814 |
with gr.Column(scale=1):
|
815 |
# Add version selector specifically for the submission tab
|
816 |
submission_version_selector = gr.Dropdown(
|
@@ -825,6 +831,13 @@ with demo:
|
|
825 |
with gr.Row():
|
826 |
with gr.Column():
|
827 |
model_name_textbox = gr.Textbox(label="Model name")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
828 |
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
829 |
model_type = gr.Dropdown(
|
830 |
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
@@ -876,6 +889,7 @@ with demo:
|
|
876 |
precision,
|
877 |
weight_type,
|
878 |
model_type,
|
|
|
879 |
file_input,
|
880 |
submission_version_selector,
|
881 |
guard_model_type
|
|
|
32 |
CATEGORIES,
|
33 |
TEST_TYPES,
|
34 |
ModelType,
|
35 |
+
Mode,
|
36 |
Precision,
|
37 |
WeightType,
|
38 |
GuardModelType,
|
|
|
395 |
precision: str,
|
396 |
weight_type: str,
|
397 |
model_type: str,
|
398 |
+
mode: str,
|
399 |
submission_file: tempfile._TemporaryFileWrapper,
|
400 |
version: str,
|
401 |
guard_model_type: GuardModelType
|
|
|
412 |
if not model_type:
|
413 |
return styled_error("Please select a model type")
|
414 |
|
415 |
+
if not mode:
|
416 |
+
return styled_error("Please select an inference mode")
|
417 |
+
|
418 |
file_path = submission_file.name
|
419 |
logger.info(f"Received submission for model {model_name}: {file_path}")
|
420 |
|
|
|
426 |
"precision": precision,
|
427 |
"weight_type": weight_type,
|
428 |
"model_type": model_type,
|
429 |
+
"mode": mode,
|
430 |
"version": version,
|
431 |
"guard_model_type": guard_model_type
|
432 |
}
|
|
|
815 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
816 |
|
817 |
with gr.Row():
|
818 |
+
# with gr.Column(scale=3):
|
819 |
+
# gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
|
820 |
with gr.Column(scale=1):
|
821 |
# Add version selector specifically for the submission tab
|
822 |
submission_version_selector = gr.Dropdown(
|
|
|
831 |
with gr.Row():
|
832 |
with gr.Column():
|
833 |
model_name_textbox = gr.Textbox(label="Model name")
|
834 |
+
mode_selector = gr.Dropdown(
|
835 |
+
choices=[m.name for m in Mode],
|
836 |
+
label="Mode",
|
837 |
+
multiselect=False,
|
838 |
+
value=None,
|
839 |
+
interactive=True,
|
840 |
+
)
|
841 |
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
842 |
model_type = gr.Dropdown(
|
843 |
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
|
|
889 |
precision,
|
890 |
weight_type,
|
891 |
model_type,
|
892 |
+
mode_selector,
|
893 |
file_input,
|
894 |
submission_version_selector,
|
895 |
guard_model_type
|
src/about.py
CHANGED
@@ -20,8 +20,6 @@ across multiple categories and test scenarios.
|
|
20 |
"""
|
21 |
|
22 |
LLM_BENCHMARKS_TEXT = """
|
23 |
-
## GuardBench evaluation methodology
|
24 |
-
|
25 |
GuardBench checks how well models handle safety challenges — from misinformation and self-harm to sexual content and corruption.
|
26 |
|
27 |
Models are tested with regular and adversarial prompts to see if they can avoid saying harmful things.
|
@@ -30,15 +28,15 @@ We track how accurate they are, how often they make mistakes, and how fast they
|
|
30 |
"""
|
31 |
|
32 |
EVALUATION_QUEUE_TEXT = """
|
33 |
-
##
|
34 |
|
35 |
-
To
|
36 |
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
|
41 |
-
|
42 |
"""
|
43 |
|
44 |
CITATION_BUTTON_LABEL = "Cite GuardBench"
|
|
|
20 |
"""
|
21 |
|
22 |
LLM_BENCHMARKS_TEXT = """
|
|
|
|
|
23 |
GuardBench checks how well models handle safety challenges — from misinformation and self-harm to sexual content and corruption.
|
24 |
|
25 |
Models are tested with regular and adversarial prompts to see if they can avoid saying harmful things.
|
|
|
28 |
"""
|
29 |
|
30 |
EVALUATION_QUEUE_TEXT = """
|
31 |
+
## Submit Your Model
|
32 |
|
33 |
+
To add your model to the GuardBench leaderboard:
|
34 |
|
35 |
+
Run your evaluation using the GuardBench framework at https://github.com/whitecircle-ai/guard-bench
|
36 |
+
Upload your run results in .jsonl format using this form
|
37 |
+
Once validated, your model will appear on the leaderboard.
|
38 |
|
39 |
+
✉️✨ Ready? Upload your results below!
|
40 |
"""
|
41 |
|
42 |
CITATION_BUTTON_LABEL = "Cite GuardBench"
|
src/display/utils.py
CHANGED
@@ -7,6 +7,16 @@ from enum import Enum, auto
|
|
7 |
from typing import List, Optional
|
8 |
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
class ModelType(Enum):
|
11 |
"""Model types for the leaderboard."""
|
12 |
Unknown = auto()
|
@@ -86,6 +96,11 @@ class GuardBenchColumn:
|
|
86 |
never_hidden=True,
|
87 |
displayed_by_default=True
|
88 |
))
|
|
|
|
|
|
|
|
|
|
|
89 |
model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
90 |
name="model_type",
|
91 |
display_name="Type",
|
@@ -333,6 +348,17 @@ GUARDBENCH_COLUMN = GuardBenchColumn()
|
|
333 |
COLS = [f.name for f in fields(GUARDBENCH_COLUMN)]
|
334 |
DISPLAY_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
|
335 |
if getattr(GUARDBENCH_COLUMN, f.name).displayed_by_default]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
336 |
METRIC_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
|
337 |
if getattr(GUARDBENCH_COLUMN, f.name).type == "number"]
|
338 |
HIDDEN_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
|
|
|
7 |
from typing import List, Optional
|
8 |
|
9 |
|
10 |
+
class Mode(Enum):
|
11 |
+
"""Inference mode for the guard model."""
|
12 |
+
CoT = auto() # Chain of Thought
|
13 |
+
Strict = auto()
|
14 |
+
|
15 |
+
def __str__(self):
|
16 |
+
"""String representation of the mode."""
|
17 |
+
return self.name
|
18 |
+
|
19 |
+
|
20 |
class ModelType(Enum):
|
21 |
"""Model types for the leaderboard."""
|
22 |
Unknown = auto()
|
|
|
96 |
never_hidden=True,
|
97 |
displayed_by_default=True
|
98 |
))
|
99 |
+
mode: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
100 |
+
name="mode",
|
101 |
+
display_name="Mode",
|
102 |
+
displayed_by_default=True
|
103 |
+
))
|
104 |
model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
105 |
name="model_type",
|
106 |
display_name="Type",
|
|
|
348 |
COLS = [f.name for f in fields(GUARDBENCH_COLUMN)]
|
349 |
DISPLAY_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
|
350 |
if getattr(GUARDBENCH_COLUMN, f.name).displayed_by_default]
|
351 |
+
|
352 |
+
# Manually reorder DISPLAY_COLS to put 'mode' after 'model_name'
|
353 |
+
def reorder_display_cols():
|
354 |
+
cols = DISPLAY_COLS
|
355 |
+
if 'model_name' in cols and 'mode' in cols:
|
356 |
+
cols.remove('mode')
|
357 |
+
model_name_index = cols.index('model_name')
|
358 |
+
cols.insert(model_name_index + 1, 'mode')
|
359 |
+
return cols
|
360 |
+
DISPLAY_COLS = reorder_display_cols()
|
361 |
+
|
362 |
METRIC_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
|
363 |
if getattr(GUARDBENCH_COLUMN, f.name).type == "number"]
|
364 |
HIDDEN_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
|
src/leaderboard/processor.py
CHANGED
@@ -175,7 +175,7 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
|
|
175 |
}
|
176 |
|
177 |
# Add additional metadata fields if present
|
178 |
-
for key in ["base_model", "revision", "precision", "weight_type"]:
|
179 |
if key in entry:
|
180 |
row[key] = entry[key]
|
181 |
|
|
|
175 |
}
|
176 |
|
177 |
# Add additional metadata fields if present
|
178 |
+
for key in ["base_model", "revision", "precision", "weight_type", "mode"]:
|
179 |
if key in entry:
|
180 |
row[key] = entry[key]
|
181 |
|
src/populate.py
CHANGED
@@ -141,6 +141,7 @@ def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
|
|
141 |
"model_name": entry.get("model_name", "Unknown Model"),
|
142 |
"model_type": entry.get("model_type", "Unknown"),
|
143 |
"guard_model_type": entry.get("guard_model_type", "Unknown"),
|
|
|
144 |
"submission_date": entry.get("submission_date", ""),
|
145 |
"version": entry.get("version", version),
|
146 |
"base_model": entry.get("base_model", ""),
|
|
|
141 |
"model_name": entry.get("model_name", "Unknown Model"),
|
142 |
"model_type": entry.get("model_type", "Unknown"),
|
143 |
"guard_model_type": entry.get("guard_model_type", "Unknown"),
|
144 |
+
"mode": entry.get("mode", "Strict"),
|
145 |
"submission_date": entry.get("submission_date", ""),
|
146 |
"version": entry.get("version", version),
|
147 |
"base_model": entry.get("base_model", ""),
|
src/submission/submit.py
CHANGED
@@ -175,6 +175,7 @@ def process_submission(file_path: str, metadata: Dict, version="v0") -> str:
|
|
175 |
"model_name": metadata.get("model_name"), # Use original model name
|
176 |
"model_type": metadata.get("model_type"),
|
177 |
"guard_model_type": str(metadata.get("guard_model_type")).lower(),
|
|
|
178 |
"base_model": metadata.get("base_model"),
|
179 |
"revision": metadata.get("revision"),
|
180 |
"precision": metadata.get("precision"),
|
|
|
175 |
"model_name": metadata.get("model_name"), # Use original model name
|
176 |
"model_type": metadata.get("model_type"),
|
177 |
"guard_model_type": str(metadata.get("guard_model_type")).lower(),
|
178 |
+
"mode": metadata.get("mode"),
|
179 |
"base_model": metadata.get("base_model"),
|
180 |
"revision": metadata.get("revision"),
|
181 |
"precision": metadata.get("precision"),
|