submodule + versioning
Browse files- .gitignore +6 -0
- .gitmodules +3 -0
- app.py +80 -53
- guard-bench-submodule +1 -0
- src/display/css_html_js.py +18 -0
- src/display/utils.py +21 -14
- src/leaderboard/processor.py +65 -32
- src/populate.py +97 -26
- src/submission/submit.py +38 -20
.gitignore
CHANGED
@@ -43,3 +43,9 @@ eval-queue/
|
|
43 |
eval-results/
|
44 |
eval-queue-bk/
|
45 |
eval-results-bk/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
eval-results/
|
44 |
eval-queue-bk/
|
45 |
eval-results-bk/
|
46 |
+
|
47 |
+
# Data files
|
48 |
+
data/
|
49 |
+
|
50 |
+
# Versioned leaderboard files
|
51 |
+
data/leaderboard_v*.json
|
.gitmodules
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
[submodule "guard-bench-submodule"]
|
2 |
+
path = guard-bench-submodule
|
3 |
+
url = https://github.com/whitecircle-ai/guard-bench.git
|
app.py
CHANGED
@@ -51,10 +51,14 @@ logger = logging.getLogger(__name__)
|
|
51 |
# Ensure data directory exists
|
52 |
os.makedirs(DATA_PATH, exist_ok=True)
|
53 |
|
|
|
|
|
|
|
|
|
54 |
# Initialize leaderboard data
|
55 |
try:
|
56 |
logger.info("Initializing leaderboard data...")
|
57 |
-
LEADERBOARD_DF = get_leaderboard_df()
|
58 |
logger.info(f"Loaded leaderboard with {len(LEADERBOARD_DF)} entries")
|
59 |
except Exception as e:
|
60 |
logger.error(f"Error loading leaderboard data: {e}")
|
@@ -70,7 +74,7 @@ def init_leaderboard(dataframe):
|
|
70 |
columns = [getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS]
|
71 |
dataframe = pd.DataFrame(columns=columns)
|
72 |
logger.warning("Initializing empty leaderboard")
|
73 |
-
|
74 |
return Leaderboard(
|
75 |
value=dataframe,
|
76 |
datatype=[getattr(GUARDBENCH_COLUMN, col).type for col in DISPLAY_COLS],
|
@@ -79,7 +83,7 @@ def init_leaderboard(dataframe):
|
|
79 |
cant_deselect=[getattr(GUARDBENCH_COLUMN, col).name for col in NEVER_HIDDEN_COLS],
|
80 |
label="Select Columns to Display:",
|
81 |
),
|
82 |
-
search_columns=[GUARDBENCH_COLUMN.
|
83 |
hide_columns=[getattr(GUARDBENCH_COLUMN, col).name for col in HIDDEN_COLS],
|
84 |
filter_columns=[
|
85 |
ColumnFilter(GUARDBENCH_COLUMN.model_type.name, type="checkboxgroup", label="Model types"),
|
@@ -95,23 +99,24 @@ def submit_results(
|
|
95 |
precision: str,
|
96 |
weight_type: str,
|
97 |
model_type: str,
|
98 |
-
submission_file: tempfile._TemporaryFileWrapper
|
|
|
99 |
):
|
100 |
"""
|
101 |
Handle submission of results with model metadata.
|
102 |
"""
|
103 |
if submission_file is None:
|
104 |
return styled_error("No submission file provided")
|
105 |
-
|
106 |
if not model_name:
|
107 |
return styled_error("Model name is required")
|
108 |
-
|
109 |
if not model_type:
|
110 |
return styled_error("Please select a model type")
|
111 |
-
|
112 |
file_path = submission_file.name
|
113 |
logger.info(f"Received submission for model {model_name}: {file_path}")
|
114 |
-
|
115 |
# Add metadata to the submission
|
116 |
metadata = {
|
117 |
"model_name": model_name,
|
@@ -119,35 +124,46 @@ def submit_results(
|
|
119 |
"revision": revision if revision else "main",
|
120 |
"precision": precision,
|
121 |
"weight_type": weight_type,
|
122 |
-
"model_type": model_type
|
|
|
123 |
}
|
124 |
-
|
125 |
# Process the submission
|
126 |
-
result = process_submission(file_path, metadata)
|
127 |
-
|
128 |
# Refresh the leaderboard data
|
129 |
global LEADERBOARD_DF
|
130 |
try:
|
131 |
-
logger.info("Refreshing leaderboard data after submission...")
|
132 |
-
LEADERBOARD_DF = get_leaderboard_df()
|
133 |
logger.info("Refreshed leaderboard data after submission")
|
134 |
except Exception as e:
|
135 |
logger.error(f"Error refreshing leaderboard data: {e}")
|
136 |
-
|
137 |
return result
|
138 |
|
139 |
|
140 |
-
def refresh_data():
|
141 |
"""
|
142 |
Refresh the leaderboard data from HuggingFace.
|
143 |
"""
|
144 |
global LEADERBOARD_DF
|
145 |
try:
|
146 |
-
logger.info("Performing scheduled refresh of leaderboard data...")
|
147 |
-
LEADERBOARD_DF = get_leaderboard_df()
|
148 |
logger.info("Scheduled refresh of leaderboard data completed")
|
149 |
except Exception as e:
|
150 |
logger.error(f"Error in scheduled refresh: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
|
152 |
|
153 |
# Create Gradio app
|
@@ -155,43 +171,54 @@ demo = gr.Blocks(css=custom_css)
|
|
155 |
|
156 |
with demo:
|
157 |
gr.HTML(TITLE)
|
158 |
-
|
159 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
161 |
with gr.TabItem("🏅 Leaderboard", elem_id="guardbench-leaderboard-tab", id=0):
|
162 |
refresh_button = gr.Button("Refresh Leaderboard")
|
163 |
-
|
164 |
# Create tabs for each category
|
165 |
with gr.Tabs(elem_classes="category-tabs") as category_tabs:
|
166 |
# First tab for average metrics across all categories
|
167 |
with gr.TabItem("📊 Overall Performance", elem_id="overall-tab"):
|
168 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
169 |
-
|
170 |
# Create a tab for each category
|
171 |
for category in CATEGORIES:
|
172 |
with gr.TabItem(f"{category}", elem_id=f"category-{category.lower().replace(' ', '-')}-tab"):
|
173 |
-
category_df = get_category_leaderboard_df(category)
|
174 |
category_leaderboard = init_leaderboard(category_df)
|
175 |
-
|
176 |
# Refresh button functionality
|
177 |
refresh_button.click(
|
178 |
fn=lambda: [
|
179 |
-
init_leaderboard(get_leaderboard_df()),
|
180 |
-
*[init_leaderboard(get_category_leaderboard_df(category)) for category in CATEGORIES]
|
181 |
],
|
182 |
inputs=[],
|
183 |
outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)]
|
184 |
)
|
185 |
-
|
186 |
with gr.TabItem("📝 About", elem_id="guardbench-about-tab", id=1):
|
187 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
188 |
-
|
189 |
with gr.TabItem("🚀 Submit", elem_id="guardbench-submit-tab", id=2):
|
190 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
191 |
-
|
192 |
with gr.Row():
|
193 |
gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
|
194 |
-
|
195 |
with gr.Row():
|
196 |
with gr.Column():
|
197 |
model_name_textbox = gr.Textbox(label="Model name")
|
@@ -203,33 +230,33 @@ with demo:
|
|
203 |
value=None,
|
204 |
interactive=True,
|
205 |
)
|
206 |
-
|
207 |
with gr.Column():
|
208 |
precision = gr.Dropdown(
|
209 |
-
choices=[i.
|
210 |
label="Precision",
|
211 |
multiselect=False,
|
212 |
value="float16",
|
213 |
interactive=True,
|
214 |
)
|
215 |
weight_type = gr.Dropdown(
|
216 |
-
choices=[i.
|
217 |
label="Weights type",
|
218 |
multiselect=False,
|
219 |
value="Original",
|
220 |
interactive=True,
|
221 |
)
|
222 |
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
223 |
-
|
224 |
with gr.Row():
|
225 |
file_input = gr.File(
|
226 |
-
label="Upload JSONL Results File",
|
227 |
file_types=[".jsonl"]
|
228 |
)
|
229 |
-
|
230 |
submit_button = gr.Button("Submit Results")
|
231 |
result_output = gr.Markdown()
|
232 |
-
|
233 |
submit_button.click(
|
234 |
fn=submit_results,
|
235 |
inputs=[
|
@@ -239,11 +266,19 @@ with demo:
|
|
239 |
precision,
|
240 |
weight_type,
|
241 |
model_type,
|
242 |
-
file_input
|
|
|
243 |
],
|
244 |
outputs=result_output
|
245 |
)
|
246 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
247 |
with gr.Row():
|
248 |
with gr.Accordion("📙 Citation", open=False):
|
249 |
citation_button = gr.Textbox(
|
@@ -253,29 +288,21 @@ with demo:
|
|
253 |
elem_id="citation-button",
|
254 |
show_copy_button=True,
|
255 |
)
|
256 |
-
|
257 |
with gr.Accordion("ℹ️ Dataset Information", open=False):
|
258 |
dataset_info = gr.Markdown(f"""
|
259 |
## Dataset Information
|
260 |
-
|
261 |
Results are stored in the HuggingFace dataset: [{RESULTS_DATASET_ID}](https://huggingface.co/datasets/{RESULTS_DATASET_ID})
|
262 |
-
|
263 |
Last updated: {pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S UTC")}
|
264 |
""")
|
265 |
|
266 |
-
# Set up scheduler to refresh data periodically
|
267 |
scheduler = BackgroundScheduler()
|
268 |
-
scheduler.add_job(refresh_data, 'interval', minutes=30)
|
269 |
scheduler.start()
|
270 |
|
271 |
# Launch the app
|
272 |
if __name__ == "__main__":
|
273 |
-
|
274 |
-
|
275 |
-
logger.warning("Admin username or password not set. Running without authentication.")
|
276 |
-
auth = None
|
277 |
-
else:
|
278 |
-
auth = (ADMIN_USERNAME, ADMIN_PASSWORD)
|
279 |
-
|
280 |
-
# Launch the app
|
281 |
-
demo.launch(server_name="0.0.0.0", server_port=7860, auth=auth)
|
|
|
51 |
# Ensure data directory exists
|
52 |
os.makedirs(DATA_PATH, exist_ok=True)
|
53 |
|
54 |
+
# Available benchmark versions
|
55 |
+
BENCHMARK_VERSIONS = ["v0"]
|
56 |
+
CURRENT_VERSION = "v0"
|
57 |
+
|
58 |
# Initialize leaderboard data
|
59 |
try:
|
60 |
logger.info("Initializing leaderboard data...")
|
61 |
+
LEADERBOARD_DF = get_leaderboard_df(version=CURRENT_VERSION)
|
62 |
logger.info(f"Loaded leaderboard with {len(LEADERBOARD_DF)} entries")
|
63 |
except Exception as e:
|
64 |
logger.error(f"Error loading leaderboard data: {e}")
|
|
|
74 |
columns = [getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS]
|
75 |
dataframe = pd.DataFrame(columns=columns)
|
76 |
logger.warning("Initializing empty leaderboard")
|
77 |
+
|
78 |
return Leaderboard(
|
79 |
value=dataframe,
|
80 |
datatype=[getattr(GUARDBENCH_COLUMN, col).type for col in DISPLAY_COLS],
|
|
|
83 |
cant_deselect=[getattr(GUARDBENCH_COLUMN, col).name for col in NEVER_HIDDEN_COLS],
|
84 |
label="Select Columns to Display:",
|
85 |
),
|
86 |
+
search_columns=[GUARDBENCH_COLUMN.model_name.name],
|
87 |
hide_columns=[getattr(GUARDBENCH_COLUMN, col).name for col in HIDDEN_COLS],
|
88 |
filter_columns=[
|
89 |
ColumnFilter(GUARDBENCH_COLUMN.model_type.name, type="checkboxgroup", label="Model types"),
|
|
|
99 |
precision: str,
|
100 |
weight_type: str,
|
101 |
model_type: str,
|
102 |
+
submission_file: tempfile._TemporaryFileWrapper,
|
103 |
+
version: str
|
104 |
):
|
105 |
"""
|
106 |
Handle submission of results with model metadata.
|
107 |
"""
|
108 |
if submission_file is None:
|
109 |
return styled_error("No submission file provided")
|
110 |
+
|
111 |
if not model_name:
|
112 |
return styled_error("Model name is required")
|
113 |
+
|
114 |
if not model_type:
|
115 |
return styled_error("Please select a model type")
|
116 |
+
|
117 |
file_path = submission_file.name
|
118 |
logger.info(f"Received submission for model {model_name}: {file_path}")
|
119 |
+
|
120 |
# Add metadata to the submission
|
121 |
metadata = {
|
122 |
"model_name": model_name,
|
|
|
124 |
"revision": revision if revision else "main",
|
125 |
"precision": precision,
|
126 |
"weight_type": weight_type,
|
127 |
+
"model_type": model_type,
|
128 |
+
"version": version
|
129 |
}
|
130 |
+
|
131 |
# Process the submission
|
132 |
+
result = process_submission(file_path, metadata, version=version)
|
133 |
+
|
134 |
# Refresh the leaderboard data
|
135 |
global LEADERBOARD_DF
|
136 |
try:
|
137 |
+
logger.info(f"Refreshing leaderboard data after submission for version {version}...")
|
138 |
+
LEADERBOARD_DF = get_leaderboard_df(version=version)
|
139 |
logger.info("Refreshed leaderboard data after submission")
|
140 |
except Exception as e:
|
141 |
logger.error(f"Error refreshing leaderboard data: {e}")
|
142 |
+
|
143 |
return result
|
144 |
|
145 |
|
146 |
+
def refresh_data(version=CURRENT_VERSION):
|
147 |
"""
|
148 |
Refresh the leaderboard data from HuggingFace.
|
149 |
"""
|
150 |
global LEADERBOARD_DF
|
151 |
try:
|
152 |
+
logger.info(f"Performing scheduled refresh of leaderboard data for version {version}...")
|
153 |
+
LEADERBOARD_DF = get_leaderboard_df(version=version)
|
154 |
logger.info("Scheduled refresh of leaderboard data completed")
|
155 |
except Exception as e:
|
156 |
logger.error(f"Error in scheduled refresh: {e}")
|
157 |
+
return LEADERBOARD_DF
|
158 |
+
|
159 |
+
|
160 |
+
def update_leaderboards(version):
|
161 |
+
"""
|
162 |
+
Update all leaderboard components with data for the selected version.
|
163 |
+
"""
|
164 |
+
new_df = get_leaderboard_df(version=version)
|
165 |
+
category_dfs = [get_category_leaderboard_df(category, version=version) for category in CATEGORIES]
|
166 |
+
return [init_leaderboard(new_df)] + [init_leaderboard(df) for df in category_dfs]
|
167 |
|
168 |
|
169 |
# Create Gradio app
|
|
|
171 |
|
172 |
with demo:
|
173 |
gr.HTML(TITLE)
|
174 |
+
|
175 |
+
with gr.Row():
|
176 |
+
with gr.Column(scale=3):
|
177 |
+
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
178 |
+
with gr.Column(scale=1):
|
179 |
+
version_selector = gr.Dropdown(
|
180 |
+
choices=BENCHMARK_VERSIONS,
|
181 |
+
label="Benchmark Version",
|
182 |
+
value=CURRENT_VERSION,
|
183 |
+
interactive=True,
|
184 |
+
elem_classes="version-selector"
|
185 |
+
)
|
186 |
+
|
187 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
188 |
with gr.TabItem("🏅 Leaderboard", elem_id="guardbench-leaderboard-tab", id=0):
|
189 |
refresh_button = gr.Button("Refresh Leaderboard")
|
190 |
+
|
191 |
# Create tabs for each category
|
192 |
with gr.Tabs(elem_classes="category-tabs") as category_tabs:
|
193 |
# First tab for average metrics across all categories
|
194 |
with gr.TabItem("📊 Overall Performance", elem_id="overall-tab"):
|
195 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
196 |
+
|
197 |
# Create a tab for each category
|
198 |
for category in CATEGORIES:
|
199 |
with gr.TabItem(f"{category}", elem_id=f"category-{category.lower().replace(' ', '-')}-tab"):
|
200 |
+
category_df = get_category_leaderboard_df(category, version=CURRENT_VERSION)
|
201 |
category_leaderboard = init_leaderboard(category_df)
|
202 |
+
|
203 |
# Refresh button functionality
|
204 |
refresh_button.click(
|
205 |
fn=lambda: [
|
206 |
+
init_leaderboard(get_leaderboard_df(version=version_selector.value)),
|
207 |
+
*[init_leaderboard(get_category_leaderboard_df(category, version=version_selector.value)) for category in CATEGORIES]
|
208 |
],
|
209 |
inputs=[],
|
210 |
outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)]
|
211 |
)
|
212 |
+
|
213 |
with gr.TabItem("📝 About", elem_id="guardbench-about-tab", id=1):
|
214 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
215 |
+
|
216 |
with gr.TabItem("🚀 Submit", elem_id="guardbench-submit-tab", id=2):
|
217 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
218 |
+
|
219 |
with gr.Row():
|
220 |
gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
|
221 |
+
|
222 |
with gr.Row():
|
223 |
with gr.Column():
|
224 |
model_name_textbox = gr.Textbox(label="Model name")
|
|
|
230 |
value=None,
|
231 |
interactive=True,
|
232 |
)
|
233 |
+
|
234 |
with gr.Column():
|
235 |
precision = gr.Dropdown(
|
236 |
+
choices=[i.name for i in Precision if i != Precision.Unknown],
|
237 |
label="Precision",
|
238 |
multiselect=False,
|
239 |
value="float16",
|
240 |
interactive=True,
|
241 |
)
|
242 |
weight_type = gr.Dropdown(
|
243 |
+
choices=[i.name for i in WeightType],
|
244 |
label="Weights type",
|
245 |
multiselect=False,
|
246 |
value="Original",
|
247 |
interactive=True,
|
248 |
)
|
249 |
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
250 |
+
|
251 |
with gr.Row():
|
252 |
file_input = gr.File(
|
253 |
+
label="Upload JSONL Results File",
|
254 |
file_types=[".jsonl"]
|
255 |
)
|
256 |
+
|
257 |
submit_button = gr.Button("Submit Results")
|
258 |
result_output = gr.Markdown()
|
259 |
+
|
260 |
submit_button.click(
|
261 |
fn=submit_results,
|
262 |
inputs=[
|
|
|
266 |
precision,
|
267 |
weight_type,
|
268 |
model_type,
|
269 |
+
file_input,
|
270 |
+
version_selector
|
271 |
],
|
272 |
outputs=result_output
|
273 |
)
|
274 |
+
|
275 |
+
# Version selector functionality
|
276 |
+
version_selector.change(
|
277 |
+
fn=update_leaderboards,
|
278 |
+
inputs=[version_selector],
|
279 |
+
outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)]
|
280 |
+
)
|
281 |
+
|
282 |
with gr.Row():
|
283 |
with gr.Accordion("📙 Citation", open=False):
|
284 |
citation_button = gr.Textbox(
|
|
|
288 |
elem_id="citation-button",
|
289 |
show_copy_button=True,
|
290 |
)
|
291 |
+
|
292 |
with gr.Accordion("ℹ️ Dataset Information", open=False):
|
293 |
dataset_info = gr.Markdown(f"""
|
294 |
## Dataset Information
|
295 |
+
|
296 |
Results are stored in the HuggingFace dataset: [{RESULTS_DATASET_ID}](https://huggingface.co/datasets/{RESULTS_DATASET_ID})
|
297 |
+
|
298 |
Last updated: {pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S UTC")}
|
299 |
""")
|
300 |
|
|
|
301 |
scheduler = BackgroundScheduler()
|
302 |
+
scheduler.add_job(lambda: refresh_data(version=CURRENT_VERSION), 'interval', minutes=30)
|
303 |
scheduler.start()
|
304 |
|
305 |
# Launch the app
|
306 |
if __name__ == "__main__":
|
307 |
+
|
308 |
+
demo.launch(server_name="0.0.0.0", server_port=7860, share=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
guard-bench-submodule
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Subproject commit 0a9f48bcedd0ccb6b5cf59ff7ed1186e32a5dc17
|
src/display/css_html_js.py
CHANGED
@@ -43,4 +43,22 @@ custom_css = """
|
|
43 |
text-decoration: underline;
|
44 |
color: #1976D2;
|
45 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
"""
|
|
|
43 |
text-decoration: underline;
|
44 |
color: #1976D2;
|
45 |
}
|
46 |
+
|
47 |
+
.version-selector {
|
48 |
+
margin-top: 10px;
|
49 |
+
padding: 5px;
|
50 |
+
border: 1px solid #e0e0e0;
|
51 |
+
border-radius: 5px;
|
52 |
+
background-color: #f9f9f9;
|
53 |
+
}
|
54 |
+
|
55 |
+
.version-selector label {
|
56 |
+
font-weight: bold;
|
57 |
+
color: #2196F3;
|
58 |
+
}
|
59 |
+
|
60 |
+
.version-selector select {
|
61 |
+
border-color: #2196F3;
|
62 |
+
border-radius: 4px;
|
63 |
+
}
|
64 |
"""
|
src/display/utils.py
CHANGED
@@ -36,12 +36,19 @@ class Precision(Enum):
|
|
36 |
int8 = auto()
|
37 |
int4 = auto()
|
38 |
|
|
|
|
|
|
|
|
|
39 |
|
40 |
class WeightType(Enum):
|
41 |
"""Model weight types."""
|
42 |
Original = auto()
|
43 |
Delta = auto()
|
44 |
Adapter = auto()
|
|
|
|
|
|
|
45 |
|
46 |
|
47 |
@dataclass
|
@@ -58,19 +65,19 @@ class ColumnInfo:
|
|
58 |
@dataclass
|
59 |
class GuardBenchColumn:
|
60 |
"""Columns for the GuardBench leaderboard."""
|
61 |
-
|
62 |
name="model_name",
|
63 |
display_name="Model",
|
64 |
never_hidden=True,
|
65 |
displayed_by_default=True
|
66 |
))
|
67 |
-
|
68 |
model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
69 |
name="model_type",
|
70 |
display_name="Type",
|
71 |
displayed_by_default=True
|
72 |
))
|
73 |
-
|
74 |
# Metrics for all categories
|
75 |
default_prompts_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
76 |
name="default_prompts_f1",
|
@@ -78,28 +85,28 @@ class GuardBenchColumn:
|
|
78 |
type="number",
|
79 |
displayed_by_default=True
|
80 |
))
|
81 |
-
|
82 |
jailbreaked_prompts_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
83 |
name="jailbreaked_prompts_f1",
|
84 |
display_name="Jailbreaked Prompts F1",
|
85 |
type="number",
|
86 |
displayed_by_default=True
|
87 |
))
|
88 |
-
|
89 |
default_answers_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
90 |
name="default_answers_f1",
|
91 |
display_name="Default Answers F1",
|
92 |
type="number",
|
93 |
displayed_by_default=True
|
94 |
))
|
95 |
-
|
96 |
jailbreaked_answers_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
97 |
name="jailbreaked_answers_f1",
|
98 |
display_name="Jailbreaked Answers F1",
|
99 |
type="number",
|
100 |
displayed_by_default=True
|
101 |
))
|
102 |
-
|
103 |
# Average metrics
|
104 |
average_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
105 |
name="average_f1",
|
@@ -108,21 +115,21 @@ class GuardBenchColumn:
|
|
108 |
displayed_by_default=True,
|
109 |
never_hidden=True
|
110 |
))
|
111 |
-
|
112 |
average_recall: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
113 |
name="average_recall",
|
114 |
display_name="Average Recall",
|
115 |
type="number",
|
116 |
displayed_by_default=False
|
117 |
))
|
118 |
-
|
119 |
average_precision: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
120 |
name="average_precision",
|
121 |
display_name="Average Precision",
|
122 |
type="number",
|
123 |
displayed_by_default=False
|
124 |
))
|
125 |
-
|
126 |
# Additional metadata
|
127 |
submission_date: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
128 |
name="submission_date",
|
@@ -136,13 +143,13 @@ GUARDBENCH_COLUMN = GuardBenchColumn()
|
|
136 |
|
137 |
# Extract column lists for different views
|
138 |
COLS = [f.name for f in fields(GUARDBENCH_COLUMN)]
|
139 |
-
DISPLAY_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
|
140 |
if getattr(GUARDBENCH_COLUMN, f.name).displayed_by_default]
|
141 |
-
METRIC_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
|
142 |
if getattr(GUARDBENCH_COLUMN, f.name).type == "number"]
|
143 |
-
HIDDEN_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
|
144 |
if getattr(GUARDBENCH_COLUMN, f.name).hidden]
|
145 |
-
NEVER_HIDDEN_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
|
146 |
if getattr(GUARDBENCH_COLUMN, f.name).never_hidden]
|
147 |
|
148 |
# Categories in GuardBench
|
|
|
36 |
int8 = auto()
|
37 |
int4 = auto()
|
38 |
|
39 |
+
def __str__(self):
|
40 |
+
"""String representation of the precision type."""
|
41 |
+
return self.name
|
42 |
+
|
43 |
|
44 |
class WeightType(Enum):
|
45 |
"""Model weight types."""
|
46 |
Original = auto()
|
47 |
Delta = auto()
|
48 |
Adapter = auto()
|
49 |
+
def __str__(self):
|
50 |
+
"""String representation of the weight type."""
|
51 |
+
return self.name
|
52 |
|
53 |
|
54 |
@dataclass
|
|
|
65 |
@dataclass
|
66 |
class GuardBenchColumn:
|
67 |
"""Columns for the GuardBench leaderboard."""
|
68 |
+
model_name: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
69 |
name="model_name",
|
70 |
display_name="Model",
|
71 |
never_hidden=True,
|
72 |
displayed_by_default=True
|
73 |
))
|
74 |
+
|
75 |
model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
76 |
name="model_type",
|
77 |
display_name="Type",
|
78 |
displayed_by_default=True
|
79 |
))
|
80 |
+
|
81 |
# Metrics for all categories
|
82 |
default_prompts_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
83 |
name="default_prompts_f1",
|
|
|
85 |
type="number",
|
86 |
displayed_by_default=True
|
87 |
))
|
88 |
+
|
89 |
jailbreaked_prompts_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
90 |
name="jailbreaked_prompts_f1",
|
91 |
display_name="Jailbreaked Prompts F1",
|
92 |
type="number",
|
93 |
displayed_by_default=True
|
94 |
))
|
95 |
+
|
96 |
default_answers_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
97 |
name="default_answers_f1",
|
98 |
display_name="Default Answers F1",
|
99 |
type="number",
|
100 |
displayed_by_default=True
|
101 |
))
|
102 |
+
|
103 |
jailbreaked_answers_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
104 |
name="jailbreaked_answers_f1",
|
105 |
display_name="Jailbreaked Answers F1",
|
106 |
type="number",
|
107 |
displayed_by_default=True
|
108 |
))
|
109 |
+
|
110 |
# Average metrics
|
111 |
average_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
112 |
name="average_f1",
|
|
|
115 |
displayed_by_default=True,
|
116 |
never_hidden=True
|
117 |
))
|
118 |
+
|
119 |
average_recall: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
120 |
name="average_recall",
|
121 |
display_name="Average Recall",
|
122 |
type="number",
|
123 |
displayed_by_default=False
|
124 |
))
|
125 |
+
|
126 |
average_precision: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
127 |
name="average_precision",
|
128 |
display_name="Average Precision",
|
129 |
type="number",
|
130 |
displayed_by_default=False
|
131 |
))
|
132 |
+
|
133 |
# Additional metadata
|
134 |
submission_date: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
135 |
name="submission_date",
|
|
|
143 |
|
144 |
# Extract column lists for different views
|
145 |
COLS = [f.name for f in fields(GUARDBENCH_COLUMN)]
|
146 |
+
DISPLAY_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
|
147 |
if getattr(GUARDBENCH_COLUMN, f.name).displayed_by_default]
|
148 |
+
METRIC_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
|
149 |
if getattr(GUARDBENCH_COLUMN, f.name).type == "number"]
|
150 |
+
HIDDEN_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
|
151 |
if getattr(GUARDBENCH_COLUMN, f.name).hidden]
|
152 |
+
NEVER_HIDDEN_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
|
153 |
if getattr(GUARDBENCH_COLUMN, f.name).never_hidden]
|
154 |
|
155 |
# Categories in GuardBench
|
src/leaderboard/processor.py
CHANGED
@@ -16,11 +16,21 @@ def load_leaderboard_data(file_path: str) -> Dict:
|
|
16 |
Load the leaderboard data from a JSON file.
|
17 |
"""
|
18 |
if not os.path.exists(file_path):
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
21 |
with open(file_path, 'r') as f:
|
22 |
data = json.load(f)
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
return data
|
25 |
|
26 |
|
@@ -30,10 +40,17 @@ def save_leaderboard_data(data: Dict, file_path: str) -> None:
|
|
30 |
"""
|
31 |
# Ensure the directory exists
|
32 |
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
33 |
-
|
34 |
# Update the last_updated timestamp
|
35 |
data["last_updated"] = datetime.now().isoformat()
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
with open(file_path, 'w') as f:
|
38 |
json.dump(data, f, indent=2)
|
39 |
|
@@ -43,26 +60,32 @@ def process_submission(submission_data: List[Dict]) -> List[Dict]:
|
|
43 |
Process submission data and convert it to leaderboard entries.
|
44 |
"""
|
45 |
entries = []
|
46 |
-
|
47 |
for item in submission_data:
|
48 |
# Create a new entry for the leaderboard
|
49 |
entry = {
|
50 |
"model_name": item.get("model_name", "Unknown Model"),
|
51 |
"per_category_metrics": {},
|
52 |
"avg_metrics": {},
|
53 |
-
"submission_date": datetime.now().isoformat()
|
|
|
54 |
}
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
56 |
# Process per-category metrics
|
57 |
if "per_category_metrics" in item:
|
58 |
entry["per_category_metrics"] = item["per_category_metrics"]
|
59 |
-
|
60 |
# Process average metrics
|
61 |
if "avg_metrics" in item:
|
62 |
entry["avg_metrics"] = item["avg_metrics"]
|
63 |
-
|
64 |
entries.append(entry)
|
65 |
-
|
66 |
return entries
|
67 |
|
68 |
|
@@ -71,17 +94,23 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
|
|
71 |
Convert leaderboard data to a pandas DataFrame for display.
|
72 |
"""
|
73 |
rows = []
|
74 |
-
|
75 |
for entry in leaderboard_data.get("entries", []):
|
76 |
model_name = entry.get("model_name", "Unknown Model")
|
77 |
-
|
78 |
# Extract average metrics for main display
|
79 |
row = {
|
80 |
"model_name": model_name,
|
81 |
"model_type": entry.get("model_type", "Unknown"),
|
82 |
-
"submission_date": entry.get("submission_date", "")
|
|
|
83 |
}
|
84 |
-
|
|
|
|
|
|
|
|
|
|
|
85 |
# Add average metrics
|
86 |
avg_metrics = entry.get("avg_metrics", {})
|
87 |
for test_type in TEST_TYPES:
|
@@ -90,12 +119,12 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
|
|
90 |
if metric in avg_metrics[test_type]:
|
91 |
col_name = f"{test_type}_{metric}"
|
92 |
row[col_name] = avg_metrics[test_type][metric]
|
93 |
-
|
94 |
# Calculate overall averages for key metrics
|
95 |
f1_values = []
|
96 |
recall_values = []
|
97 |
precision_values = []
|
98 |
-
|
99 |
for test_type in TEST_TYPES:
|
100 |
if test_type in avg_metrics and "f1_binary" in avg_metrics[test_type]:
|
101 |
f1_values.append(avg_metrics[test_type]["f1_binary"])
|
@@ -103,7 +132,7 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
|
|
103 |
recall_values.append(avg_metrics[test_type]["recall_binary"])
|
104 |
if test_type in avg_metrics and "precision_binary" in avg_metrics[test_type]:
|
105 |
precision_values.append(avg_metrics[test_type]["precision_binary"])
|
106 |
-
|
107 |
# Add overall averages
|
108 |
if f1_values:
|
109 |
row["average_f1"] = sum(f1_values) / len(f1_values)
|
@@ -111,7 +140,7 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
|
|
111 |
row["average_recall"] = sum(recall_values) / len(recall_values)
|
112 |
if precision_values:
|
113 |
row["average_precision"] = sum(precision_values) / len(precision_values)
|
114 |
-
|
115 |
# Add specific test type F1 scores for display
|
116 |
if "default_prompts" in avg_metrics and "f1_binary" in avg_metrics["default_prompts"]:
|
117 |
row["default_prompts_f1"] = avg_metrics["default_prompts"]["f1_binary"]
|
@@ -121,14 +150,14 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
|
|
121 |
row["default_answers_f1"] = avg_metrics["default_answers"]["f1_binary"]
|
122 |
if "jailbreaked_answers" in avg_metrics and "f1_binary" in avg_metrics["jailbreaked_answers"]:
|
123 |
row["jailbreaked_answers_f1"] = avg_metrics["jailbreaked_answers"]["f1_binary"]
|
124 |
-
|
125 |
rows.append(row)
|
126 |
-
|
127 |
# Create DataFrame and sort by average F1 score
|
128 |
df = pd.DataFrame(rows)
|
129 |
if not df.empty and "average_f1" in df.columns:
|
130 |
df = df.sort_values(by="average_f1", ascending=False)
|
131 |
-
|
132 |
return df
|
133 |
|
134 |
|
@@ -136,25 +165,29 @@ def add_entries_to_leaderboard(leaderboard_data: Dict, new_entries: List[Dict])
|
|
136 |
"""
|
137 |
Add new entries to the leaderboard, replacing any with the same model name.
|
138 |
"""
|
139 |
-
# Create a mapping of existing entries by model name
|
140 |
-
existing_entries = {
|
141 |
-
|
|
|
|
|
|
|
142 |
# Process each new entry
|
143 |
for new_entry in new_entries:
|
144 |
model_name = new_entry.get("model_name")
|
145 |
-
|
146 |
-
|
|
|
147 |
# Replace existing entry
|
148 |
-
leaderboard_data["entries"][existing_entries[model_name]] = new_entry
|
149 |
else:
|
150 |
# Add new entry
|
151 |
if "entries" not in leaderboard_data:
|
152 |
leaderboard_data["entries"] = []
|
153 |
leaderboard_data["entries"].append(new_entry)
|
154 |
-
|
155 |
# Update the last_updated timestamp
|
156 |
leaderboard_data["last_updated"] = datetime.now().isoformat()
|
157 |
-
|
158 |
return leaderboard_data
|
159 |
|
160 |
|
@@ -171,10 +204,10 @@ def process_jsonl_submission(file_path: str) -> Tuple[List[Dict], str]:
|
|
171 |
entries.append(entry)
|
172 |
except json.JSONDecodeError as e:
|
173 |
return [], f"Invalid JSON in submission file: {e}"
|
174 |
-
|
175 |
if not entries:
|
176 |
return [], "Submission file is empty"
|
177 |
-
|
178 |
return entries, "Successfully processed submission"
|
179 |
except Exception as e:
|
180 |
return [], f"Error processing submission file: {e}"
|
|
|
16 |
Load the leaderboard data from a JSON file.
|
17 |
"""
|
18 |
if not os.path.exists(file_path):
|
19 |
+
version = "v0"
|
20 |
+
if "_v" in file_path:
|
21 |
+
version = file_path.split("_")[-1].split(".")[0]
|
22 |
+
return {"entries": [], "last_updated": datetime.now().isoformat(), "version": version}
|
23 |
+
|
24 |
with open(file_path, 'r') as f:
|
25 |
data = json.load(f)
|
26 |
+
|
27 |
+
# Ensure version field exists
|
28 |
+
if "version" not in data:
|
29 |
+
version = "v0"
|
30 |
+
if "_v" in file_path:
|
31 |
+
version = file_path.split("_")[-1].split(".")[0]
|
32 |
+
data["version"] = version
|
33 |
+
|
34 |
return data
|
35 |
|
36 |
|
|
|
40 |
"""
|
41 |
# Ensure the directory exists
|
42 |
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
43 |
+
|
44 |
# Update the last_updated timestamp
|
45 |
data["last_updated"] = datetime.now().isoformat()
|
46 |
+
|
47 |
+
# Ensure version is set
|
48 |
+
if "version" not in data:
|
49 |
+
version = "v0"
|
50 |
+
if "_v" in file_path:
|
51 |
+
version = file_path.split("_")[-1].split(".")[0]
|
52 |
+
data["version"] = version
|
53 |
+
|
54 |
with open(file_path, 'w') as f:
|
55 |
json.dump(data, f, indent=2)
|
56 |
|
|
|
60 |
Process submission data and convert it to leaderboard entries.
|
61 |
"""
|
62 |
entries = []
|
63 |
+
|
64 |
for item in submission_data:
|
65 |
# Create a new entry for the leaderboard
|
66 |
entry = {
|
67 |
"model_name": item.get("model_name", "Unknown Model"),
|
68 |
"per_category_metrics": {},
|
69 |
"avg_metrics": {},
|
70 |
+
"submission_date": datetime.now().isoformat(),
|
71 |
+
"version": item.get("version", "v0")
|
72 |
}
|
73 |
+
|
74 |
+
# Copy model metadata
|
75 |
+
for key in ["model_type", "base_model", "revision", "precision", "weight_type"]:
|
76 |
+
if key in item:
|
77 |
+
entry[key] = item[key]
|
78 |
+
|
79 |
# Process per-category metrics
|
80 |
if "per_category_metrics" in item:
|
81 |
entry["per_category_metrics"] = item["per_category_metrics"]
|
82 |
+
|
83 |
# Process average metrics
|
84 |
if "avg_metrics" in item:
|
85 |
entry["avg_metrics"] = item["avg_metrics"]
|
86 |
+
|
87 |
entries.append(entry)
|
88 |
+
|
89 |
return entries
|
90 |
|
91 |
|
|
|
94 |
Convert leaderboard data to a pandas DataFrame for display.
|
95 |
"""
|
96 |
rows = []
|
97 |
+
|
98 |
for entry in leaderboard_data.get("entries", []):
|
99 |
model_name = entry.get("model_name", "Unknown Model")
|
100 |
+
|
101 |
# Extract average metrics for main display
|
102 |
row = {
|
103 |
"model_name": model_name,
|
104 |
"model_type": entry.get("model_type", "Unknown"),
|
105 |
+
"submission_date": entry.get("submission_date", ""),
|
106 |
+
"version": entry.get("version", "v0")
|
107 |
}
|
108 |
+
|
109 |
+
# Add additional metadata fields if present
|
110 |
+
for key in ["base_model", "revision", "precision", "weight_type"]:
|
111 |
+
if key in entry:
|
112 |
+
row[key] = entry[key]
|
113 |
+
|
114 |
# Add average metrics
|
115 |
avg_metrics = entry.get("avg_metrics", {})
|
116 |
for test_type in TEST_TYPES:
|
|
|
119 |
if metric in avg_metrics[test_type]:
|
120 |
col_name = f"{test_type}_{metric}"
|
121 |
row[col_name] = avg_metrics[test_type][metric]
|
122 |
+
|
123 |
# Calculate overall averages for key metrics
|
124 |
f1_values = []
|
125 |
recall_values = []
|
126 |
precision_values = []
|
127 |
+
|
128 |
for test_type in TEST_TYPES:
|
129 |
if test_type in avg_metrics and "f1_binary" in avg_metrics[test_type]:
|
130 |
f1_values.append(avg_metrics[test_type]["f1_binary"])
|
|
|
132 |
recall_values.append(avg_metrics[test_type]["recall_binary"])
|
133 |
if test_type in avg_metrics and "precision_binary" in avg_metrics[test_type]:
|
134 |
precision_values.append(avg_metrics[test_type]["precision_binary"])
|
135 |
+
|
136 |
# Add overall averages
|
137 |
if f1_values:
|
138 |
row["average_f1"] = sum(f1_values) / len(f1_values)
|
|
|
140 |
row["average_recall"] = sum(recall_values) / len(recall_values)
|
141 |
if precision_values:
|
142 |
row["average_precision"] = sum(precision_values) / len(precision_values)
|
143 |
+
|
144 |
# Add specific test type F1 scores for display
|
145 |
if "default_prompts" in avg_metrics and "f1_binary" in avg_metrics["default_prompts"]:
|
146 |
row["default_prompts_f1"] = avg_metrics["default_prompts"]["f1_binary"]
|
|
|
150 |
row["default_answers_f1"] = avg_metrics["default_answers"]["f1_binary"]
|
151 |
if "jailbreaked_answers" in avg_metrics and "f1_binary" in avg_metrics["jailbreaked_answers"]:
|
152 |
row["jailbreaked_answers_f1"] = avg_metrics["jailbreaked_answers"]["f1_binary"]
|
153 |
+
|
154 |
rows.append(row)
|
155 |
+
|
156 |
# Create DataFrame and sort by average F1 score
|
157 |
df = pd.DataFrame(rows)
|
158 |
if not df.empty and "average_f1" in df.columns:
|
159 |
df = df.sort_values(by="average_f1", ascending=False)
|
160 |
+
|
161 |
return df
|
162 |
|
163 |
|
|
|
165 |
"""
|
166 |
Add new entries to the leaderboard, replacing any with the same model name.
|
167 |
"""
|
168 |
+
# Create a mapping of existing entries by model name and version
|
169 |
+
existing_entries = {
|
170 |
+
(entry["model_name"], entry.get("version", "v0")): i
|
171 |
+
for i, entry in enumerate(leaderboard_data.get("entries", []))
|
172 |
+
}
|
173 |
+
|
174 |
# Process each new entry
|
175 |
for new_entry in new_entries:
|
176 |
model_name = new_entry.get("model_name")
|
177 |
+
version = new_entry.get("version", "v0")
|
178 |
+
|
179 |
+
if (model_name, version) in existing_entries:
|
180 |
# Replace existing entry
|
181 |
+
leaderboard_data["entries"][existing_entries[(model_name, version)]] = new_entry
|
182 |
else:
|
183 |
# Add new entry
|
184 |
if "entries" not in leaderboard_data:
|
185 |
leaderboard_data["entries"] = []
|
186 |
leaderboard_data["entries"].append(new_entry)
|
187 |
+
|
188 |
# Update the last_updated timestamp
|
189 |
leaderboard_data["last_updated"] = datetime.now().isoformat()
|
190 |
+
|
191 |
return leaderboard_data
|
192 |
|
193 |
|
|
|
204 |
entries.append(entry)
|
205 |
except json.JSONDecodeError as e:
|
206 |
return [], f"Invalid JSON in submission file: {e}"
|
207 |
+
|
208 |
if not entries:
|
209 |
return [], "Submission file is empty"
|
210 |
+
|
211 |
return entries, "Successfully processed submission"
|
212 |
except Exception as e:
|
213 |
return [], f"Error processing submission file: {e}"
|
src/populate.py
CHANGED
@@ -17,15 +17,29 @@ from src.envs import RESULTS_DATASET_ID, TOKEN, LEADERBOARD_FILE, CACHE_PATH
|
|
17 |
from src.leaderboard.processor import leaderboard_to_dataframe, load_leaderboard_data, save_leaderboard_data, process_jsonl_submission, add_entries_to_leaderboard
|
18 |
|
19 |
|
20 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
"""
|
22 |
Download the latest leaderboard data from HuggingFace.
|
|
|
|
|
|
|
23 |
"""
|
24 |
try:
|
25 |
# Create a temporary directory to download the submissions
|
26 |
-
temp_dir = os.path.join(CACHE_PATH, "
|
27 |
os.makedirs(temp_dir, exist_ok=True)
|
28 |
|
|
|
|
|
|
|
29 |
# Download the entire repository
|
30 |
try:
|
31 |
snapshot_path = snapshot_download(
|
@@ -43,25 +57,43 @@ def download_leaderboard_data() -> bool:
|
|
43 |
|
44 |
# Look for submission files in the submissions directory
|
45 |
submissions_dir = os.path.join(snapshot_path, "submissions")
|
|
|
|
|
|
|
46 |
if os.path.exists(submissions_dir):
|
47 |
submission_files.extend(glob(os.path.join(submissions_dir, "*.jsonl")))
|
48 |
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
# Process each submission file
|
53 |
for file_path in submission_files:
|
54 |
entries, _ = process_jsonl_submission(file_path)
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
# Create leaderboard data structure
|
58 |
leaderboard_data = {
|
59 |
"entries": all_entries,
|
60 |
-
"last_updated": pd.Timestamp.now().isoformat()
|
|
|
61 |
}
|
62 |
|
63 |
# Save to local file
|
64 |
-
save_leaderboard_data(leaderboard_data,
|
65 |
|
66 |
return True
|
67 |
except Exception as e:
|
@@ -72,7 +104,14 @@ def download_leaderboard_data() -> bool:
|
|
72 |
api = HfApi(token=TOKEN)
|
73 |
files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
|
74 |
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
all_entries = []
|
77 |
|
78 |
for file_path in submission_files:
|
@@ -84,49 +123,70 @@ def download_leaderboard_data() -> bool:
|
|
84 |
token=TOKEN
|
85 |
)
|
86 |
entries, _ = process_jsonl_submission(local_path)
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
except Exception as file_error:
|
89 |
print(f"Error downloading file {file_path}: {file_error}")
|
90 |
|
91 |
# Create leaderboard data structure
|
92 |
leaderboard_data = {
|
93 |
"entries": all_entries,
|
94 |
-
"last_updated": pd.Timestamp.now().isoformat()
|
|
|
95 |
}
|
96 |
|
97 |
# Save to local file
|
98 |
-
save_leaderboard_data(leaderboard_data,
|
99 |
|
100 |
return True
|
101 |
except Exception as list_error:
|
102 |
print(f"Error listing repository files: {list_error}")
|
103 |
|
104 |
# If we can't download anything, create an empty leaderboard
|
105 |
-
if not os.path.exists(
|
106 |
-
empty_data = {
|
107 |
-
|
|
|
|
|
|
|
|
|
108 |
|
109 |
return False
|
110 |
except Exception as e:
|
111 |
print(f"Error downloading leaderboard data: {e}")
|
112 |
|
113 |
# Ensure we have at least an empty leaderboard file
|
114 |
-
|
115 |
-
|
116 |
-
|
|
|
|
|
|
|
|
|
|
|
117 |
|
118 |
return False
|
119 |
|
120 |
|
121 |
-
def get_leaderboard_df() -> pd.DataFrame:
|
122 |
"""
|
123 |
Get the leaderboard data as a DataFrame.
|
|
|
|
|
|
|
124 |
"""
|
125 |
# Try to download the latest data
|
126 |
-
download_leaderboard_data()
|
127 |
|
128 |
# Load from local file
|
129 |
-
|
|
|
130 |
|
131 |
# Convert to DataFrame
|
132 |
df = leaderboard_to_dataframe(leaderboard_data)
|
@@ -134,18 +194,20 @@ def get_leaderboard_df() -> pd.DataFrame:
|
|
134 |
return df
|
135 |
|
136 |
|
137 |
-
def get_category_leaderboard_df(category: str) -> pd.DataFrame:
|
138 |
"""
|
139 |
Get the leaderboard data filtered by a specific category.
|
140 |
|
141 |
Args:
|
142 |
category: The category to filter by (e.g., "Criminal, Violent, and Terrorist Activity")
|
|
|
143 |
|
144 |
Returns:
|
145 |
DataFrame with metrics for the specified category
|
146 |
"""
|
147 |
# Load the leaderboard data
|
148 |
-
|
|
|
149 |
|
150 |
# Filter entries to only include those with data for the specified category
|
151 |
filtered_entries = []
|
@@ -158,6 +220,7 @@ def get_category_leaderboard_df(category: str) -> pd.DataFrame:
|
|
158 |
"model_name": entry.get("model_name", "Unknown Model"),
|
159 |
"model_type": entry.get("model_type", "Unknown"),
|
160 |
"submission_date": entry.get("submission_date", ""),
|
|
|
161 |
}
|
162 |
|
163 |
# Extract metrics for this category
|
@@ -189,7 +252,8 @@ def get_category_leaderboard_df(category: str) -> pd.DataFrame:
|
|
189 |
# Create a new leaderboard data structure with the filtered entries
|
190 |
filtered_leaderboard = {
|
191 |
"entries": filtered_entries,
|
192 |
-
"last_updated": leaderboard_data.get("last_updated", pd.Timestamp.now().isoformat())
|
|
|
193 |
}
|
194 |
|
195 |
# Convert to DataFrame
|
@@ -198,14 +262,21 @@ def get_category_leaderboard_df(category: str) -> pd.DataFrame:
|
|
198 |
return df
|
199 |
|
200 |
|
201 |
-
def get_detailed_model_data(model_name: str) -> Dict:
|
202 |
"""
|
203 |
Get detailed data for a specific model.
|
|
|
|
|
|
|
|
|
204 |
"""
|
205 |
-
|
|
|
206 |
|
207 |
for entry in leaderboard_data.get("entries", []):
|
208 |
-
|
|
|
|
|
209 |
return entry
|
210 |
|
211 |
return {}
|
|
|
17 |
from src.leaderboard.processor import leaderboard_to_dataframe, load_leaderboard_data, save_leaderboard_data, process_jsonl_submission, add_entries_to_leaderboard
|
18 |
|
19 |
|
20 |
+
def get_versioned_leaderboard_file(version="v0"):
|
21 |
+
"""
|
22 |
+
Get the versioned leaderboard file path.
|
23 |
+
"""
|
24 |
+
base_name, ext = os.path.splitext(LEADERBOARD_FILE)
|
25 |
+
return f"{base_name}_{version}{ext}"
|
26 |
+
|
27 |
+
|
28 |
+
def download_leaderboard_data(version="v0") -> bool:
|
29 |
"""
|
30 |
Download the latest leaderboard data from HuggingFace.
|
31 |
+
|
32 |
+
Args:
|
33 |
+
version: The dataset version to download
|
34 |
"""
|
35 |
try:
|
36 |
# Create a temporary directory to download the submissions
|
37 |
+
temp_dir = os.path.join(CACHE_PATH, f"temp_submissions_{version}")
|
38 |
os.makedirs(temp_dir, exist_ok=True)
|
39 |
|
40 |
+
# Get the versioned leaderboard file
|
41 |
+
leaderboard_file = get_versioned_leaderboard_file(version)
|
42 |
+
|
43 |
# Download the entire repository
|
44 |
try:
|
45 |
snapshot_path = snapshot_download(
|
|
|
57 |
|
58 |
# Look for submission files in the submissions directory
|
59 |
submissions_dir = os.path.join(snapshot_path, "submissions")
|
60 |
+
version_submissions_dir = os.path.join(snapshot_path, f"submissions_{version}")
|
61 |
+
|
62 |
+
# Check both standard and versioned submission directories
|
63 |
if os.path.exists(submissions_dir):
|
64 |
submission_files.extend(glob(os.path.join(submissions_dir, "*.jsonl")))
|
65 |
|
66 |
+
if os.path.exists(version_submissions_dir):
|
67 |
+
submission_files.extend(glob(os.path.join(version_submissions_dir, "*.jsonl")))
|
68 |
+
|
69 |
+
# Also look for any versioned JSONL files in the root
|
70 |
+
submission_files.extend(glob(os.path.join(snapshot_path, f"*_{version}.jsonl")))
|
71 |
+
|
72 |
+
# If we're looking for v0 and no versioned files found, use generic ones
|
73 |
+
if version == "v0" and not submission_files:
|
74 |
+
submission_files.extend(glob(os.path.join(snapshot_path, "*.jsonl")))
|
75 |
|
76 |
# Process each submission file
|
77 |
for file_path in submission_files:
|
78 |
entries, _ = process_jsonl_submission(file_path)
|
79 |
+
|
80 |
+
# Filter entries to those that match the version or don't have version specified
|
81 |
+
filtered_entries = [
|
82 |
+
entry for entry in entries
|
83 |
+
if entry.get("version", "v0") == version or "version" not in entry
|
84 |
+
]
|
85 |
+
|
86 |
+
all_entries.extend(filtered_entries)
|
87 |
|
88 |
# Create leaderboard data structure
|
89 |
leaderboard_data = {
|
90 |
"entries": all_entries,
|
91 |
+
"last_updated": pd.Timestamp.now().isoformat(),
|
92 |
+
"version": version
|
93 |
}
|
94 |
|
95 |
# Save to local file
|
96 |
+
save_leaderboard_data(leaderboard_data, leaderboard_file)
|
97 |
|
98 |
return True
|
99 |
except Exception as e:
|
|
|
104 |
api = HfApi(token=TOKEN)
|
105 |
files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
|
106 |
|
107 |
+
# Look for versioned and regular files
|
108 |
+
submission_files = [
|
109 |
+
f for f in files
|
110 |
+
if (f.endswith(f'_{version}.jsonl') or
|
111 |
+
f.startswith(f'submissions_{version}/') or
|
112 |
+
(version == "v0" and f.endswith('.jsonl')))
|
113 |
+
]
|
114 |
+
|
115 |
all_entries = []
|
116 |
|
117 |
for file_path in submission_files:
|
|
|
123 |
token=TOKEN
|
124 |
)
|
125 |
entries, _ = process_jsonl_submission(local_path)
|
126 |
+
|
127 |
+
# Filter entries to those that match the version or don't have version specified
|
128 |
+
filtered_entries = [
|
129 |
+
entry for entry in entries
|
130 |
+
if entry.get("version", "v0") == version or "version" not in entry
|
131 |
+
]
|
132 |
+
|
133 |
+
all_entries.extend(filtered_entries)
|
134 |
except Exception as file_error:
|
135 |
print(f"Error downloading file {file_path}: {file_error}")
|
136 |
|
137 |
# Create leaderboard data structure
|
138 |
leaderboard_data = {
|
139 |
"entries": all_entries,
|
140 |
+
"last_updated": pd.Timestamp.now().isoformat(),
|
141 |
+
"version": version
|
142 |
}
|
143 |
|
144 |
# Save to local file
|
145 |
+
save_leaderboard_data(leaderboard_data, leaderboard_file)
|
146 |
|
147 |
return True
|
148 |
except Exception as list_error:
|
149 |
print(f"Error listing repository files: {list_error}")
|
150 |
|
151 |
# If we can't download anything, create an empty leaderboard
|
152 |
+
if not os.path.exists(leaderboard_file):
|
153 |
+
empty_data = {
|
154 |
+
"entries": [],
|
155 |
+
"last_updated": pd.Timestamp.now().isoformat(),
|
156 |
+
"version": version
|
157 |
+
}
|
158 |
+
save_leaderboard_data(empty_data, leaderboard_file)
|
159 |
|
160 |
return False
|
161 |
except Exception as e:
|
162 |
print(f"Error downloading leaderboard data: {e}")
|
163 |
|
164 |
# Ensure we have at least an empty leaderboard file
|
165 |
+
leaderboard_file = get_versioned_leaderboard_file(version)
|
166 |
+
if not os.path.exists(leaderboard_file):
|
167 |
+
empty_data = {
|
168 |
+
"entries": [],
|
169 |
+
"last_updated": pd.Timestamp.now().isoformat(),
|
170 |
+
"version": version
|
171 |
+
}
|
172 |
+
save_leaderboard_data(empty_data, leaderboard_file)
|
173 |
|
174 |
return False
|
175 |
|
176 |
|
177 |
+
def get_leaderboard_df(version="v0") -> pd.DataFrame:
|
178 |
"""
|
179 |
Get the leaderboard data as a DataFrame.
|
180 |
+
|
181 |
+
Args:
|
182 |
+
version: The dataset version to retrieve
|
183 |
"""
|
184 |
# Try to download the latest data
|
185 |
+
download_leaderboard_data(version=version)
|
186 |
|
187 |
# Load from local file
|
188 |
+
leaderboard_file = get_versioned_leaderboard_file(version)
|
189 |
+
leaderboard_data = load_leaderboard_data(leaderboard_file)
|
190 |
|
191 |
# Convert to DataFrame
|
192 |
df = leaderboard_to_dataframe(leaderboard_data)
|
|
|
194 |
return df
|
195 |
|
196 |
|
197 |
+
def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
|
198 |
"""
|
199 |
Get the leaderboard data filtered by a specific category.
|
200 |
|
201 |
Args:
|
202 |
category: The category to filter by (e.g., "Criminal, Violent, and Terrorist Activity")
|
203 |
+
version: The dataset version to retrieve
|
204 |
|
205 |
Returns:
|
206 |
DataFrame with metrics for the specified category
|
207 |
"""
|
208 |
# Load the leaderboard data
|
209 |
+
leaderboard_file = get_versioned_leaderboard_file(version)
|
210 |
+
leaderboard_data = load_leaderboard_data(leaderboard_file)
|
211 |
|
212 |
# Filter entries to only include those with data for the specified category
|
213 |
filtered_entries = []
|
|
|
220 |
"model_name": entry.get("model_name", "Unknown Model"),
|
221 |
"model_type": entry.get("model_type", "Unknown"),
|
222 |
"submission_date": entry.get("submission_date", ""),
|
223 |
+
"version": entry.get("version", version),
|
224 |
}
|
225 |
|
226 |
# Extract metrics for this category
|
|
|
252 |
# Create a new leaderboard data structure with the filtered entries
|
253 |
filtered_leaderboard = {
|
254 |
"entries": filtered_entries,
|
255 |
+
"last_updated": leaderboard_data.get("last_updated", pd.Timestamp.now().isoformat()),
|
256 |
+
"version": version
|
257 |
}
|
258 |
|
259 |
# Convert to DataFrame
|
|
|
262 |
return df
|
263 |
|
264 |
|
265 |
+
def get_detailed_model_data(model_name: str, version="v0") -> Dict:
|
266 |
"""
|
267 |
Get detailed data for a specific model.
|
268 |
+
|
269 |
+
Args:
|
270 |
+
model_name: The name of the model to get data for
|
271 |
+
version: The dataset version to retrieve
|
272 |
"""
|
273 |
+
leaderboard_file = get_versioned_leaderboard_file(version)
|
274 |
+
leaderboard_data = load_leaderboard_data(leaderboard_file)
|
275 |
|
276 |
for entry in leaderboard_data.get("entries", []):
|
277 |
+
# Check both the model name and version
|
278 |
+
entry_version = entry.get("version", "v0")
|
279 |
+
if entry.get("model_name") == model_name and (entry_version == version or entry_version is None):
|
280 |
return entry
|
281 |
|
282 |
return {}
|
src/submission/submit.py
CHANGED
@@ -25,33 +25,40 @@ def validate_submission(file_path: str) -> Tuple[bool, str]:
|
|
25 |
entries, message = process_jsonl_submission(file_path)
|
26 |
if not entries:
|
27 |
return False, message
|
28 |
-
|
29 |
# Additional validation could be added here
|
30 |
-
|
31 |
return True, "Submission is valid"
|
32 |
except Exception as e:
|
33 |
return False, f"Error validating submission: {e}"
|
34 |
|
35 |
|
36 |
-
def submit_to_hub(file_path: str, metadata: Dict, dataset_id: str, token: str) -> Tuple[bool, str]:
|
37 |
"""
|
38 |
Submit results to a HuggingFace dataset repository as individual files.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
"""
|
40 |
try:
|
41 |
# Process the submission file to validate
|
42 |
entries, message = process_jsonl_submission(file_path)
|
43 |
if not entries:
|
44 |
return False, message
|
45 |
-
|
46 |
# Generate a unique submission ID
|
47 |
model_name = metadata.get("model_name", "unknown")
|
48 |
model_name_safe = model_name.replace("/", "_").replace(" ", "_")
|
49 |
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
50 |
submission_id = f"{model_name_safe}_{timestamp}"
|
51 |
-
|
52 |
# Create an API instance
|
53 |
api = HfApi(token=token)
|
54 |
-
|
55 |
# Create a temporary file with metadata added
|
56 |
with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as temp_file:
|
57 |
# Add metadata to each entry
|
@@ -59,47 +66,58 @@ def submit_to_hub(file_path: str, metadata: Dict, dataset_id: str, token: str) -
|
|
59 |
# If the entry already has a model_name, don't override it
|
60 |
if "model_name" not in entry:
|
61 |
entry["model_name"] = metadata.get("model_name")
|
62 |
-
|
63 |
# Add other metadata if not present
|
64 |
for key, value in metadata.items():
|
65 |
if key != "model_name" and key not in entry:
|
66 |
entry[key] = value
|
67 |
-
|
|
|
|
|
|
|
68 |
# Write to temp file
|
69 |
temp_file.write(json.dumps(entry) + "\n")
|
70 |
-
|
71 |
temp_path = temp_file.name
|
72 |
-
|
73 |
-
# Upload the file
|
74 |
-
submission_path = f"submissions/{submission_id}.jsonl"
|
75 |
api.upload_file(
|
76 |
path_or_fileobj=temp_path,
|
77 |
path_in_repo=submission_path,
|
78 |
repo_id=dataset_id,
|
79 |
repo_type="dataset",
|
80 |
-
commit_message=f"Add submission for {model_name}"
|
81 |
)
|
82 |
-
|
83 |
# Clean up the temporary file
|
84 |
os.unlink(temp_path)
|
85 |
-
|
86 |
-
return True, f"Successfully uploaded submission for {model_name} to {dataset_id}"
|
87 |
except Exception as e:
|
88 |
return False, f"Error submitting to dataset: {e}"
|
89 |
|
90 |
|
91 |
-
def process_submission(file_path: str, metadata: Dict) -> str:
|
92 |
"""
|
93 |
Process a submission to the GuardBench leaderboard.
|
|
|
|
|
|
|
|
|
|
|
94 |
"""
|
95 |
# Validate submission file
|
96 |
is_valid, validation_message = validate_submission(file_path)
|
97 |
if not is_valid:
|
98 |
return styled_error(validation_message)
|
99 |
-
|
|
|
|
|
|
|
100 |
# Submit to HuggingFace dataset repository
|
101 |
-
success, message = submit_to_hub(file_path, metadata, RESULTS_DATASET_ID, TOKEN)
|
102 |
if not success:
|
103 |
return styled_error(message)
|
104 |
-
|
105 |
return styled_message(f"Submission successful! {message}")
|
|
|
25 |
entries, message = process_jsonl_submission(file_path)
|
26 |
if not entries:
|
27 |
return False, message
|
28 |
+
|
29 |
# Additional validation could be added here
|
30 |
+
|
31 |
return True, "Submission is valid"
|
32 |
except Exception as e:
|
33 |
return False, f"Error validating submission: {e}"
|
34 |
|
35 |
|
36 |
+
def submit_to_hub(file_path: str, metadata: Dict, dataset_id: str, token: str, version="v0") -> Tuple[bool, str]:
|
37 |
"""
|
38 |
Submit results to a HuggingFace dataset repository as individual files.
|
39 |
+
|
40 |
+
Args:
|
41 |
+
file_path: Path to the submission file
|
42 |
+
metadata: Metadata to include with the submission
|
43 |
+
dataset_id: The dataset repository ID
|
44 |
+
token: HuggingFace API token
|
45 |
+
version: The version of the benchmark used (e.g., "v0", "v1")
|
46 |
"""
|
47 |
try:
|
48 |
# Process the submission file to validate
|
49 |
entries, message = process_jsonl_submission(file_path)
|
50 |
if not entries:
|
51 |
return False, message
|
52 |
+
|
53 |
# Generate a unique submission ID
|
54 |
model_name = metadata.get("model_name", "unknown")
|
55 |
model_name_safe = model_name.replace("/", "_").replace(" ", "_")
|
56 |
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
57 |
submission_id = f"{model_name_safe}_{timestamp}"
|
58 |
+
|
59 |
# Create an API instance
|
60 |
api = HfApi(token=token)
|
61 |
+
|
62 |
# Create a temporary file with metadata added
|
63 |
with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as temp_file:
|
64 |
# Add metadata to each entry
|
|
|
66 |
# If the entry already has a model_name, don't override it
|
67 |
if "model_name" not in entry:
|
68 |
entry["model_name"] = metadata.get("model_name")
|
69 |
+
|
70 |
# Add other metadata if not present
|
71 |
for key, value in metadata.items():
|
72 |
if key != "model_name" and key not in entry:
|
73 |
entry[key] = value
|
74 |
+
|
75 |
+
# Ensure version is set
|
76 |
+
entry["version"] = version
|
77 |
+
|
78 |
# Write to temp file
|
79 |
temp_file.write(json.dumps(entry) + "\n")
|
80 |
+
|
81 |
temp_path = temp_file.name
|
82 |
+
|
83 |
+
# Upload the file to the version-specific directory
|
84 |
+
submission_path = f"submissions_{version}/{submission_id}_{version}.jsonl" if version != "v0" else f"submissions/{submission_id}.jsonl"
|
85 |
api.upload_file(
|
86 |
path_or_fileobj=temp_path,
|
87 |
path_in_repo=submission_path,
|
88 |
repo_id=dataset_id,
|
89 |
repo_type="dataset",
|
90 |
+
commit_message=f"Add submission for {model_name} (version {version})"
|
91 |
)
|
92 |
+
|
93 |
# Clean up the temporary file
|
94 |
os.unlink(temp_path)
|
95 |
+
|
96 |
+
return True, f"Successfully uploaded submission for {model_name} to {dataset_id} (version {version})"
|
97 |
except Exception as e:
|
98 |
return False, f"Error submitting to dataset: {e}"
|
99 |
|
100 |
|
101 |
+
def process_submission(file_path: str, metadata: Dict, version="v0") -> str:
|
102 |
"""
|
103 |
Process a submission to the GuardBench leaderboard.
|
104 |
+
|
105 |
+
Args:
|
106 |
+
file_path: Path to the submission file
|
107 |
+
metadata: Metadata to include with the submission
|
108 |
+
version: The version of the benchmark used (e.g., "v0", "v1")
|
109 |
"""
|
110 |
# Validate submission file
|
111 |
is_valid, validation_message = validate_submission(file_path)
|
112 |
if not is_valid:
|
113 |
return styled_error(validation_message)
|
114 |
+
|
115 |
+
# Add version to metadata
|
116 |
+
metadata["version"] = version
|
117 |
+
|
118 |
# Submit to HuggingFace dataset repository
|
119 |
+
success, message = submit_to_hub(file_path, metadata, RESULTS_DATASET_ID, TOKEN, version=version)
|
120 |
if not success:
|
121 |
return styled_error(message)
|
122 |
+
|
123 |
return styled_message(f"Submission successful! {message}")
|