rounding
Browse files- src/populate.py +31 -6
src/populate.py
CHANGED
@@ -166,15 +166,24 @@ def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
|
|
166 |
f1_values = []
|
167 |
recall_values = []
|
168 |
precision_values = []
|
|
|
|
|
|
|
169 |
|
170 |
for test_type in ["default_prompts", "jailbreaked_prompts", "default_answers", "jailbreaked_answers"]:
|
171 |
if test_type in category_metrics and isinstance(category_metrics[test_type], dict):
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
|
179 |
# Add overall averages
|
180 |
if f1_values:
|
@@ -184,6 +193,22 @@ def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
|
|
184 |
if precision_values:
|
185 |
filtered_entry["average_precision"] = sum(precision_values) / len(precision_values)
|
186 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
filtered_entries.append(filtered_entry)
|
188 |
|
189 |
# Create a new leaderboard data structure with the filtered entries
|
|
|
166 |
f1_values = []
|
167 |
recall_values = []
|
168 |
precision_values = []
|
169 |
+
accuracy_values = []
|
170 |
+
category_recall_values = []
|
171 |
+
total_samples = 0
|
172 |
|
173 |
for test_type in ["default_prompts", "jailbreaked_prompts", "default_answers", "jailbreaked_answers"]:
|
174 |
if test_type in category_metrics and isinstance(category_metrics[test_type], dict):
|
175 |
+
test_metrics = category_metrics[test_type]
|
176 |
+
if "f1_binary" in test_metrics and pd.notna(test_metrics["f1_binary"]):
|
177 |
+
f1_values.append(test_metrics["f1_binary"])
|
178 |
+
if "recall_binary" in test_metrics and pd.notna(test_metrics["recall_binary"]):
|
179 |
+
recall_values.append(test_metrics["recall_binary"])
|
180 |
+
category_recall_values.append(test_metrics["recall_binary"])
|
181 |
+
if "precision_binary" in test_metrics and pd.notna(test_metrics["precision_binary"]):
|
182 |
+
precision_values.append(test_metrics["precision_binary"])
|
183 |
+
if "accuracy" in test_metrics and pd.notna(test_metrics["accuracy"]):
|
184 |
+
accuracy_values.append(test_metrics["accuracy"])
|
185 |
+
if "sample_count" in test_metrics and pd.notna(test_metrics["sample_count"]):
|
186 |
+
total_samples += test_metrics["sample_count"]
|
187 |
|
188 |
# Add overall averages
|
189 |
if f1_values:
|
|
|
193 |
if precision_values:
|
194 |
filtered_entry["average_precision"] = sum(precision_values) / len(precision_values)
|
195 |
|
196 |
+
# Add category-specific values to standard macro metric keys
|
197 |
+
if accuracy_values:
|
198 |
+
filtered_entry["macro_accuracy"] = sum(accuracy_values) / len(accuracy_values)
|
199 |
+
else:
|
200 |
+
filtered_entry["macro_accuracy"] = pd.NA
|
201 |
+
|
202 |
+
if category_recall_values:
|
203 |
+
filtered_entry["macro_recall"] = sum(category_recall_values) / len(category_recall_values)
|
204 |
+
else:
|
205 |
+
filtered_entry["macro_recall"] = pd.NA
|
206 |
+
|
207 |
+
if total_samples > 0:
|
208 |
+
filtered_entry["total_evals_count"] = total_samples
|
209 |
+
else:
|
210 |
+
filtered_entry["total_evals_count"] = pd.NA
|
211 |
+
|
212 |
filtered_entries.append(filtered_entry)
|
213 |
|
214 |
# Create a new leaderboard data structure with the filtered entries
|