apsys commited on
Commit
b4647b5
·
1 Parent(s): 2de7cd0
Files changed (1) hide show
  1. src/populate.py +31 -6
src/populate.py CHANGED
@@ -166,15 +166,24 @@ def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
166
  f1_values = []
167
  recall_values = []
168
  precision_values = []
 
 
 
169
 
170
  for test_type in ["default_prompts", "jailbreaked_prompts", "default_answers", "jailbreaked_answers"]:
171
  if test_type in category_metrics and isinstance(category_metrics[test_type], dict):
172
- if "f1_binary" in category_metrics[test_type]:
173
- f1_values.append(category_metrics[test_type]["f1_binary"])
174
- if "recall_binary" in category_metrics[test_type]:
175
- recall_values.append(category_metrics[test_type]["recall_binary"])
176
- if "precision_binary" in category_metrics[test_type]:
177
- precision_values.append(category_metrics[test_type]["precision_binary"])
 
 
 
 
 
 
178
 
179
  # Add overall averages
180
  if f1_values:
@@ -184,6 +193,22 @@ def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
184
  if precision_values:
185
  filtered_entry["average_precision"] = sum(precision_values) / len(precision_values)
186
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  filtered_entries.append(filtered_entry)
188
 
189
  # Create a new leaderboard data structure with the filtered entries
 
166
  f1_values = []
167
  recall_values = []
168
  precision_values = []
169
+ accuracy_values = []
170
+ category_recall_values = []
171
+ total_samples = 0
172
 
173
  for test_type in ["default_prompts", "jailbreaked_prompts", "default_answers", "jailbreaked_answers"]:
174
  if test_type in category_metrics and isinstance(category_metrics[test_type], dict):
175
+ test_metrics = category_metrics[test_type]
176
+ if "f1_binary" in test_metrics and pd.notna(test_metrics["f1_binary"]):
177
+ f1_values.append(test_metrics["f1_binary"])
178
+ if "recall_binary" in test_metrics and pd.notna(test_metrics["recall_binary"]):
179
+ recall_values.append(test_metrics["recall_binary"])
180
+ category_recall_values.append(test_metrics["recall_binary"])
181
+ if "precision_binary" in test_metrics and pd.notna(test_metrics["precision_binary"]):
182
+ precision_values.append(test_metrics["precision_binary"])
183
+ if "accuracy" in test_metrics and pd.notna(test_metrics["accuracy"]):
184
+ accuracy_values.append(test_metrics["accuracy"])
185
+ if "sample_count" in test_metrics and pd.notna(test_metrics["sample_count"]):
186
+ total_samples += test_metrics["sample_count"]
187
 
188
  # Add overall averages
189
  if f1_values:
 
193
  if precision_values:
194
  filtered_entry["average_precision"] = sum(precision_values) / len(precision_values)
195
 
196
+ # Add category-specific values to standard macro metric keys
197
+ if accuracy_values:
198
+ filtered_entry["macro_accuracy"] = sum(accuracy_values) / len(accuracy_values)
199
+ else:
200
+ filtered_entry["macro_accuracy"] = pd.NA
201
+
202
+ if category_recall_values:
203
+ filtered_entry["macro_recall"] = sum(category_recall_values) / len(category_recall_values)
204
+ else:
205
+ filtered_entry["macro_recall"] = pd.NA
206
+
207
+ if total_samples > 0:
208
+ filtered_entry["total_evals_count"] = total_samples
209
+ else:
210
+ filtered_entry["total_evals_count"] = pd.NA
211
+
212
  filtered_entries.append(filtered_entry)
213
 
214
  # Create a new leaderboard data structure with the filtered entries