categories fix
Browse files- src/leaderboard/processor.py +8 -6
- src/populate.py +2 -2
- src/submission/submit.py +5 -5
src/leaderboard/processor.py
CHANGED
@@ -29,10 +29,10 @@ def calculate_integral_score(row: pd.Series) -> float:
|
|
29 |
for test_type in TEST_TYPES:
|
30 |
metric_col = f"{test_type}_accuracy"
|
31 |
if metric_col in row and pd.notna(row[metric_col]):
|
32 |
-
print(f"Found accuracy metric for {test_type}: {row[metric_col]}")
|
33 |
integral_score *= row[metric_col]
|
34 |
metric_count += 1
|
35 |
-
print(f"Metric count: {metric_count}")
|
36 |
|
37 |
# If no accuracy metrics were found at all, the score remains 1.0 before penalties.
|
38 |
# The library returns 0.0 in this case (`return integral_score if count > 0 else 0.0`)
|
@@ -181,7 +181,9 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
|
|
181 |
|
182 |
# CASE 1: Metrics are flat in the root
|
183 |
for key, value in entry.items():
|
184 |
-
if any(test_type in key for test_type in TEST_TYPES) or
|
|
|
|
|
185 |
row[key] = value
|
186 |
|
187 |
# CASE 2: Metrics are in avg_metrics structure
|
@@ -285,9 +287,9 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
|
|
285 |
for col in old_avg_cols:
|
286 |
if col in df.columns:
|
287 |
df = df.drop(columns=[col])
|
288 |
-
print("--- DataFrame before returning from leaderboard_to_dataframe ---")
|
289 |
-
print(df[['model_name', 'macro_accuracy', 'macro_recall', 'total_evals_count']].head())
|
290 |
-
print("-------------------------------------------------------------")
|
291 |
return df
|
292 |
|
293 |
|
|
|
29 |
for test_type in TEST_TYPES:
|
30 |
metric_col = f"{test_type}_accuracy"
|
31 |
if metric_col in row and pd.notna(row[metric_col]):
|
32 |
+
# print(f"Found accuracy metric for {test_type}: {row[metric_col]}")
|
33 |
integral_score *= row[metric_col]
|
34 |
metric_count += 1
|
35 |
+
# print(f"Metric count: {metric_count}")
|
36 |
|
37 |
# If no accuracy metrics were found at all, the score remains 1.0 before penalties.
|
38 |
# The library returns 0.0 in this case (`return integral_score if count > 0 else 0.0`)
|
|
|
181 |
|
182 |
# CASE 1: Metrics are flat in the root
|
183 |
for key, value in entry.items():
|
184 |
+
if any(test_type in key for test_type in TEST_TYPES) or \
|
185 |
+
key in ["average_f1", "average_recall", "average_precision",
|
186 |
+
"macro_accuracy", "macro_recall", "total_evals_count"]:
|
187 |
row[key] = value
|
188 |
|
189 |
# CASE 2: Metrics are in avg_metrics structure
|
|
|
287 |
for col in old_avg_cols:
|
288 |
if col in df.columns:
|
289 |
df = df.drop(columns=[col])
|
290 |
+
# print("--- DataFrame before returning from leaderboard_to_dataframe ---")
|
291 |
+
# print(df[['model_name', 'macro_accuracy', 'macro_recall', 'total_evals_count']].head())
|
292 |
+
# print("-------------------------------------------------------------")
|
293 |
return df
|
294 |
|
295 |
|
src/populate.py
CHANGED
@@ -185,8 +185,8 @@ def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
|
|
185 |
if "sample_count" in test_metrics and pd.notna(test_metrics["sample_count"]):
|
186 |
total_samples += test_metrics["sample_count"]
|
187 |
|
188 |
-
print(f"F1 values: {f1_values}")
|
189 |
-
print(f1_values, recall_values, precision_values, accuracy_values, total_samples)
|
190 |
|
191 |
|
192 |
# Add overall averages
|
|
|
185 |
if "sample_count" in test_metrics and pd.notna(test_metrics["sample_count"]):
|
186 |
total_samples += test_metrics["sample_count"]
|
187 |
|
188 |
+
# print(f"F1 values: {f1_values}")
|
189 |
+
# print(f1_values, recall_values, precision_values, accuracy_values, total_samples)
|
190 |
|
191 |
|
192 |
# Add overall averages
|
src/submission/submit.py
CHANGED
@@ -132,14 +132,14 @@ def process_submission(file_path: str, metadata: Dict, version="v0") -> str:
|
|
132 |
)
|
133 |
os.makedirs(results_dir + "/guardbench_dataset_1k_public", exist_ok=True)
|
134 |
|
135 |
-
|
136 |
-
print(f"Target file: {target_file}")
|
137 |
-
|
138 |
|
139 |
|
140 |
shutil.copy2(file_path, target_file)
|
141 |
-
print(f"Copied file to target file: {target_file}")
|
142 |
-
print(f" ls /home/user/app/guard-bench-submodule/results/guardbench_dataset_1k_public/: {subprocess.check_output('ls /home/user/app/guard-bench-submodule/results/guardbench_dataset_1k_public/', shell=True).decode('utf-8')}")
|
143 |
|
144 |
try:
|
145 |
# Initialize GuardBench context
|
|
|
132 |
)
|
133 |
os.makedirs(results_dir + "/guardbench_dataset_1k_public", exist_ok=True)
|
134 |
|
135 |
+
# (f"Submission path: {submission_path}")
|
136 |
+
# print(f"Target file: {target_file}")
|
137 |
+
# printprint(f"Results dir: {results_dir}")
|
138 |
|
139 |
|
140 |
shutil.copy2(file_path, target_file)
|
141 |
+
# print(f"Copied file to target file: {target_file}")
|
142 |
+
# print(f" ls /home/user/app/guard-bench-submodule/results/guardbench_dataset_1k_public/: {subprocess.check_output('ls /home/user/app/guard-bench-submodule/results/guardbench_dataset_1k_public/', shell=True).decode('utf-8')}")
|
143 |
|
144 |
try:
|
145 |
# Initialize GuardBench context
|