apsys commited on
Commit
3f01f81
·
1 Parent(s): 5113e4c

categories fix

Browse files
src/leaderboard/processor.py CHANGED
@@ -29,10 +29,10 @@ def calculate_integral_score(row: pd.Series) -> float:
29
  for test_type in TEST_TYPES:
30
  metric_col = f"{test_type}_accuracy"
31
  if metric_col in row and pd.notna(row[metric_col]):
32
- print(f"Found accuracy metric for {test_type}: {row[metric_col]}")
33
  integral_score *= row[metric_col]
34
  metric_count += 1
35
- print(f"Metric count: {metric_count}")
36
 
37
  # If no accuracy metrics were found at all, the score remains 1.0 before penalties.
38
  # The library returns 0.0 in this case (`return integral_score if count > 0 else 0.0`)
@@ -181,7 +181,9 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
181
 
182
  # CASE 1: Metrics are flat in the root
183
  for key, value in entry.items():
184
- if any(test_type in key for test_type in TEST_TYPES) or key in ["average_f1", "average_recall", "average_precision"]:
 
 
185
  row[key] = value
186
 
187
  # CASE 2: Metrics are in avg_metrics structure
@@ -285,9 +287,9 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
285
  for col in old_avg_cols:
286
  if col in df.columns:
287
  df = df.drop(columns=[col])
288
- print("--- DataFrame before returning from leaderboard_to_dataframe ---")
289
- print(df[['model_name', 'macro_accuracy', 'macro_recall', 'total_evals_count']].head())
290
- print("-------------------------------------------------------------")
291
  return df
292
 
293
 
 
29
  for test_type in TEST_TYPES:
30
  metric_col = f"{test_type}_accuracy"
31
  if metric_col in row and pd.notna(row[metric_col]):
32
+ # print(f"Found accuracy metric for {test_type}: {row[metric_col]}")
33
  integral_score *= row[metric_col]
34
  metric_count += 1
35
+ # print(f"Metric count: {metric_count}")
36
 
37
  # If no accuracy metrics were found at all, the score remains 1.0 before penalties.
38
  # The library returns 0.0 in this case (`return integral_score if count > 0 else 0.0`)
 
181
 
182
  # CASE 1: Metrics are flat in the root
183
  for key, value in entry.items():
184
+ if any(test_type in key for test_type in TEST_TYPES) or \
185
+ key in ["average_f1", "average_recall", "average_precision",
186
+ "macro_accuracy", "macro_recall", "total_evals_count"]:
187
  row[key] = value
188
 
189
  # CASE 2: Metrics are in avg_metrics structure
 
287
  for col in old_avg_cols:
288
  if col in df.columns:
289
  df = df.drop(columns=[col])
290
+ # print("--- DataFrame before returning from leaderboard_to_dataframe ---")
291
+ # print(df[['model_name', 'macro_accuracy', 'macro_recall', 'total_evals_count']].head())
292
+ # print("-------------------------------------------------------------")
293
  return df
294
 
295
 
src/populate.py CHANGED
@@ -185,8 +185,8 @@ def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
185
  if "sample_count" in test_metrics and pd.notna(test_metrics["sample_count"]):
186
  total_samples += test_metrics["sample_count"]
187
 
188
- print(f"F1 values: {f1_values}")
189
- print(f1_values, recall_values, precision_values, accuracy_values, total_samples)
190
 
191
 
192
  # Add overall averages
 
185
  if "sample_count" in test_metrics and pd.notna(test_metrics["sample_count"]):
186
  total_samples += test_metrics["sample_count"]
187
 
188
+ # print(f"F1 values: {f1_values}")
189
+ # print(f1_values, recall_values, precision_values, accuracy_values, total_samples)
190
 
191
 
192
  # Add overall averages
src/submission/submit.py CHANGED
@@ -132,14 +132,14 @@ def process_submission(file_path: str, metadata: Dict, version="v0") -> str:
132
  )
133
  os.makedirs(results_dir + "/guardbench_dataset_1k_public", exist_ok=True)
134
 
135
- print(f"Submission path: {submission_path}")
136
- print(f"Target file: {target_file}")
137
- print(f"Results dir: {results_dir}")
138
 
139
 
140
  shutil.copy2(file_path, target_file)
141
- print(f"Copied file to target file: {target_file}")
142
- print(f" ls /home/user/app/guard-bench-submodule/results/guardbench_dataset_1k_public/: {subprocess.check_output('ls /home/user/app/guard-bench-submodule/results/guardbench_dataset_1k_public/', shell=True).decode('utf-8')}")
143
 
144
  try:
145
  # Initialize GuardBench context
 
132
  )
133
  os.makedirs(results_dir + "/guardbench_dataset_1k_public", exist_ok=True)
134
 
135
+ # (f"Submission path: {submission_path}")
136
+ # print(f"Target file: {target_file}")
137
+ # printprint(f"Results dir: {results_dir}")
138
 
139
 
140
  shutil.copy2(file_path, target_file)
141
+ # print(f"Copied file to target file: {target_file}")
142
+ # print(f" ls /home/user/app/guard-bench-submodule/results/guardbench_dataset_1k_public/: {subprocess.check_output('ls /home/user/app/guard-bench-submodule/results/guardbench_dataset_1k_public/', shell=True).decode('utf-8')}")
143
 
144
  try:
145
  # Initialize GuardBench context