Spaces:

whitecircle-ai
/

circle-guard-bench

Running

apsys commited on 19 days ago

Commit

3f01f81

1 Parent(s): 5113e4c

categories fix

Files changed (3) hide show

src/leaderboard/processor.py CHANGED Viewed

@@ -29,10 +29,10 @@ def calculate_integral_score(row: pd.Series) -> float:
     for test_type in TEST_TYPES:
         metric_col = f"{test_type}_accuracy"
         if metric_col in row and pd.notna(row[metric_col]):
-            print(f"Found accuracy metric for {test_type}: {row[metric_col]}")
             integral_score *= row[metric_col]
             metric_count += 1
-    print(f"Metric count: {metric_count}")
     # If no accuracy metrics were found at all, the score remains 1.0 before penalties.
     # The library returns 0.0 in this case (`return integral_score if count > 0 else 0.0`)
@@ -181,7 +181,9 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
         # CASE 1: Metrics are flat in the root
         for key, value in entry.items():
-            if any(test_type in key for test_type in TEST_TYPES) or key in ["average_f1", "average_recall", "average_precision"]:
                 row[key] = value
         # CASE 2: Metrics are in avg_metrics structure
@@ -285,9 +287,9 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
     for col in old_avg_cols:
         if col in df.columns:
             df = df.drop(columns=[col])
-    print("--- DataFrame before returning from leaderboard_to_dataframe ---")
-    print(df[['model_name', 'macro_accuracy', 'macro_recall', 'total_evals_count']].head())
-    print("-------------------------------------------------------------")
     return df

     for test_type in TEST_TYPES:
         metric_col = f"{test_type}_accuracy"
         if metric_col in row and pd.notna(row[metric_col]):
+            # print(f"Found accuracy metric for {test_type}: {row[metric_col]}")
             integral_score *= row[metric_col]
             metric_count += 1
+    # print(f"Metric count: {metric_count}")
     # If no accuracy metrics were found at all, the score remains 1.0 before penalties.
     # The library returns 0.0 in this case (`return integral_score if count > 0 else 0.0`)
         # CASE 1: Metrics are flat in the root
         for key, value in entry.items():
+            if any(test_type in key for test_type in TEST_TYPES) or \
+               key in ["average_f1", "average_recall", "average_precision",
+                       "macro_accuracy", "macro_recall", "total_evals_count"]:
                 row[key] = value
         # CASE 2: Metrics are in avg_metrics structure
     for col in old_avg_cols:
         if col in df.columns:
             df = df.drop(columns=[col])
+    # print("--- DataFrame before returning from leaderboard_to_dataframe ---")
+    # print(df[['model_name', 'macro_accuracy', 'macro_recall', 'total_evals_count']].head())
+    # print("-------------------------------------------------------------")
     return df

src/populate.py CHANGED Viewed

@@ -185,8 +185,8 @@ def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
                     if "sample_count" in test_metrics and pd.notna(test_metrics["sample_count"]):
                         total_samples += test_metrics["sample_count"]
-            print(f"F1 values: {f1_values}")
-            print(f1_values, recall_values, precision_values, accuracy_values, total_samples)
             # Add overall averages

                     if "sample_count" in test_metrics and pd.notna(test_metrics["sample_count"]):
                         total_samples += test_metrics["sample_count"]
+            # print(f"F1 values: {f1_values}")
+            # print(f1_values, recall_values, precision_values, accuracy_values, total_samples)
             # Add overall averages

src/submission/submit.py CHANGED Viewed

@@ -132,14 +132,14 @@ def process_submission(file_path: str, metadata: Dict, version="v0") -> str:
         )
         os.makedirs(results_dir + "/guardbench_dataset_1k_public", exist_ok=True)
-        print(f"Submission path: {submission_path}")
-        print(f"Target file: {target_file}")
-        print(f"Results dir: {results_dir}")
         shutil.copy2(file_path, target_file)
-        print(f"Copied file to target file: {target_file}")
-        print(f" ls /home/user/app/guard-bench-submodule/results/guardbench_dataset_1k_public/: {subprocess.check_output('ls /home/user/app/guard-bench-submodule/results/guardbench_dataset_1k_public/', shell=True).decode('utf-8')}")
         try:
             # Initialize GuardBench context

         )
         os.makedirs(results_dir + "/guardbench_dataset_1k_public", exist_ok=True)
+        # (f"Submission path: {submission_path}")
+        # print(f"Target file: {target_file}")
+        # printprint(f"Results dir: {results_dir}")
         shutil.copy2(file_path, target_file)
+        # print(f"Copied file to target file: {target_file}")
+        # print(f" ls /home/user/app/guard-bench-submodule/results/guardbench_dataset_1k_public/: {subprocess.check_output('ls /home/user/app/guard-bench-submodule/results/guardbench_dataset_1k_public/', shell=True).decode('utf-8')}")
         try:
             # Initialize GuardBench context