File size: 9,730 Bytes
d4d998a b1cb07d a7b55ff d4d998a b1cb07d 97cb7a0 d4d998a b1cb07d 98f5e7c b1cb07d 9064b69 d4d998a a7eca29 d4d998a a7eca29 d4d998a b1cb07d d4d998a a7eca29 3c01baa b1cb07d a7eca29 3c01baa b1cb07d 3c01baa b1cb07d a7eca29 b1cb07d 3c01baa b1cb07d a7eca29 b1cb07d 3c01baa b1cb07d d4d998a 3c01baa b1cb07d d4d998a b1cb07d d4d998a b1cb07d d4d998a 3c01baa d4d998a b1cb07d d4d998a b1cb07d d4d998a a7b55ff 3c01baa d4d998a b1cb07d 3c01baa b1cb07d 3c01baa b1cb07d 5bd956d 3c01baa b1cb07d 5bd956d b1cb07d 3f01f81 ec912e0 b1cb07d 3f01f81 b1cb07d 5bd956d b1cb07d 5bd956d b1cb07d a17bcda b1cb07d a7eca29 b1cb07d a7b55ff b1cb07d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 |
"""
Handle submissions to the GuardBench leaderboard.
"""
import json
import os
import tempfile
from datetime import datetime
from typing import Dict, List, Tuple
import shutil
import threading
import time
from huggingface_hub import HfApi
from datasets import load_dataset
import subprocess
from src.display.formatting import styled_error, styled_message
from src.envs import RESULTS_DATASET_ID, TOKEN, REPO_ID
from src.leaderboard.processor import process_jsonl_submission
from circleguardbench.evaluator import Evaluator
from circleguardbench.context import GuardbenchContext
from circleguardbench.models_config import ModelType
def validate_submission(file_path: str) -> Tuple[bool, str]:
"""
Validate a submission file.
"""
try:
entries, message = process_jsonl_submission(file_path)
if not entries:
return False, message
return True, "Submission is valid"
except Exception as e:
return False, f"Error validating submission: {e}"
def submit_entry_to_hub(entry: Dict, model_name: str, mode: str, version="v0") -> Tuple[bool, str]:
"""
Submit a model's evaluation entry to the HuggingFace dataset. The entry is uniquely identified by model_name, mode, and version.
"""
try:
# Create safe model name for file path
model_name_safe = model_name.replace("/", "_").replace(" ", "_")
mode_safe = str(mode).replace("/", "_").replace(" ", "_").lower()
# Create entry path in entries folder
entry_path = f"entries/entry_{model_name_safe}_{mode_safe}_{version}.json"
# Save entry to temporary file
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file:
json.dump(entry, temp_file, indent=2)
temp_path = temp_file.name
# Upload file
api = HfApi(token=TOKEN)
api.upload_file(
path_or_fileobj=temp_path,
path_in_repo=entry_path,
repo_id=RESULTS_DATASET_ID,
repo_type="dataset",
commit_message=f"Add evaluation entry for {model_name} (mode {mode}, version {version})"
)
os.unlink(temp_path)
return True, f"Successfully uploaded evaluation entry for {model_name} (mode {mode})"
except Exception as e:
return False, f"Error submitting entry to dataset: {e}"
def submit_leaderboard_to_hub(entries: List[Dict], version="v0") -> Tuple[bool, str]:
"""
Submit updated leaderboard to the HuggingFace dataset.
"""
try:
# Create leaderboard data
leaderboard_data = {
"entries": entries,
"last_updated": datetime.now().isoformat(),
"version": version
}
# Save to temporary file
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file:
json.dump(leaderboard_data, temp_file, indent=2)
temp_path = temp_file.name
# Upload file
api = HfApi(token=TOKEN)
api.upload_file(
path_or_fileobj=temp_path,
path_in_repo=f"leaderboards/leaderboard_{version}.json",
repo_id=RESULTS_DATASET_ID,
repo_type="dataset",
commit_message=f"Update leaderboard for version {version}"
)
os.unlink(temp_path)
return True, "Leaderboard updated successfully"
except Exception as e:
return False, f"Error updating leaderboard: {e}"
def restart_space_after_delay(delay_seconds: int = 2) -> None:
"""
Restart the Hugging Face Space after a delay.
"""
def _restart_space():
time.sleep(delay_seconds)
try:
api = HfApi(token=TOKEN)
api.restart_space(repo_id=REPO_ID)
except Exception as e:
print(f"Error restarting space: {e}")
# Start the restart in a separate thread
thread = threading.Thread(target=_restart_space)
thread.daemon = True
thread.start()
def process_submission(file_path: str, metadata: Dict, version="v0") -> str:
"""
Process a submission to the GuardBench leaderboard.
"""
try:
# Validate submission
is_valid, validation_message = validate_submission(file_path)
if not is_valid:
return styled_error(validation_message)
# Get GuardBench results directory path
guardbench_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "guard-bench-submodule")
results_dir = os.path.join(guardbench_dir, "results")
os.makedirs(results_dir, exist_ok=True)
# Copy submission to GuardBench results directory
model_name = metadata.get("model_name", "unknown")
model_name_safe = model_name.replace("/", "_").replace(" ", "_")
guard_model_type = metadata.get("guard_model_type", "unknown")
target_file = os.path.join(results_dir + "/circleguardbench_public", f"{model_name_safe}.jsonl")
# Upload raw submission file
api = HfApi(token=TOKEN)
submission_path = f"submissions_{version}/{model_name_safe}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl"
api.upload_file(
path_or_fileobj=file_path,
path_in_repo=submission_path,
repo_id=RESULTS_DATASET_ID,
repo_type="dataset",
commit_message=f"Add raw submission for {model_name}"
)
os.makedirs(results_dir + "/circleguardbench_public", exist_ok=True)
# (f"Submission path: {submission_path}")
# print(f"Target file: {target_file}")
# printprint(f"Results dir: {results_dir}")
shutil.copy2(file_path, target_file)
# print(f"Copied file to target file: {target_file}")
# print(f" ls /home/user/app/guard-bench-submodule/results/guardbench_dataset_1k_public/: {subprocess.check_output('ls /home/user/app/guard-bench-submodule/results/guardbench_dataset_1k_public/', shell=True).decode('utf-8')}")
try:
# Initialize GuardBench context
ctx = GuardbenchContext()
# Set results directory
ctx.results_dir = results_dir
# Set bench name from the results directory
ctx.bench_name = "circleguardbench_public"
# Load dataset
ctx.load_dataset("whitecircle-ai/circleguardbench_public")
# Mark as initialized
ctx.is_initialized = True
evaluator = Evaluator(ctx, force=True, using_cached=True)
# Run evaluation and get entry
evaluator.evaluate_model(model_name_safe, str(guard_model_type).lower())
# Get the entry from results
with open(os.path.join(results_dir + "/" + ctx.bench_name, "leaderboard.json"), 'r') as f:
results_data = json.load(f)
model_entry = next(
(entry for entry in results_data.get("entries", [])
if entry.get("model_name") == model_name_safe),
None
)
if not model_entry:
return styled_error("No evaluation results found")
# Add metadata to entry
model_entry.update({
"model_name": metadata.get("model_name"), # Use original model name
"model_type": metadata.get("model_type"),
"guard_model_type": str(metadata.get("guard_model_type")).lower(),
"mode": metadata.get("mode"),
"base_model": metadata.get("base_model"),
"revision": metadata.get("revision"),
"precision": metadata.get("precision"),
"weight_type": metadata.get("weight_type"),
"version": version,
"submission_date": datetime.now().isoformat()
})
# Submit entry to entries folder
success, message = submit_entry_to_hub(model_entry, model_name, metadata.get("mode"), version)
if not success:
return styled_error(message)
# Get all entries from HF dataset
api = HfApi(token=TOKEN)
files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
entry_files = [f for f in files if f.startswith("entries/") and f.endswith(f"_{version}.json")]
all_entries = []
for entry_file in entry_files:
try:
entry_path = api.hf_hub_download(
repo_id=RESULTS_DATASET_ID,
filename=entry_file,
repo_type="dataset",
)
with open(entry_path, 'r') as f:
entry_data = json.load(f)
all_entries.append(entry_data)
except Exception as e:
print(f"Error loading entry {entry_file}: {e}")
# Update leaderboard with all entries
success, message = submit_leaderboard_to_hub(all_entries, version)
if not success:
return styled_error(message)
restart_space_after_delay(5)
return styled_message("Submission successful! Model evaluated and leaderboard updated.")
except Exception as eval_error:
return styled_error(f"Error during evaluation: {eval_error}")
except Exception as e:
return styled_error(f"Error processing submission: {e}")
finally:
# Clean up temporary files
try:
if os.path.exists(file_path):
os.remove(file_path)
if os.path.exists(target_file):
os.remove(target_file)
except:
pass
|