|
""" |
|
Handle submissions to the GuardBench leaderboard. |
|
""" |
|
|
|
import json |
|
import os |
|
import tempfile |
|
from datetime import datetime |
|
from typing import Dict, List, Tuple |
|
import shutil |
|
import threading |
|
import time |
|
|
|
from huggingface_hub import HfApi |
|
from datasets import load_dataset |
|
import subprocess |
|
|
|
from src.display.formatting import styled_error, styled_message |
|
from src.envs import RESULTS_DATASET_ID, TOKEN, REPO_ID |
|
from src.leaderboard.processor import process_jsonl_submission |
|
from circleguardbench.evaluator import Evaluator |
|
from circleguardbench.context import GuardbenchContext |
|
from circleguardbench.models_config import ModelType |
|
|
|
|
|
def validate_submission(file_path: str) -> Tuple[bool, str]: |
|
""" |
|
Validate a submission file. |
|
""" |
|
try: |
|
entries, message = process_jsonl_submission(file_path) |
|
if not entries: |
|
return False, message |
|
return True, "Submission is valid" |
|
except Exception as e: |
|
return False, f"Error validating submission: {e}" |
|
|
|
|
|
def submit_entry_to_hub(entry: Dict, model_name: str, mode: str, version="v0") -> Tuple[bool, str]: |
|
""" |
|
Submit a model's evaluation entry to the HuggingFace dataset. The entry is uniquely identified by model_name, mode, and version. |
|
""" |
|
try: |
|
|
|
model_name_safe = model_name.replace("/", "_").replace(" ", "_") |
|
mode_safe = str(mode).replace("/", "_").replace(" ", "_").lower() |
|
|
|
|
|
entry_path = f"entries/entry_{model_name_safe}_{mode_safe}_{version}.json" |
|
|
|
|
|
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file: |
|
json.dump(entry, temp_file, indent=2) |
|
temp_path = temp_file.name |
|
|
|
|
|
api = HfApi(token=TOKEN) |
|
api.upload_file( |
|
path_or_fileobj=temp_path, |
|
path_in_repo=entry_path, |
|
repo_id=RESULTS_DATASET_ID, |
|
repo_type="dataset", |
|
commit_message=f"Add evaluation entry for {model_name} (mode {mode}, version {version})" |
|
) |
|
|
|
os.unlink(temp_path) |
|
return True, f"Successfully uploaded evaluation entry for {model_name} (mode {mode})" |
|
except Exception as e: |
|
return False, f"Error submitting entry to dataset: {e}" |
|
|
|
|
|
def submit_leaderboard_to_hub(entries: List[Dict], version="v0") -> Tuple[bool, str]: |
|
""" |
|
Submit updated leaderboard to the HuggingFace dataset. |
|
""" |
|
try: |
|
|
|
leaderboard_data = { |
|
"entries": entries, |
|
"last_updated": datetime.now().isoformat(), |
|
"version": version |
|
} |
|
|
|
|
|
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file: |
|
json.dump(leaderboard_data, temp_file, indent=2) |
|
temp_path = temp_file.name |
|
|
|
|
|
api = HfApi(token=TOKEN) |
|
api.upload_file( |
|
path_or_fileobj=temp_path, |
|
path_in_repo=f"leaderboards/leaderboard_{version}.json", |
|
repo_id=RESULTS_DATASET_ID, |
|
repo_type="dataset", |
|
commit_message=f"Update leaderboard for version {version}" |
|
) |
|
|
|
os.unlink(temp_path) |
|
return True, "Leaderboard updated successfully" |
|
except Exception as e: |
|
return False, f"Error updating leaderboard: {e}" |
|
|
|
|
|
def restart_space_after_delay(delay_seconds: int = 2) -> None: |
|
""" |
|
Restart the Hugging Face Space after a delay. |
|
""" |
|
def _restart_space(): |
|
time.sleep(delay_seconds) |
|
try: |
|
api = HfApi(token=TOKEN) |
|
api.restart_space(repo_id=REPO_ID) |
|
except Exception as e: |
|
print(f"Error restarting space: {e}") |
|
|
|
|
|
thread = threading.Thread(target=_restart_space) |
|
thread.daemon = True |
|
thread.start() |
|
|
|
|
|
def process_submission(file_path: str, metadata: Dict, version="v0") -> str: |
|
""" |
|
Process a submission to the GuardBench leaderboard. |
|
""" |
|
try: |
|
|
|
is_valid, validation_message = validate_submission(file_path) |
|
if not is_valid: |
|
return styled_error(validation_message) |
|
|
|
|
|
guardbench_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "guard-bench-submodule") |
|
results_dir = os.path.join(guardbench_dir, "results") |
|
os.makedirs(results_dir, exist_ok=True) |
|
|
|
|
|
model_name = metadata.get("model_name", "unknown") |
|
model_name_safe = model_name.replace("/", "_").replace(" ", "_") |
|
guard_model_type = metadata.get("guard_model_type", "unknown") |
|
target_file = os.path.join(results_dir + "/circleguardbench_public", f"{model_name_safe}.jsonl") |
|
|
|
|
|
api = HfApi(token=TOKEN) |
|
submission_path = f"submissions_{version}/{model_name_safe}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl" |
|
api.upload_file( |
|
path_or_fileobj=file_path, |
|
path_in_repo=submission_path, |
|
repo_id=RESULTS_DATASET_ID, |
|
repo_type="dataset", |
|
commit_message=f"Add raw submission for {model_name}" |
|
) |
|
os.makedirs(results_dir + "/circleguardbench_public", exist_ok=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
shutil.copy2(file_path, target_file) |
|
|
|
|
|
|
|
try: |
|
|
|
ctx = GuardbenchContext() |
|
|
|
ctx.results_dir = results_dir |
|
|
|
ctx.bench_name = "circleguardbench_public" |
|
|
|
ctx.load_dataset("whitecircle-ai/circleguardbench_public") |
|
|
|
ctx.is_initialized = True |
|
|
|
evaluator = Evaluator(ctx, force=True, using_cached=True) |
|
|
|
|
|
evaluator.evaluate_model(model_name_safe, str(guard_model_type).lower()) |
|
|
|
|
|
with open(os.path.join(results_dir + "/" + ctx.bench_name, "leaderboard.json"), 'r') as f: |
|
results_data = json.load(f) |
|
model_entry = next( |
|
(entry for entry in results_data.get("entries", []) |
|
if entry.get("model_name") == model_name_safe), |
|
None |
|
) |
|
|
|
if not model_entry: |
|
return styled_error("No evaluation results found") |
|
|
|
|
|
model_entry.update({ |
|
"model_name": metadata.get("model_name"), |
|
"model_type": metadata.get("model_type"), |
|
"guard_model_type": str(metadata.get("guard_model_type")).lower(), |
|
"mode": metadata.get("mode"), |
|
"base_model": metadata.get("base_model"), |
|
"revision": metadata.get("revision"), |
|
"precision": metadata.get("precision"), |
|
"weight_type": metadata.get("weight_type"), |
|
"version": version, |
|
"submission_date": datetime.now().isoformat() |
|
}) |
|
|
|
|
|
success, message = submit_entry_to_hub(model_entry, model_name, metadata.get("mode"), version) |
|
if not success: |
|
return styled_error(message) |
|
|
|
|
|
api = HfApi(token=TOKEN) |
|
files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset") |
|
entry_files = [f for f in files if f.startswith("entries/") and f.endswith(f"_{version}.json")] |
|
|
|
all_entries = [] |
|
for entry_file in entry_files: |
|
try: |
|
entry_path = api.hf_hub_download( |
|
repo_id=RESULTS_DATASET_ID, |
|
filename=entry_file, |
|
repo_type="dataset", |
|
) |
|
with open(entry_path, 'r') as f: |
|
entry_data = json.load(f) |
|
all_entries.append(entry_data) |
|
except Exception as e: |
|
print(f"Error loading entry {entry_file}: {e}") |
|
|
|
|
|
success, message = submit_leaderboard_to_hub(all_entries, version) |
|
if not success: |
|
return styled_error(message) |
|
|
|
restart_space_after_delay(5) |
|
|
|
return styled_message("Submission successful! Model evaluated and leaderboard updated.") |
|
|
|
except Exception as eval_error: |
|
return styled_error(f"Error during evaluation: {eval_error}") |
|
|
|
except Exception as e: |
|
return styled_error(f"Error processing submission: {e}") |
|
finally: |
|
|
|
try: |
|
if os.path.exists(file_path): |
|
os.remove(file_path) |
|
if os.path.exists(target_file): |
|
os.remove(target_file) |
|
except: |
|
pass |
|
|