|
""" |
|
GuardBench Leaderboard Application |
|
""" |
|
|
|
import os |
|
import json |
|
import tempfile |
|
import logging |
|
import gradio as gr |
|
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns |
|
import pandas as pd |
|
from apscheduler.schedulers.background import BackgroundScheduler |
|
|
|
from src.about import ( |
|
CITATION_BUTTON_LABEL, |
|
CITATION_BUTTON_TEXT, |
|
EVALUATION_QUEUE_TEXT, |
|
INTRODUCTION_TEXT, |
|
LLM_BENCHMARKS_TEXT, |
|
TITLE, |
|
) |
|
from src.display.css_html_js import custom_css |
|
from src.display.utils import ( |
|
GUARDBENCH_COLUMN, |
|
DISPLAY_COLS, |
|
METRIC_COLS, |
|
HIDDEN_COLS, |
|
NEVER_HIDDEN_COLS, |
|
CATEGORIES, |
|
TEST_TYPES, |
|
ModelType, |
|
Precision, |
|
WeightType |
|
) |
|
from src.display.formatting import styled_message, styled_error, styled_warning |
|
from src.envs import ( |
|
ADMIN_USERNAME, |
|
ADMIN_PASSWORD, |
|
RESULTS_DATASET_ID, |
|
SUBMITTER_TOKEN, |
|
TOKEN, |
|
DATA_PATH |
|
) |
|
from src.populate import get_leaderboard_df, download_leaderboard_data, get_category_leaderboard_df |
|
from src.submission.submit import process_submission |
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
os.makedirs(DATA_PATH, exist_ok=True) |
|
|
|
|
|
BENCHMARK_VERSIONS = ["v0"] |
|
CURRENT_VERSION = "v0" |
|
|
|
|
|
try: |
|
logger.info("Initializing leaderboard data...") |
|
LEADERBOARD_DF = get_leaderboard_df(version=CURRENT_VERSION) |
|
logger.info(f"Loaded leaderboard with {len(LEADERBOARD_DF)} entries") |
|
except Exception as e: |
|
logger.error(f"Error loading leaderboard data: {e}") |
|
LEADERBOARD_DF = pd.DataFrame() |
|
|
|
|
|
def init_leaderboard(dataframe): |
|
""" |
|
Initialize the leaderboard component. |
|
""" |
|
if dataframe is None or dataframe.empty: |
|
|
|
columns = [getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS] |
|
dataframe = pd.DataFrame(columns=columns) |
|
logger.warning("Initializing empty leaderboard") |
|
|
|
return Leaderboard( |
|
value=dataframe, |
|
datatype=[getattr(GUARDBENCH_COLUMN, col).type for col in DISPLAY_COLS], |
|
select_columns=SelectColumns( |
|
default_selection=[getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS], |
|
cant_deselect=[getattr(GUARDBENCH_COLUMN, col).name for col in NEVER_HIDDEN_COLS], |
|
label="Select Columns to Display:", |
|
), |
|
search_columns=[GUARDBENCH_COLUMN.model_name.name], |
|
hide_columns=[getattr(GUARDBENCH_COLUMN, col).name for col in HIDDEN_COLS], |
|
filter_columns=[ |
|
ColumnFilter(GUARDBENCH_COLUMN.model_type.name, type="checkboxgroup", label="Model types"), |
|
], |
|
interactive=False, |
|
) |
|
|
|
|
|
def submit_results( |
|
model_name: str, |
|
base_model: str, |
|
revision: str, |
|
precision: str, |
|
weight_type: str, |
|
model_type: str, |
|
submission_file: tempfile._TemporaryFileWrapper, |
|
version: str |
|
): |
|
""" |
|
Handle submission of results with model metadata. |
|
""" |
|
if submission_file is None: |
|
return styled_error("No submission file provided") |
|
|
|
if not model_name: |
|
return styled_error("Model name is required") |
|
|
|
if not model_type: |
|
return styled_error("Please select a model type") |
|
|
|
file_path = submission_file.name |
|
logger.info(f"Received submission for model {model_name}: {file_path}") |
|
|
|
|
|
metadata = { |
|
"model_name": model_name, |
|
"base_model": base_model, |
|
"revision": revision if revision else "main", |
|
"precision": precision, |
|
"weight_type": weight_type, |
|
"model_type": model_type, |
|
"version": version |
|
} |
|
|
|
|
|
result = process_submission(file_path, metadata, version=version) |
|
|
|
|
|
global LEADERBOARD_DF |
|
try: |
|
logger.info(f"Refreshing leaderboard data after submission for version {version}...") |
|
LEADERBOARD_DF = get_leaderboard_df(version=version) |
|
logger.info("Refreshed leaderboard data after submission") |
|
except Exception as e: |
|
logger.error(f"Error refreshing leaderboard data: {e}") |
|
|
|
return result |
|
|
|
|
|
def refresh_data(version=CURRENT_VERSION): |
|
""" |
|
Refresh the leaderboard data from HuggingFace. |
|
""" |
|
global LEADERBOARD_DF |
|
try: |
|
logger.info(f"Performing scheduled refresh of leaderboard data for version {version}...") |
|
LEADERBOARD_DF = get_leaderboard_df(version=version) |
|
logger.info("Scheduled refresh of leaderboard data completed") |
|
except Exception as e: |
|
logger.error(f"Error in scheduled refresh: {e}") |
|
return LEADERBOARD_DF |
|
|
|
|
|
def update_leaderboards(version): |
|
""" |
|
Update all leaderboard components with data for the selected version. |
|
""" |
|
new_df = get_leaderboard_df(version=version) |
|
category_dfs = [get_category_leaderboard_df(category, version=version) for category in CATEGORIES] |
|
return [init_leaderboard(new_df)] + [init_leaderboard(df) for df in category_dfs] |
|
|
|
|
|
|
|
demo = gr.Blocks(css=custom_css) |
|
|
|
with demo: |
|
gr.HTML(TITLE) |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=3): |
|
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") |
|
with gr.Column(scale=1): |
|
version_selector = gr.Dropdown( |
|
choices=BENCHMARK_VERSIONS, |
|
label="Benchmark Version", |
|
value=CURRENT_VERSION, |
|
interactive=True, |
|
elem_classes="version-selector" |
|
) |
|
|
|
with gr.Tabs(elem_classes="tab-buttons") as tabs: |
|
with gr.TabItem("🏅 Leaderboard", elem_id="guardbench-leaderboard-tab", id=0): |
|
refresh_button = gr.Button("Refresh Leaderboard") |
|
|
|
|
|
with gr.Tabs(elem_classes="category-tabs") as category_tabs: |
|
|
|
with gr.TabItem("📊 Overall Performance", elem_id="overall-tab"): |
|
leaderboard = init_leaderboard(LEADERBOARD_DF) |
|
|
|
|
|
for category in CATEGORIES: |
|
with gr.TabItem(f"{category}", elem_id=f"category-{category.lower().replace(' ', '-')}-tab"): |
|
category_df = get_category_leaderboard_df(category, version=CURRENT_VERSION) |
|
category_leaderboard = init_leaderboard(category_df) |
|
|
|
|
|
refresh_button.click( |
|
fn=lambda: [ |
|
init_leaderboard(get_leaderboard_df(version=version_selector.value)), |
|
*[init_leaderboard(get_category_leaderboard_df(category, version=version_selector.value)) for category in CATEGORIES] |
|
], |
|
inputs=[], |
|
outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)] |
|
) |
|
|
|
with gr.TabItem("📝 About", elem_id="guardbench-about-tab", id=1): |
|
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") |
|
|
|
with gr.TabItem("🚀 Submit", elem_id="guardbench-submit-tab", id=2): |
|
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") |
|
|
|
with gr.Row(): |
|
gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
model_name_textbox = gr.Textbox(label="Model name") |
|
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main") |
|
model_type = gr.Dropdown( |
|
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown], |
|
label="Model type", |
|
multiselect=False, |
|
value=None, |
|
interactive=True, |
|
) |
|
|
|
with gr.Column(): |
|
precision = gr.Dropdown( |
|
choices=[i.name for i in Precision if i != Precision.Unknown], |
|
label="Precision", |
|
multiselect=False, |
|
value="float16", |
|
interactive=True, |
|
) |
|
weight_type = gr.Dropdown( |
|
choices=[i.name for i in WeightType], |
|
label="Weights type", |
|
multiselect=False, |
|
value="Original", |
|
interactive=True, |
|
) |
|
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)") |
|
|
|
with gr.Row(): |
|
file_input = gr.File( |
|
label="Upload JSONL Results File", |
|
file_types=[".jsonl"] |
|
) |
|
|
|
submit_button = gr.Button("Submit Results") |
|
result_output = gr.Markdown() |
|
|
|
submit_button.click( |
|
fn=submit_results, |
|
inputs=[ |
|
model_name_textbox, |
|
base_model_name_textbox, |
|
revision_name_textbox, |
|
precision, |
|
weight_type, |
|
model_type, |
|
file_input, |
|
version_selector |
|
], |
|
outputs=result_output |
|
) |
|
|
|
|
|
version_selector.change( |
|
fn=update_leaderboards, |
|
inputs=[version_selector], |
|
outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)] |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Accordion("📙 Citation", open=False): |
|
citation_button = gr.Textbox( |
|
value=CITATION_BUTTON_TEXT, |
|
label=CITATION_BUTTON_LABEL, |
|
lines=10, |
|
elem_id="citation-button", |
|
show_copy_button=True, |
|
) |
|
|
|
with gr.Accordion("ℹ️ Dataset Information", open=False): |
|
dataset_info = gr.Markdown(f""" |
|
## Dataset Information |
|
|
|
Results are stored in the HuggingFace dataset: [{RESULTS_DATASET_ID}](https://huggingface.co/datasets/{RESULTS_DATASET_ID}) |
|
|
|
Last updated: {pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S UTC")} |
|
""") |
|
|
|
scheduler = BackgroundScheduler() |
|
scheduler.add_job(lambda: refresh_data(version=CURRENT_VERSION), 'interval', minutes=30) |
|
scheduler.start() |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
demo.launch(server_name="0.0.0.0", server_port=7860, share=True) |
|
|