apsys's picture
submodule + versioning
3c01baa
raw
history blame
10.8 kB
"""
GuardBench Leaderboard Application
"""
import os
import json
import tempfile
import logging
import gradio as gr
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
EVALUATION_QUEUE_TEXT,
INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT,
TITLE,
)
from src.display.css_html_js import custom_css
from src.display.utils import (
GUARDBENCH_COLUMN,
DISPLAY_COLS,
METRIC_COLS,
HIDDEN_COLS,
NEVER_HIDDEN_COLS,
CATEGORIES,
TEST_TYPES,
ModelType,
Precision,
WeightType
)
from src.display.formatting import styled_message, styled_error, styled_warning
from src.envs import (
ADMIN_USERNAME,
ADMIN_PASSWORD,
RESULTS_DATASET_ID,
SUBMITTER_TOKEN,
TOKEN,
DATA_PATH
)
from src.populate import get_leaderboard_df, download_leaderboard_data, get_category_leaderboard_df
from src.submission.submit import process_submission
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Ensure data directory exists
os.makedirs(DATA_PATH, exist_ok=True)
# Available benchmark versions
BENCHMARK_VERSIONS = ["v0"]
CURRENT_VERSION = "v0"
# Initialize leaderboard data
try:
logger.info("Initializing leaderboard data...")
LEADERBOARD_DF = get_leaderboard_df(version=CURRENT_VERSION)
logger.info(f"Loaded leaderboard with {len(LEADERBOARD_DF)} entries")
except Exception as e:
logger.error(f"Error loading leaderboard data: {e}")
LEADERBOARD_DF = pd.DataFrame()
def init_leaderboard(dataframe):
"""
Initialize the leaderboard component.
"""
if dataframe is None or dataframe.empty:
# Create an empty dataframe with the right columns
columns = [getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS]
dataframe = pd.DataFrame(columns=columns)
logger.warning("Initializing empty leaderboard")
return Leaderboard(
value=dataframe,
datatype=[getattr(GUARDBENCH_COLUMN, col).type for col in DISPLAY_COLS],
select_columns=SelectColumns(
default_selection=[getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS],
cant_deselect=[getattr(GUARDBENCH_COLUMN, col).name for col in NEVER_HIDDEN_COLS],
label="Select Columns to Display:",
),
search_columns=[GUARDBENCH_COLUMN.model_name.name],
hide_columns=[getattr(GUARDBENCH_COLUMN, col).name for col in HIDDEN_COLS],
filter_columns=[
ColumnFilter(GUARDBENCH_COLUMN.model_type.name, type="checkboxgroup", label="Model types"),
],
interactive=False,
)
def submit_results(
model_name: str,
base_model: str,
revision: str,
precision: str,
weight_type: str,
model_type: str,
submission_file: tempfile._TemporaryFileWrapper,
version: str
):
"""
Handle submission of results with model metadata.
"""
if submission_file is None:
return styled_error("No submission file provided")
if not model_name:
return styled_error("Model name is required")
if not model_type:
return styled_error("Please select a model type")
file_path = submission_file.name
logger.info(f"Received submission for model {model_name}: {file_path}")
# Add metadata to the submission
metadata = {
"model_name": model_name,
"base_model": base_model,
"revision": revision if revision else "main",
"precision": precision,
"weight_type": weight_type,
"model_type": model_type,
"version": version
}
# Process the submission
result = process_submission(file_path, metadata, version=version)
# Refresh the leaderboard data
global LEADERBOARD_DF
try:
logger.info(f"Refreshing leaderboard data after submission for version {version}...")
LEADERBOARD_DF = get_leaderboard_df(version=version)
logger.info("Refreshed leaderboard data after submission")
except Exception as e:
logger.error(f"Error refreshing leaderboard data: {e}")
return result
def refresh_data(version=CURRENT_VERSION):
"""
Refresh the leaderboard data from HuggingFace.
"""
global LEADERBOARD_DF
try:
logger.info(f"Performing scheduled refresh of leaderboard data for version {version}...")
LEADERBOARD_DF = get_leaderboard_df(version=version)
logger.info("Scheduled refresh of leaderboard data completed")
except Exception as e:
logger.error(f"Error in scheduled refresh: {e}")
return LEADERBOARD_DF
def update_leaderboards(version):
"""
Update all leaderboard components with data for the selected version.
"""
new_df = get_leaderboard_df(version=version)
category_dfs = [get_category_leaderboard_df(category, version=version) for category in CATEGORIES]
return [init_leaderboard(new_df)] + [init_leaderboard(df) for df in category_dfs]
# Create Gradio app
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
with gr.Row():
with gr.Column(scale=3):
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Column(scale=1):
version_selector = gr.Dropdown(
choices=BENCHMARK_VERSIONS,
label="Benchmark Version",
value=CURRENT_VERSION,
interactive=True,
elem_classes="version-selector"
)
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("🏅 Leaderboard", elem_id="guardbench-leaderboard-tab", id=0):
refresh_button = gr.Button("Refresh Leaderboard")
# Create tabs for each category
with gr.Tabs(elem_classes="category-tabs") as category_tabs:
# First tab for average metrics across all categories
with gr.TabItem("📊 Overall Performance", elem_id="overall-tab"):
leaderboard = init_leaderboard(LEADERBOARD_DF)
# Create a tab for each category
for category in CATEGORIES:
with gr.TabItem(f"{category}", elem_id=f"category-{category.lower().replace(' ', '-')}-tab"):
category_df = get_category_leaderboard_df(category, version=CURRENT_VERSION)
category_leaderboard = init_leaderboard(category_df)
# Refresh button functionality
refresh_button.click(
fn=lambda: [
init_leaderboard(get_leaderboard_df(version=version_selector.value)),
*[init_leaderboard(get_category_leaderboard_df(category, version=version_selector.value)) for category in CATEGORIES]
],
inputs=[],
outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)]
)
with gr.TabItem("📝 About", elem_id="guardbench-about-tab", id=1):
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
with gr.TabItem("🚀 Submit", elem_id="guardbench-submit-tab", id=2):
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
with gr.Row():
gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
with gr.Row():
with gr.Column():
model_name_textbox = gr.Textbox(label="Model name")
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
model_type = gr.Dropdown(
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
label="Model type",
multiselect=False,
value=None,
interactive=True,
)
with gr.Column():
precision = gr.Dropdown(
choices=[i.name for i in Precision if i != Precision.Unknown],
label="Precision",
multiselect=False,
value="float16",
interactive=True,
)
weight_type = gr.Dropdown(
choices=[i.name for i in WeightType],
label="Weights type",
multiselect=False,
value="Original",
interactive=True,
)
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
with gr.Row():
file_input = gr.File(
label="Upload JSONL Results File",
file_types=[".jsonl"]
)
submit_button = gr.Button("Submit Results")
result_output = gr.Markdown()
submit_button.click(
fn=submit_results,
inputs=[
model_name_textbox,
base_model_name_textbox,
revision_name_textbox,
precision,
weight_type,
model_type,
file_input,
version_selector
],
outputs=result_output
)
# Version selector functionality
version_selector.change(
fn=update_leaderboards,
inputs=[version_selector],
outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)]
)
with gr.Row():
with gr.Accordion("📙 Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=10,
elem_id="citation-button",
show_copy_button=True,
)
with gr.Accordion("ℹ️ Dataset Information", open=False):
dataset_info = gr.Markdown(f"""
## Dataset Information
Results are stored in the HuggingFace dataset: [{RESULTS_DATASET_ID}](https://huggingface.co/datasets/{RESULTS_DATASET_ID})
Last updated: {pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S UTC")}
""")
scheduler = BackgroundScheduler()
scheduler.add_job(lambda: refresh_data(version=CURRENT_VERSION), 'interval', minutes=30)
scheduler.start()
# Launch the app
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860, share=True)