|
import json |
|
from pathlib import Path |
|
|
|
import gradio as gr |
|
import pandas as pd |
|
|
|
from texts import TITLE, DESCRIPTION, ABOUT |
|
from process_data import load_average_data, load_hard_data, load_easy_data, load_detailed_success_rate_data, load_detailed_action_counts_data |
|
from display import custom_css |
|
BENCHMARKS_TO_SKIP = [] |
|
|
|
color_map = { |
|
"Pretrained": "#7497db", |
|
"RL": "#E8ECF2", |
|
"Finetuned": "#ffcd75", |
|
|
|
} |
|
|
|
model_name_map = { |
|
"qwen2.5-3b-instruct": "Qwen/Qwen2.5-3B-Instruct", |
|
"qwen2.5-7b-instruct": "Qwen/Qwen2.5-7B-Instruct", |
|
"qwen2.5-14b-instruct": "Qwen/Qwen2.5-14B-Instruct", |
|
"qwen2.5-32b-instruct": "Qwen/Qwen2.5-32B-Instruct", |
|
"qwen2.5-72b-instruct": "Qwen/Qwen2.5-72B-Instruct", |
|
"llama-3.1-8b-instruct": "Meta-Llama/Llama-3.1-8B-Instruct", |
|
"llama-3.1-70b-instruct": "Meta-Llama/Llama-3.1-70B-Instruct", |
|
"llama-3.2-3b-instruct": "Meta-Llama/Llama-3.2-3B-Instruct", |
|
"llama-3.3-70b-instruct": "Meta-Llama/Llama-3.3-70B-Instruct", |
|
"mistral-large-instruct-2411": "Mistral/Mistral-Large-2411", |
|
"gemma-2-27b-it": "google/gemma-2-27b-it", |
|
"gemma-2-9b-it": "google/gemma-2-9b-it", |
|
"deepseek-v3": "deepseek-ai/DeepSeek-V3", |
|
"deepseek-r1": "deepseek-ai/DeepSeek-R1", |
|
"qwq-32b": "Qwen/QwQ-32B", |
|
"yi-lightning": "Yi/Yi-Lightning", |
|
'gpt-3.5-turbo': "openai/gpt-3.5-turbo", |
|
'gpt-4o': "openai/gpt-4o", |
|
'gpt-4o-mini': "openai/gpt-4o-mini", |
|
'o1-mini': "openai/o1-mini", |
|
'claude-3.5-haiku': "anthropic/claude-3.5-haiku", |
|
'claude-3.5-sonnet': "anthropic/claude-3.5-sonnet", |
|
} |
|
|
|
def map_model_name(model_id): |
|
if model_id not in model_name_map.keys(): |
|
return model_id |
|
else: |
|
return model_name_map[model_id] |
|
|
|
|
|
def model_hyperlink(link, model_name): |
|
|
|
return f"[{model_name}]({link})" |
|
|
|
def make_clickable_model(model_name): |
|
link = f"https://huggingface.co/{model_name}" |
|
return model_hyperlink(link, model_name) |
|
|
|
rl_models = ['deepseek-r1', 'o1-mini'] |
|
def map_model_type(model_name): |
|
if model_name in rl_models: |
|
return "RL" |
|
else: |
|
return "Pretrained" |
|
|
|
|
|
def prep_leaderboard_df(): |
|
average_df = load_average_data() |
|
hard_df = load_hard_data() |
|
easy_df = load_easy_data() |
|
df = pd.concat([easy_df, hard_df, average_df], axis=1) |
|
|
|
df.insert(0, "Model", [map_model_name(idx) for idx in df.index]) |
|
df.insert(1, "Model Type", [map_model_type(idx) for idx in df.index]) |
|
|
|
|
|
df = df.round(2) |
|
return df |
|
|
|
def prep_detailed_success_rate_df(): |
|
df = load_detailed_success_rate_data() |
|
|
|
df.insert(0, "Model", [map_model_name(idx) for idx in df.index]) |
|
df.insert(1, "Model Type", [map_model_type(idx) for idx in df.index]) |
|
df = df.round(2) |
|
return df |
|
|
|
def prep_detailed_action_counts_df(): |
|
df = load_detailed_action_counts_data() |
|
|
|
df.insert(0, "Model", [map_model_name(idx) for idx in df.index]) |
|
df.insert(1, "Model Type", [map_model_type(idx) for idx in df.index]) |
|
df = df.round(2) |
|
return df |
|
|
|
leaderboard_df = prep_leaderboard_df() |
|
detailed_success_rate_df = prep_detailed_success_rate_df() |
|
detailed_action_counts_df = prep_detailed_action_counts_df() |
|
|
|
|
|
def filter_and_search_success_rate(cols: list[str], search_query: str, agg: str,): |
|
|
|
df = detailed_success_rate_df |
|
search_terms = "Model" |
|
if len(search_query) > 0: |
|
search_terms = search_query.split(";") |
|
search_terms = [term.strip().lower() for term in search_terms] |
|
pattern = "|".join(search_terms) |
|
df = df[df["Model"].str.lower().str.contains(pattern, regex=True)] |
|
|
|
df = df.dropna(how="all", axis=1) |
|
|
|
if len(cols) > 0: |
|
index_cols = list(leaderboard_df.columns[:1]) |
|
new_cols = index_cols + cols |
|
df = df.copy()[new_cols] |
|
df = df.copy().dropna(how="all", axis=0, subset=[c for c in df.columns if c in cols]) |
|
|
|
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce') |
|
df = df.sort_values(by=cols, ascending=False, na_position='last') |
|
df[cols] = df[cols].astype(str) |
|
return df |
|
|
|
|
|
def filter_and_search_action_counts(cols: list[str], search_query: str, agg: str,): |
|
|
|
df = detailed_action_counts_df |
|
search_terms = "Model" |
|
if len(search_query) > 0: |
|
search_terms = search_query.split(";") |
|
search_terms = [term.strip().lower() for term in search_terms] |
|
pattern = "|".join(search_terms) |
|
df = df[df["Model"].str.lower().str.contains(pattern, regex=True)] |
|
|
|
df = df.dropna(how="all", axis=1) |
|
|
|
if len(cols) > 0: |
|
index_cols = list(leaderboard_df.columns[:1]) |
|
new_cols = index_cols + cols |
|
df = df.copy()[new_cols] |
|
df = df.copy().dropna(how="all", axis=0, subset=[c for c in df.columns if c in cols]) |
|
|
|
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce') |
|
df = df.sort_values(by=cols, ascending=False, na_position='last') |
|
df[cols] = df[cols].astype(str) |
|
return df |
|
|
|
|
|
demo = gr.Blocks(css=custom_css) |
|
|
|
with demo: |
|
gr.HTML(TITLE) |
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown(DESCRIPTION, elem_classes="markdown-text") |
|
with gr.Tabs(elem_classes="tab-buttons") as tabs: |
|
with gr.TabItem("🏆 Leaderboard"): |
|
with gr.Row(): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Group(): |
|
leaderboard_table = gr.Dataframe( |
|
value=leaderboard_df, |
|
wrap=True, |
|
column_widths=[250, 120] + [(60 + len(c)) for c in leaderboard_df.columns[2:]], |
|
) |
|
|
|
|
|
|
|
with gr.TabItem("Success Rates - Detailed"): |
|
with gr.Column(): |
|
with gr.Row(): |
|
search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False) |
|
|
|
with gr.Row(): |
|
cols_bar = gr.CheckboxGroup( |
|
choices=[c for c in detailed_success_rate_df.columns[2:] if c != "Average"], |
|
show_label=False, |
|
|
|
) |
|
detailed_success_rate_table = gr.Dataframe( |
|
value=detailed_success_rate_df, |
|
wrap=True, |
|
column_widths=[350, 120] + [(150 + len(c)) for c in detailed_success_rate_df.columns[2:]], |
|
) |
|
cols_bar.change(filter_and_search_success_rate, inputs=[cols_bar, search_bar], outputs=[detailed_success_rate_table]) |
|
search_bar.submit(filter_and_search_success_rate, inputs=[cols_bar, search_bar], outputs=[detailed_success_rate_table]) |
|
|
|
with gr.TabItem("Action Counts - Detailed"): |
|
with gr.Column(): |
|
with gr.Row(): |
|
search_bar_1 = gr.Textbox(placeholder="Search for your model...", show_label=False) |
|
|
|
with gr.Row(): |
|
cols_bar_1 = gr.CheckboxGroup( |
|
choices=[c for c in detailed_action_counts_df.columns[2:] if c != "Average"], |
|
show_label=False, |
|
|
|
) |
|
detailed_action_counts_table = gr.Dataframe( |
|
value=detailed_action_counts_df, |
|
wrap=True, |
|
column_widths=[350, 120] + [(100 + len(c)) for c in detailed_action_counts_df.columns[2:]], |
|
) |
|
cols_bar_1.change(filter_and_search_action_counts, inputs=[cols_bar_1, search_bar_1], outputs=[detailed_action_counts_table]) |
|
search_bar_1.submit(filter_and_search_action_counts, inputs=[cols_bar_1, search_bar_1], outputs=[detailed_action_counts_table]) |
|
|
|
with gr.TabItem("About"): |
|
gr.Markdown(ABOUT) |
|
|
|
|
|
with gr.Row(): |
|
with gr.Accordion("📚 Citation", open=False): |
|
citation_button = gr.Textbox( |
|
value=r"""@article{lin2025generative, |
|
title={Generative Evaluation of Complex Reasoning in Large Language Models}, |
|
author={Lin, Haowei and Wang, Xiangyu and Yan, Ruilin and Huang, Baizhou and Ye, Haotian and Zhu, Jianhua and Wang, Zihao and Zou, James and Ma, Jianzhu and Liang, Yitao}, |
|
journal={arXiv preprint arXiv:2504.02810}, |
|
year={2025} |
|
}""", |
|
lines=7, |
|
label="Copy the following to cite these results.", |
|
elem_id="citation-button", |
|
show_copy_button=True, |
|
) |
|
|
|
demo.launch() |