zhwang4ai's picture
update tabs
70c8dc9
raw
history blame
7.01 kB
import json
from pathlib import Path
import gradio as gr
import pandas as pd
from texts import TITLE, DESCRIPTION, ABOUT
from process_data import load_average_data, load_hard_data, load_easy_data, load_detailed_success_rate_data, load_detailed_action_counts_data
from display import custom_css
BENCHMARKS_TO_SKIP = []
color_map = {
"Pretrained": "#7497db",
"RL": "#E8ECF2",
"Finetuned": "#ffcd75",
# "DPO": "#75809c",
}
model_name_map = {
"qwen2.5-3b-instruct": "Qwen/Qwen2.5-3B-Instruct",
"qwen2.5-7b-instruct": "Qwen/Qwen2.5-7B-Instruct",
"qwen2.5-14b-instruct": "Qwen/Qwen2.5-14B-Instruct",
"qwen2.5-32b-instruct": "Qwen/Qwen2.5-32B-Instruct",
"qwen2.5-72b-instruct": "Qwen/Qwen2.5-72B-Instruct",
"llama-3.1-8b-instruct": "Meta-Llama/Llama-3.1-8B-Instruct",
"llama-3.1-70b-instruct": "Meta-Llama/Llama-3.1-70B-Instruct",
"llama-3.2-3b-instruct": "Meta-Llama/Llama-3.2-3B-Instruct",
"llama-3.3-70b-instruct": "Meta-Llama/Llama-3.3-70B-Instruct",
"mistral-large-instruct-2411": "Mistral/Mistral-Large-2411",
"gemma-2-27b-it": "google/gemma-2-27b-it",
"gemma-2-9b-it": "google/gemma-2-9b-it",
"deepseek-v3": "deepseek-ai/DeepSeek-V3",
"deepseek-r1": "deepseek-ai/DeepSeek-R1",
"qwq-32b": "Qwen/QwQ-32B",
"yi-lightning": "Yi/Yi-Lightning",
'gpt-3.5-turbo': "openai/gpt-3.5-turbo",
'gpt-4o': "openai/gpt-4o",
'gpt-4o-mini': "openai/gpt-4o-mini",
'o1-mini': "openai/o1-mini",
'claude-3.5-haiku': "anthropic/claude-3.5-haiku",
'claude-3.5-sonnet': "anthropic/claude-3.5-sonnet",
}
def map_model_name(model_id):
if model_id not in model_name_map.keys():
return model_id
else:
return model_name_map[model_id]
# 定义函数,将模型名称转换为带有链接的 HTML 格式
def model_hyperlink(link, model_name):
# return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
return f"[{model_name}]({link})"
def make_clickable_model(model_name):
link = f"https://huggingface.co/{model_name}"
return model_hyperlink(link, model_name)
rl_models = ['deepseek-r1', 'o1-mini']
def map_model_type(model_name):
if model_name in rl_models:
return "RL"
else:
return "Pretrained"
def prep_leaderboard_df():
average_df = load_average_data()
hard_df = load_hard_data()
easy_df = load_easy_data()
df = pd.concat([easy_df, hard_df, average_df], axis=1)
# insert a column named "Model" at the first position
df.insert(0, "Model", [map_model_name(idx) for idx in df.index])
df.insert(1, "Model Type", [map_model_type(idx) for idx in df.index])
# 对 Model 列应用函数,将模型名称转换为链接形式
# df['Model'] = df['Model'].apply(make_clickable_model)
df = df.round(2)
return df
def prep_detailed_success_rate_df():
df = load_detailed_success_rate_data()
df.insert(0, "Model", [map_model_name(idx) for idx in df.index])
df.insert(1, "Model Type", [map_model_type(idx) for idx in df.index])
df = df.round(2)
return df
def prep_detailed_action_counts_df():
df = load_detailed_action_counts_data()
df.insert(0, "Model", [map_model_name(idx) for idx in df.index])
df.insert(1, "Model Type", [map_model_type(idx) for idx in df.index])
df = df.round(2)
return df
leaderboard_df = prep_leaderboard_df()
detailed_success_rate_df = prep_detailed_success_rate_df()
detailed_action_counts_df = prep_detailed_action_counts_df()
# Function to update the table based on search query
def filter_and_search(cols: list[str], search_query: str, agg: str):
print("filter")
df = leaderboard_df
search_terms = "Model"
if len(search_query) > 0:
search_terms = search_query.split(";")
search_terms = [term.strip().lower() for term in search_terms]
pattern = "|".join(search_terms)
df = df[df["Model"].str.lower().str.contains(pattern, regex=True)]
# Drop any columns which are all NaN
df = df.dropna(how="all", axis=1)
if len(cols) > 0:
index_cols = list(leaderboard_df.columns[:1])
new_cols = index_cols + cols
df = df.copy()[new_cols]
df = df.copy().dropna(how="all", axis=0, subset=[c for c in df.columns if c in cols])
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')
df = df.sort_values(by=cols, ascending=False, na_position='last')
df[cols] = df[cols].astype(str)
return df
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
with gr.Row():
with gr.Column():
gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("🏆 Leaderboard"):
with gr.Row():
# search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)
# cols_bar = gr.CheckboxGroup(
# choices=[c for c in leaderboard_df.columns[1:] if c != "Average"],
# show_label=False,
# # info="Select columns to display",
# )
with gr.Group():
leaderboard_table = gr.Dataframe(
value=leaderboard_df,
wrap=True,
# column_widths=[400, 110] + [(260 + len(c)) for c in leaderboard_df.columns[1:]],
)
#cols_bar.change(filter_and_search, inputs=[cols_bar, search_bar], outputs=[leaderboard_table])
# search_bar.submit(filter_and_search, inputs=[cols_bar, search_bar], outputs=[leaderboard_table])
with gr.TabItem("Success Rates - Detailed"):
with gr.Row():
detailed_success_rate_table = gr.Dataframe(
value=detailed_success_rate_df,
wrap=True,
)
with gr.TabItem("Action Counts - Detailed"):
with gr.Row():
detailed_action_counts_table = gr.Dataframe(
value=detailed_action_counts_df,
wrap=True,
)
with gr.TabItem("About"):
gr.Markdown(ABOUT)
with gr.Row():
with gr.Accordion("📚 Citation", open=False):
citation_button = gr.Textbox(
value=r"""@article{lin2025generative,
title={Generative Evaluation of Complex Reasoning in Large Language Models},
author={Lin, Haowei and Wang, Xiangyu and Yan, Ruilin and Huang, Baizhou and Ye, Haotian and Zhu, Jianhua and Wang, Zihao and Zou, James and Ma, Jianzhu and Liang, Yitao},
journal={arXiv preprint arXiv:2504.02810},
year={2025}
}""",
lines=7,
label="Copy the following to cite these results.",
elem_id="citation-button",
show_copy_button=True,
)
demo.launch()