Spaces:

iberbench
/

leaderboard

Running

File size: 31,849 Bytes

c2f297a
77175ac
c2f297a
 
1cf89c1
c2f297a
77175ac
c2f297a
 
1cf89c1
ed1f9e1
1cf89c1
c2f297a
1cf89c1
 
 
77175ac
c2f297a
1cf89c1
77175ac
1cf89c1
 
 
ed1f9e1
 
 
 
 
 
77175ac
1cf89c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2f297a
 
77175ac
1cf89c1
b7f9bcb
1cf89c1
ed1f9e1
 
 
 
 
 
 
 
 
 
1cf89c1
 
6aaf516
c2f297a
00090df
c2f297a
 
 
1053127
c2f297a
 
 
77175ac
ed1f9e1
c2f297a
 
 
 
 
77175ac
ed1f9e1
bec5baa
ed1f9e1
 
 
 
 
 
 
 
 
77175ac
 
bec5baa
77175ac
ed1f9e1
 
 
f30b2f5
bec5baa
ed1f9e1
bec5baa
ed1f9e1
77175ac
 
1cf89c1
77175ac
 
ed1f9e1
1cf89c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77175ac
b7f9bcb
1cf89c1
ed1f9e1
 
 
 
b7f9bcb
6aaf516
 
b7f9bcb
ed1f9e1
6aaf516
 
 
ed1f9e1
 
 
 
 
 
1cf89c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6aaf516
 
1cf89c1
 
ed1f9e1
 
 
 
1cf89c1
ed1f9e1
 
 
1cf89c1
 
 
 
 
 
6aaf516
1cf89c1
 
 
 
 
 
 
 
 
 
 
 
ed1f9e1
1cf89c1
 
 
 
 
 
 
 
 
 
ed1f9e1
1cf89c1
 
ed1f9e1
 
 
 
1cf89c1
 
 
ed1f9e1
 
 
1cf89c1
ed1f9e1
 
 
 
 
1cf89c1
 
ed1f9e1
 
 
b7f9bcb
 
1cf89c1
 
 
 
ed1f9e1
 
 
 
 
 
 
1cf89c1
b7f9bcb
77175ac
1cf89c1
 
ed1f9e1
 
 
 
 
 
 
 
1cf89c1
77175ac
1cf89c1
 
 
 
 
 
ed1f9e1
 
 
 
 
1cf89c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed1f9e1
 
 
1cf89c1
 
 
 
 
 
 
 
 
 
 
 
 
ed1f9e1
 
 
 
 
 
 
 
 
 
1cf89c1
 
 
 
 
 
 
 
 
 
ed1f9e1
 
1cf89c1
 
 
 
ed1f9e1
1cf89c1
 
 
 
 
ed1f9e1
 
 
1cf89c1
 
 
 
ed1f9e1
1cf89c1
 
 
ed1f9e1
 
 
 
 
1cf89c1
ed1f9e1
 
 
1cf89c1
 
 
 
 
 
 
ed1f9e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1cf89c1
 
 
 
 
 
ed1f9e1
1cf89c1
 
 
 
ed1f9e1
 
 
1cf89c1
 
 
 
 
 
 
ed1f9e1
 
 
 
 
 
1cf89c1
ed1f9e1
 
 
 
 
 
 
 
77175ac
ed1f9e1
1cf89c1
 
 
 
 
ed1f9e1
 
 
1cf89c1
 
 
 
 
 
 
ed1f9e1
 
 
 
 
 
1cf89c1
ed1f9e1
 
 
 
 
1cf89c1
ed1f9e1
 
 
 
1cf89c1
 
 
ed1f9e1
 
 
 
1cf89c1
 
ed1f9e1
 
 
1cf89c1
 
 
 
 
 
 
 
 
 
 
 
 
ed1f9e1
1cf89c1
 
 
 
 
 
 
 
 
 
ed1f9e1
 
 
1cf89c1
 
 
ed1f9e1
 
 
1cf89c1
ed1f9e1
 
 
1cf89c1
 
ed1f9e1
 
 
1cf89c1
 
 
ed1f9e1
 
 
1cf89c1
 
 
ed1f9e1
 
 
1cf89c1
b7f9bcb
1cf89c1
 
 
 
 
 
 
 
 
ed1f9e1
 
 
1cf89c1
6aaf516
b7f9bcb
1cf89c1
 
 
 
 
 
ed1f9e1
 
 
 
 
 
 
 
1cf89c1
 
ed1f9e1
1cf89c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed1f9e1
 
 
 
 
 
1cf89c1
 
 
 
 
 
 
 
 
 
ed1f9e1
1cf89c1
 
 
77175ac
ed1f9e1
1cf89c1
77175ac
1cf89c1
ed1f9e1
1cf89c1
 
 
 
 
 
 
ed1f9e1
 
 
 
 
 
 
 
 
1cf89c1
ed1f9e1
1cf89c1
 
 
ed1f9e1
1cf89c1
 
 
 
 
ed1f9e1
 
 
 
 
 
 
 
 
 
 
 
1cf89c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed1f9e1
 
1cf89c1
 
ed1f9e1
 
 
1cf89c1
ed1f9e1
 
 
 
 
1cf89c1
 
 
 
 
 
 
 
ed1f9e1
 
 
 
1cf89c1
 
 
ed1f9e1
 
1cf89c1
 
 
 
 
 
 
 
 
 
 
77175ac
1cf89c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed1f9e1
 
 
1cf89c1
 
 
 
 
 
 
 
 
 
 
 
77175ac
 
1cf89c1
 
77175ac
c2f297a
1cf89c1
 
 
 
 
 
 
 
 
 
 
 
 
ed1f9e1
 
 
 
1cf89c1
 
 
c2f297a
1cf89c1
c2f297a
 
 
1cf89c1
 
c2f297a
ed1f9e1
 
 
 
77175ac
1cf89c1
 
77175ac
c2f297a
77175ac
 
1cf89c1
 
77175ac
c2f297a
 
ed1f9e1
 
 
 
 
77175ac
1cf89c1
 
 
77175ac
1cf89c1
77175ac
 
c2f297a
ed1f9e1
 
 
 
 
c2f297a
77175ac
 
 
1cf89c1
77175ac
c2f297a
 
 
 
 
 
 
 
 
 
 
1cf89c1
c2f297a
ed1f9e1
 
 
1cf89c1
 
ed1f9e1
 
 
 
1cf89c1
ed1f9e1
 
 
 
 
 
 
 
 
 
 
 
 
 
1cf89c1
 
 
ed1f9e1
1cf89c1
 
 
ed1f9e1
 
 
 
 
 
1cf89c1
ed1f9e1
 
 
 
 
 
1cf89c1
ed1f9e1
 
 
 
 
 
1cf89c1
 
ed1f9e1

import json
import os
import re
import uuid
import random
from pathlib import Path

import pandas as pd
import streamlit as st
import plotly.express as px
import plotly.graph_objects as go

from datasets import load_dataset
from huggingface_hub import CommitScheduler, hf_hub_download
from huggingface_hub.utils import RepositoryNotFoundError
from yaml import safe_load as yaml_load

from src.check_validity import validate_model
from src.task_mappings import professional_mapping, semantic_categories

# -----------------------------------------------------------------------------
# Page configuration and global CSS styles for modern look and improved UX
# -----------------------------------------------------------------------------
st.set_page_config(
    page_title="IberBench",
    layout="wide",
    initial_sidebar_state="expanded",
    page_icon="🌍",
)

st.markdown(
    """
    <style>
    /* General page styling */
    body {
        background-color: #f7f7f7;
        font-family: 'Segoe UI', sans-serif;
    }
    /* Sidebar styling */
    .css-1d391kg {
        background-color: #ffffff;
        border-right: 2px solid #eaeaea;
    }
    /* Header styling */
    .main-header {
        text-align: center;
        padding: 2rem 0;
        background: linear-gradient(90deg, #007BFF, #00BFFF);
        color: white;
        border-radius: 10px 10px 10px 10px;
    }
    /* Tab styling */
    .stTabs > .css-1qimj2v { 
        background: #fff;
    }
    /* Form styling */
    .stButton>button {
        background-color: #007BFF;
        color: white;
        border: none;
        border-radius: 5px;
    }
    </style>
    """,
    unsafe_allow_html=True,
)

# -----------------------------------------------------------------------------
# Global variables and helper functions
# -----------------------------------------------------------------------------
request_file = Path("user_request/") / f"data_{uuid.uuid4()}.json"
request_folder = request_file.parent

LANGUAGES_SETTINGS = Path("etc/languages_settings.yml")

dataset_columns = [
    "workshop",
    "shared_task",
    "year",
    "task_type",
    "language",
    "url",
    "language_variety",
    "problem_type",
    "num_labels",
    "labels",
]
model_columns = ["model_name", "model_type", "num_parameters"]

scheduler = CommitScheduler(
    repo_id="iberbench/user-requests",
    repo_type="dataset",
    private=True,
    folder_path=request_folder,
    token=st.secrets["HF_TOKEN"],
    path_in_repo="data",
    every=10,
)


def log_submission(input_dict: dict) -> None:
    with scheduler.lock:
        with request_file.open("a") as f:
            f.write(json.dumps(input_dict))
            f.write("\n")


def get_lang_columns(columns: list, lang: str):
    # Mixed needs to return all the columns that ends
    # with the language, but doesn't have variation at the end
    if "Mixed" in lang:
        lang = lang.lower().split(" ")[0]
        return [col for col in columns if col.endswith(lang)]
    else:
        lang_norm = lang.lower().replace(" ", "_")
        return [col for col in columns if lang_norm in col]


@st.cache_data
def load_data(lang) -> pd.DataFrame:
    try:
        data = load_dataset(
            "iberbench/lm-eval-results", token=st.secrets["HF_TOKEN"]
        )["train"].to_pandas()
        task_columns = [col for col in data.columns if col not in model_columns]
        task_lang_columns = get_lang_columns(task_columns, lang)
        data[task_columns] = data[task_columns] * 100
        data = data[model_columns + task_lang_columns]
        # data["Active"] = False
        return data
    except FileNotFoundError:
        st.error("iberbench/lm-eval-results was not found in the hub 😕")
        return pd.DataFrame()


def load_dataset_card(task) -> list:
    name_repo = "iberbench/" + task
    try:
        info_path = hf_hub_download(
            repo_id=name_repo,
            filename="task_metadata.json",
            repo_type="dataset",
        )
        with open(info_path, "r") as f:
            info = json.load(f)
        values_ = []
        for i in dataset_columns:
            if i in info:
                values_.append(info[i])
            else:
                values_.append([] if i == "labels" else "-")
        return values_
    except RepositoryNotFoundError:
        st.error(task + ": dataset was not found in the hub 🚫")
        return ["-"] * len(dataset_columns)


def active_data(lang) -> pd.DataFrame:
    return st.session_state[f"leaderboard_data_{lang}"][
        st.session_state[f"leaderboard_data_{lang}"]["Active"] == True
    ].copy()


def get_index(lang, row) -> pd.Series:
    return active_data(lang).iloc[row].name


def commit(lang) -> None:
    for row in st.session_state[f"edited_data_{lang}"]["edited_rows"]:
        row_index = get_index(lang, row)
        for key, value in st.session_state[f"edited_data_{lang}"][
            "edited_rows"
        ][row].items():
            st.session_state[f"leaderboard_data_{lang}"].at[
                row_index, key
            ] = value


# -----------------------------------------------------------------------------
# Visualization helper functions
# -----------------------------------------------------------------------------
def create_table_results(df_mean: pd.DataFrame):
    rank_value = []
    for i in df_mean["Mean"].rank(method="dense", ascending=False).astype(int):
        if i == 1:
            rank_value.append(f"{i} 🥇")
        elif i == 2:
            rank_value.append(f"{i} 🥈")
        elif i == 3:
            rank_value.append(f"{i} 🥉")
        else:
            rank_value.append(str(i))
    df_mean.insert(0, "Rank", rank_value)
    df_final = df_mean.sort_values("Mean", ascending=False)
    st.dataframe(
        df_final,
        hide_index=True,
        use_container_width=True,
        column_config={
            "model_name": st.column_config.TextColumn("Model 🧠"),
            "model_type": st.column_config.TextColumn("Type 📌"),
            "num_parameters": st.column_config.NumberColumn("Model Size 🔢"),
        },
    )


def create_table_all_results(aggregated_df: pd.DataFrame):
    combined_df = create_data_results_per_language()
    df_lang = combined_df.pivot(
        index="model_name", columns="language", values="Mean"
    )
    aggregated_df[df_lang.columns] = df_lang[df_lang.columns].values
    rank_value = []
    for i in (
        aggregated_df["Mean"].rank(method="dense", ascending=False).astype(int)
    ):
        if i == 1:
            rank_value.append(f"{i} 🥇")
        elif i == 2:
            rank_value.append(f"{i} 🥈")
        elif i == 3:
            rank_value.append(f"{i} 🥉")
        else:
            rank_value.append(str(i))
    aggregated_df.insert(0, "Rank", rank_value)
    df_final = aggregated_df.sort_values("Mean", ascending=False)
    st.dataframe(
        df_final,
        hide_index=True,
        use_container_width=True,
        column_config={
            "model_name": st.column_config.TextColumn("Model 🧠"),
            "model_type": st.column_config.TextColumn("Type 📌"),
            "num_parameters": st.column_config.NumberColumn("Model Size 🔢"),
        },
    )


def create_scatter_chart(df: pd.DataFrame, id_: str):
    fig = px.scatter(
        df,
        x="num_parameters",
        y="Mean",
        color="model_name",
        size="num_parameters",
        hover_data=["model_type"],
        labels={"num_parameters": "Num parameters"},
    )
    fig.update_layout(template="plotly_white")
    st.plotly_chart(
        fig, use_container_width=True, key=id_ + str(random.random())
    )


def create_radar_chart(df: pd.DataFrame, id_: str):
    df = df.sort_values(by="Mean", ascending=False)
    radar_df = pd.DataFrame(
        {"r": df["Mean"][:10], "theta": df["model_name"][:10]}
    )
    fig = px.line_polar(
        radar_df,
        r="r",
        theta="theta",
        line_close=True,
        markers=True,
    )
    fig.update_traces(fill="toself")
    st.plotly_chart(
        fig, use_container_width=True, key=id_ + str(random.random())
    )


def create_pie_chart(df: pd.DataFrame, id_: str):
    df_pie = df["model_type"].value_counts().reset_index()
    df_pie.columns = ["model_type", "count"]
    fig = px.pie(
        df_pie,
        values="count",
        names="model_type",
        labels={"model_type": "Model type"},
    )
    st.plotly_chart(
        fig, use_container_width=True, key=id_ + str(random.random())
    )


def create_box_plot(df: pd.DataFrame, id_: str):
    fig = px.box(
        df,
        x="model_type",
        y="Mean",
        points="all",
        labels={"model_type": "Model type"},
    )
    st.plotly_chart(
        fig, use_container_width=True, key=id_ + str(random.random())
    )


def get_summary_df(lang: str, task_types: list) -> pd.DataFrame:
    df = st.session_state[f"leaderboard_data_{lang}"][model_columns].copy()
    if not st.session_state[f"leaderboard_data_{lang}"].empty:
        for t in task_types:
            task_list = semantic_categories[t]
            cols = [
                col
                for col in st.session_state[f"leaderboard_data_{lang}"].columns
                if "iberbench/" + col in task_list
            ]
            if cols:
                tmp = st.session_state[f"leaderboard_data_{lang}"][cols]
                df[t] = tmp.mean(axis=1).round(2)
        if df.shape[1] > 4:
            df.insert(3, "Mean", df.iloc[:, 3:-1].mean(axis=1).round(2))
        else:
            df.insert(3, "Mean", df.iloc[:, 3].round(2))
    return df


def get_all_languages_summary_df() -> pd.DataFrame:
    """Combine leaderboard summary data from all languages using get_summary_df."""
    combined_df = pd.DataFrame()
    for key in st.session_state:
        if key.startswith("leaderboard_data_"):
            lang = key.split("leaderboard_data_")[1]
            task_types = select_task_per_language(lang)
            summary_df = get_summary_df(lang, task_types)
            summary_df["language"] = lang
            combined_df = pd.concat(
                [combined_df, summary_df], ignore_index=True
            )
    return combined_df


def create_results_visualization_lang(lang: str):
    # ---------------------------
    # In-language plots section
    # ---------------------------
    task_types = select_task_per_language(lang)
    summary_df = get_summary_df(lang, task_types)
    tasks_df = st.session_state[f"leaderboard_data_{lang}"].copy()
    create_table_results(summary_df)
    st.markdown("### Language plots 📊")
    # Display the results table for the selected language

    in_lang_tabs = st.tabs(
        [
            "Top 10 performance 🥇",
            "Performance vs. size 📏",
            "Performance per type 💡",
            "Fundamental vs industry ⚖️",
            "Performance per task category 📈",
        ]
    )
    with in_lang_tabs[0]:
        create_radar_chart(summary_df, lang + "in_radar")
    with in_lang_tabs[1]:
        create_scatter_chart(summary_df, lang + "in_scatter")
    with in_lang_tabs[2]:
        create_box_plot(summary_df, lang + "in_box")
    with in_lang_tabs[3]:
        create_box_plot_per_task_category(tasks_df, lang + "in_box_task_cat")
    with in_lang_tabs[4]:
        create_box_plot_per_semantic_category(tasks_df, lang + "in_box_sem_cat")


# -----------------------------------------------------------------------------
# Functions for other visualization sections
# -----------------------------------------------------------------------------


def select_task_per_language(lang: str):
    types = []
    for k, v in semantic_categories.items():
        for vv in v:
            task_name = vv.split("iberbench/")[1]
            if task_name in list(
                st.session_state[f"leaderboard_data_{lang}"].columns
            ):
                if k not in types:
                    types.append(k)
    return types


def create_dataset_info_per_language(lang: str):
    all_values = []
    if not st.session_state[f"leaderboard_data_{lang}"].empty:
        cols = [
            col
            for col in st.session_state[f"leaderboard_data_{lang}"].columns
            if col not in model_columns
        ]
        if len(cols) > 1:
            for task in cols[:-1]:
                values = load_dataset_card(task)
                all_values.append(values)
        else:
            values = load_dataset_card(cols[0])
            all_values.append(values)
        df = pd.DataFrame(all_values, columns=dataset_columns)
        st.dataframe(
            df,
            column_config={
                "workshop": st.column_config.TextColumn(
                    "Workshop 🏫", help="Workshop to belong to the shared task"
                ),
                "shared_task": st.column_config.TextColumn(
                    "Shared Task 📋", help="Shared Task name"
                ),
                "year": st.column_config.TextColumn(
                    "Year 📅", help="Year of the shared task"
                ),
                "task_type": st.column_config.TextColumn(
                    "Task Type 🔖", help="Shared Task type"
                ),
                "language": st.column_config.TextColumn(
                    "Language 🌐", help="Shared Task language"
                ),
                "url": st.column_config.ListColumn(
                    "Task URL 🔗", help="Shared Task url"
                ),
                "language_variety": st.column_config.TextColumn(
                    "Language Variety 🗣️", help="Shared Task language variety"
                ),
                "problem_type": st.column_config.TextColumn(
                    "Problem Type ❓", help="Shared Task problem type"
                ),
                "num_labels": st.column_config.NumberColumn(
                    "Number of Labels 🔢", help="Shared Task number of labels"
                ),
                "labels": st.column_config.ListColumn(
                    "Labels 🏷️", help="Shared Task labels"
                ),
            },
            hide_index=True,
        )
    else:
        st.write("No data found to display on leaderboard 😔.")


def create_box_plot_per_task_category(df: pd.DataFrame, id_: str):
    # Compute average performance for each professional category (using professional_mapping).
    melt_vars = []
    for category, tasks in professional_mapping.items():
        relevant_cols = [
            col for col in df.columns if "iberbench/" + col in tasks
        ]
        if relevant_cols:
            df[category] = df[relevant_cols].mean(axis=1).round(2)
            melt_vars.append(category)
    melt_vars = list(set(melt_vars))
    id_vars = model_columns.copy()
    if "language" in df.columns:
        id_vars.append("language")
    df_melt = df.melt(
        id_vars=id_vars,
        value_vars=melt_vars,
        var_name="Task Category",
        value_name="Performance",
    )
    fig = px.box(
        df_melt,
        x="Task Category",
        y="Performance",
        points="all",
        labels={"Performance": "Performance (%)"},
    )
    st.plotly_chart(
        fig, use_container_width=True, key=id_ + str(random.random())
    )


def create_box_plot_per_semantic_category(df: pd.DataFrame, id_: str):
    # Compute average performance for each semantic category defined in semantic_categories.
    melt_vars = []
    for category, tasks in semantic_categories.items():
        relevant_cols = [
            col for col in df.columns if "iberbench/" + col in tasks
        ]
        if relevant_cols:
            df[category] = df[relevant_cols].mean(axis=1).round(2)
            melt_vars.append(category)
    melt_vars = list(set(melt_vars))
    id_vars = model_columns.copy()
    if "language" in df.columns:
        id_vars.append("language")
    df_melt = df.melt(
        id_vars=id_vars,
        value_vars=melt_vars,
        var_name="Task Category",
        value_name="Performance",
    )
    fig = px.box(
        df_melt,
        x="Task Category",
        y="Performance",
        points="all",
        labels={"Performance": "Performance (%)"},
    )
    st.plotly_chart(
        fig, use_container_width=True, key=id_ + str(random.random())
    )


def create_histogram(df: pd.DataFrame, id_: str):
    fig = px.histogram(
        df,
        x="num_parameters",
        nbins=20,
        labels={"num_parameters": "Num parameters", "count": "Count"},
    )
    fig.update_layout(template="plotly_white")
    st.plotly_chart(
        fig, use_container_width=True, key=id_ + str(random.random())
    )


def create_data_results_per_language() -> pd.DataFrame:
    # Create a combined dataframe from all leaderboard data in session_state.
    combined_df = pd.DataFrame()
    for key in st.session_state.keys():
        if key.startswith("leaderboard_data_"):
            temp_df = st.session_state[key].copy()
            # If the "language" column is missing, use the key to assign a language name.
            if "language" not in temp_df.columns:
                lang = key.split("leaderboard_data_")[1]
                temp_df["language"] = lang
            combined_df = pd.concat([combined_df, temp_df], ignore_index=True)

    if combined_df.empty:
        st.warning("No data available for any language ⚠️.")
        return

    # Check if the "Mean" column exists. If not, compute it.
    if "Mean" not in combined_df.columns:
        # Define model metadata columns that should be excluded from the performance calculation.
        model_columns = ["model_name", "model_type", "num_parameters"]
        # Exclude metadata, language, and any non-numeric columns.
        performance_cols = [
            col
            for col in combined_df.columns
            if col not in model_columns + ["language", "Active"]
            and pd.api.types.is_numeric_dtype(combined_df[col])
        ]
        if performance_cols:
            combined_df["Mean"] = (
                combined_df[performance_cols].mean(axis=1).round(2)
            )
        else:
            st.warning(
                "No numeric task performance columns available to compute 'Mean' ⚠️."
            )
            return
    return combined_df


def create_box_plot_per_language(id_: str):
    # Create a boxplot with performance (Mean) per language.
    combined_df = create_data_results_per_language()
    fig = px.box(
        combined_df,
        x="language",
        y="Mean",
        points="all",
        labels={"language": "Language", "Mean": "Performance (%)"},
    )
    st.plotly_chart(
        fig, use_container_width=True, key=id_ + str(random.random())
    )


def get_all_languages_summary_df() -> pd.DataFrame:
    """Combine leaderboard summary data from all languages using get_summary_df."""
    combined_df = pd.DataFrame()
    for key in st.session_state:
        if key.startswith("leaderboard_data_"):
            lang = key.split("leaderboard_data_")[1]
            task_types = select_task_per_language(lang)
            summary_df = get_summary_df(lang, task_types)
            summary_df["language"] = lang
            combined_df = pd.concat(
                [combined_df, summary_df], ignore_index=True
            )
    return combined_df


def get_all_languages_aggregated_summary_df() -> pd.DataFrame:
    """
    Aggregate the combined summary data by model_name to compute mean performance
    across languages. Use this aggregated data for radar, scatter, pie, box, and histogram plots.
    """
    df = get_all_languages_summary_df()
    agg_df = df.groupby("model_name", as_index=False).agg(
        {
            "model_type": "first",  # choose an aggregation that makes sense
            "num_parameters": "mean",  # average model size across languages
            "Mean": "mean",  # average performance
        }
    )
    agg_df["Mean"] = agg_df["Mean"].round(2)
    return agg_df


def get_all_languages_raw_df() -> pd.DataFrame:
    """
    Combine the raw leaderboard data from all languages.
    This is used for plots (e.g., Fundamental vs Professional) that rely on the original task columns.
    """
    combined_df = pd.DataFrame()
    for key in st.session_state:
        if key.startswith("leaderboard_data_"):
            lang = key.split("leaderboard_data_")[1]
            temp_df = st.session_state[key].copy()
            temp_df["language"] = lang
            combined_df = pd.concat([combined_df, temp_df], ignore_index=True)
    return combined_df


# -----------------------------------------------------------------------------
# Sidebar for Navigation and Global Settings
# -----------------------------------------------------------------------------
st.sidebar.markdown(
    "<h2 style='text-align: center;'>IberBench 🌍</h2>", unsafe_allow_html=True
)
menu = st.sidebar.radio(
    "", ["Leaderboard 📊", "Submit Model 🚀", "Datasets 📚", "About ℹ️"]
)
st.sidebar.markdown("---")
st.sidebar.markdown(
    """
    <p style="font-size:0.9rem; text-align:center;">
    A leaderboard of LLMs on languages from the Iberian Peninsula and Ibero-America
    </p>
    """,
    unsafe_allow_html=True,
)


def load_languages_set():
    with open(LANGUAGES_SETTINGS, "r") as f:
        return yaml_load(f)


lang_set = load_languages_set()

for lang in lang_set.keys():
    data = load_data(lang)
    if f"leaderboard_data_{lang}" not in st.session_state:
        st.session_state[f"leaderboard_data_{lang}"] = data

# -----------------------------------------------------------------------------
# Main Content based on Navigation
# -----------------------------------------------------------------------------
if menu == "Leaderboard 📊":
    st.markdown(
        "<div class='main-header'><h1>Leaderboard 📊</h1></div>",
        unsafe_allow_html=True,
    )
    lang_iber = [
        k
        for k, v in lang_set.items()
        if v["category"] == "Iberian Peninsula languages"
    ]
    st.markdown("### General ranking 🏆")

    # ---------------------------
    # All-language plots section
    # ---------------------------
    # Use aggregated data for plots where each model must appear once with averaged values.
    aggregated_df = get_all_languages_aggregated_summary_df()
    create_table_all_results(aggregated_df)
    st.markdown("### General plots 📊")
    # Use raw data for Fundamental vs Professional and Task Category plots.
    raw_all_df = get_all_languages_raw_df()
    all_lang_tabs = st.tabs(
        [
            "Top 10 performance 🥇",
            "Performance vs. size 📏",
            "Type distribution 🎨",
            "Performance per type 💡",
            "Distribution of sizes 📊",
            "Fundamental vs industry ⚖️",
            "Performance per task category 📈",
            "Performance per language 🌐",
        ]
    )
    with all_lang_tabs[0]:
        create_radar_chart(aggregated_df, "all_radar")
    with all_lang_tabs[1]:
        create_scatter_chart(aggregated_df, "all_scatter")
    with all_lang_tabs[2]:
        create_pie_chart(aggregated_df, "all_pie")
    with all_lang_tabs[3]:
        create_box_plot(aggregated_df, "all_box")
    with all_lang_tabs[4]:
        create_histogram(aggregated_df, "all_hist")
    with all_lang_tabs[5]:
        # Use the raw combined data so that professional task columns are available.
        create_box_plot_per_task_category(raw_all_df, "all_box_task_cat")
    with all_lang_tabs[6]:
        create_box_plot_per_semantic_category(raw_all_df, "all_box_sem_cat")
    with all_lang_tabs[7]:
        create_box_plot_per_language("all_box_language")

    # Results per language
    st.markdown("---")
    st.markdown("### Language ranking 🏆")
    lang_choice = st.selectbox(
        "Select a language 🌐:", list(lang_iber), key="lang_leaderboard"
    )
    if lang_choice == "Spanish":
        variations = [
            k
            for k, v in lang_set.items()
            if v["category"] in ["Spanish Variations languages"]
        ]
        tabs_var = st.tabs(variations)
        for var, tab in zip(variations, tabs_var):
            with tab:
                create_results_visualization_lang(var)
    else:
        create_results_visualization_lang(lang_choice)

elif menu == "Submit Model 🚀":
    st.markdown(
        "<div class='main-header'><h1>Submit Your Model 🚀</h1></div>",
        unsafe_allow_html=True,
    )
    st.markdown("## How to submit a model 📤")

    # CSS
    st.markdown(
        """
        <style>
            .card-container {
                max-width: 300px;
                margin: auto;
                text-align: left;
                font-size: 1rem;
                padding: 0.5rem;
                box-sizing: border-box;
            }
            .id-container {
                display: flex;
                align-items: center; 
                margin-bottom: 1rem;
            }
            .id-circle {
                width: 32px;
                height: 32px;
                border-radius: 50%;
                display: flex;
                align-items: center;
                justify-content: center;
                border: 1px solid #007BFF;
                color: #007BFF;
                font-size: 0.875rem;
                font-weight: 600;
                background-color: transparent;
                margin-right: 8px; 
            }
            .guide-content {
                word-wrap: break-word;
            }
            .guide-title {
                font-weight: bold;
                font-size: 1rem;
                margin-left: 8px;
            }
        </style>
    """,
        unsafe_allow_html=True,
    )

    def render_card(content):
        html = f"""
        <div class="card-container">
            <div class="guide-content">
                {content}
            </div>
        </div>
        """
        return html

    # Load your HTML content from files
    guide_info_list = []
    html_path = "assets/html"
    filenames = sorted(os.listdir(html_path))
    for filename in filenames:
        file_path = os.path.join(html_path, filename)
        with open(file_path, "r", encoding="utf-8") as file:
            raw_html = file.read()
            guide_info_list.append(raw_html)

    # Create the grid
    num_columns = 3
    num_rows = 2

    for row in range(num_rows):
        cols = st.columns(num_columns)
        for col in range(num_columns):
            index = row * num_columns + col
            if index < len(guide_info_list):
                with cols[col]:
                    st.markdown(
                        render_card(guide_info_list[index]),
                        unsafe_allow_html=True,
                    )

    st.markdown("## Submission form 📝")
    with st.form("submit_model_form", clear_on_submit=True):
        model_name = st.text_input(
            "Model Name (format: user_name/model_name) 🧩",
            help="Your model should be public on the Hub and follow the username/model-id format (e.g. mistralai/Mistral-7B-v0.1).",
        )
        description = st.text_area(
            "Description ✍️",
            help="Add a description of the proposed model for the evaluation to help prioritize its evaluation.",
        )
        user_contact = st.text_input(
            "Your Contact Email 📧",
            help="User e-mail to contact when there are updates.",
        )
        precision_option = st.selectbox(
            "Choose precision format 🔢:",
            help="Size limits vary by precision. Choose carefully as incorrect precision can cause evaluation errors.",
            options=["float16", "bfloat16", "8bit", "4bit", "GPTQ"],
            index=0,
        )
        weight_type_option = st.selectbox(
            "Select weight type ⚖️:",
            help="Original: Complete model weights. Delta: Differences from base model. Adapter: Lightweight fine-tuning layers.",
            options=["Original", "Adapter", "Delta"],
            index=0,
        )
        base_model_name = st.text_input(
            "Base model (if applicable) 🏗️",
            help="Required for delta weights or adapters. This helps calculate total parameter count.",
            value="",
        )
        model_type = st.selectbox(
            "Choose model type 🔍:",
            help="🟢 Pretrained: Base models, 🔶 Fine-tuned: Domain-specific, 💬 Chat: Conversational, 🤝 Merge: Combined weights.",
            options=["🟢 Pretrained", "🔶 Fine-tuned", "💬 Chat", "🤝 Merge"],
        )
        submit_button = st.form_submit_button("Submit Request 🚀")
        if submit_button:
            use_chat_template = True if model_type == "💬 Chat" else False
            validation_error = validate_model(
                model_name,
                precision_option,
                base_model_name,
                weight_type_option,
                use_chat_template,
            )
            if validation_error is not None:
                st.error(validation_error)
            elif not re.match(r"[^@]+@[^@]+\.[^@]+", user_contact):
                st.error("Invalid email address ⚠️.")
            else:
                input_dict = {
                    "model_name": model_name,
                    "description": description,
                    "user_contact": user_contact,
                    "precision_option": precision_option,
                    "weight_type_option": weight_type_option,
                    "base_model_name": base_model_name,
                    "model_type": model_type,
                }
                try:
                    log_submission(input_dict)
                    st.success("Your request has been sent successfully 🎉.")
                except Exception as e:
                    st.error(
                        f"Failed to send your request: {e}. Please try again later."
                    )

elif menu == "Datasets 📚":
    st.markdown(
        "<div class='main-header'><h1>Dataset Information 📚</h1></div>",
        unsafe_allow_html=True,
    )
    st.markdown("### Check the datasets 🔍")
    lang_iber = [
        k
        for k, v in lang_set.items()
        if v["category"] == "Iberian Peninsula languages"
    ]
    lang_choice = st.selectbox(
        "Select a language 🌐:", list(lang_iber), key="lang_dataset"
    )
    if lang_choice in ["Spanish"]:
        variations = [
            k
            for k, v in lang_set.items()
            if v["category"] in ["Spanish Variations languages"]
        ]
        tabs_var = st.tabs(variations)
        for var, tab in zip(variations, tabs_var):
            with tab:
                create_dataset_info_per_language(var)
    else:
        create_dataset_info_per_language(lang_choice)
    st.markdown("### Task mappings 🔄")
    st.markdown(
        "For the sake of completeness, here we show the mappings we use in the leaderboard to aggregate tasks."
    )
    tab1, tab2 = st.tabs(
        ["Semantic categories 🗂️", "Fundamental vs. Industry ⚖️"]
    )
    with tab1:
        st.json(
            {
                category: [task.removeprefix("iberbench/") for task in tasks]
                for category, tasks in semantic_categories.items()
            }
        )
    with tab2:
        st.json(
            {
                category: [task.removeprefix("iberbench/") for task in tasks]
                for category, tasks in professional_mapping.items()
            }
        )

elif menu == "About ℹ️":
    st.markdown(
        "<div class='main-header'><h1>About ℹ️</h1></div>",
        unsafe_allow_html=True,
    )
    with open("./assets/md/about.md", "r") as fr:
        st.markdown(fr.read(), unsafe_allow_html=True)