ai-phone-leaderboard / src /components /device_comparison.py
agh123's picture
chore: order charts in Device Duel Arena
2443343
import streamlit as st
import pandas as pd
import plotly.graph_objects as go
from typing import List, Optional
from ..core.glicko2_ranking import analyze_device_glicko2_matches
from ..components.visualizations import clean_device_id
def create_head_to_head_battle_chart(
device1: str,
device2: str,
device1_display: str,
device2_display: str,
token_wins_1: int,
prompt_wins_1: int,
combined_wins_1: int,
total_matches: int,
):
"""Create an engaging head-to-head battle visualization."""
# Calculate win percentages for both devices
token_pct_1 = token_wins_1 / total_matches * 100
token_pct_2 = 100 - token_pct_1
prompt_pct_1 = prompt_wins_1 / total_matches * 100
prompt_pct_2 = 100 - prompt_pct_1
combined_pct_1 = combined_wins_1 / total_matches * 100
combined_pct_2 = 100 - combined_pct_1
# Create figure
fig = go.Figure()
# Add bars for device 1
fig.add_trace(
go.Bar(
y=["Token Gen", "Prompt Proc", "Combined"],
x=[token_pct_1, prompt_pct_1, combined_pct_1],
name=device1_display,
orientation="h",
marker=dict(
color="rgba(58, 71, 180, 0.8)",
line=dict(color="rgba(58, 71, 180, 1.0)", width=2),
),
text=[
f"{token_pct_1:.1f}%",
f"{prompt_pct_1:.1f}%",
f"{combined_pct_1:.1f}%",
],
textposition="inside",
insidetextanchor="middle",
hoverinfo="text",
hovertext=[
f"{device1_display}<br>Token Wins: {token_wins_1} ({token_pct_1:.1f}%)",
f"{device1_display}<br>Prompt Wins: {prompt_wins_1} ({prompt_pct_1:.1f}%)",
f"{device1_display}<br>Combined Wins: {combined_wins_1} ({combined_pct_1:.1f}%)",
],
width=0.5,
)
)
# Add bars for device 2
token_wins_2 = total_matches - token_wins_1
prompt_wins_2 = total_matches - prompt_wins_1
combined_wins_2 = total_matches - combined_wins_1
fig.add_trace(
go.Bar(
y=["Token Gen", "Prompt Proc", "Combined"],
x=[-token_pct_2, -prompt_pct_2, -combined_pct_2], # Negative to go left
name=device2_display,
orientation="h",
marker=dict(
color="rgba(231, 99, 99, 0.8)",
line=dict(color="rgba(231, 99, 99, 1.0)", width=2),
),
text=[
f"{token_pct_2:.1f}%",
f"{prompt_pct_2:.1f}%",
f"{combined_pct_2:.1f}%",
],
textposition="inside",
insidetextanchor="middle",
hoverinfo="text",
hovertext=[
f"{device2_display}<br>Token Wins: {token_wins_2} ({token_pct_2:.1f}%)",
f"{device2_display}<br>Prompt Wins: {prompt_wins_2} ({prompt_pct_2:.1f}%)",
f"{device2_display}<br>Combined Wins: {combined_wins_2} ({combined_pct_2:.1f}%)",
],
width=0.5,
)
)
# Design: Add center line and decorations
fig.add_shape(
type="line",
x0=0,
y0=-0.5,
x1=0,
y1=2.5,
line=dict(color="black", width=2, dash="solid"),
)
# VS label in the middle
# fig.add_annotation(
# x=0,
# y=1.5,
# text="VS",
# showarrow=False,
# font=dict(size=20, color="black", family="Arial Black"),
# bgcolor="rgba(255, 255, 255, 0.8)",
# bordercolor="black",
# borderwidth=2,
# borderpad=4,
# width=50,
# height=30,
# )
# Update layout for a battle-like appearance
fig.update_layout(
title=dict(
text=f"⚔️ {device1_display} vs {device2_display} ⚔️",
font=dict(size=24, family="Arial Black"),
x=0.5,
),
barmode="overlay",
bargap=0.15,
bargroupgap=0.1,
legend=dict(x=0.5, y=1.05, xanchor="center", orientation="h"),
xaxis=dict(
title="Win Rate (%)",
range=[-100, 100],
tickvals=[-100, -75, -50, -25, 0, 25, 50, 75, 100],
ticktext=["100%", "75%", "50%", "25%", "0%", "25%", "50%", "75%", "100%"],
zeroline=True,
zerolinewidth=2,
zerolinecolor="black",
),
yaxis=dict(title="", autorange="reversed"),
plot_bgcolor="rgba(240, 240, 240, 0.8)",
height=400,
margin=dict(l=20, r=20, t=80, b=20),
# annotations=[
# dict(
# x=-50,
# y="Token Gen",
# text=device2_display,
# showarrow=False,
# font=dict(
# size=14, color="rgba(231, 99, 99, 1.0)", family="Arial Black"
# ),
# align="center",
# xanchor="center",
# ),
# dict(
# x=50,
# y="Token Gen",
# text=device1_display,
# showarrow=False,
# font=dict(
# size=14, color="rgba(58, 71, 180, 1.0)", family="Arial Black"
# ),
# align="center",
# xanchor="center",
# ),
# ],
)
return fig
def create_victory_badge(winner_device: str, loser_device: str, win_percentage: float):
"""Create a stylized victory badge."""
badge_color = (
"#FFD700"
if win_percentage >= 75
else "#C0C0C0" if win_percentage >= 50 else "#CD7F32"
)
badge_text = (
"DOMINANT VICTORY"
if win_percentage >= 75
else "CLEAR WINNER" if win_percentage >= 50 else "NARROW VICTORY"
)
html = f"""
<div style="display: flex; justify-content: center; margin: 20px 0;">
<div style="
background: linear-gradient(135deg, {badge_color} 0%, #FFFFFF 50%, {badge_color} 100%);
border-radius: 16px;
padding: 20px;
box-shadow: 0 4px 8px rgba(0,0,0,0.2);
text-align: center;
border: 2px solid {badge_color};
max-width: 90%;
">
<div style="font-size: 24px; font-weight: bold; margin-bottom: 8px; font-family: 'Arial Black', sans-serif;">
🏆 {badge_text} 🏆
</div>
<div style="font-size: 18px; font-weight: bold; color: #333;">
{winner_device}
</div>
<div style="font-size: 14px; margin: 8px 0;">
defeated
</div>
<div style="font-size: 16px; color: #555;">
{loser_device}
</div>
<div style="font-size: 20px; font-weight: bold; margin-top: 8px; color: #333;">
{win_percentage:.1f}% Win Rate
</div>
</div>
</div>
"""
return html
def create_model_performance_chart(
matches_df, device1, device2, device1_display, device2_display, top_n=8
):
"""Create an improved model performance comparison chart with vertical models and side-by-side bars."""
# Group by model and calculate mean for both devices
token_cols = ["Model", "Token Generation 1", "Token Generation 2"]
prompt_cols = ["Model", "Prompt Processing 1", "Prompt Processing 2"]
# Ensure all required columns exist
if not all(col in matches_df.columns for col in token_cols + prompt_cols[1:]):
return None
# Prepare data with basic metrics
agg_dict = {
"Token Generation 1": "mean",
"Token Generation 2": "mean",
"Prompt Processing 1": "mean",
"Prompt Processing 2": "mean",
"Model File Size": "first",
}
# Group by model and aggregate
grouped = matches_df.groupby("Model").agg(agg_dict).reset_index()
# Sort by model name alphabetically
grouped = grouped.sort_values("Model File Size", ascending=False)
# Take first top_n models
if len(grouped) > top_n:
grouped = grouped.head(top_n)
# Create figure
fig = go.Figure()
# Use model names directly
models = grouped["Model"].tolist()
token_gen_1 = grouped["Token Generation 1"].tolist()
token_gen_2 = grouped["Token Generation 2"].tolist()
prompt_proc_1 = grouped["Prompt Processing 1"].tolist()
prompt_proc_2 = grouped["Prompt Processing 2"].tolist()
# Add Token Generation traces
fig.add_trace(
go.Bar(
x=token_gen_1,
y=models,
name=f"{device1_display} Token Gen",
orientation="h",
marker=dict(color="rgba(58, 71, 180, 0.8)"),
hovertemplate="%{y}<br>%{x:.2f} tokens/sec<extra></extra>",
legendgroup="device1",
offsetgroup=1,
xaxis="x",
)
)
fig.add_trace(
go.Bar(
x=token_gen_2,
y=models,
name=f"{device2_display} Token Gen",
orientation="h",
marker=dict(color="rgba(231, 99, 99, 0.8)"),
hovertemplate="%{y}<br>%{x:.2f} tokens/sec<extra></extra>",
legendgroup="device2",
offsetgroup=2,
xaxis="x",
)
)
# Add Prompt Processing traces
fig.add_trace(
go.Bar(
x=prompt_proc_1,
y=models,
name=f"{device1_display} Prompt Proc",
orientation="h",
marker=dict(color="rgba(58, 71, 180, 0.4)"),
hovertemplate="%{y}<br>%{x:.2f} tokens/sec<extra></extra>",
legendgroup="device1",
offsetgroup=1,
xaxis="x2",
showlegend=False,
)
)
fig.add_trace(
go.Bar(
x=prompt_proc_2,
y=models,
name=f"{device2_display} Prompt Proc",
orientation="h",
marker=dict(color="rgba(231, 99, 99, 0.4)"),
hovertemplate="%{y}<br>%{x:.2f} tokens/sec<extra></extra>",
legendgroup="device2",
offsetgroup=2,
xaxis="x2",
showlegend=False,
)
)
# Create layout with two x-axes
fig.update_layout(
title_text="📊 Performance Breakdown by Model",
grid=dict(rows=1, columns=2, pattern="independent"),
legend=dict(orientation="h", yanchor="bottom", y=1.12, xanchor="right", x=1),
height=max(
350, 50 * len(models) + 120
), # Dynamic height based on number of models
margin=dict(l=20, r=20, t=80, b=50),
xaxis=dict(
title="Token Generation (tokens/sec)", side="bottom", domain=[0, 0.48]
),
xaxis2=dict(
title="Prompt Processing (tokens/sec)", side="bottom", domain=[0.52, 1]
),
yaxis=dict(title="", autorange="reversed"),
)
# Add a center divider
fig.add_shape(
type="line",
x0=0.5,
y0=0,
x1=0.5,
y1=1,
xref="paper",
yref="paper",
line=dict(color="rgba(0,0,0,0.2)", width=1, dash="dash"),
)
# Add headers for each section
fig.add_annotation(
x=0.4,
y=1.08,
xanchor="right",
xref="paper",
yref="paper",
text="Token Generation",
showarrow=False,
font=dict(
size=14,
color="rgba(58, 71, 180, 1.0)",
family="Arial, sans-serif",
weight="bold",
),
)
fig.add_annotation(
x=0.6,
y=1.08,
xanchor="left",
xref="paper",
yref="paper",
text="Prompt Processing",
showarrow=False,
font=dict(
size=14,
color="rgba(231, 99, 99, 1.0)",
family="Arial, sans-serif",
weight="bold",
),
)
# Better styling for the model names
fig.update_yaxes(
tickfont=dict(size=12, family="Arial, sans-serif"), gridcolor="rgba(0,0,0,0.05)"
)
return fig
def render_device_comparison(df: pd.DataFrame, normalized_device_ids: List[str]):
"""
Render a component for comparing two devices and analyzing their matches.
Args:
df: DataFrame containing benchmark data
normalized_device_ids: List of normalized device IDs to select from
"""
st.title("⚔️ Device Duel Arena")
# Add dramatic introduction with some CSS styling
st.markdown(
"""
<div style="text-align: center; padding: 10px; margin-bottom: 20px;
background: linear-gradient(135deg, #f6f8fa 0%, #e9ecef 100%);
border-radius: 10px; border: 1px solid #dee2e6;">
<p style="font-size: 16px; font-style: italic; color: #495057;">
Welcome to the arena where devices face off in direct comparison!
Choose any two and see how they stack up.
</p>
</div>
""",
unsafe_allow_html=True,
)
# Create mapping of normalized IDs to display names
device_display_names = {
device_id: clean_device_id(device_id) for device_id in normalized_device_ids
}
# Sort device IDs alphabetically by their display names
sorted_device_ids = sorted(
normalized_device_ids, key=lambda x: device_display_names[x].lower()
)
# Create two columns for device selection with battle-themed styling
st.markdown(
"""
<style>
.device-select-header {
font-weight: bold;
font-size: 18px;
margin-bottom: 10px;
text-align: center;
padding: 5px;
border-radius: 5px;
}
.device1-header {
background-color: rgba(58, 71, 180, 0.2);
border-left: 4px solid rgba(58, 71, 180, 1.0);
}
.device2-header {
background-color: rgba(231, 99, 99, 0.2);
border-left: 4px solid rgba(231, 99, 99, 1.0);
}
</style>
""",
unsafe_allow_html=True,
)
col1, vs_col, col2 = st.columns([0.45, 0.1, 0.45])
with vs_col:
st.markdown(
"""
<div style="display: flex; height: 100%; align-items: center; justify-content: center;">
<div style="font-size: 24px; font-weight: bold; color: #555;">VS</div>
</div>
""",
unsafe_allow_html=True,
)
with col1:
st.markdown(
'<div class="device-select-header device1-header">CHALLENGER</div>',
unsafe_allow_html=True,
)
device1 = st.selectbox(
"First Device",
options=sorted_device_ids,
format_func=lambda x: device_display_names[x],
key="device_compare_1",
index=None,
placeholder="Select a device ...",
)
with col2:
st.markdown(
'<div class="device-select-header device2-header">OPPONENT</div>',
unsafe_allow_html=True,
)
device2 = st.selectbox(
"Second Device",
options=sorted_device_ids,
format_func=lambda x: device_display_names[x],
key="device_compare_2",
index=None,
placeholder="Select a device ...",
)
# Button to analyze matches with a more exciting style
button_col1, button_col2, button_col3 = st.columns([0.3, 0.4, 0.3])
with button_col2:
duel_button = st.button(
"️Start",
key="analyze_matches_btn",
use_container_width=True,
)
if duel_button:
# Validate device selection
if not device1 or not device2:
st.error("Please select two devices to battle!")
return
elif device1 == device2:
st.error("Please select two different devices to compare.")
return
# Create dramatic divider
st.markdown(
"""
<div style="text-align: center; margin: 20px 0;">
<div style="font-size: 24px; font-weight: bold; color: #333;">⚔️ BATTLE RESULTS ⚔️</div>
<div style="height: 4px; background: linear-gradient(90deg, rgba(58,71,180,1) 0%, rgba(231,99,99,1) 100%); margin: 10px 0;"></div>
</div>
""",
unsafe_allow_html=True,
)
with st.spinner(
f"⚔️ Battle in progress between {device_display_names[device1]} and {device_display_names[device2]}..."
):
try:
# Analyze matches using Glicko-2
matches_df = analyze_device_glicko2_matches(df, device1, device2)
if not matches_df.empty:
# Show summary statistics
total_matches = len(matches_df)
# Check for required columns before calculating metrics
if (
"Token Winner" in matches_df.columns
and "Prompt Winner" in matches_df.columns
and "Combined Winner" in matches_df.columns
):
token_wins_1 = sum(matches_df["Token Winner"] == device1)
prompt_wins_1 = sum(matches_df["Prompt Winner"] == device1)
combined_wins_1 = sum(matches_df["Combined Winner"] == device1)
# Display total matches info
st.markdown(
f"""
<div style="text-align: center; padding: 10px; background-color: #f8f9fa;
border-radius: 5px; margin: 10px 0; border: 1px solid #dee2e6;">
<span style="font-size: 16px; font-weight: bold;">Total Matches: {total_matches}</span>
</div>
""",
unsafe_allow_html=True,
)
# Show victory badge for the overall winner
winner_device = (
device1 if combined_wins_1 > total_matches / 2 else device2
)
loser_device = device2 if winner_device == device1 else device1
winner_display = device_display_names[winner_device]
loser_display = device_display_names[loser_device]
win_percentage = (
(combined_wins_1 / total_matches * 100)
if winner_device == device1
else (
(total_matches - combined_wins_1) / total_matches * 100
)
)
st.markdown(
create_victory_badge(
winner_display, loser_display, win_percentage
),
unsafe_allow_html=True,
)
# Create battle visualization
battle_fig = create_head_to_head_battle_chart(
device1,
device2,
device_display_names[device1],
device_display_names[device2],
token_wins_1,
prompt_wins_1,
combined_wins_1,
total_matches,
)
st.plotly_chart(battle_fig, use_container_width=True)
# Replace the model-specific charts with the new integrated version
model_performance_chart = create_model_performance_chart(
matches_df,
device1,
device2,
device_display_names[device1],
device_display_names[device2],
)
if model_performance_chart:
st.plotly_chart(
model_performance_chart, use_container_width=True
)
# Show the detailed match table
with st.expander("View Detailed Match Results", expanded=False):
st.markdown("#### All Match Data")
# Define display columns for Glicko-2
display_cols = [
"Model",
"Token Generation 1",
"Token Generation 2",
"Token Winner",
"Token Win Prob",
"Prompt Processing 1",
"Prompt Processing 2",
"Prompt Winner",
"Prompt Win Prob",
"Combined Winner",
"Combined Win Prob",
"Platform 1",
"Platform 2",
]
# Ensure all columns exist in the dataframe
valid_cols = [
col for col in display_cols if col in matches_df.columns
]
if valid_cols:
# Rename some columns for better display
matches_display = matches_df[valid_cols].copy()
# Define a rename mapping but only apply for columns that exist
rename_mapping = {
"Token Generation 1": f"{device_display_names[device1]} Token Gen",
"Token Generation 2": f"{device_display_names[device2]} Token Gen",
"Prompt Processing 1": f"{device_display_names[device1]} Prompt Proc",
"Prompt Processing 2": f"{device_display_names[device2]} Prompt Proc",
"Platform 1": f"{device_display_names[device1]} Platform",
"Platform 2": f"{device_display_names[device2]} Platform",
"Token Win Prob": "Device 1 Token Win Prob",
"Prompt Win Prob": "Device 1 Prompt Win Prob",
"Combined Win Prob": "Device 1 Combined Win Prob",
}
# Only rename columns that exist in the dataframe
rename_filtered = {
k: v
for k, v in rename_mapping.items()
if k in matches_display.columns
}
matches_display = matches_display.rename(
columns=rename_filtered
)
# Round any numeric columns for better display
for col in matches_display.columns:
if matches_display[col].dtype in [
"float64",
"float32",
]:
matches_display[col] = matches_display[
col
].round(2)
st.dataframe(
matches_display,
use_container_width=True,
height=400,
)
else:
st.warning(
"No valid columns found for display in the match data."
)
# # Platform breakdown if available
# if "Platform 2" in matches_df.columns:
# with st.expander("Platform Distribution", expanded=False):
# platform_counts = matches_df[
# "Platform 2"
# ].value_counts()
# st.bar_chart(platform_counts)
else:
st.warning("Winner information is missing from the match data.")
else:
st.error(
f"No matches found between {device_display_names[device1]} and {device_display_names[device2]}."
)
st.info(
"Try selecting different devices or checking if they both have benchmark data for the same models."
)
except Exception as e:
st.error(f"An error occurred during match analysis: {str(e)}")
st.info("Please try with different devices.")