Spaces:

a-ghorbani
/

ai-phone-leaderboard

Running

App Files Files Community

ai-phone-leaderboard / src /components /device_comparison.py

agh123

chore: order charts in Device Duel Arena

2443343 about 1 month ago

raw

history blame contribute delete

25.3 kB

	import streamlit as st
	import pandas as pd
	import plotly.graph_objects as go
	from typing import List, Optional

	from ..core.glicko2_ranking import analyze_device_glicko2_matches
	from ..components.visualizations import clean_device_id


	def create_head_to_head_battle_chart(
	device1: str,
	device2: str,
	device1_display: str,
	device2_display: str,
	token_wins_1: int,
	prompt_wins_1: int,
	combined_wins_1: int,
	total_matches: int,
	):
	"""Create an engaging head-to-head battle visualization."""

	# Calculate win percentages for both devices
	token_pct_1 = token_wins_1 / total_matches * 100
	token_pct_2 = 100 - token_pct_1

	prompt_pct_1 = prompt_wins_1 / total_matches * 100
	prompt_pct_2 = 100 - prompt_pct_1

	combined_pct_1 = combined_wins_1 / total_matches * 100
	combined_pct_2 = 100 - combined_pct_1

	# Create figure
	fig = go.Figure()

	# Add bars for device 1
	fig.add_trace(
	go.Bar(
	y=["Token Gen", "Prompt Proc", "Combined"],
	x=[token_pct_1, prompt_pct_1, combined_pct_1],
	name=device1_display,
	orientation="h",
	marker=dict(
	color="rgba(58, 71, 180, 0.8)",
	line=dict(color="rgba(58, 71, 180, 1.0)", width=2),
	),
	text=[
	f"{token_pct_1:.1f}%",
	f"{prompt_pct_1:.1f}%",
	f"{combined_pct_1:.1f}%",
	],
	textposition="inside",
	insidetextanchor="middle",
	hoverinfo="text",
	hovertext=[
	f"{device1_display}<br>Token Wins: {token_wins_1} ({token_pct_1:.1f}%)",
	f"{device1_display}<br>Prompt Wins: {prompt_wins_1} ({prompt_pct_1:.1f}%)",
	f"{device1_display}<br>Combined Wins: {combined_wins_1} ({combined_pct_1:.1f}%)",
	],
	width=0.5,
	)
	)

	# Add bars for device 2
	token_wins_2 = total_matches - token_wins_1
	prompt_wins_2 = total_matches - prompt_wins_1
	combined_wins_2 = total_matches - combined_wins_1

	fig.add_trace(
	go.Bar(
	y=["Token Gen", "Prompt Proc", "Combined"],
	x=[-token_pct_2, -prompt_pct_2, -combined_pct_2], # Negative to go left
	name=device2_display,
	orientation="h",
	marker=dict(
	color="rgba(231, 99, 99, 0.8)",
	line=dict(color="rgba(231, 99, 99, 1.0)", width=2),
	),
	text=[
	f"{token_pct_2:.1f}%",
	f"{prompt_pct_2:.1f}%",
	f"{combined_pct_2:.1f}%",
	],
	textposition="inside",
	insidetextanchor="middle",
	hoverinfo="text",
	hovertext=[
	f"{device2_display}<br>Token Wins: {token_wins_2} ({token_pct_2:.1f}%)",
	f"{device2_display}<br>Prompt Wins: {prompt_wins_2} ({prompt_pct_2:.1f}%)",
	f"{device2_display}<br>Combined Wins: {combined_wins_2} ({combined_pct_2:.1f}%)",
	],
	width=0.5,
	)
	)

	# Design: Add center line and decorations
	fig.add_shape(
	type="line",
	x0=0,
	y0=-0.5,
	x1=0,
	y1=2.5,
	line=dict(color="black", width=2, dash="solid"),
	)

	# VS label in the middle
	# fig.add_annotation(
	# x=0,
	# y=1.5,
	# text="VS",
	# showarrow=False,
	# font=dict(size=20, color="black", family="Arial Black"),
	# bgcolor="rgba(255, 255, 255, 0.8)",
	# bordercolor="black",
	# borderwidth=2,
	# borderpad=4,
	# width=50,
	# height=30,
	# )

	# Update layout for a battle-like appearance
	fig.update_layout(
	title=dict(
	text=f"⚔️ {device1_display} vs {device2_display} ⚔️",
	font=dict(size=24, family="Arial Black"),
	x=0.5,
	),
	barmode="overlay",
	bargap=0.15,
	bargroupgap=0.1,
	legend=dict(x=0.5, y=1.05, xanchor="center", orientation="h"),
	xaxis=dict(
	title="Win Rate (%)",
	range=[-100, 100],
	tickvals=[-100, -75, -50, -25, 0, 25, 50, 75, 100],
	ticktext=["100%", "75%", "50%", "25%", "0%", "25%", "50%", "75%", "100%"],
	zeroline=True,
	zerolinewidth=2,
	zerolinecolor="black",
	),
	yaxis=dict(title="", autorange="reversed"),
	plot_bgcolor="rgba(240, 240, 240, 0.8)",
	height=400,
	margin=dict(l=20, r=20, t=80, b=20),
	# annotations=[
	# dict(
	# x=-50,
	# y="Token Gen",
	# text=device2_display,
	# showarrow=False,
	# font=dict(
	# size=14, color="rgba(231, 99, 99, 1.0)", family="Arial Black"
	# ),
	# align="center",
	# xanchor="center",
	# ),
	# dict(
	# x=50,
	# y="Token Gen",
	# text=device1_display,
	# showarrow=False,
	# font=dict(
	# size=14, color="rgba(58, 71, 180, 1.0)", family="Arial Black"
	# ),
	# align="center",
	# xanchor="center",
	# ),
	# ],
	)

	return fig


	def create_victory_badge(winner_device: str, loser_device: str, win_percentage: float):
	"""Create a stylized victory badge."""
	badge_color = (
	"#FFD700"
	if win_percentage >= 75
	else "#C0C0C0" if win_percentage >= 50 else "#CD7F32"
	)
	badge_text = (
	"DOMINANT VICTORY"
	if win_percentage >= 75
	else "CLEAR WINNER" if win_percentage >= 50 else "NARROW VICTORY"
	)

	html = f"""
	<div style="display: flex; justify-content: center; margin: 20px 0;">
	<div style="
	background: linear-gradient(135deg, {badge_color} 0%, #FFFFFF 50%, {badge_color} 100%);
	border-radius: 16px;
	padding: 20px;
	box-shadow: 0 4px 8px rgba(0,0,0,0.2);
	text-align: center;
	border: 2px solid {badge_color};
	max-width: 90%;
	">
	<div style="font-size: 24px; font-weight: bold; margin-bottom: 8px; font-family: 'Arial Black', sans-serif;">
	🏆 {badge_text} 🏆
	</div>
	<div style="font-size: 18px; font-weight: bold; color: #333;">
	{winner_device}
	</div>
	<div style="font-size: 14px; margin: 8px 0;">
	defeated
	</div>
	<div style="font-size: 16px; color: #555;">
	{loser_device}
	</div>
	<div style="font-size: 20px; font-weight: bold; margin-top: 8px; color: #333;">
	{win_percentage:.1f}% Win Rate
	</div>
	</div>
	</div>
	"""
	return html


	def create_model_performance_chart(
	matches_df, device1, device2, device1_display, device2_display, top_n=8
	):
	"""Create an improved model performance comparison chart with vertical models and side-by-side bars."""
	# Group by model and calculate mean for both devices
	token_cols = ["Model", "Token Generation 1", "Token Generation 2"]
	prompt_cols = ["Model", "Prompt Processing 1", "Prompt Processing 2"]

	# Ensure all required columns exist
	if not all(col in matches_df.columns for col in token_cols + prompt_cols[1:]):
	return None

	# Prepare data with basic metrics
	agg_dict = {
	"Token Generation 1": "mean",
	"Token Generation 2": "mean",
	"Prompt Processing 1": "mean",
	"Prompt Processing 2": "mean",
	"Model File Size": "first",
	}

	# Group by model and aggregate
	grouped = matches_df.groupby("Model").agg(agg_dict).reset_index()

	# Sort by model name alphabetically
	grouped = grouped.sort_values("Model File Size", ascending=False)

	# Take first top_n models
	if len(grouped) > top_n:
	grouped = grouped.head(top_n)

	# Create figure
	fig = go.Figure()

	# Use model names directly
	models = grouped["Model"].tolist()

	token_gen_1 = grouped["Token Generation 1"].tolist()
	token_gen_2 = grouped["Token Generation 2"].tolist()
	prompt_proc_1 = grouped["Prompt Processing 1"].tolist()
	prompt_proc_2 = grouped["Prompt Processing 2"].tolist()

	# Add Token Generation traces
	fig.add_trace(
	go.Bar(
	x=token_gen_1,
	y=models,
	name=f"{device1_display} Token Gen",
	orientation="h",
	marker=dict(color="rgba(58, 71, 180, 0.8)"),
	hovertemplate="%{y}<br>%{x:.2f} tokens/sec<extra></extra>",
	legendgroup="device1",
	offsetgroup=1,
	xaxis="x",
	)
	)

	fig.add_trace(
	go.Bar(
	x=token_gen_2,
	y=models,
	name=f"{device2_display} Token Gen",
	orientation="h",
	marker=dict(color="rgba(231, 99, 99, 0.8)"),
	hovertemplate="%{y}<br>%{x:.2f} tokens/sec<extra></extra>",
	legendgroup="device2",
	offsetgroup=2,
	xaxis="x",
	)
	)

	# Add Prompt Processing traces
	fig.add_trace(
	go.Bar(
	x=prompt_proc_1,
	y=models,
	name=f"{device1_display} Prompt Proc",
	orientation="h",
	marker=dict(color="rgba(58, 71, 180, 0.4)"),
	hovertemplate="%{y}<br>%{x:.2f} tokens/sec<extra></extra>",
	legendgroup="device1",
	offsetgroup=1,
	xaxis="x2",
	showlegend=False,
	)
	)

	fig.add_trace(
	go.Bar(
	x=prompt_proc_2,
	y=models,
	name=f"{device2_display} Prompt Proc",
	orientation="h",
	marker=dict(color="rgba(231, 99, 99, 0.4)"),
	hovertemplate="%{y}<br>%{x:.2f} tokens/sec<extra></extra>",
	legendgroup="device2",
	offsetgroup=2,
	xaxis="x2",
	showlegend=False,
	)
	)

	# Create layout with two x-axes
	fig.update_layout(
	title_text="📊 Performance Breakdown by Model",
	grid=dict(rows=1, columns=2, pattern="independent"),
	legend=dict(orientation="h", yanchor="bottom", y=1.12, xanchor="right", x=1),
	height=max(
	350, 50 * len(models) + 120
	), # Dynamic height based on number of models
	margin=dict(l=20, r=20, t=80, b=50),
	xaxis=dict(
	title="Token Generation (tokens/sec)", side="bottom", domain=[0, 0.48]
	),
	xaxis2=dict(
	title="Prompt Processing (tokens/sec)", side="bottom", domain=[0.52, 1]
	),
	yaxis=dict(title="", autorange="reversed"),
	)

	# Add a center divider
	fig.add_shape(
	type="line",
	x0=0.5,
	y0=0,
	x1=0.5,
	y1=1,
	xref="paper",
	yref="paper",
	line=dict(color="rgba(0,0,0,0.2)", width=1, dash="dash"),
	)

	# Add headers for each section
	fig.add_annotation(
	x=0.4,
	y=1.08,
	xanchor="right",
	xref="paper",
	yref="paper",
	text="Token Generation",
	showarrow=False,
	font=dict(
	size=14,
	color="rgba(58, 71, 180, 1.0)",
	family="Arial, sans-serif",
	weight="bold",
	),
	)

	fig.add_annotation(
	x=0.6,
	y=1.08,
	xanchor="left",
	xref="paper",
	yref="paper",
	text="Prompt Processing",
	showarrow=False,
	font=dict(
	size=14,
	color="rgba(231, 99, 99, 1.0)",
	family="Arial, sans-serif",
	weight="bold",
	),
	)

	# Better styling for the model names
	fig.update_yaxes(
	tickfont=dict(size=12, family="Arial, sans-serif"), gridcolor="rgba(0,0,0,0.05)"
	)

	return fig


	def render_device_comparison(df: pd.DataFrame, normalized_device_ids: List[str]):
	"""
	Render a component for comparing two devices and analyzing their matches.

	Args:
	df: DataFrame containing benchmark data
	normalized_device_ids: List of normalized device IDs to select from
	"""
	st.title("⚔️ Device Duel Arena")

	# Add dramatic introduction with some CSS styling
	st.markdown(
	"""
	<div style="text-align: center; padding: 10px; margin-bottom: 20px;
	background: linear-gradient(135deg, #f6f8fa 0%, #e9ecef 100%);
	border-radius: 10px; border: 1px solid #dee2e6;">
	<p style="font-size: 16px; font-style: italic; color: #495057;">
	Welcome to the arena where devices face off in direct comparison!
	Choose any two and see how they stack up.
	</p>
	</div>
	""",
	unsafe_allow_html=True,
	)

	# Create mapping of normalized IDs to display names
	device_display_names = {
	device_id: clean_device_id(device_id) for device_id in normalized_device_ids
	}

	# Sort device IDs alphabetically by their display names
	sorted_device_ids = sorted(
	normalized_device_ids, key=lambda x: device_display_names[x].lower()
	)

	# Create two columns for device selection with battle-themed styling
	st.markdown(
	"""
	<style>
	.device-select-header {
	font-weight: bold;
	font-size: 18px;
	margin-bottom: 10px;
	text-align: center;
	padding: 5px;
	border-radius: 5px;
	}
	.device1-header {
	background-color: rgba(58, 71, 180, 0.2);
	border-left: 4px solid rgba(58, 71, 180, 1.0);
	}
	.device2-header {
	background-color: rgba(231, 99, 99, 0.2);
	border-left: 4px solid rgba(231, 99, 99, 1.0);
	}
	</style>
	""",
	unsafe_allow_html=True,
	)

	col1, vs_col, col2 = st.columns([0.45, 0.1, 0.45])

	with vs_col:
	st.markdown(
	"""
	<div style="display: flex; height: 100%; align-items: center; justify-content: center;">
	<div style="font-size: 24px; font-weight: bold; color: #555;">VS</div>
	</div>
	""",
	unsafe_allow_html=True,
	)

	with col1:
	st.markdown(
	'<div class="device-select-header device1-header">CHALLENGER</div>',
	unsafe_allow_html=True,
	)
	device1 = st.selectbox(
	"First Device",
	options=sorted_device_ids,
	format_func=lambda x: device_display_names[x],
	key="device_compare_1",
	index=None,
	placeholder="Select a device ...",
	)

	with col2:
	st.markdown(
	'<div class="device-select-header device2-header">OPPONENT</div>',
	unsafe_allow_html=True,
	)
	device2 = st.selectbox(
	"Second Device",
	options=sorted_device_ids,
	format_func=lambda x: device_display_names[x],
	key="device_compare_2",
	index=None,
	placeholder="Select a device ...",
	)

	# Button to analyze matches with a more exciting style
	button_col1, button_col2, button_col3 = st.columns([0.3, 0.4, 0.3])
	with button_col2:
	duel_button = st.button(
	"️Start",
	key="analyze_matches_btn",
	use_container_width=True,
	)

	if duel_button:
	# Validate device selection
	if not device1 or not device2:
	st.error("Please select two devices to battle!")
	return
	elif device1 == device2:
	st.error("Please select two different devices to compare.")
	return

	# Create dramatic divider
	st.markdown(
	"""
	<div style="text-align: center; margin: 20px 0;">
	<div style="font-size: 24px; font-weight: bold; color: #333;">⚔️ BATTLE RESULTS ⚔️</div>
	<div style="height: 4px; background: linear-gradient(90deg, rgba(58,71,180,1) 0%, rgba(231,99,99,1) 100%); margin: 10px 0;"></div>
	</div>
	""",
	unsafe_allow_html=True,
	)

	with st.spinner(
	f"⚔️ Battle in progress between {device_display_names[device1]} and {device_display_names[device2]}..."
	):
	try:
	# Analyze matches using Glicko-2
	matches_df = analyze_device_glicko2_matches(df, device1, device2)

	if not matches_df.empty:
	# Show summary statistics
	total_matches = len(matches_df)

	# Check for required columns before calculating metrics
	if (
	"Token Winner" in matches_df.columns
	and "Prompt Winner" in matches_df.columns
	and "Combined Winner" in matches_df.columns
	):
	token_wins_1 = sum(matches_df["Token Winner"] == device1)
	prompt_wins_1 = sum(matches_df["Prompt Winner"] == device1)
	combined_wins_1 = sum(matches_df["Combined Winner"] == device1)

	# Display total matches info
	st.markdown(
	f"""
	<div style="text-align: center; padding: 10px; background-color: #f8f9fa;
	border-radius: 5px; margin: 10px 0; border: 1px solid #dee2e6;">
	<span style="font-size: 16px; font-weight: bold;">Total Matches: {total_matches}</span>
	</div>
	""",
	unsafe_allow_html=True,
	)

	# Show victory badge for the overall winner
	winner_device = (
	device1 if combined_wins_1 > total_matches / 2 else device2
	)
	loser_device = device2 if winner_device == device1 else device1

	winner_display = device_display_names[winner_device]
	loser_display = device_display_names[loser_device]

	win_percentage = (
	(combined_wins_1 / total_matches * 100)
	if winner_device == device1
	else (
	(total_matches - combined_wins_1) / total_matches * 100
	)
	)

	st.markdown(
	create_victory_badge(
	winner_display, loser_display, win_percentage
	),
	unsafe_allow_html=True,
	)

	# Create battle visualization
	battle_fig = create_head_to_head_battle_chart(
	device1,
	device2,
	device_display_names[device1],
	device_display_names[device2],
	token_wins_1,
	prompt_wins_1,
	combined_wins_1,
	total_matches,
	)

	st.plotly_chart(battle_fig, use_container_width=True)

	# Replace the model-specific charts with the new integrated version
	model_performance_chart = create_model_performance_chart(
	matches_df,
	device1,
	device2,
	device_display_names[device1],
	device_display_names[device2],
	)

	if model_performance_chart:
	st.plotly_chart(
	model_performance_chart, use_container_width=True
	)

	# Show the detailed match table
	with st.expander("View Detailed Match Results", expanded=False):
	st.markdown("#### All Match Data")

	# Define display columns for Glicko-2
	display_cols = [
	"Model",
	"Token Generation 1",
	"Token Generation 2",
	"Token Winner",
	"Token Win Prob",
	"Prompt Processing 1",
	"Prompt Processing 2",
	"Prompt Winner",
	"Prompt Win Prob",
	"Combined Winner",
	"Combined Win Prob",
	"Platform 1",
	"Platform 2",
	]

	# Ensure all columns exist in the dataframe
	valid_cols = [
	col for col in display_cols if col in matches_df.columns
	]

	if valid_cols:
	# Rename some columns for better display
	matches_display = matches_df[valid_cols].copy()

	# Define a rename mapping but only apply for columns that exist
	rename_mapping = {
	"Token Generation 1": f"{device_display_names[device1]} Token Gen",
	"Token Generation 2": f"{device_display_names[device2]} Token Gen",
	"Prompt Processing 1": f"{device_display_names[device1]} Prompt Proc",
	"Prompt Processing 2": f"{device_display_names[device2]} Prompt Proc",
	"Platform 1": f"{device_display_names[device1]} Platform",
	"Platform 2": f"{device_display_names[device2]} Platform",
	"Token Win Prob": "Device 1 Token Win Prob",
	"Prompt Win Prob": "Device 1 Prompt Win Prob",
	"Combined Win Prob": "Device 1 Combined Win Prob",
	}

	# Only rename columns that exist in the dataframe
	rename_filtered = {
	k: v
	for k, v in rename_mapping.items()
	if k in matches_display.columns
	}
	matches_display = matches_display.rename(
	columns=rename_filtered
	)

	# Round any numeric columns for better display
	for col in matches_display.columns:
	if matches_display[col].dtype in [
	"float64",
	"float32",
	]:
	matches_display[col] = matches_display[
	col
	].round(2)

	st.dataframe(
	matches_display,
	use_container_width=True,
	height=400,
	)
	else:
	st.warning(
	"No valid columns found for display in the match data."
	)

	# # Platform breakdown if available
	# if "Platform 2" in matches_df.columns:
	# with st.expander("Platform Distribution", expanded=False):
	# platform_counts = matches_df[
	# "Platform 2"
	# ].value_counts()
	# st.bar_chart(platform_counts)
	else:
	st.warning("Winner information is missing from the match data.")
	else:
	st.error(
	f"No matches found between {device_display_names[device1]} and {device_display_names[device2]}."
	)
	st.info(
	"Try selecting different devices or checking if they both have benchmark data for the same models."
	)
	except Exception as e:
	st.error(f"An error occurred during match analysis: {str(e)}")
	st.info("Please try with different devices.")