|
""" |
|
Glicko-2 Ranking System for Device Performance Comparison |
|
|
|
This module implements a Glicko-2 based ranking system for comparing device performance |
|
in benchmark tests. Glicko-2 is an improvement over the original Glicko system and Elo, |
|
providing better handling of rating uncertainty and volatility. |
|
|
|
The system: |
|
1. Filters out emulators and iOS devices with insufficient GPU layers |
|
2. Normalizes scores within each model group |
|
3. Computes Glicko-2 ratings for devices based on their performance |
|
4. Provides uncertainty metrics alongside ratings |
|
5. Supports both combined and separate analysis of Token Generation and Prompt Processing |
|
""" |
|
|
|
import numpy as np |
|
import pandas as pd |
|
from collections import defaultdict |
|
from typing import Tuple, Dict, List, Optional |
|
import glicko2 |
|
import streamlit as st |
|
|
|
|
|
CACHE_DURATION = 36000 |
|
|
|
|
|
def preprocess_benchmark_data( |
|
df: pd.DataFrame, |
|
min_gpu_layers: int = 20, |
|
pp_config: int = 512, |
|
tg_config: int = 128, |
|
) -> pd.DataFrame: |
|
""" |
|
Preprocess benchmark data by filtering out invalid entries. |
|
|
|
Args: |
|
df: DataFrame containing benchmark data |
|
min_gpu_layers: Minimum number of GPU layers required for iOS devices |
|
pp_config: Prompt Processing configuration to filter for |
|
tg_config: Token Generation configuration to filter for |
|
|
|
Returns: |
|
Filtered DataFrame containing only valid benchmark entries |
|
""" |
|
|
|
keep_device = ( |
|
|
|
( |
|
(df["Platform"] != "iOS") |
|
| |
|
|
|
((df["Platform"] == "iOS") & (df["n_gpu_layers"] >= min_gpu_layers)) |
|
) |
|
& |
|
|
|
(~df["Normalized Device ID"].str.contains("Emulator", case=False, na=False)) |
|
& |
|
|
|
(df["PP Config"] == pp_config) |
|
& (df["TG Config"] == tg_config) |
|
) |
|
|
|
filtered_df = df[keep_device].copy() |
|
|
|
|
|
total_devices = df["Normalized Device ID"].nunique() |
|
filtered_devices = filtered_df["Normalized Device ID"].nunique() |
|
emulator_devices = df[ |
|
df["Normalized Device ID"].str.contains("Emulator", case=False, na=False) |
|
]["Normalized Device ID"].nunique() |
|
|
|
print("Filtering Statistics:") |
|
print(f"Original devices: {total_devices}") |
|
print(f"Emulator devices removed: {emulator_devices}") |
|
print( |
|
f"iOS devices with insufficient GPU layers removed: " |
|
f"{total_devices - filtered_devices - emulator_devices}" |
|
) |
|
print(f"Final device count: {filtered_devices}") |
|
|
|
|
|
print( |
|
f"Removed {set(df['Normalized Device ID'].unique()) - set(filtered_df['Normalized Device ID'].unique())} " |
|
) |
|
|
|
return filtered_df |
|
|
|
|
|
def compute_glicko2_rankings( |
|
df: pd.DataFrame, token_weight: float = 0.6 |
|
) -> pd.DataFrame: |
|
""" |
|
Compute device rankings using Glicko-2 rating system. |
|
|
|
Args: |
|
df: DataFrame containing benchmark data |
|
token_weight: Weight for Token Generation in combined score (0.0 to 1.0) |
|
|
|
Returns: |
|
DataFrame containing device rankings and statistics |
|
""" |
|
|
|
ratings = {} |
|
match_counts = defaultdict(int) |
|
win_counts = defaultdict(int) |
|
loss_counts = defaultdict(int) |
|
|
|
|
|
|
|
def create_glicko2_rating(): |
|
return glicko2.Player(rating=1500, rd=350, vol=0.06) |
|
|
|
def normalize_scores(group: pd.DataFrame) -> pd.Series: |
|
"""Normalize and combine scores within a model group""" |
|
|
|
token_min = group["Token Generation"].min() |
|
token_max = group["Token Generation"].max() |
|
token_norm = ( |
|
(group["Token Generation"] - token_min) / (token_max - token_min) |
|
if token_max > token_min |
|
else 0 |
|
) |
|
|
|
|
|
prompt_min = group["Prompt Processing"].min() |
|
prompt_max = group["Prompt Processing"].max() |
|
prompt_norm = ( |
|
(group["Prompt Processing"] - prompt_min) / (prompt_max - prompt_min) |
|
if prompt_max > prompt_min |
|
else 0 |
|
) |
|
|
|
|
|
return token_weight * token_norm + (1 - token_weight) * prompt_norm |
|
|
|
|
|
all_devices = df["Normalized Device ID"].unique() |
|
|
|
|
|
for device in all_devices: |
|
ratings[device] = create_glicko2_rating() |
|
|
|
|
|
for model, group in df.groupby("Model ID"): |
|
|
|
group.loc[:, "combined_score"] = normalize_scores(group) |
|
|
|
devices = group["Normalized Device ID"].unique() |
|
|
|
|
|
|
|
device_matches = defaultdict( |
|
lambda: {"opponent_ratings": [], "opponent_rds": [], "outcomes": []} |
|
) |
|
|
|
for i in range(len(devices)): |
|
for j in range(i + 1, len(devices)): |
|
device1 = devices[i] |
|
device2 = devices[j] |
|
|
|
score1 = group[group["Normalized Device ID"] == device1][ |
|
"combined_score" |
|
].iloc[0] |
|
score2 = group[group["Normalized Device ID"] == device2][ |
|
"combined_score" |
|
].iloc[0] |
|
|
|
|
|
match_counts[device1] += 1 |
|
match_counts[device2] += 1 |
|
|
|
|
|
if score1 > score2: |
|
|
|
outcome = 1 |
|
win_counts[device1] += 1 |
|
loss_counts[device2] += 1 |
|
|
|
device_matches[device1]["opponent_ratings"].append( |
|
ratings[device2].rating |
|
) |
|
device_matches[device1]["opponent_rds"].append(ratings[device2].rd) |
|
device_matches[device1]["outcomes"].append(outcome) |
|
|
|
device_matches[device2]["opponent_ratings"].append( |
|
ratings[device1].rating |
|
) |
|
device_matches[device2]["opponent_rds"].append(ratings[device1].rd) |
|
device_matches[device2]["outcomes"].append(0) |
|
elif score1 < score2: |
|
|
|
outcome = 0 |
|
win_counts[device2] += 1 |
|
loss_counts[device1] += 1 |
|
|
|
device_matches[device1]["opponent_ratings"].append( |
|
ratings[device2].rating |
|
) |
|
device_matches[device1]["opponent_rds"].append(ratings[device2].rd) |
|
device_matches[device1]["outcomes"].append(outcome) |
|
|
|
device_matches[device2]["opponent_ratings"].append( |
|
ratings[device1].rating |
|
) |
|
device_matches[device2]["opponent_rds"].append(ratings[device1].rd) |
|
device_matches[device2]["outcomes"].append(1) |
|
else: |
|
|
|
outcome = 0.5 |
|
|
|
device_matches[device1]["opponent_ratings"].append( |
|
ratings[device2].rating |
|
) |
|
device_matches[device1]["opponent_rds"].append(ratings[device2].rd) |
|
device_matches[device1]["outcomes"].append(outcome) |
|
|
|
device_matches[device2]["opponent_ratings"].append( |
|
ratings[device1].rating |
|
) |
|
device_matches[device2]["opponent_rds"].append(ratings[device1].rd) |
|
device_matches[device2]["outcomes"].append(outcome) |
|
|
|
|
|
for device, matches in device_matches.items(): |
|
if matches[ |
|
"opponent_ratings" |
|
]: |
|
|
|
ratings[device].update_player( |
|
matches["opponent_ratings"], |
|
matches["opponent_rds"], |
|
matches["outcomes"], |
|
) |
|
|
|
|
|
ranking_data = [] |
|
for device, rating in ratings.items(): |
|
if match_counts[device] > 0: |
|
ranking_data.append( |
|
{ |
|
"device": device, |
|
"rating": rating.rating, |
|
"rd": rating.rd, |
|
"volatility": rating.vol, |
|
"matches": match_counts[device], |
|
"wins": win_counts[device], |
|
"losses": loss_counts[device], |
|
|
|
"conserv_rating": rating.rating - (2 * rating.rd), |
|
} |
|
) |
|
|
|
|
|
ranking_df = pd.DataFrame(ranking_data) |
|
|
|
if len(ranking_df) > 0: |
|
|
|
ranking_df["win_rate"] = ranking_df["wins"] / ranking_df["matches"] |
|
|
|
|
|
ranking_df["Platform"] = pd.Series( |
|
{ |
|
row["device"]: df[df["Normalized Device ID"] == row["device"]][ |
|
"Platform" |
|
].iloc[0] |
|
for _, row in ranking_df.iterrows() |
|
} |
|
) |
|
|
|
|
|
ranking_df = ranking_df.set_index("device") |
|
|
|
return ranking_df |
|
|
|
|
|
@st.cache_data(ttl=CACHE_DURATION) |
|
def analyze_glicko2_rankings( |
|
df: pd.DataFrame, |
|
min_matches: int = 5, |
|
min_gpu_layers: int = 20, |
|
pp_config: int = 512, |
|
tg_config: int = 128, |
|
) -> Tuple[pd.DataFrame, pd.DataFrame]: |
|
""" |
|
Analyze and display ranking results with Glicko-2 ratings. |
|
|
|
Args: |
|
df: DataFrame containing benchmark data |
|
min_matches: Minimum number of matches required for confident rankings |
|
min_gpu_layers: Minimum number of GPU layers required for iOS devices |
|
pp_config: Prompt Processing configuration to filter for |
|
tg_config: Token Generation configuration to filter for |
|
|
|
Returns: |
|
Tuple of (all rankings DataFrame, confident rankings DataFrame) |
|
""" |
|
|
|
filtered_df = preprocess_benchmark_data(df, min_gpu_layers, pp_config, tg_config) |
|
|
|
|
|
combined_rankings = compute_glicko2_rankings(filtered_df, token_weight=0.6) |
|
token_rankings = compute_glicko2_rankings(filtered_df, token_weight=1.0) |
|
prompt_rankings = compute_glicko2_rankings(filtered_df, token_weight=0.0) |
|
|
|
|
|
combined_rankings = combined_rankings.rename( |
|
columns={ |
|
"rating": "combined_rating", |
|
"rd": "combined_rd", |
|
"volatility": "combined_vol", |
|
"conserv_rating": "combined_conserv", |
|
"wins": "combined_wins", |
|
"losses": "combined_losses", |
|
"win_rate": "combined_win_rate", |
|
} |
|
) |
|
|
|
token_rankings = token_rankings.rename( |
|
columns={ |
|
"rating": "token_rating", |
|
"rd": "token_rd", |
|
"volatility": "token_vol", |
|
"conserv_rating": "token_conserv", |
|
"wins": "token_wins", |
|
"losses": "token_losses", |
|
"win_rate": "token_win_rate", |
|
} |
|
) |
|
|
|
prompt_rankings = prompt_rankings.rename( |
|
columns={ |
|
"rating": "prompt_rating", |
|
"rd": "prompt_rd", |
|
"volatility": "prompt_vol", |
|
"conserv_rating": "prompt_conserv", |
|
"wins": "prompt_wins", |
|
"losses": "prompt_losses", |
|
"win_rate": "prompt_win_rate", |
|
} |
|
) |
|
|
|
|
|
|
|
rankings = combined_rankings.copy() |
|
|
|
|
|
for col in [ |
|
"token_rating", |
|
"token_rd", |
|
"token_vol", |
|
"token_conserv", |
|
"token_wins", |
|
"token_losses", |
|
"token_win_rate", |
|
]: |
|
rankings[col] = token_rankings[col] |
|
|
|
|
|
for col in [ |
|
"prompt_rating", |
|
"prompt_rd", |
|
"prompt_vol", |
|
"prompt_conserv", |
|
"prompt_wins", |
|
"prompt_losses", |
|
"prompt_win_rate", |
|
]: |
|
rankings[col] = prompt_rankings[col] |
|
|
|
|
|
confident_rankings = rankings[rankings["matches"] >= min_matches].sort_values( |
|
"combined_rating", ascending=False |
|
) |
|
|
|
return rankings, confident_rankings |
|
|
|
|
|
def analyze_device_glicko2_matches( |
|
df: pd.DataFrame, |
|
device_id1: str, |
|
device_id2: Optional[str] = None, |
|
token_weight: float = 0.6, |
|
) -> pd.DataFrame: |
|
""" |
|
Analyze all matches for one or two specific devices using the Glicko-2 methodology. |
|
|
|
Args: |
|
df: DataFrame containing benchmark data |
|
device_id1: First device ID to analyze |
|
device_id2: Optional second device ID to compare against |
|
token_weight: Weight for Token Generation in combined score (0.0 to 1.0) |
|
|
|
Returns: |
|
DataFrame containing detailed match information with win probabilities |
|
""" |
|
matches = [] |
|
|
|
def normalize_scores(group: pd.DataFrame) -> Dict[str, Dict]: |
|
"""Normalize scores within a model group and return as dict""" |
|
|
|
token_min = group["Token Generation"].min() |
|
token_max = group["Token Generation"].max() |
|
token_range = token_max - token_min |
|
|
|
|
|
prompt_min = group["Prompt Processing"].min() |
|
prompt_max = group["Prompt Processing"].max() |
|
prompt_range = prompt_max - prompt_min |
|
|
|
|
|
result = {} |
|
for _, row in group.iterrows(): |
|
device_id = row["Normalized Device ID"] |
|
|
|
|
|
token_norm = 0 |
|
if token_range > 0: |
|
token_norm = (row["Token Generation"] - token_min) / token_range |
|
|
|
|
|
prompt_norm = 0 |
|
if prompt_range > 0: |
|
prompt_norm = (row["Prompt Processing"] - prompt_min) / prompt_range |
|
|
|
|
|
combined = token_weight * token_norm + (1 - token_weight) * prompt_norm |
|
|
|
result[device_id] = { |
|
"token_norm": token_norm, |
|
"prompt_norm": prompt_norm, |
|
"combined": combined, |
|
} |
|
return result |
|
|
|
|
|
for model, group in df.groupby("Model ID"): |
|
if device_id1 not in group["Normalized Device ID"].values: |
|
continue |
|
|
|
|
|
|
|
device_agg = ( |
|
group.groupby("Normalized Device ID") |
|
.agg( |
|
{ |
|
"Token Generation": "max", |
|
"Prompt Processing": "max", |
|
"n_gpu_layers": "first", |
|
"Platform": "first", |
|
"Model File Size": "first", |
|
} |
|
) |
|
.reset_index() |
|
) |
|
|
|
|
|
if device_id1 not in device_agg["Normalized Device ID"].values: |
|
continue |
|
|
|
device1_data = device_agg[ |
|
device_agg["Normalized Device ID"] == device_id1 |
|
].iloc[0] |
|
|
|
|
|
if device_id2 is not None: |
|
if device_id2 not in device_agg["Normalized Device ID"].values: |
|
continue |
|
devices_to_compare = [device_id2] |
|
else: |
|
devices_to_compare = [ |
|
d |
|
for d in device_agg["Normalized Device ID"].unique() |
|
if d != device_id1 |
|
] |
|
|
|
|
|
|
|
norm_scores = normalize_scores(device_agg) |
|
|
|
|
|
|
|
for other_device in devices_to_compare: |
|
device2_data = device_agg[ |
|
device_agg["Normalized Device ID"] == other_device |
|
].iloc[0] |
|
|
|
|
|
if device_id1 not in norm_scores or other_device not in norm_scores: |
|
continue |
|
|
|
|
|
scores1 = norm_scores[device_id1] |
|
scores2 = norm_scores[other_device] |
|
|
|
|
|
|
|
token_advantage = scores1["token_norm"] - scores2["token_norm"] |
|
token_prob = 1 / ( |
|
1 + 10 ** (-6 * token_advantage) |
|
) |
|
|
|
|
|
prompt_advantage = scores1["prompt_norm"] - scores2["prompt_norm"] |
|
prompt_prob = 1 / (1 + 10 ** (-6 * prompt_advantage)) |
|
|
|
|
|
combined_advantage = scores1["combined"] - scores2["combined"] |
|
combined_prob = 1 / (1 + 10 ** (-6 * combined_advantage)) |
|
|
|
token_winner = ( |
|
device_id1 |
|
if device1_data["Token Generation"] > device2_data["Token Generation"] |
|
else ( |
|
other_device |
|
if device2_data["Token Generation"] |
|
> device1_data["Token Generation"] |
|
else "Tie" |
|
) |
|
) |
|
prompt_winner = ( |
|
device_id1 |
|
if device1_data["Prompt Processing"] > device2_data["Prompt Processing"] |
|
else ( |
|
other_device |
|
if device2_data["Prompt Processing"] |
|
> device1_data["Prompt Processing"] |
|
else "Tie" |
|
) |
|
) |
|
combined_winner = ( |
|
device_id1 |
|
if scores1["combined"] > scores2["combined"] |
|
else ( |
|
other_device if scores2["combined"] > scores1["combined"] else "Tie" |
|
) |
|
) |
|
|
|
matches.append( |
|
{ |
|
"Model": model, |
|
"Device 1": device_id1, |
|
"Device 2": other_device, |
|
"n_gpu_layers 1": device1_data["n_gpu_layers"], |
|
"n_gpu_layers 2": device2_data["n_gpu_layers"], |
|
"Token Generation 1": device1_data["Token Generation"], |
|
"Token Generation 2": device2_data["Token Generation"], |
|
"Token Winner": token_winner, |
|
"Token Win Prob": token_prob, |
|
"Prompt Processing 1": device1_data["Prompt Processing"], |
|
"Prompt Processing 2": device2_data["Prompt Processing"], |
|
"Prompt Winner": prompt_winner, |
|
"Prompt Win Prob": prompt_prob, |
|
"Combined Winner": combined_winner, |
|
"Combined Win Prob": combined_prob, |
|
"Platform 1": device1_data["Platform"], |
|
"Platform 2": device2_data["Platform"], |
|
"Model File Size": device1_data["Model File Size"], |
|
} |
|
) |
|
|
|
matches_df = pd.DataFrame(matches) |
|
|
|
if len(matches_df) > 0: |
|
return matches_df |
|
else: |
|
print( |
|
f"No matches found for device {device_id1}" |
|
+ (f" against {device_id2}" if device_id2 else "") |
|
) |
|
return pd.DataFrame() |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
print("This module provides Glicko-2 ranking for device performance.") |
|
print("Import and use the functions in your own code.") |
|
print("Example:") |
|
print(" from glicko2_ranking import analyze_glicko2_rankings") |
|
print(" rankings, confident_rankings = analyze_glicko2_rankings(df)") |
|
|