Spaces:

a-ghorbani
/

ai-phone-leaderboard

Running

App Files Files Community

ai-phone-leaderboard / src /utils /anomaly.py

agh123

chore: filter out anomalies (z_threshold=9.0)

157b914 about 1 month ago

raw

history blame contribute delete

4.93 kB

	import pandas as pd


	def add_model_size_groups(df, group_size=0.5, max_size=15):
	"""
	Add a column to the DataFrame categorizing model file sizes into size groups.

	Args:
	df (pandas.DataFrame): DataFrame containing model benchmark data
	group_size (float): Size of each group in GB (default: 0.5)
	max_size (int): Maximum size in GB to consider (default: 15)

	Returns:
	pandas.DataFrame: Original DataFrame with an additional 'Size Group' column
	"""
	if df is None or df.empty:
	return df

	result_df = df.copy()

	if "Model Size GB" not in result_df.columns:
	# Check if 'Model File Size' exists in the DataFrame
	if "Model File Size" not in result_df.columns:
	raise ValueError("DataFrame must contain 'Model File Size' column")
	result_df["Model Size GB"] = result_df["Model File Size"] / 1024**3

	# Define a function to assign size groups
	def assign_size_group(size):
	if size is None or pd.isna(size):
	return "Unknown"

	if size >= max_size:
	return f">{max_size} GB"

	import math

	group_index = math.floor(size / group_size)
	lower_bound = group_index * group_size
	upper_bound = lower_bound + group_size

	# Round to 1 decimal place to avoid floating point issues
	lower_bound = round(lower_bound, 1)
	upper_bound = round(upper_bound, 1)

	return f"{lower_bound}-{upper_bound} GB"

	result_df["Size Group"] = result_df["Model Size GB"].apply(assign_size_group)

	return result_df


	def detect_anomalies(df, z_threshold=6.0, min_samples=5):
	"""
	Detect anomalies in benchmark data.

	Args:
	df (pd.DataFrame): DataFrame containing benchmark data
	z_threshold (float): Z-score threshold for anomaly detection (default: 6.0)
	min_samples (int): Minimum number of samples needed for a group to calculate statistics

	Returns:
	pd.DataFrame: DataFrame containing detected anomalies with relevant information
	"""
	if df is None or df.empty:
	return pd.DataFrame()

	# Ensure we have Size Group column
	if "Size Group" not in df.columns:
	df = add_model_size_groups(df)

	anomalies = []

	for metric in ["Prompt Processing", "Token Generation"]:
	size_groups = df.groupby("Size Group")

	for size_group, group_df in size_groups:
	# Only process groups with enough samples
	if len(group_df) < min_samples:
	continue

	mean_value = group_df[metric].mean()
	std_value = group_df[metric].std()

	# Skip if standard deviation is zero or very small
	if std_value < 0.001:
	continue

	# Calculate z-scores for each entry
	for _, row in group_df.iterrows():
	value = row[metric]
	if pd.isna(value):
	continue

	z_score = abs((value - mean_value) / std_value)

	# Flag as anomaly if z-score exceeds threshold
	if z_score > z_threshold:
	anomaly_data = {
	"Size Group": size_group,
	"Model": row["Model"],
	"Device": row["Device"],
	"Device ID": row["Device ID"],
	"Platform": row["Platform"],
	"Metric": metric,
	"Value": value,
	"Mean": mean_value,
	"Std": std_value,
	"Z-Score": z_score,
	"Times Faster/Slower": value / mean_value,
	"Benchmark": row["Benchmark"],
	"Submission ID": row["Submission ID"],
	}

	anomalies.append(anomaly_data)

	anomaly_df = pd.DataFrame(anomalies)
	if not anomaly_df.empty:
	anomaly_df = anomaly_df.sort_values(by="Z-Score", ascending=False)

	return anomaly_df


	def filter_anomalies(df, z_threshold=9.0, min_samples=5):
	"""
	Filter out anomalies from a DataFrame.

	Args:
	df (pd.DataFrame): DataFrame containing benchmark data
	z_threshold (float): Z-score threshold for anomaly detection (default: 9.0)
	min_samples (int): Minimum number of samples needed for a group to calculate statistics

	Returns:
	tuple: (filtered_df, anomalies_df) - the filtered DataFrame without anomalies and the anomalies DataFrame
	"""
	if df is None or df.empty:
	return df, pd.DataFrame()

	# Find anomalies
	anomalies = detect_anomalies(df, z_threshold, min_samples)

	if anomalies.empty:
	return df, anomalies

	anomaly_ids = set(anomalies["Submission ID"].dropna().unique())
	filtered_df = df[~df["Submission ID"].isin(anomaly_ids)]
	return filtered_df, anomalies