agh123's picture
chore: filter out anomalies (z_threshold=9.0)
157b914
import pandas as pd
def add_model_size_groups(df, group_size=0.5, max_size=15):
"""
Add a column to the DataFrame categorizing model file sizes into size groups.
Args:
df (pandas.DataFrame): DataFrame containing model benchmark data
group_size (float): Size of each group in GB (default: 0.5)
max_size (int): Maximum size in GB to consider (default: 15)
Returns:
pandas.DataFrame: Original DataFrame with an additional 'Size Group' column
"""
if df is None or df.empty:
return df
result_df = df.copy()
if "Model Size GB" not in result_df.columns:
# Check if 'Model File Size' exists in the DataFrame
if "Model File Size" not in result_df.columns:
raise ValueError("DataFrame must contain 'Model File Size' column")
result_df["Model Size GB"] = result_df["Model File Size"] / 1024**3
# Define a function to assign size groups
def assign_size_group(size):
if size is None or pd.isna(size):
return "Unknown"
if size >= max_size:
return f">{max_size} GB"
import math
group_index = math.floor(size / group_size)
lower_bound = group_index * group_size
upper_bound = lower_bound + group_size
# Round to 1 decimal place to avoid floating point issues
lower_bound = round(lower_bound, 1)
upper_bound = round(upper_bound, 1)
return f"{lower_bound}-{upper_bound} GB"
result_df["Size Group"] = result_df["Model Size GB"].apply(assign_size_group)
return result_df
def detect_anomalies(df, z_threshold=6.0, min_samples=5):
"""
Detect anomalies in benchmark data.
Args:
df (pd.DataFrame): DataFrame containing benchmark data
z_threshold (float): Z-score threshold for anomaly detection (default: 6.0)
min_samples (int): Minimum number of samples needed for a group to calculate statistics
Returns:
pd.DataFrame: DataFrame containing detected anomalies with relevant information
"""
if df is None or df.empty:
return pd.DataFrame()
# Ensure we have Size Group column
if "Size Group" not in df.columns:
df = add_model_size_groups(df)
anomalies = []
for metric in ["Prompt Processing", "Token Generation"]:
size_groups = df.groupby("Size Group")
for size_group, group_df in size_groups:
# Only process groups with enough samples
if len(group_df) < min_samples:
continue
mean_value = group_df[metric].mean()
std_value = group_df[metric].std()
# Skip if standard deviation is zero or very small
if std_value < 0.001:
continue
# Calculate z-scores for each entry
for _, row in group_df.iterrows():
value = row[metric]
if pd.isna(value):
continue
z_score = abs((value - mean_value) / std_value)
# Flag as anomaly if z-score exceeds threshold
if z_score > z_threshold:
anomaly_data = {
"Size Group": size_group,
"Model": row["Model"],
"Device": row["Device"],
"Device ID": row["Device ID"],
"Platform": row["Platform"],
"Metric": metric,
"Value": value,
"Mean": mean_value,
"Std": std_value,
"Z-Score": z_score,
"Times Faster/Slower": value / mean_value,
"Benchmark": row["Benchmark"],
"Submission ID": row["Submission ID"],
}
anomalies.append(anomaly_data)
anomaly_df = pd.DataFrame(anomalies)
if not anomaly_df.empty:
anomaly_df = anomaly_df.sort_values(by="Z-Score", ascending=False)
return anomaly_df
def filter_anomalies(df, z_threshold=9.0, min_samples=5):
"""
Filter out anomalies from a DataFrame.
Args:
df (pd.DataFrame): DataFrame containing benchmark data
z_threshold (float): Z-score threshold for anomaly detection (default: 9.0)
min_samples (int): Minimum number of samples needed for a group to calculate statistics
Returns:
tuple: (filtered_df, anomalies_df) - the filtered DataFrame without anomalies and the anomalies DataFrame
"""
if df is None or df.empty:
return df, pd.DataFrame()
# Find anomalies
anomalies = detect_anomalies(df, z_threshold, min_samples)
if anomalies.empty:
return df, anomalies
anomaly_ids = set(anomalies["Submission ID"].dropna().unique())
filtered_df = df[~df["Submission ID"].isin(anomaly_ids)]
return filtered_df, anomalies