File size: 4,933 Bytes
157b914 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
import pandas as pd
def add_model_size_groups(df, group_size=0.5, max_size=15):
"""
Add a column to the DataFrame categorizing model file sizes into size groups.
Args:
df (pandas.DataFrame): DataFrame containing model benchmark data
group_size (float): Size of each group in GB (default: 0.5)
max_size (int): Maximum size in GB to consider (default: 15)
Returns:
pandas.DataFrame: Original DataFrame with an additional 'Size Group' column
"""
if df is None or df.empty:
return df
result_df = df.copy()
if "Model Size GB" not in result_df.columns:
# Check if 'Model File Size' exists in the DataFrame
if "Model File Size" not in result_df.columns:
raise ValueError("DataFrame must contain 'Model File Size' column")
result_df["Model Size GB"] = result_df["Model File Size"] / 1024**3
# Define a function to assign size groups
def assign_size_group(size):
if size is None or pd.isna(size):
return "Unknown"
if size >= max_size:
return f">{max_size} GB"
import math
group_index = math.floor(size / group_size)
lower_bound = group_index * group_size
upper_bound = lower_bound + group_size
# Round to 1 decimal place to avoid floating point issues
lower_bound = round(lower_bound, 1)
upper_bound = round(upper_bound, 1)
return f"{lower_bound}-{upper_bound} GB"
result_df["Size Group"] = result_df["Model Size GB"].apply(assign_size_group)
return result_df
def detect_anomalies(df, z_threshold=6.0, min_samples=5):
"""
Detect anomalies in benchmark data.
Args:
df (pd.DataFrame): DataFrame containing benchmark data
z_threshold (float): Z-score threshold for anomaly detection (default: 6.0)
min_samples (int): Minimum number of samples needed for a group to calculate statistics
Returns:
pd.DataFrame: DataFrame containing detected anomalies with relevant information
"""
if df is None or df.empty:
return pd.DataFrame()
# Ensure we have Size Group column
if "Size Group" not in df.columns:
df = add_model_size_groups(df)
anomalies = []
for metric in ["Prompt Processing", "Token Generation"]:
size_groups = df.groupby("Size Group")
for size_group, group_df in size_groups:
# Only process groups with enough samples
if len(group_df) < min_samples:
continue
mean_value = group_df[metric].mean()
std_value = group_df[metric].std()
# Skip if standard deviation is zero or very small
if std_value < 0.001:
continue
# Calculate z-scores for each entry
for _, row in group_df.iterrows():
value = row[metric]
if pd.isna(value):
continue
z_score = abs((value - mean_value) / std_value)
# Flag as anomaly if z-score exceeds threshold
if z_score > z_threshold:
anomaly_data = {
"Size Group": size_group,
"Model": row["Model"],
"Device": row["Device"],
"Device ID": row["Device ID"],
"Platform": row["Platform"],
"Metric": metric,
"Value": value,
"Mean": mean_value,
"Std": std_value,
"Z-Score": z_score,
"Times Faster/Slower": value / mean_value,
"Benchmark": row["Benchmark"],
"Submission ID": row["Submission ID"],
}
anomalies.append(anomaly_data)
anomaly_df = pd.DataFrame(anomalies)
if not anomaly_df.empty:
anomaly_df = anomaly_df.sort_values(by="Z-Score", ascending=False)
return anomaly_df
def filter_anomalies(df, z_threshold=9.0, min_samples=5):
"""
Filter out anomalies from a DataFrame.
Args:
df (pd.DataFrame): DataFrame containing benchmark data
z_threshold (float): Z-score threshold for anomaly detection (default: 9.0)
min_samples (int): Minimum number of samples needed for a group to calculate statistics
Returns:
tuple: (filtered_df, anomalies_df) - the filtered DataFrame without anomalies and the anomalies DataFrame
"""
if df is None or df.empty:
return df, pd.DataFrame()
# Find anomalies
anomalies = detect_anomalies(df, z_threshold, min_samples)
if anomalies.empty:
return df, anomalies
anomaly_ids = set(anomalies["Submission ID"].dropna().unique())
filtered_df = df[~df["Submission ID"].isin(anomaly_ids)]
return filtered_df, anomalies
|