TabArena-WIP / src /utils.py
geoalgo's picture
add normalized error
c5607c4
from pathlib import Path
import pandas as pd
import os
import re
from scipy import stats
from src.constants import ProblemTypes, MetricNames
METRIC_CHOICES = [
f"eval_metrics/{MetricNames.normalized_error}",
f"eval_metrics/{MetricNames.fit_time_per_1K_rows}",
f"eval_metrics/{MetricNames.inference_time_per_1K_rows}",
]
# Define the formatting function
def format_number(num):
# Check if the value is numeric
if isinstance(num, (int, float)):
if abs(num) >= 10**2:
return f"{num:.1e}"
else:
return f"{num:.3f}"
# Return non-numeric values as-is
return num
def norm_sNavie(df):
df_normalized = df.copy()
seasonal_naive_row = df[df['model'] == 'seasonal_naive'].iloc[0]
print('df: ',df)
for column in df.columns:
if column != 'model': # We skip normalizing the 'model' column
df_normalized[column] = df[column] / seasonal_naive_row[column]
return df_normalized
def pivot_df(file_name, tab_name):
df = pd.read_csv(file_name)
if tab_name == 'univariate':
df['univariate'] = df['univariate'].replace({True: 'univariate', False: 'multivariate'})
df.rename(columns={'univariate': 'variate_type'}, inplace=True)
tab_name = 'variate_type'
df_melted = pd.melt(df, id_vars=[tab_name, 'model'], var_name='metric', value_name='value')
df_melted['metric'] = df_melted['metric'].replace({
'eval_metrics/MAPE[0.5]': 'MAPE',
'eval_metrics/mean_weighted_sum_quantile_loss': 'CRPS'
})
df_pivot = df_melted.pivot_table(index='model', columns=[tab_name, 'metric'], values='value')
df_pivot.columns = [f'{tab_name} ({metric})' for tab_name, metric in df_pivot.columns]
# df_pivot.to_csv('pivoted_df.csv')
# print(df_pivot)
df_pivot = df_pivot.reset_index()
df_pivot = df_pivot.round(3)
return df_pivot
def rename_metrics(df):
df = df.rename(columns={
f'eval_metrics/{MetricNames.normalized_error}': MetricNames.normalized_error,
f'eval_metrics/{MetricNames.inference_time_per_1K_rows}': "Inference time / 1K rows (s)",
f'eval_metrics/{MetricNames.fit_time_per_1K_rows}': "Fit time / 1K rows (s)",
})
return df
def format_df(df):
df = df.applymap(format_number)
# make sure the data type is float
df.iloc[:, 1:] = df.iloc[:, 1:].astype(float)
return df
def unify_freq(df):
# Remove all numeric characters from the 'frequency' column
df['frequency'] = df['frequency'].str.replace(r'\d+', '', regex=True)
# Remove everything after '-' if present
df['frequency'] = df['frequency'].str.split('-').str[0]
# Define the frequency conversion dictionary
freq_conversion = {
'T': 'Minutely',
'H': 'Hourly',
'D': 'Daily',
'W': 'Weekly',
'M': 'Monthly',
'Q': 'Quarterly',
'Y': 'Yearly',
'A': 'Yearly',
'S': 'Secondly'
}
# Map the cleaned 'frequency' values using the dictionary
df['frequency'] = df['frequency'].replace(freq_conversion)
return df
def pivot_existed_df(df, tab_name):
df = df.reset_index()
if tab_name == 'univariate':
df['univariate'] = df['univariate'].replace({True: 'univariate', False: 'multivariate'})
df.rename(columns={'univariate': 'variate_type'}, inplace=True)
tab_name = 'variate_type'
print('tab_name:', tab_name, 'df: ',df)
print('columns', df.columns)
df_melted = pd.melt(df, id_vars=[tab_name, 'model'], var_name='metric', value_name='value')
df_melted['metric'] = df_melted['metric'].replace({
'eval_metrics/normalized-error': 'normalized-error',
'eval_metrics/mean_weighted_sum_quantile_loss': 'CRPS',
'rank': 'Rank',
})
df_pivot = df_melted.pivot_table(index='model', columns=[tab_name, 'metric'], values='value')
df_pivot.columns = [f'{tab_name} ({metric})' for tab_name, metric in df_pivot.columns]
df_pivot = df_pivot.reset_index()
# df_pivot = df_pivot.round(3)
df_pivot = format_df(df_pivot)
# df_pivot = df_pivot.applymap(format_number)
# # make sure the data type is float
# df_pivot.iloc[:, 1:] = df_pivot.iloc[:, 1:].astype(float)
return df_pivot
def get_grouped_dfs(root_dir='results', ds_properties='results/dataset_properties.csv'):
df_list = []
# Walk through all folders and subfolders in the root directory
for csv_path in Path(root_dir).rglob("*csv"):
if 'all_results.csv' in str(csv_path):
df_list.append(pd.read_csv(csv_path))
# Concatenate all dataframes into one
all_results_df = pd.concat(df_list, ignore_index=True)
all_results_df = all_results_df.sort_values(by=['model', 'dataset']).reset_index(drop=True)
dataset_properties = pd.read_csv(ds_properties)
# Reforemat the first element of each row after the header following these rules:
# 1. make all characters lowercase
dataset_properties['dataset'] = dataset_properties['dataset'].apply(lambda x: x.lower())
# 2. replace all spaces with underscores
dataset_properties['dataset'] = dataset_properties['dataset'].apply(lambda x: x.replace(' ', '_'))
# 3. Replace all dashes with underscores
dataset_properties['dataset'] = dataset_properties['dataset'].apply(lambda x: x.replace('-', '_'))
# 4. Replace consecutive underscores with a single underscore. There maybe more than 2 consecutive underscores
dataset_properties['dataset'] = dataset_properties['dataset'].apply(lambda x: re.sub('_+', '_', x))
# 5. Remove all leading and trailing underscores
dataset_properties['dataset'] = dataset_properties['dataset'].apply(lambda x: x.strip('_'))
df = all_results_df
# convert it to a dictionary, with dataset as the key, and the value as another dictionary. The inner dictionary has the column names as the key, and the value as the value.
dataset_properties_dict = dataset_properties.set_index('dataset').T.to_dict('dict')
# match the dataset name in model_properties_dict with the dataset name in df and add a new column for each key value pair in the inner dictionary.
for dataset in dataset_properties_dict.keys():
for key in dataset_properties_dict[dataset].keys():
df.loc[df['dataset'] == dataset, key] = dataset_properties_dict[dataset][key]
# unify the frequency
# df = unify_freq(df)
# standardize by seasonal naive
df = standardize_df(df)
# TODO compute normalized error
# TODO change to ELO
RANKING_METRIC = "eval_metrics/normalized-error"
# compute metrics that requires all methods results such as Rank and Elo.
df['rank'] = df.groupby(['dataset', ProblemTypes.col_name])[f'{RANKING_METRIC}'].rank(method='first', ascending=True)
df['ELO'] = df.groupby(['dataset', ProblemTypes.col_name])[f'{RANKING_METRIC}'].rank(method='first',
ascending=True) * 100
# group by domain
grouped_results_overall = df.groupby(['model'])[METRIC_CHOICES].agg(stats.gmean)
grouped_results_overall_rank = df.groupby(['model'])[['rank']].mean()
grouped_results_overall_elo = df.groupby(['model'])[['ELO']].mean()
grouped_results_overall = pd.concat([
grouped_results_overall,
grouped_results_overall_rank,
grouped_results_overall_elo],
axis=1
)
# grouped_results_overall = grouped_results_overall.rename(columns={'model':'Model'})
# grouped_results.to_csv(f'artefacts/grouped_results_by_model.csv')
grouped_dfs = {}
# for col_name in ["domain", 'term_length', 'frequency', 'univariate']:
for col_name in [ProblemTypes.col_name]:
grouped_dfs[col_name] = group_by(df, col_name)
# print(f"Grouping by {col_name}:\n {grouped_dfs.head(20)}")
grouped_dfs['overall'] = grouped_results_overall
return grouped_dfs
def standardize_df(df):
raw_metric = f'eval_metrics/{MetricNames.raw_error}'
# Perform min-max normalization. We may want to do something more outlier robust like what is done
# in Tabrepo by doing (x - x.min()) / (x.median() - x.min())
df[f'eval_metrics/{MetricNames.normalized_error}'] = df.groupby('dataset')[raw_metric].transform(
lambda x: (x - x.min()) / (x.max() - x.min())
)
return df
def group_by(df, col_name):
grouped_results = df.groupby([col_name, 'model'])[METRIC_CHOICES].agg(stats.gmean)
grouped_results_rank = df.groupby([col_name, 'model'])[['rank']].mean()
grouped_results = pd.concat([grouped_results, grouped_results_rank], axis=1)
# Display the results
# Write the results to a csv file
# grouped_results.to_csv(f'grouped_results_by_{col_name}.csv')
return grouped_results