|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
This script is used to analyze the results of the experiments from a CSV file. |
|
|
|
Basic Usage: |
|
To perform analysis on the adapters experiment results:: |
|
|
|
python scoring_and_analysis.py \ |
|
--csv <path to cleaned result csv file> \ |
|
--dataset_type_column <column name in csv with the dataset types> |
|
|
|
To perform analysis on the finetuning experiment results:: |
|
|
|
$ python scoring_and_analysis.py \ |
|
--csv <path to csv> \ |
|
--dataset_type_column <column name in csv with the dataset types> \ |
|
-ft |
|
|
|
Advanced Usage: |
|
The script by default shows only the best hyperparameters for each crietria. |
|
To see a ranking of all the hyperparameters for each criteria in order to visualize |
|
how the results were selected use the `--show_analysis` flag. Moreover, instead of |
|
displaying only the best hyperparameters, you can use the `--topk` flag to show the |
|
top *k* hyperparameters:: |
|
|
|
$ python scoring_and_analysis.py \ |
|
--csv <path to csv> \ |
|
--dataset_type_column <dataset_group_column_name> \ |
|
--show_analysis \ |
|
--topk 3 |
|
|
|
Instead of doing the analysis over all possible combinations of all the hyperparameters, |
|
you can restrict the search space only to a subset of experiments. This can be achieved |
|
by the `-uargs` and the `-cargs` flag for the unconstrained and the constrained |
|
experiments respectively:: |
|
|
|
$ python scoring_and_analysis.py \ |
|
--csv <path to csv> \ |
|
--dataset_type_column <dataset_group_column_name> \ |
|
-cargs 'Adapter Position' encoder \ |
|
-cargs 'Adapter Dropout' 0.5 \ |
|
-uargs 'Train Steps' 5000 |
|
""" |
|
|
|
import argparse |
|
from typing import Tuple |
|
|
|
import numpy as np |
|
import pandas as pd |
|
|
|
|
|
|
|
UNCONSTRAINED_EXP_KEY = {'name': 'WER: Test', 'attribute': min} |
|
CONSTRAINED_EXP_KEY = {'name': 'Score', 'attribute': max} |
|
|
|
|
|
ADAPTER_HYPERPARAMTER_COLUMNS = ['Adapter Dimensions', 'Adapter Dropout', 'Stochastic Depth', 'Train Steps'] |
|
FINETUNING_HYPERPARAMETER_COLUMNS = ['Train Steps', 'Learning Rate'] |
|
|
|
|
|
TEST_WER_COLUMN = 'WER: Test' |
|
|
|
|
|
ORIGINAL_TEST_WER_COLUMN = 'WER: Librispeech Test Other' |
|
|
|
|
|
EXP_CATEGORY_KEY = {'adapters': 'Adapter Position', 'finetuning': 'Frozen Module'} |
|
|
|
|
|
MAX_DEGRADATION_PERCENTAGE = 3 |
|
|
|
|
|
BASELINE_ORIGINAL_WER = 5.118 |
|
|
|
|
|
|
|
BASELINE_ADAPTED_WER = { |
|
'irish_english_male': 20.690, |
|
'midlands_english_female': 9.612, |
|
'midlands_english_male': 11.253, |
|
'northern_english_female': 11.108, |
|
'northern_english_male': 10.180, |
|
'scottish_english_female': 12.309, |
|
'scottish_english_male': 11.942, |
|
'southern_english_female': 9.701, |
|
'southern_english_male': 10.215, |
|
'welsh_english_female': 8.514, |
|
'welsh_english_male': 11.463, |
|
} |
|
|
|
|
|
def calculate_original_scale(original_wer): |
|
wer_do = abs(original_wer - BASELINE_ORIGINAL_WER) |
|
return (MAX_DEGRADATION_PERCENTAGE - min(MAX_DEGRADATION_PERCENTAGE, wer_do)) / MAX_DEGRADATION_PERCENTAGE |
|
|
|
|
|
def calculate_adapt_werr(adapted_wer, group): |
|
return max(BASELINE_ADAPTED_WER[group] - adapted_wer, 0) / BASELINE_ADAPTED_WER[group] |
|
|
|
|
|
def parse_results(filepath: str, dataset_type_col: str, exp_type: str) -> Tuple[pd.DataFrame]: |
|
"""Calculate the scoring metric for each experiment |
|
|
|
Args: |
|
filepath: Path to the csv file containing the results |
|
dataset_type_col: Name of the column containing the dataset types |
|
exp_type: Type of experiments in the csv file |
|
|
|
Returns: |
|
Dataframes of all the experiments with scores |
|
""" |
|
global UNCONSTRAINED_EXP_KEY, TEST_WER_COLUMN |
|
|
|
df = pd.read_csv(filepath) |
|
df.drop(columns=['Model', 'Model Size'], errors='ignore', inplace=True) |
|
|
|
if exp_type == 'finetuning': |
|
df['Frozen Module'] = df['Frozen Module'].replace('-', 'null') |
|
|
|
if 'Score' not in df: |
|
|
|
df['Original Scale'] = df.apply(lambda x: calculate_original_scale(x[ORIGINAL_TEST_WER_COLUMN]), axis=1) |
|
df['Adapt WERR'] = df.apply(lambda x: calculate_adapt_werr(x[TEST_WER_COLUMN], x[dataset_type_col]), axis=1) |
|
df['Score'] = df['Original Scale'] * df['Adapt WERR'] |
|
|
|
|
|
df = df.round({'Original Scale': 4, 'Adapt WERR': 4, 'Score': 4}) |
|
|
|
|
|
df.to_csv(filepath, index=False) |
|
|
|
return df |
|
|
|
|
|
def display_analysis_table(df_analysis: pd.DataFrame, key_info: dict): |
|
"""Display the analysis table used to select the best hyperparameter configuration |
|
|
|
Args: |
|
df_analysis: Dataframe of the analysis table |
|
key_info: Dictionary containing the name of the column and the attribute to use for analysis |
|
""" |
|
|
|
column_lengths = {x: max(len(x), df_analysis[x].map(str).apply(len).max()) for x in df_analysis.columns} |
|
|
|
print(' | '.join([f'{x:^{column_lengths[x]}}' for x in df_analysis.columns])) |
|
print('-' * sum([column_lengths[x] + 3 for x in df_analysis.columns])) |
|
|
|
for idx in range(len(df_analysis)): |
|
row_str = [] |
|
for column in df_analysis.columns: |
|
row_str.append(f'{df_analysis.iloc[idx][column]:^{column_lengths[column]}}') |
|
print(' | '.join(row_str)) |
|
|
|
|
|
def display_results(df_all: pd.DataFrame, category: str, best_config: pd.Series, dataset_type_col: str, exp_type: str): |
|
"""Display the Test and the Librispeech Test Other WER for the best configuration. |
|
|
|
Args: |
|
df_all: Dataframe of all the experiments |
|
category: Adapter position or frozen module in case of finetuning |
|
best_config: Best hyperparameter configurations |
|
dataset_type_col: Name of the column containing the dataset types |
|
exp_type: Type of experiments in the dataframe |
|
""" |
|
test_wer_values, ls_test_other_wer_values = [], [] |
|
|
|
print(f'{dataset_type_col:^25} | {TEST_WER_COLUMN:<20} | {ORIGINAL_TEST_WER_COLUMN:<20}') |
|
print('-' * 70) |
|
for dtype in df_all[dataset_type_col].unique(): |
|
df_filtered = df_all[(df_all[dataset_type_col] == dtype) & (df_all[EXP_CATEGORY_KEY[exp_type]] == category)] |
|
for col in ADAPTER_HYPERPARAMTER_COLUMNS if exp_type == 'adapters' else FINETUNING_HYPERPARAMETER_COLUMNS: |
|
df_filtered = df_filtered[df_filtered[col] == best_config[col]] |
|
|
|
if len(df_filtered) == 0: |
|
continue |
|
|
|
if len(df_filtered) > 1: |
|
raise ValueError(f'More than one row found for dtype: {dataset_type_col} and category: {category}') |
|
|
|
dtype_data = df_filtered.iloc[0] |
|
test_wer_values.append(dtype_data[TEST_WER_COLUMN]) |
|
ls_test_other_wer_values.append(dtype_data[ORIGINAL_TEST_WER_COLUMN]) |
|
print( |
|
f'{dtype_data[dataset_type_col]:^25} | {dtype_data[TEST_WER_COLUMN]:^20} | {dtype_data[ORIGINAL_TEST_WER_COLUMN]:^20}' |
|
) |
|
print('-' * 70) |
|
print(f'{"Average":^25} | {np.mean(test_wer_values):^20} | {np.mean(ls_test_other_wer_values):^20}') |
|
print('\n') |
|
|
|
|
|
def get_best_config( |
|
df_exp: pd.DataFrame, dataset_type_col: str, key_info: dict, topk: int, show_analysis: bool, exp_type: str, |
|
): |
|
"""Get the best hyperparameter configuration for a given subset of experiments. |
|
|
|
Args: |
|
df_exp: Dataframe of all experiments |
|
dataset_type_col: Name of the column containing the dataset types |
|
key_info: Dictionary containing the name of the column and the attribute to use for analysis |
|
topk: Number of top-k results to display |
|
show_analysis: Whether to display the analysis table |
|
exp_type: Type of experiments in the dataframe |
|
""" |
|
|
|
hyperparamter_cols = ADAPTER_HYPERPARAMTER_COLUMNS if exp_type == 'adapters' else FINETUNING_HYPERPARAMETER_COLUMNS |
|
|
|
|
|
analysis_columns = list(set([key_info['name'], TEST_WER_COLUMN, ORIGINAL_TEST_WER_COLUMN])) |
|
|
|
df_analyze = df_exp.drop( |
|
columns=[ |
|
x |
|
for x in df_exp.columns |
|
if x not in set(hyperparamter_cols + [EXP_CATEGORY_KEY[exp_type]] + analysis_columns) |
|
] |
|
) |
|
|
|
for category in df_exp[EXP_CATEGORY_KEY[exp_type]].unique(): |
|
|
|
df_category_mean = ( |
|
df_analyze[df_analyze[EXP_CATEGORY_KEY[exp_type]] == category] |
|
.groupby(hyperparamter_cols, as_index=False)[analysis_columns] |
|
.mean() |
|
) |
|
|
|
|
|
df_category_mean.sort_values( |
|
by=key_info['name'], ascending=True if key_info['attribute'].__qualname__ == 'min' else False, inplace=True |
|
) |
|
|
|
print('=' * len(category)) |
|
print(category.upper()) |
|
print('=' * len(category) + '\n') |
|
|
|
if show_analysis: |
|
display_analysis_table(df_category_mean, key_info) |
|
print('\n') |
|
|
|
for idx in range(min(topk, len(df_category_mean))): |
|
print('-----') |
|
print(f'Top-{idx + 1}') |
|
print('-----') |
|
|
|
df_category_best = df_category_mean.iloc[idx] |
|
|
|
print(f'\nHyperparamters') |
|
print('---------------\n') |
|
for hyperparamter in hyperparamter_cols + [key_info['name']]: |
|
print(f'{hyperparamter:<20}: {df_category_best[hyperparamter]}') |
|
print() |
|
|
|
print('\nResults') |
|
print('-------\n') |
|
display_results(df_exp, category, df_category_best, dataset_type_col, exp_type) |
|
|
|
|
|
def analyze_results( |
|
df_exp: pd.DataFrame, |
|
fixed_hyperparameters: list, |
|
title: str, |
|
dataset_type_col: str, |
|
key_info: dict, |
|
topk: int, |
|
show_analysis: bool, |
|
exp_type: str, |
|
): |
|
"""Perform analysis on a given subset of experiments |
|
|
|
Args: |
|
df_exp: Dataframe of all experiments |
|
fixed_hyperparameters: List of pair of hyperparamters and their values to fix in the analysis |
|
title: Title of the analysis (for logging) |
|
dataset_type_col: Name of the column containing the dataset types |
|
key_info: Dictionary containing the name of the column and the attribute to use for analysis |
|
topk: Number of top-k results to display |
|
show_analysis: Whether to display the analysis table |
|
exp_type: Type of experiments in the dataframe |
|
""" |
|
|
|
for hyperparameter_name, hyperparameter_value in fixed_hyperparameters: |
|
df_exp = df_exp[df_exp[hyperparameter_name] == hyperparameter_value] |
|
|
|
|
|
print('+' * len(title)) |
|
print(title) |
|
print('+' * len(title) + '\n') |
|
get_best_config(df_exp, dataset_type_col, key_info, topk, show_analysis, exp_type) |
|
print() |
|
|
|
|
|
def __validate_arg_type(arg): |
|
"""Validate the type of the command line argument value.""" |
|
dtype = float if '.' in arg else int |
|
try: |
|
return dtype(arg) |
|
except ValueError: |
|
return arg |
|
|
|
|
|
if __name__ == '__main__': |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument('-c', '--csv', required=True, help='Path to the cleaned results CSV file') |
|
parser.add_argument( |
|
'-dtype', |
|
'--dataset_type_column', |
|
required=True, |
|
help='Name of the column containing the dataset type. Example: For SLR83 it is "Group", for GSC it is "Dataset Size"', |
|
) |
|
parser.add_argument( |
|
'-cargs', |
|
'--constrained_args', |
|
nargs=2, |
|
action='append', |
|
default=[], |
|
type=__validate_arg_type, |
|
help='Hyperparameters to fix for the constrained experiments', |
|
) |
|
parser.add_argument( |
|
'-uargs', |
|
'--unconstrained_args', |
|
nargs=2, |
|
action='append', |
|
default=[], |
|
type=__validate_arg_type, |
|
help='Hyperparameters to fix for the unconstrained experiments', |
|
) |
|
parser.add_argument('-k', '--topk', type=int, default=1, help='Number of top-k results to display') |
|
parser.add_argument( |
|
'-ft', '--finetuning', action='store_true', help='True if the CSV contains Finetuning experiments' |
|
) |
|
parser.add_argument( |
|
'-s', '--show_analysis', action='store_true', help='Show the key values of all the dataset types' |
|
) |
|
args = parser.parse_args() |
|
|
|
|
|
exp_type = 'finetuning' if args.finetuning else 'adapters' |
|
|
|
|
|
df = parse_results(args.csv, args.dataset_type_column, exp_type) |
|
|
|
|
|
analyze_results( |
|
df, |
|
args.constrained_args, |
|
'Constrained Experiment Results', |
|
args.dataset_type_column, |
|
CONSTRAINED_EXP_KEY, |
|
args.topk, |
|
args.show_analysis, |
|
exp_type, |
|
) |
|
|
|
|
|
analyze_results( |
|
df, |
|
args.unconstrained_args, |
|
'Unconstrained Experiment Results', |
|
args.dataset_type_column, |
|
UNCONSTRAINED_EXP_KEY, |
|
args.topk, |
|
args.show_analysis, |
|
exp_type, |
|
) |
|
|