NeMo / examples /asr /asr_adapters /scoring_and_analysis.py

thanks to NVIDIA ❤

7934b29 about 2 years ago

14.5 kB

	# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""
	This script is used to analyze the results of the experiments from a CSV file.

	Basic Usage:
	To perform analysis on the adapters experiment results::

	python scoring_and_analysis.py \
	--csv <path to cleaned result csv file> \
	--dataset_type_column <column name in csv with the dataset types>

	To perform analysis on the finetuning experiment results::

	$ python scoring_and_analysis.py \
	--csv <path to csv> \
	--dataset_type_column <column name in csv with the dataset types> \
	-ft

	Advanced Usage:
	The script by default shows only the best hyperparameters for each crietria.
	To see a ranking of all the hyperparameters for each criteria in order to visualize
	how the results were selected use the `--show_analysis` flag. Moreover, instead of
	displaying only the best hyperparameters, you can use the `--topk` flag to show the
	top k hyperparameters::

	$ python scoring_and_analysis.py \
	--csv <path to csv> \
	--dataset_type_column <dataset_group_column_name> \
	--show_analysis \
	--topk 3

	Instead of doing the analysis over all possible combinations of all the hyperparameters,
	you can restrict the search space only to a subset of experiments. This can be achieved
	by the `-uargs` and the `-cargs` flag for the unconstrained and the constrained
	experiments respectively::

	$ python scoring_and_analysis.py \
	--csv <path to csv> \
	--dataset_type_column <dataset_group_column_name> \
	-cargs 'Adapter Position' encoder \
	-cargs 'Adapter Dropout' 0.5 \
	-uargs 'Train Steps' 5000
	"""

	import argparse
	from typing import Tuple

	import numpy as np
	import pandas as pd

	# CHANGE: Specify the column names and their attributes to consider for the selection
	# of the best results
	UNCONSTRAINED_EXP_KEY = {'name': 'WER: Test', 'attribute': min}
	CONSTRAINED_EXP_KEY = {'name': 'Score', 'attribute': max}

	# CHANGE: Hyperparamters of the best run to display in the output
	ADAPTER_HYPERPARAMTER_COLUMNS = ['Adapter Dimensions', 'Adapter Dropout', 'Stochastic Depth', 'Train Steps']
	FINETUNING_HYPERPARAMETER_COLUMNS = ['Train Steps', 'Learning Rate']

	# CHANGE: Column name for the test set WER on the new domain
	TEST_WER_COLUMN = 'WER: Test'

	# CHANGE: Column name for the test set WER on the original domain
	ORIGINAL_TEST_WER_COLUMN = 'WER: Librispeech Test Other'

	# CHANGE: Based on the experiment type, get the column name for categorizing the results
	EXP_CATEGORY_KEY = {'adapters': 'Adapter Position', 'finetuning': 'Frozen Module'}

	# CHANGE: Maximum absolute WER degradation allowed in the original domain
	MAX_DEGRADATION_PERCENTAGE = 3

	# CHANGE: Baseline WER in the original domain
	BASELINE_ORIGINAL_WER = 5.118

	# CHANGE: Baseline WER in the domain to be adapted
	# The keys of this dictionary should cover all values of the `dataset_type_column`
	BASELINE_ADAPTED_WER = {
	'irish_english_male': 20.690,
	'midlands_english_female': 9.612,
	'midlands_english_male': 11.253,
	'northern_english_female': 11.108,
	'northern_english_male': 10.180,
	'scottish_english_female': 12.309,
	'scottish_english_male': 11.942,
	'southern_english_female': 9.701,
	'southern_english_male': 10.215,
	'welsh_english_female': 8.514,
	'welsh_english_male': 11.463,
	}


	def calculate_original_scale(original_wer):
	wer_do = abs(original_wer - BASELINE_ORIGINAL_WER)
	return (MAX_DEGRADATION_PERCENTAGE - min(MAX_DEGRADATION_PERCENTAGE, wer_do)) / MAX_DEGRADATION_PERCENTAGE


	def calculate_adapt_werr(adapted_wer, group):
	return max(BASELINE_ADAPTED_WER[group] - adapted_wer, 0) / BASELINE_ADAPTED_WER[group]


	def parse_results(filepath: str, dataset_type_col: str, exp_type: str) -> Tuple[pd.DataFrame]:
	"""Calculate the scoring metric for each experiment

	Args:
	filepath: Path to the csv file containing the results
	dataset_type_col: Name of the column containing the dataset types
	exp_type: Type of experiments in the csv file

	Returns:
	Dataframes of all the experiments with scores
	"""
	global UNCONSTRAINED_EXP_KEY, TEST_WER_COLUMN

	df = pd.read_csv(filepath)
	df.drop(columns=['Model', 'Model Size'], errors='ignore', inplace=True) # Drop columns if exists

	if exp_type == 'finetuning':
	df['Frozen Module'] = df['Frozen Module'].replace('-', 'null')

	if 'Score' not in df:
	# Calculate the selection scoring metric
	df['Original Scale'] = df.apply(lambda x: calculate_original_scale(x[ORIGINAL_TEST_WER_COLUMN]), axis=1)
	df['Adapt WERR'] = df.apply(lambda x: calculate_adapt_werr(x[TEST_WER_COLUMN], x[dataset_type_col]), axis=1)
	df['Score'] = df['Original Scale'] * df['Adapt WERR']

	# Round off the values to 4 decimal places
	df = df.round({'Original Scale': 4, 'Adapt WERR': 4, 'Score': 4})

	# Save the updated csv with scores
	df.to_csv(filepath, index=False)

	return df


	def display_analysis_table(df_analysis: pd.DataFrame, key_info: dict):
	"""Display the analysis table used to select the best hyperparameter configuration

	Args:
	df_analysis: Dataframe of the analysis table
	key_info: Dictionary containing the name of the column and the attribute to use for analysis
	"""
	# Calculate each column length for the table
	column_lengths = {x: max(len(x), df_analysis[x].map(str).apply(len).max()) for x in df_analysis.columns}

	print(' \| '.join([f'{x:^{column_lengths[x]}}' for x in df_analysis.columns]))
	print('-' * sum([column_lengths[x] + 3 for x in df_analysis.columns]))

	for idx in range(len(df_analysis)):
	row_str = []
	for column in df_analysis.columns:
	row_str.append(f'{df_analysis.iloc[idx][column]:^{column_lengths[column]}}')
	print(' \| '.join(row_str))


	def display_results(df_all: pd.DataFrame, category: str, best_config: pd.Series, dataset_type_col: str, exp_type: str):
	"""Display the Test and the Librispeech Test Other WER for the best configuration.

	Args:
	df_all: Dataframe of all the experiments
	category: Adapter position or frozen module in case of finetuning
	best_config: Best hyperparameter configurations
	dataset_type_col: Name of the column containing the dataset types
	exp_type: Type of experiments in the dataframe
	"""
	test_wer_values, ls_test_other_wer_values = [], []

	print(f'{dataset_type_col:^25} \| {TEST_WER_COLUMN:<20} \| {ORIGINAL_TEST_WER_COLUMN:<20}')
	print('-' * 70)
	for dtype in df_all[dataset_type_col].unique():
	df_filtered = df_all[(df_all[dataset_type_col] == dtype) & (df_all[EXP_CATEGORY_KEY[exp_type]] == category)]
	for col in ADAPTER_HYPERPARAMTER_COLUMNS if exp_type == 'adapters' else FINETUNING_HYPERPARAMETER_COLUMNS:
	df_filtered = df_filtered[df_filtered[col] == best_config[col]]

	if len(df_filtered) == 0:
	continue

	if len(df_filtered) > 1:
	raise ValueError(f'More than one row found for dtype: {dataset_type_col} and category: {category}')

	dtype_data = df_filtered.iloc[0]
	test_wer_values.append(dtype_data[TEST_WER_COLUMN])
	ls_test_other_wer_values.append(dtype_data[ORIGINAL_TEST_WER_COLUMN])
	print(
	f'{dtype_data[dataset_type_col]:^25} \| {dtype_data[TEST_WER_COLUMN]:^20} \| {dtype_data[ORIGINAL_TEST_WER_COLUMN]:^20}'
	)
	print('-' * 70)
	print(f'{"Average":^25} \| {np.mean(test_wer_values):^20} \| {np.mean(ls_test_other_wer_values):^20}')
	print('\n')


	def get_best_config(
	df_exp: pd.DataFrame, dataset_type_col: str, key_info: dict, topk: int, show_analysis: bool, exp_type: str,
	):
	"""Get the best hyperparameter configuration for a given subset of experiments.

	Args:
	df_exp: Dataframe of all experiments
	dataset_type_col: Name of the column containing the dataset types
	key_info: Dictionary containing the name of the column and the attribute to use for analysis
	topk: Number of top-k results to display
	show_analysis: Whether to display the analysis table
	exp_type: Type of experiments in the dataframe
	"""
	# Columns to consider for hyperparameter combinations
	hyperparamter_cols = ADAPTER_HYPERPARAMTER_COLUMNS if exp_type == 'adapters' else FINETUNING_HYPERPARAMETER_COLUMNS

	# Columns to display in the analysis table
	analysis_columns = list(set([key_info['name'], TEST_WER_COLUMN, ORIGINAL_TEST_WER_COLUMN]))

	df_analyze = df_exp.drop(
	columns=[
	x
	for x in df_exp.columns
	if x not in set(hyperparamter_cols + [EXP_CATEGORY_KEY[exp_type]] + analysis_columns)
	]
	)

	for category in df_exp[EXP_CATEGORY_KEY[exp_type]].unique():
	# Group all hyperparameter configurations and do mean across all speakers
	df_category_mean = (
	df_analyze[df_analyze[EXP_CATEGORY_KEY[exp_type]] == category]
	.groupby(hyperparamter_cols, as_index=False)[analysis_columns]
	.mean()
	)

	# Sort the values by the key in order to get the top-k results
	df_category_mean.sort_values(
	by=key_info['name'], ascending=True if key_info['attribute'].__qualname__ == 'min' else False, inplace=True
	)

	print('=' * len(category))
	print(category.upper())
	print('=' * len(category) + '\n')

	if show_analysis:
	display_analysis_table(df_category_mean, key_info)
	print('\n')

	for idx in range(min(topk, len(df_category_mean))):
	print('-----')
	print(f'Top-{idx + 1}')
	print('-----')

	df_category_best = df_category_mean.iloc[idx]

	print(f'\nHyperparamters')
	print('---------------\n')
	for hyperparamter in hyperparamter_cols + [key_info['name']]:
	print(f'{hyperparamter:<20}: {df_category_best[hyperparamter]}')
	print()

	print('\nResults')
	print('-------\n')
	display_results(df_exp, category, df_category_best, dataset_type_col, exp_type)


	def analyze_results(
	df_exp: pd.DataFrame,
	fixed_hyperparameters: list,
	title: str,
	dataset_type_col: str,
	key_info: dict,
	topk: int,
	show_analysis: bool,
	exp_type: str,
	):
	"""Perform analysis on a given subset of experiments

	Args:
	df_exp: Dataframe of all experiments
	fixed_hyperparameters: List of pair of hyperparamters and their values to fix in the analysis
	title: Title of the analysis (for logging)
	dataset_type_col: Name of the column containing the dataset types
	key_info: Dictionary containing the name of the column and the attribute to use for analysis
	topk: Number of top-k results to display
	show_analysis: Whether to display the analysis table
	exp_type: Type of experiments in the dataframe
	"""
	# Filter experiments based on the fixed hyperparameters
	for hyperparameter_name, hyperparameter_value in fixed_hyperparameters:
	df_exp = df_exp[df_exp[hyperparameter_name] == hyperparameter_value]

	# Perform analysis
	print('+' * len(title))
	print(title)
	print('+' * len(title) + '\n')
	get_best_config(df_exp, dataset_type_col, key_info, topk, show_analysis, exp_type)
	print()


	def __validate_arg_type(arg):
	"""Validate the type of the command line argument value."""
	dtype = float if '.' in arg else int
	try:
	return dtype(arg)
	except ValueError:
	return arg


	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument('-c', '--csv', required=True, help='Path to the cleaned results CSV file')
	parser.add_argument(
	'-dtype',
	'--dataset_type_column',
	required=True,
	help='Name of the column containing the dataset type. Example: For SLR83 it is "Group", for GSC it is "Dataset Size"',
	)
	parser.add_argument(
	'-cargs',
	'--constrained_args',
	nargs=2,
	action='append',
	default=[],
	type=__validate_arg_type,
	help='Hyperparameters to fix for the constrained experiments',
	)
	parser.add_argument(
	'-uargs',
	'--unconstrained_args',
	nargs=2,
	action='append',
	default=[],
	type=__validate_arg_type,
	help='Hyperparameters to fix for the unconstrained experiments',
	)
	parser.add_argument('-k', '--topk', type=int, default=1, help='Number of top-k results to display')
	parser.add_argument(
	'-ft', '--finetuning', action='store_true', help='True if the CSV contains Finetuning experiments'
	)
	parser.add_argument(
	'-s', '--show_analysis', action='store_true', help='Show the key values of all the dataset types'
	)
	args = parser.parse_args()

	# Get the experiment type
	exp_type = 'finetuning' if args.finetuning else 'adapters'

	# Parse CSV file
	df = parse_results(args.csv, args.dataset_type_column, exp_type)

	# Perform analysis - Constrained Adaptation
	analyze_results(
	df,
	args.constrained_args,
	'Constrained Experiment Results',
	args.dataset_type_column,
	CONSTRAINED_EXP_KEY,
	args.topk,
	args.show_analysis,
	exp_type,
	)

	# Perform analysis - Unconstrained Adaptation
	analyze_results(
	df,
	args.unconstrained_args,
	'Unconstrained Experiment Results',
	args.dataset_type_column,
	UNCONSTRAINED_EXP_KEY,
	args.topk,
	args.show_analysis,
	exp_type,
	)