File size: 11,758 Bytes
041508c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 |
import multiprocessing
from itertools import repeat
from models import fastaiModel
from models.wavelet import WaveletModel
from utilities.utils import *
class SCPExperiment:
"""
Experiment on SCP-ECG statements.
All experiments based on SCP are performed and evaluated the same way.
"""
def __init__(self, experiment_name, task, data_folder, output_folder, models,
sampling_frequency=100, min_samples=0, train_fold=8, val_fold=9,
test_fold=10, folds_type='strat'):
self.models = models
self.min_samples = min_samples
self.task = task
self.train_fold = train_fold
self.val_fold = val_fold
self.test_fold = test_fold
self.folds_type = folds_type
self.experiment_name = experiment_name
self.output_folder = output_folder
self.data_folder = data_folder
self.sampling_frequency = sampling_frequency
# create folder structure if needed
if not os.path.exists(self.output_folder + self.experiment_name):
os.makedirs(self.output_folder + self.experiment_name)
if not os.path.exists(self.output_folder + self.experiment_name + '/results/'):
os.makedirs(self.output_folder + self.experiment_name + '/results/')
if not os.path.exists(output_folder + self.experiment_name + '/models/'):
os.makedirs(self.output_folder + self.experiment_name + '/models/')
if not os.path.exists(output_folder + self.experiment_name + '/data/'):
os.makedirs(self.output_folder + self.experiment_name + '/data/')
def prepare(self):
# Load PTB-XL data
self.data, self.raw_labels = load_dataset(self.data_folder, self.sampling_frequency)
# Preprocess label data
self.labels = compute_label_aggregations(self.raw_labels, self.data_folder, self.task)
# Select relevant data and convert to one-hot
self.data, self.labels, self.Y, _ = select_data(self.data, self.labels, self.task, self.min_samples,
self.output_folder + self.experiment_name + '/data/')
self.input_shape = self.data[0].shape
# 10th fold for testing (9th for now)
self.X_test = self.data[self.labels.strat_fold == self.test_fold]
self.y_test = self.Y[self.labels.strat_fold == self.test_fold]
# 9th fold for validation (8th for now)
self.X_val = self.data[self.labels.strat_fold == self.val_fold]
self.y_val = self.Y[self.labels.strat_fold == self.val_fold]
# rest for training
self.X_train = self.data[self.labels.strat_fold <= self.train_fold]
self.y_train = self.Y[self.labels.strat_fold <= self.train_fold]
# Preprocess signal data
self.X_train, self.X_val, self.X_test = preprocess_signals(self.X_train, self.X_val, self.X_test,
self.output_folder + self.experiment_name + '/data/')
self.n_classes = self.y_train.shape[1]
# save train and test labels
self.y_train.dump(self.output_folder + self.experiment_name + '/data/y_train.npy')
self.y_val.dump(self.output_folder + self.experiment_name + '/data/y_val.npy')
self.y_test.dump(self.output_folder + self.experiment_name + '/data/y_test.npy')
model_name = 'naive'
# create most naive predictions via simple mean in training
mpath = self.output_folder + self.experiment_name + '/models/' + model_name + '/'
# create folder for model outputs
if not os.path.exists(mpath):
os.makedirs(mpath)
if not os.path.exists(mpath + 'results/'):
os.makedirs(mpath + 'results/')
mean_y = np.mean(self.y_train, axis=0)
np.array([mean_y] * len(self.y_train)).dump(mpath + 'y_train_pred.npy')
np.array([mean_y] * len(self.y_test)).dump(mpath + 'y_test_pred.npy')
np.array([mean_y] * len(self.y_val)).dump(mpath + 'y_val_pred.npy')
def perform(self):
for model_description in self.models:
model_name = model_description['model_name']
model_type = model_description['model_type']
model_params = model_description['parameters']
mpath = self.output_folder + self.experiment_name + '/models/' + model_name + '/'
# create folder for model outputs
if not os.path.exists(mpath):
os.makedirs(mpath)
if not os.path.exists(mpath + 'results/'):
os.makedirs(mpath + 'results/')
n_classes = self.Y.shape[1]
# load respective model
if model_type == 'WAVELET':
model = WaveletModel(model_name, n_classes, self.sampling_frequency, mpath, self.input_shape,
**model_params)
elif model_type == "FastaiModel":
model = fastaiModel.FastaiModel(model_name, n_classes, self.sampling_frequency, mpath, self.input_shape,
**model_params)
else:
assert True
break
# fit model
model.fit(self.X_train, self.y_train, self.X_val, self.y_val)
# predict and dump
model.predict(self.X_train).dump(mpath + 'y_train_pred.npy')
model.predict(self.X_val).dump(mpath + 'y_val_pred.npy')
model.predict(self.X_test).dump(mpath + 'y_test_pred.npy')
model_name = 'ensemble'
# create ensemble predictions via simple mean across model predictions (except naive predictions)
ensemblepath = self.output_folder + self.experiment_name + '/models/' + model_name + '/'
# create folder for model outputs
if not os.path.exists(ensemblepath):
os.makedirs(ensemblepath)
if not os.path.exists(ensemblepath + 'results/'):
os.makedirs(ensemblepath + 'results/')
# load all predictions
ensemble_train, ensemble_val, ensemble_test = [], [], []
for model_description in os.listdir(self.output_folder + self.experiment_name + '/models/'):
if not model_description in ['ensemble', 'naive']:
mpath = self.output_folder + self.experiment_name + '/models/' + model_description + '/'
ensemble_train.append(np.load(mpath + 'y_train_pred.npy', allow_pickle=True))
ensemble_val.append(np.load(mpath + 'y_val_pred.npy', allow_pickle=True))
ensemble_test.append(np.load(mpath + 'y_test_pred.npy', allow_pickle=True))
# dump mean predictions
np.array(ensemble_train).mean(axis=0).dump(ensemblepath + 'y_train_pred.npy')
np.array(ensemble_test).mean(axis=0).dump(ensemblepath + 'y_test_pred.npy')
np.array(ensemble_val).mean(axis=0).dump(ensemblepath + 'y_val_pred.npy')
def evaluate(self, n_bootstraping_samples=100, n_jobs=20, bootstrap_eval=False, dumped_bootstraps=True):
# get labels
global train_samples, val_samples
y_train = np.load(self.output_folder + self.experiment_name + '/data/y_train.npy', allow_pickle=True)
y_val = np.load(self.output_folder + self.experiment_name + '/data/y_val.npy', allow_pickle=True)
y_test = np.load(self.output_folder + self.experiment_name + '/data/y_test.npy', allow_pickle=True)
# if bootstrapping then generate appropriate samples for each
if bootstrap_eval:
if not dumped_bootstraps:
train_samples = np.array(get_appropriate_bootstrap_samples(y_train, n_bootstraping_samples))
test_samples = np.array(get_appropriate_bootstrap_samples(y_test, n_bootstraping_samples))
val_samples = np.array(get_appropriate_bootstrap_samples(y_val, n_bootstraping_samples))
else:
test_samples = np.load(self.output_folder + self.experiment_name + '/test_bootstrap_ids.npy',
allow_pickle=True)
else:
train_samples = np.array([range(len(y_train))])
test_samples = np.array([range(len(y_test))])
val_samples = np.array([range(len(y_val))])
# store samples for future evaluations
train_samples.dump(self.output_folder + self.experiment_name + '/train_bootstrap_ids.npy')
test_samples.dump(self.output_folder + self.experiment_name + '/test_bootstrap_ids.npy')
val_samples.dump(self.output_folder + self.experiment_name + '/val_bootstrap_ids.npy')
# iterate over all models fitted so far
for m in sorted(os.listdir(self.output_folder + self.experiment_name + '/models')):
print(m)
mpath = self.output_folder + self.experiment_name + '/models/' + m + '/'
rpath = self.output_folder + self.experiment_name + '/models/' + m + '/results/'
# load predictions
y_train_pred = np.load(mpath + 'y_train_pred.npy', allow_pickle=True)
y_val_pred = np.load(mpath + 'y_val_pred.npy', allow_pickle=True)
y_test_pred = np.load(mpath + 'y_test_pred.npy', allow_pickle=True)
if self.experiment_name == 'exp_ICBEB':
# compute classwise thresholds such that recall-focused Gbeta is optimized
thresholds = find_optimal_cutoff_thresholds_for_Gbeta(y_train, y_train_pred)
else:
thresholds = None
pool = multiprocessing.Pool(n_jobs)
tr_df = pd.concat(pool.starmap(generate_results,
zip(train_samples, repeat(y_train), repeat(y_train_pred),
repeat(thresholds))))
tr_df_point = generate_results(range(len(y_train)), y_train, y_train_pred, thresholds)
tr_df_result = pd.DataFrame(
np.array([
tr_df_point.mean().values,
tr_df.mean().values,
tr_df.quantile(0.05).values,
tr_df.quantile(0.95).values]),
columns=tr_df.columns,
index=['point', 'mean', 'lower', 'upper'])
te_df = pd.concat(pool.starmap(generate_results,
zip(test_samples, repeat(y_test), repeat(y_test_pred), repeat(thresholds))))
te_df_point = generate_results(range(len(y_test)), y_test, y_test_pred, thresholds)
te_df_result = pd.DataFrame(
np.array([
te_df_point.mean().values,
te_df.mean().values,
te_df.quantile(0.05).values,
te_df.quantile(0.95).values]),
columns=te_df.columns,
index=['point', 'mean', 'lower', 'upper'])
val_df = pd.concat(pool.starmap(generate_results,
zip(val_samples, repeat(y_val), repeat(y_val_pred), repeat(thresholds))))
val_df_point = generate_results(range(len(y_val)), y_val, y_val_pred, thresholds)
val_df_result = pd.DataFrame(
np.array([
val_df_point.mean().values,
val_df.mean().values,
val_df.quantile(0.05).values,
val_df.quantile(0.95).values]),
columns=val_df.columns,
index=['point', 'mean', 'lower', 'upper'])
pool.close()
# dump results
tr_df_result.to_csv(rpath + 'tr_results.csv')
val_df_result.to_csv(rpath + 'val_results.csv')
te_df_result.to_csv(rpath + 'te_results.csv')
|