File size: 7,919 Bytes
041508c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import numpy as np
from tqdm import tqdm


def stratify_df(df, new_col_name, n_folds=10, nr_clean_folds=0):
    # compute qualities as described in PTB-XL report
    qualities = []
    for i, row in df.iterrows():
        q = 0
        if 'validated_by_human' in df.columns:
            if row.validated_by_human:
                q = 1
        qualities.append(q)
    df['quality'] = qualities

    # create stratified folds according to patients
    pat_ids = np.array(sorted(list(set(df.patient_id.values))))
    p_labels = []
    p_qualities = []
    ecgs_per_patient = []

    for pid in tqdm(pat_ids):
        sel = df[df.patient_id == pid]
        l = np.concatenate([list(d.keys()) for d in sel.scp_codes.values])
        if sel.sex.values[0] == 0:
            gender = 'male'
        else:
            gender = 'female'
        l = np.concatenate((l, [gender] * len(sel)))
        for age in sel.age.values:
            if age < 20:
                l = np.concatenate((l, ['<20']))
            elif 20 <= age < 40:
                l = np.concatenate((l, ['20-40']))
            elif 40 <= age < 60:
                l = np.concatenate((l, ['40-60']))
            elif 60 <= age < 80:
                l = np.concatenate((l, ['60-80']))
            elif age >= 80:
                l = np.concatenate((l, ['>=80']))
        p_labels.append(l)
        ecgs_per_patient.append(len(sel))
        p_qualities.append(sel.quality.min())
    classes = sorted(list(set([item for sublist in p_labels for item in sublist])))

    stratified_data_ids, stratified_data = stratify(p_labels, classes, [1 / n_folds] * n_folds, p_qualities,
                                                    ecgs_per_patient, nr_clean_folds)

    df[new_col_name] = np.zeros(len(df)).astype(int)
    for fold_i, fold_ids in tqdm(enumerate(stratified_data_ids)):
        ipat_ids = [pat_ids[pid] for pid in fold_ids]
        df[new_col_name][df.patient_id.isin(ipat_ids)] = fold_i + 1

    return df


def stratify(data, classes, ratios, qualities, ecgs_per_patient, nr_clean_folds=1):
    """Stratifying procedure. Modified from https://vict0rs.ch/2018/05/24/sample-multilabel-dataset/ (based on Sechidis 2011)

    data is a list of lists: a list of labels, for each sample.
        Each sample's labels should be ints, if they are one-hot encoded, use one_hot=True

    classes is the list of classes each label can take

    ratios is a list, summing to 1, of how the dataset should be split

    qualities: quality per entry (only >0 can be assigned to clean folds; 4 will always be assigned to final fold)

    ecgs_per_patient: list with number of ecgs per sample

    nr_clean_folds: the last nr_clean_folds can only take clean entries

    """
    np.random.seed(0)  # fix the random seed

    # data is now always a list of lists; len(data) is the number of patients; data[i] is the list of all labels for
    # patient i (possibly multiple identical entries)

    # size is the number of ecgs
    size = np.sum(ecgs_per_patient)

    # Organize data per label: for each label l, per_label_data[l] contains the list of patients
    # in data which have this label (potentially multiple identical entries)
    per_label_data = {c: [] for c in classes}
    for i, d in enumerate(data):
        for l in d:
            per_label_data[l].append(i)

    # In order not to compute lengths each time, they are tracked here.
    subset_sizes = [r * size for r in ratios]  # list of subset_sizes in terms of ecgs
    per_label_subset_sizes = {c: [r * len(per_label_data[c]) for r in ratios] for c in
                              classes}  # dictionary with label: list of subset sizes in terms of patients

    # For each subset we want, the set of sample-ids which should end up in it
    stratified_data_ids = [set() for _ in range(len(ratios))]  # initialize empty

    # For each sample in the data set
    print("Assigning patients to folds...")
    size_prev = size + 1  # just for output
    while size > 0:
        if int(size_prev / 1000) > int(size / 1000):
            print("Remaining patients/ecgs to distribute:", size, "non-empty labels:",
                  np.sum([1 for l, label_data in per_label_data.items() if len(label_data) > 0]))
        size_prev = size
        # Compute |Di|
        lengths = {
            l: len(label_data)
            for l, label_data in per_label_data.items()
        }  # dictionary label: number of ecgs with this label that have not been assigned to a fold yet
        try:
            # Find label of smallest |Di|
            label = min({k: v for k, v in lengths.items() if v > 0}, key=lengths.get)
        except ValueError:
            # If the dictionary in `min` is empty we get a Value Error.
            # This can happen if there are unlabeled samples.
            # In this case, `size` would be > 0 but only samples without label would remain.
            # "No label" could be a class in itself: it's up to you to format your data accordingly.
            break
        # For each patient with label `label` get patient and corresponding counts
        unique_samples, unique_counts = np.unique(per_label_data[label], return_counts=True)
        idxs_sorted = np.argsort(unique_counts, kind='stable')[::-1]
        unique_samples = unique_samples[
            idxs_sorted]  # this is a list of all patient ids with this label sort by size descending
        unique_counts = unique_counts[idxs_sorted]  # these are the corresponding counts

        # loop through all patient ids with this label
        for current_id, current_count in zip(unique_samples, unique_counts):

            subset_sizes_for_label = per_label_subset_sizes[label]  # current subset sizes for the chosen label

            # if quality is bad remove clean folds (i.e. sample cannot be assigned to clean folds)
            if qualities[current_id] < 1:
                subset_sizes_for_label = subset_sizes_for_label[:len(ratios) - nr_clean_folds]

            # Find argmax clj i.e. subset in greatest need of the current label
            largest_subsets = np.argwhere(subset_sizes_for_label == np.amax(subset_sizes_for_label)).flatten()

            # if there is a single best choice: assign it
            if len(largest_subsets) == 1:
                subset = largest_subsets[0]
            # If there is more than one such subset, find the one in greatest need of any label
            else:
                largest_subsets2 = np.argwhere(np.array(subset_sizes)[largest_subsets] == np.amax(
                    np.array(subset_sizes)[largest_subsets])).flatten()
                subset = largest_subsets[np.random.choice(largest_subsets2)]

            # Store the sample's id in the selected subset
            stratified_data_ids[subset].add(current_id)

            # There is current_count fewer samples to distribute
            size -= ecgs_per_patient[current_id]
            # The selected subset needs current_count fewer samples
            subset_sizes[subset] -= ecgs_per_patient[current_id]

            # In the selected subset, there is one more example for each label
            # the current sample has
            for l in data[current_id]:
                per_label_subset_sizes[l][subset] -= 1

            # Remove the sample from the dataset, meaning from all per_label dataset created
            for x in per_label_data.keys():
                per_label_data[x] = [y for y in per_label_data[x] if y != current_id]

    # Create the stratified dataset as a list of subsets, each containing the original labels
    stratified_data_ids = [sorted(strat) for strat in stratified_data_ids]
    stratified_data = [
        [data[i] for i in strat] for strat in stratified_data_ids
    ]

    # Return both the stratified indexes, to be used to sample the `features` associated with your labels
    # And the stratified labels dataset

    return stratified_data_ids, stratified_data