deep-multipit/scripts/_utils.py at main · sysbio-curie/deep-multipit · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
import inspect
import os
import sys
import warnings

import numpy as np
import pandas as pd
from joblib import Parallel
from lifelines.statistics import logrank_test
from tqdm.auto import tqdm

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)

from dmultipit.base.base_dataset import CustomSubset, check_transform
import dmultipit.dataset.dataset as module_data
import dmultipit.model.model as module_arch
import dmultipit.model.attentions as module_att
import dmultipit.model.embeddings as module_emb


def get_dataset(
        labels,
        list_raw_data,
        dataset_name,
        list_unimodal_processings,
        multimodal_processing,
        indexes,
        drop_modas=False,
        keep_unlabelled=False,
        radiomics=None,
        rad_transform=None,
):
    """
    Create multimodal dataset from raw data

    Parameters
    ----------
    labels: 1D numpy array or pandas serie (n_samples,)
        Label for each sample. It should be ordered in the same way as the raw data sets provied in *list_raw_data*.

    list_raw_data: list of numpy arrays or pandas dataframes of shape (n_samples, n_features_1),
        (n_samples, n_features_2)... The rows of the different datasets should be ordered in the same way (i.e., same
        order of samples each time).

    dataset_name: str
        Name of the Dataset object to consider. It should refer to an object from dmultipit.dataset.dataset and
        inheritating from dmultipit.base.base_dataset.MultiModalDataset.

    list_unimodal_processings: list of dictionaries, sklearn.base.TransformerMixin and None
        List of processing operations to apply to each modality separately.
        * If None no operation is performed for the corresponding modality
        * If dictionary it should define a processing strategy to be fitted on the data
         (e.g., {'scaling': {'with_std': True}, 'PCA': {'n_components': 0.9}} could define a standard scaling
         operations followed by a PCA (to be defined in the fit_process method !))
        * If sklearn.base.TransformerMixin it should correspond to a fitted transformer !

    multimodal_processing: dict, sklearn.base.TransformerMixin and None
        Processing operations to apply to the multimodal data set.
        * If None no operation is performed
        * If dictionary it should define a processing strategy to be fitted on the data (to be defined in the
         fit_multimodal_process method !)
        * If sklearn.base.TransformerMixin it should correspond to a fitted transformer !

    indexes: list of string or None
        List of sample indexes to consider in the data set. If None the function does not return a Dataset (only None
        value).

    drop_modas: bool
        If True, random droping of modalities will be applied to each sample (i.e., data augmentation).
        See dmultipit.dataset.dataset.DropModalities. The default is False

    keep_unlabelled: bool
        If True, keep unlabelled samples in the dataset. Discard them otherwise.
        See dmultipit.base.base_dataset.MultiModalDataset. The default is False.

    radiomics: int or None
        Position index of radiomics data within the list of raw data sets. If None, it is assumed no radiomic data
        were included in the list of raw data. This argument is only taken into accound when dataset_name is
        'MSKCCDataSet'. The default is None.

    rad_transform: dict or _transformers.MSKCCRadiomicsTransform object, or None
        Transformer for radiomic data. This argument is only taken into accound when dataset_name is 'MSKCCDataSet'.
        The default is None.

    Returns
    -------
    data_set: MultiModalDataset object or None
        A None value is returned when no indexes were passed as input.

    bool_mask_missing: boolean array of size (n_samples,) or None
        A None value is returned when no indexes were passed as input. Otherwise, indicate samples with only NaN values.
    """

    if indexes is not None:

        # select indexes in raw data sets and discard all samples with only missing values
        data_sets, labels, bool_mask_missing = _select_indexes(list_raw_data, labels, indexes)

        # create MultiModalDataset
        if dataset_name == "MSKCCDataset":

            if (multimodal_processing is not None) or keep_unlabelled:
                warnings.warn("multimodal_processing is not used with MSKCC dataset and will be ignored as well as "
                              "keep_unlabelled.")

            dataset = getattr(module_data, dataset_name)(
                list_raw_data=data_sets,
                labels=labels,
                list_unimodal_processings=list_unimodal_processings,
                radiomics=radiomics,
                rad_transform=rad_transform,
            )
        else:
            dataset = getattr(module_data, dataset_name)(
                list_raw_data=data_sets,
                labels=labels,
                list_unimodal_processings=list_unimodal_processings,
                multimodal_processing=multimodal_processing,
                keep_unlabelled=keep_unlabelled,
            )

        if drop_modas:
            setattr(dataset, "transform", module_data.DropModalities())

    else:
        dataset, bool_mask_missing = None, None

    return dataset, bool_mask_missing


def train_test_split(
        train_index,
        test_index,
        labels,
        list_raw_data,
        dataset_name,
        list_unimodal_processings,
        multimodal_processing,
        drop_modas,
        keep_unlabelled,
        radiomics=None,
        rad_transform=None,
):
    """
    Create training and test data for a given train-test split. Unimodal and multimodal processings are fitted to the
    training data and subsequently applied to the test data.

    Parameters
    ----------
    train_index: list of int.
        Indexes for training data.

    test_index: list of int.
        Indexes for test data.

    Returns
    -------
    dataset_train: MultiModalDataset object
        Training dataset with labelled data.

    dataset_train_unlabelled: MultiModalDataset object or None
        Training dataset with unlabelled data. None if self.keep_unlabelled is False.

    dataset_test: MultiModalDataset object
        Test dataset.

    bool_mask_test: boolean array of shape (n_test_samples,)
        Indicate samples with only NaN values.
    """

    # create training dataset
    dataset_train, _ = get_dataset(
        labels,
        list_raw_data,
        dataset_name,
        list_unimodal_processings,
        multimodal_processing,
        train_index,
        drop_modas,
        keep_unlabelled,
        radiomics,
        rad_transform,
    )

    # create training dataset with unlabelled data if needed (for semi-supervised strategy)
    dataset_train_unlabelled = None

    if keep_unlabelled:
        if (dataset_train.unlabelled_data is not None) and (
                len(dataset_train.unlabelled_data) > 0
        ):
            dataset_train_unlabelled = CustomSubset(
                dataset_train, dataset_train.unlabelled_data
            )
            dataset_train = CustomSubset(
                dataset_train,
                list(
                    set(range(len(dataset_train))) - set(dataset_train.unlabelled_data)
                ),
            )
        else:
            warnings.warn(
                "Training data contains no unlabelled data. dataset_train_unlabelled is set to None"
            )

    # create test dataset with fitted unimodal and multimodal processings from training data
    dataset_test, bool_mask_test = get_dataset(
        labels,
        list_raw_data,
        dataset_name,
        dataset_train.list_unimodal_processings,
        dataset_train.multimodal_processing,
        test_index,
        keep_unlabelled=False,
        radiomics=radiomics,
        rad_transform=dataset_train.rad_transform
        if rad_transform is not None
        else None,
    )

    return dataset_train, dataset_train_unlabelled, dataset_test, bool_mask_test


def train_val_test_split(
        train_index,
        test_index,
        val_index,
        labels,
        list_raw_data,
        dataset_name,
        list_unimodal_processings,
        multimodal_processing,
        drop_modas,
        keep_unlabelled,
        radiomics=None,
        rad_transform=None,
):
    """
    Create training, validation and test data for a given train-val-test split. Unimodal and multimodal processings are
    fitted to the training data and subsequently applied to the validation and test data.

    Parameters
    ----------
    train_index: list of int
        Indexes for training data.

    test_index: list of int
        Indexes for test data.

    val_index: list of int
        Indexes for validation data.

    Returns
    -------
    dataset_train: MultiModalDataset object
        Training dataset with labelled data.

    dataset_train_unlabelled: MultiModalDataset object or None
        Training dataset with unlabelled data.

    dataset_val: MultiModalDataset object
        Validation dataset.

    dataset_test: MultiModalDataset object
        Test dataset.

    bool_mask_test: boolean array of shape (n_test_samples,)
        Indicate samples with only NaN values.

    bool_mask_train: boolean array of shape (n_val_samples,)
        Indicate samples with only NaN values.
    """

    # create training dataset
    dataset_train, bool_mask_train = get_dataset(
        labels,
        list_raw_data,
        dataset_name,
        list_unimodal_processings,
        multimodal_processing,
        train_index,
        drop_modas,
        keep_unlabelled,
        radiomics,
        rad_transform,
    )

    # create training dataset with unlabelled data if needed (for semi-supervised strategy)
    dataset_train_unlabelled = None

    if keep_unlabelled:
        if (dataset_train.unlabelled_data is not None) and (
                len(dataset_train.unlabelled_data) > 0
        ):
            dataset_train_unlabelled = CustomSubset(
                dataset_train, dataset_train.unlabelled_data
            )
            dataset_train = CustomSubset(
                dataset_train,
                list(
                    set(range(len(dataset_train))) - set(dataset_train.unlabelled_data)
                ),
            )
        else:
            warnings.warn(
                "Training data contains no unlabelled data. dataset_train_unlabelled is set to None"
            )

    # create validation dataset with fitted unimodal and multimodal processings from training data
    dataset_val, _ = get_dataset(
        labels,
        list_raw_data,
        dataset_name,
        dataset_train.list_unimodal_processings,
        dataset_train.multimodal_processing,
        val_index,
        keep_unlabelled=False,
        radiomics=radiomics,
        rad_transform=dataset_train.rad_transform
        if rad_transform is not None
        else None,
    )

    # create test dataset with fitted unimodal and multimodal processings from training data
    dataset_test, bool_mask_test = get_dataset(
        labels,
        list_raw_data,
        dataset_name,
        dataset_train.list_unimodal_processings,
        dataset_train.multimodal_processing,
        test_index,
        keep_unlabelled=False,
        radiomics=radiomics,
        rad_transform=dataset_train.rad_transform
        if rad_transform is not None
        else None,
    )

    return (
        dataset_train,
        dataset_train_unlabelled,
        dataset_val,
        dataset_test,
        bool_mask_test,
        bool_mask_train,
    )


def build_model(config_dict, device, training_data=None, logger=None):
    """
    Build multimodal predictive model from configuration dictionary

    Parameters
    ----------
    config_dict: dict

    device: str
        Torch.device on which to allocate model weights

    training_data: dmultipit.base.base_dataset.MultiModalDataset object or None
        Training data set. If None, the architecture of the model is not updated. The default is None.

    logger: logging device or None
        The default is None.

    Returns
    -------
    model: dmultipit.model.model.InterAttentionFusion or dmultipit.model.model.LateAttentionFusion

    """
    # update architecture (after training pre-processings)
    if training_data is not None:
        config_dict = _update_architecture(config_dict, training_data)

    # build embedding modules for each modality
    embeddings = [
        config_dict.init_obj(
            ("architecture", "modality_embeddings", modality), module_emb
        )
        for modality in config_dict["architecture"]["order"]
    ]

    if config_dict["architecture"]["intermediate_fusion"]:

        # build intermediate fusion model (see dmultipit.model.model)
        model = module_arch.InterAttentionFusion(
            modality_embeddings=embeddings,
            attention=config_dict.init_obj(["architecture", "attention"], module_att),
            predictor=config_dict.init_obj(["architecture", "predictor"], module_emb),
        )
    else:

        # build late fusion model (see dmultipit.model.model), where attention modules have the same dim input as
        # embedding modules for the different modalities
        model = module_arch.LateAttentionFusion(
            modality_embeddings=embeddings,
            multimodalattention=config_dict.init_obj(
                ["architecture", "attention"],
                module_att,
                dim_input=[
                    config_dict["architecture"]["modality_embeddings"][m]["args"][
                        "dim_input"
                    ]
                    for m in config_dict["architecture"]["order"]
                ],
            ),
        )

    if logger is not None:
        logger.info(model)

    model = model.to(device)

    return model


def _update_architecture(config_dict, training_data):
    """
    Update configuration dictionary (i.e., model architecture) to take into account pre-processing operations (e.g.,
    changes in the input dimensions)

    Parameters
    ----------
    config_dict: dict
        Configuration dictionary

    training_data: dmultipit.base.base_dataset.MultiModalDataset object
        Training data set

    Returns
    -------
    config_dict: dict
        Updated configuration dictionary
    """

    # If multimodal pre-processing is performed (last processing operations) use the get_multimodal_dimension method
    # of the dmultipit.base.base_transformer.MultimodalTransformer estimator to extract the updated dimension of each
    # modality
    if (len(config_dict["architecture"]["order"]) > 1) and (training_data.multimodal_processing is not None):
        list_new_dim = training_data.multimodal_processing.get_multimodal_dimension()
        for new, moda in zip(list_new_dim, config_dict["architecture"]["order"]):
            config_dict["architecture"]["modality_embeddings"][moda]["args"]["dim_input"] = new

    # Otherwise, use the get_dimension method of the last dmultimot.base.base_transformer.MultimodalTransformer applied
    # to each modality to obtain the updated input dimensions.
    else:
        for process, moda in zip(training_data.list_unimodal_processings, config_dict["architecture"]["order"]):
            if check_transform(process):
                classes = inspect.getmro(process.__class__)
                if classes[0].__name__ == "Pipeline":
                    process = process[-1]
                new = process.get_dimension()
                config_dict["architecture"]["modality_embeddings"][moda]["args"]["dim_input"] = new

            # special case for MKSCC dataset and radiomic transform (when no unimodal processing is applied)
            elif config_dict["MSKCC"] and (moda in {"radiomics_PL", "radiomics_PC", "radiomics_LN"}):
                new = len(training_data.rad_transform.selected_features_[moda.split("_")[1]])
                config_dict["architecture"]["modality_embeddings"][moda]["args"]["dim_input"] = new

    return config_dict


class ProgressParallel(Parallel):
    """Custom tqdm progress bar for parallel computing"""

    def __init__(self, use_tqdm=True, total=None, *args, **kwargs):
        self._use_tqdm = use_tqdm
        self._total = total
        super().__init__(*args, **kwargs)

    def __call__(self, *args, **kwargs):
        with tqdm(disable=not self._use_tqdm, total=self._total) as self._pbar:
            return Parallel.__call__(self, *args, **kwargs)

    def print_progress(self):
        if self._total is None:
            self._pbar.total = self.n_dispatched_tasks
        self._pbar.n = self.n_completed_tasks
        self._pbar.refresh()


def find_logrank_threshold(risk_score, labels_surv):
    """
    Find cutoff value that maximize logrank test (searching between 30th and 71st percentiles of provided risk scores)

    Parameters
    ----------
    risk_score: 1D array of shape (n_samples,)
        Predicted survival risk score.

    labels_surv: sksurv.util.Surv array of shape (n_samples,)
        Structured array containing event indicators (i.e., censored or not) and observed times.

    Returns
    -------
        float, optimal cutoff value
    """
    cutoffs, pvals = [], []
    for p in np.arange(30, 71):
        c = np.percentile(risk_score, p)
        group1 = risk_score <= c
        group2 = risk_score > c
        test = logrank_test(
            durations_A=labels_surv[group1]["time"],
            durations_B=labels_surv[group2]["time"],
            event_observed_A=1 * (labels_surv[group1]["event"]),
            event_observed_B=1 * (labels_surv[group2]["event"]),
        )
        cutoffs.append(c)
        pvals.append(test.summary["p"].values[0])
    return cutoffs[np.argmin(pvals)]


def _select_indexes(list_raw_data, labels, indexes):
    """
    Select a subset of indexes in a list of raw datasets and discard samples with only missing values

    Note
    ----
    The main difficulty that makes the function quite complex is for dealing with radiomic data from MSKCC dataset
    which are loaded as a tuple of dataframe (radiomic data) and indexes (all samples included in the analysis, even
    those with missing radiomic modalities).
    """

    # select indexes in raw data sets
    data_sets = []
    missing_modalities = np.zeros((len(indexes), len(list_raw_data)))
    for i, raw_data in enumerate(list_raw_data):
        if isinstance(raw_data, np.ndarray):
            raw_data_subset = raw_data.copy()[indexes, :]
            missing_modalities[:, i] = np.sum(np.isnan(raw_data_subset), axis=1) == raw_data_subset.shape[1]
            data_sets.append(raw_data_subset)
        elif isinstance(raw_data, pd.DataFrame):
            raw_data_subset = raw_data.copy().iloc[indexes, :].values
            missing_modalities[:, i] = np.sum(np.isnan(raw_data_subset), axis=1) == raw_data_subset.shape[1]
            data_sets.append(raw_data_subset)

        # dealing with radiomic MSKCC data
        elif isinstance(raw_data, tuple):
            new_ind_subset = raw_data[1][indexes]
            raw_data_subset = raw_data[0].loc[new_ind_subset.intersection(raw_data[0].index)]
            raw_data_subset.index.names = ["main_index"]
            temp = pd.DataFrame(index=new_ind_subset)
            tempbis = (raw_data_subset[raw_data_subset['job_tag'] == "filtered-radiomics"]
                       .drop(columns=["main_index", "job_tag", "site", "lesion_index"], errors="ignore"))
            tempbis = tempbis[~tempbis.index.duplicated(keep='first')]
            temp['missing'] = tempbis.isna().sum(axis=1) == tempbis.shape[1]
            temp = temp.fillna(value=True)
            missing_modalities[:, i] = temp["missing"].values
            data_sets.append((raw_data_subset, new_ind_subset))

    bool_mask_missing = np.all(missing_modalities, axis=1)

    cured_data_sets = []
    for data in data_sets:
        if isinstance(data, np.ndarray):
            cured_data_sets.append(data[~bool_mask_missing])

        # dealing with radiomic MSKCC data
        elif isinstance(data, tuple):
            new_ind = data[1][~bool_mask_missing]
            new_data = data[0].loc[new_ind.intersection(data[0].index)]
            new_data.index.names = ["main_index"]
            cured_data_sets.append((new_data, new_ind))

    # select indexes in labels
    if isinstance(labels, np.ndarray):
        labels = labels.copy()[indexes][~bool_mask_missing]
    else:
        labels = labels.copy().iloc[indexes][~bool_mask_missing]

    return cured_data_sets, labels, bool_mask_missing