Source code for pytrial.tasks.indiv_outcome.tabular.transtab

import pdb
from copy import deepcopy

import transtab
import numpy as np
import pandas as pd
import torch

from pytrial.data.patient_data import TabularPatientBase
from pytrial.utils.check import (
    check_checkpoint_file, check_model_dir, check_model_config_file, make_dir_if_not_exist
)
from .base import TabularIndivBase, IndivTabDataset

class BuildModel:
    def __new__(self, config):
        contrastive_pretrain = config.pop('contrastive_pretrain')
        if not contrastive_pretrain:
            clf = transtab.build_classifier(**config)
            collate_fn = None
        else:
            clf, collate_fn = transtab.build_contrastive_learner(**config)
        return clf, collate_fn

[docs]class TransTab(TabularIndivBase):
    '''
    Implement transtab model for tabular individual outcome
    prediction in clinical trials [1]_.

    Parameters
    ----------
    mode: str
        The task's objectives, in `binary`, `multiclass`. # TODO: `multilabel`, or `regression`
        Can be ignored if `contrastive_pretrain` is set True.
    
    categorical_columns: list 
        a list of categorical feature names.

    numerical_columns: list
        a list of numerical feature names.

    binary_columns: list
        a list of binary feature names, accept binary indicators like (yes,no); (true,false); (0,1).
    
    contrastive_pretrain: bool(default=False)
        whether or not take a contrastive pretraining. If set true,
        `num_class` will be ignored.

    num_class: int
        number of output classes to be predicted.

    hidden_dim: int
        the dimension of hidden embeddings.
    
    num_layer: int
        the number of transformer layers used in the encoder.
    
    num_attention_head: int
        the numebr of heads of multihead self-attention layer in the transformers.

    hidden_dropout_prob: float
        the dropout ratio in the transformer encoder.

    ffn_dim: int
        the dimension of feed-forward layer in the transformer layer.
    
    activation: str
        the name of used activation functions, support ``"relu"``, ``"gelu"``, ``"selu"``, ``"leakyrelu"``.

    learning_rate: float
        Learning rate for optimization based on SGD. Use torch.optim.Adam by default.

    weight_decay: float
        Regularization strength for l2 norm; must be a positive float.
        Smaller values specify weaker regularization.
    
    batch_size: int
        Batch size when doing SGD optimization.

    epochs: int
        Maximum number of iterations taken for the solvers to converge.

    num_worker: int
        Number of workers used to do dataloading during training.

    device: str
        Target device to train the model, as `cuda:0` or `cpu`.

    experiment_id: str, optional (default='test')
        The name of current experiment. Decide the saved model checkpoint name.
    
    Notes
    -----
    .. [1] Wang, Z., & Sun, J. (2022). TransTab: Learning Transferable Tabular Transformers Across Tables. NeurIPS'22.
    '''
    def __init__(self, 
        mode=None,
        categorical_columns=None,
        numerical_columns=None,
        binary_columns=None,
        contrastive_pretrain=False,
        num_class=2,
        hidden_dim=128,
        num_layer=2,
        num_attention_head=8,
        hidden_dropout_prob=0,
        ffn_dim=256,
        activation='relu',
        learning_rate=1e-4,
        weight_decay=1e-4,
        batch_size=64,
        epochs=10,
        num_worker=0,
        device='cuda:0',
        experiment_id='test'):
        super().__init__(experiment_id)

        mode = mode.lower()
        if not contrastive_pretrain:
            assert mode in ['binary', 'multiclass', 'regression', 'multilabel'], 'Must specify `mode` for supervised classifcation.'

        self.config = {
            'categorical_columns':categorical_columns,
            'numerical_columns':numerical_columns,
            'binary_columns':binary_columns,
            'contrastive_pretrain':contrastive_pretrain,
            'num_class':num_class,
            'hidden_dim':hidden_dim,
            'num_layer':num_layer,
            'num_attention_head':num_attention_head,
            'hidden_dropout_prob':hidden_dropout_prob,
            'ffn_dim':ffn_dim,
            'activation':activation,
            'device':device,
            'mode':mode,
            'learning_rate':learning_rate,
            'weight_decay':weight_decay,
            'batch_size':batch_size,
            'epochs':epochs,
            'num_worker':num_worker,
        }
        self._save_config(self.config)
        self._build_model()
        self.device = device

[docs]    def fit(self, train_data, valid_data=None):
        '''Train TransTab model to predict patient outcome
        with tabular input data.

        Parameters
        ----------
        train_data: list[dict]
            a list of patient data, each patient is a dict of 
            {
            
            'x': TabularPatientBase or pd.DataFrame, 
            
            'y': pd.Series or np.ndarray
            
            }.

            TransTab can learn from multiple different tabular datasets.

        valid_data: dict
            Validation data during the training for early stopping. 
            valid_data = 
            
            {
            
            'x': TabularPatientBase or pd.DataFrame,

            'y': pd.Series or np.ndarray
            
            }
            
        '''
        self._input_data_check(train_data)
        if valid_data is not None: self._input_data_check(valid_data)
        self._fit_model(train_data=train_data, valid_data=valid_data)

[docs]    def predict(self, test_data):
        '''
        Make prediction probability based on the learned model.

        Parameters
        ----------
        test_data: TabularPatientBase or pd.DataFrame
            Contain all patient features.

        Returns
        -------
        ypred: np.ndarray or torch.Tensor

            - For binary classification, return shape (n, );
            - For multiclass classification, return shape (n, n_class).

        '''
        self._input_data_check(test_data)
        data = self._parse_input_data(test_data)
        ypred = transtab.predict(self.model, x_test=data)
        return ypred
        
[docs]    def save_model(self, output_dir=None):
        '''
        Save the learned transtab model to the disk.

        Parameters
        ----------
        output_dir: str or None
            The dir to save the learned model.
            If set None, will save model to `self.checkout_dir`.
        '''
        if output_dir is not None:
            make_dir_if_not_exist(output_dir)
        else:
            output_dir = self.checkout_dir

        self._save_config(self.config, output_dir=output_dir)
        self.model.save(output_dir)

[docs]    def load_model(self, checkpoint):
        '''
        Load the learned transtab model from the given checkpoint dir.

        Parameters
        ----------
        checkpoint: str
            The input dir that stores the pretrained model.

            - If a directory, the only checkpoint file `*.pth.tar` will be loaded.
            - If a filepath, will load from this file.
        '''
        config_filename = check_model_config_file(checkpoint)
        if config_filename is not None:
            config = self._load_config(config_filename)
            self.config.update(config)        
        self.model.load(checkpoint)

[docs]    def update(self, config):
        '''Update the configuration of feature extractor's column map for *cat*, *num*, and *bin* cols.
        Or update the number of classes for the output classifier layer.

        Parameters
        ----------
        config: dict
            a dict of configurations: keys `cat:list`, `num:list`, `bin:list` are to specify the new column names;
            key `num_class:int` is to specify the number of classes for finetuning on a new dataset.

        '''
        self.model.update(config)

    def _build_model(self):
        config = deepcopy(self.config)
        self.model, self.collate_fn = BuildModel(config)

    def _fit_model(self, train_data, valid_data=None):
        train_data = self._parse_input_data(train_data)
        if valid_data is not None: valid_data = self._parse_input_data(valid_data)
        
        if self.config['mode'] == 'binary': eval_metric = 'auc'
        elif self.config['mode'] == 'multiclass': eval_metric = 'acc'

        transtab.train(
            self.model,
            train_data, 
            valset=valid_data, 
            eval_metric=eval_metric,
            output_dir=self.checkout_dir,
            collate_fn = self.collate_fn,
            **self.config
            )
        
    def _parse_input_data(self, inputs):
        def _check_input(input):
            if isinstance(input, pd.DataFrame):
                return input
            if isinstance(input, TabularPatientBase):
                return input.df

        if isinstance(inputs, list):
            data = []
            for input in inputs:
                x = _check_input(input['x'])
                data.append((x, input['y']))

        if isinstance(inputs, dict):
            data = (_check_input(inputs['x']), inputs['y'])

        if isinstance(inputs, pd.DataFrame):
            data = inputs
        
        if isinstance(inputs, TabularPatientBase):
            data = inputs.df

        return data

    def _input_data_check(self, inputs):
        '''
        Check the training / testing data fits the formats.
        Target to (1) check if inputs valid,
                    if not, give tips about the data problem.

        Parameters
        ----------
        inputs: [{
                'x': TabularPatientBase or pd.DataFrame,
                'y': pd.Series or np.ndarray
                },...]
                'x' contain all patient features; 'y' contain labels
                for each row.
        '''
        def _check_input(input):
            if isinstance(input, dict):
                assert 'x' in input, 'No input patient data found in inputs.'
                assert isinstance(input['x'], pd.DataFrame) or isinstance(input['x'], TabularPatientBase), 'Get unaccepted input data format, expect `pd.DataFrame` or `TabularPatientBase`, get {} instead.'.format(type(inputs['x']))
                if 'y' in input:
                    assert isinstance(input['y'], pd.Series) or isinstance(input['y'], np.ndarray)
                    assert not pd.isnull(input['y']).any(), 'Find NaN in input targets, please check.'
            if isinstance(input['x'], pd.DataFrame):
                assert not input['x'].isnull().values.any(), 'Find NaN in input dataframe, please check your input, or try to pass `TabularPatientBase` as inputs.'
            if isinstance(input['x'], TabularPatientBase):
                assert not input['x'].df.isnull().values.any(), 'Find NaN in input dataset, please check your input, or try to pass `TabularPatientBase` as inputs.'

        if isinstance(inputs, list):
            for input in inputs:
                _check_input(input)
        else:
            _check_input(inputs)