Source code for pytrial.tasks.indiv_outcome.tabular.logistic_regression

'''
Implement Logistic Regression model for tabular individual outcome
prediction in clinical trials.
'''
import pdb
import os
import joblib
import pickle

from sklearn.linear_model import LogisticRegression as lr_model

from pytrial.data.patient_data import TabularPatientBase
from pytrial.utils.check import check_checkpoint_file, check_model_dir, check_model_config_file, make_dir_if_not_exist
from .base import TabularIndivBase

class BuildModel:
    def __new__(self, config):
        if config['dual']:
            solver = 'liblinear'
        else:
            solver = 'lbfgs'
        model = lr_model(
            C = 1/config['weight_decay'],
            dual = config['dual'],
            solver = solver,
            max_iter = config['epochs'],
            )
        return model

[docs]class LogisticRegression(TabularIndivBase):
    '''
    Implement Logistic Regression model for tabular individual outcome
    prediction in clinical trials. Now only support `binary classification`.

    Parameters
    ----------
    weigth_decay: float
        Regularization strength for l2 norm; must be a positive float.
        Like in support vector machines, smaller values specify weaker regularization.

    dual: bool
        Dual or primal formulation. Dual formulation is only implemented for l2 penalty with liblinear solver.
        Prefer `dual=False` when `n_samples > n_features`.

    epochs: int
        Maximum number of iterations taken for the solvers to converge.

    experiment_id: str, optional (default='test')
        The name of current experiment. Decide the saved model checkpoint name.
    '''
    def __init__(self,
        weight_decay=1,
        dual=False,
        epochs=100,
        experiment_id='test',
        ) -> None:
        super().__init__(experiment_id=experiment_id)
        self.config = {
            'weight_decay': weight_decay,
            'dual': dual,
            'epochs': epochs,
            'experiment_id': experiment_id,
            'model_name': 'logistic_regression',
        }
        self._save_config(self.config)

[docs]    def fit(self, train_data, valid_data=None):
        '''Train logistic regression model to predict patient outcome
        with tabular input data.

        Parameters
        ----------
        train_data: dict
            {

            'x': TabularPatientBase or pd.DataFrame,

            'y': pd.Series or np.ndarray

            }

            - 'x' contain all patient features; 
            - 'y' contain labels for each row.

        valid_data: Ignored.
            Not used, present heare for API consistency by convention.
        '''
        self._input_data_check(train_data)
        self._build_model()

        if isinstance(train_data['x'], TabularPatientBase):
            dataset = train_data['x']
            x_feat = dataset.df
            y = train_data['y']
        else:
            x_feat = train_data['x']
            y = train_data['y']

        self._fit_model(x_feat, y)

[docs]    def predict(self, test_data):
        '''
        Make prediction probability based on the learned model.
        Save to `self.result_dir`.

        Parameters
        ----------
        test_data: dict
            {

            'x': TabularPatientBase or pd.DataFrame,

            'y': pd.Series or np.ndarray

            }

            - 'x' contain all patient features; 
            - 'y' contain labels for each row. Ignored for prediction function.

        Returns
        -------
        ypred: np.ndarray
            The predicted probability for each patient.
            
            - For binary classification, return shape (n, );
            - For multiclass classification, return shape (n, n_class).

        '''
        self._input_data_check(test_data)

        dataset = test_data['x']
        if isinstance(dataset, TabularPatientBase):
            x_feat = dataset.df
        else:
            x_feat = dataset

        ypred = self.model.predict_proba(x_feat)
        if ypred.shape[1] == 2: # binary
            ypred = ypred[:,1]

        # save results to dir
        pickle.dump(ypred, open(os.path.join(self.result_dir, 'pred.pkl'), 'wb'))
        if isinstance(test_data, dict):
            if 'y' in test_data:
                pickle.dump(test_data['y'], open(os.path.join(self.result_dir, 'label.pkl'), 'wb'))

        return ypred

[docs]    def save_model(self, output_dir=None):
        '''
        Save the learned logistic regression model to the disk.

        Parameters
        ----------
        output_dir: str or None
            The dir to save the learned model.
            If set None, will save model to `self.checkout_dir`.
        '''
        if output_dir is not None:
            make_dir_if_not_exist(output_dir)
        else:
            output_dir = self.checkout_dir

        self._save_config(self.config, output_dir=output_dir)
        ckpt_path = os.path.join(output_dir, 'indiv-tabular.model')
        joblib.dump(self.model, ckpt_path)

[docs]    def load_model(self, checkpoint=None):
        '''
        Save the learned logistic regression model to the disk.

        Parameters
        ----------
        checkpoint: str or None
            - If a directory, the only checkpoint file `.model` will be loaded.
            - If a filepath, will load from this file;
            - If None, will load from `self.checkout_dir`.
        '''
        if checkpoint is None:
            checkpoint = self.checkout_dir

        checkpoint_filename = check_checkpoint_file(checkpoint, suffix='model')
        config_filename = check_model_config_file(checkpoint)
        self.model = joblib.load(checkpoint_filename)
        self.config = self._load_config(config_filename)

    def _build_model(self):
        self.model = BuildModel(self.config)

    def _fit_model(self, x_feat, y):
        self.model.fit(x_feat, y)