Source code for pytrial.tasks.trial_simulation.tabular.gaussian_copula

'''
Implement Gaussian Copula model for tabular simulation
prediction in clinical trials.
'''
import os
import warnings
import joblib
from copulas.multivariate import GaussianMultivariate

from .base import TabularSimulationBase
from pytrial.data.patient_data import TabularPatientBase
from pytrial.utils.check import check_checkpoint_file, check_model_dir, check_model_config_file, make_dir_if_not_exist

warnings.filterwarnings('ignore')


class BuildModel:
    def __new__(self) -> GaussianMultivariate:
        return GaussianMultivariate()

[docs]class GaussianCopula(TabularSimulationBase): ''' Implement Gaussian Copula model for tabular patient simulation. Parameters ---------- experiment_id: str, optional (default='trial_simulation.tabular.gaussiancopula') The name of current experiment. Decide the saved model checkpoint name. ''' def __init__( self, experiment_id='trial_simulation.tabular.gaussiancopula', ) -> None: super().__init__(experiment_id=experiment_id)
[docs] def fit(self, train_data): ''' Train gaussian copula model to simulate patient outcome with tabular input data. Parameters ---------- train_data: dict or TabularPatientBase The training data, which is the real tabular patient data. ''' self._input_data_check(train_data) self._build_model() if isinstance(train_data, TabularPatientBase): dataset = train_data.df if isinstance(train_data, dict): dataset = TabularPatientBase(train_data, transform=True) dataset = dataset.df self._fit_model(dataset) self.metadata = train_data.metadata self.raw_dataset = train_data
[docs] def predict(self, n=200): ''' simulate a new tabular data with number_of_predictions. Parameters ---------- n: int The number of synthetic samples to generation. Returns ------- ypred: TabularPatientBase A new tabular data simulated by the model ''' ypred = self.model.sample(n) # build df ypred = self.raw_dataset.reverse_transform(ypred) # transform back return ypred # output: dataset, same as the input dataset
[docs] def save_model(self, output_dir=None): ''' Save the learned gaussian copula model to the disk. Parameters ---------- output_dir: str or None The dir to save the learned model. If set None, will save model to `self.checkout_dir`. ''' if output_dir is not None: make_dir_if_not_exist(output_dir) else: output_dir = self.checkout_dir #self._save_config(self.config, output_dir=output_dir) ckpt_path = os.path.join(output_dir, 'gaussiancopula.model') joblib.dump(self.model, ckpt_path)
[docs] def load_model(self, checkpoint=None): ''' Save the learned gaussian copula model to the disk. Parameters ---------- checkpoint: str or None The path to the saved model. - If a directory, the only checkpoint file `.model` will be loaded. - If a filepath, will load from this file; - If None, will load from `self.checkout_dir`. ''' if checkpoint is None: checkpoint = self.checkout_dir checkpoint_filename = check_checkpoint_file(checkpoint, suffix='model') config_filename = check_model_config_file(checkpoint) self.model = joblib.load(checkpoint_filename)
def _build_model(self): self.model = BuildModel() def _fit_model(self, data): self.model.fit(data)