Source code for pytrial.tasks.trial_patient_match.models.compose

import pdb

import torch
from torch import nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from import DataLoader
import numpy as np

from pytrial.utils.check import (
    check_checkpoint_file, check_model_dir, check_model_config_file, make_dir_if_not_exist
from .base import PatientTrialMatchBase
from import TrialCollator, PatientCollator
from ..losses import COMPOSELoss
from ..trainer import PatientTrialTrainer

[docs]class COMPOSE(PatientTrialMatchBase): ''' Leverage COMPOSE model for patient-trial matching [1]_. One patient's label = [[0,1,2,3], [2,3,4,5]] where the first is a list of indices of inclusion criteria that the patient satisfies. the second is a list of indices of exclusion criteria that the patient does not satisfy. for the other criteria we dont know if the patient satisfied or not. In this regard, we need to: (1) predict "match" for all inclusion criteria of a trial for recruiting the patient (2) predict "unmatch" for all exclusion criteria of a trial for recruiting the patient # prediction logits # 0 is unmatch # 1 is match # 2 is unknown Parameters ---------- order: list[str] The order of events within each visit in the patient's EHR data, e.g., orders=['diag','med','prod']. vocab_size: int The vocabular size of each patient's EHR event types, e.g., diag, med, prod. max_visit: int Maximum number of visits to load for EHRs inputs. word_dim: int The dimension of input word embeddings for encoding eligibility criteria. conv_dim: int The dimension of convolutional layers for processing eligibility criteria embeddings. mem_dim: int The hidden dimension of the EHR memory network (encode patient EHRs). mlp_dim: int The hidden dimension of the MLP layers in the Query Network. demo_dim: int The input dimensions of patient demographic information, e.g., age, gender. margin: float The margin when compute nn.CosineEmbeddingLoss on patient embedding and inclusion/exclusion criteria embeddings. Refer to Eq. (12) of the reference paper. epochs: int, optional (default=50) Number of iterations (epochs) over the training data. batch_size: int, optional (default=512) Number of samples in each training batch. learning_rate: float, optional (default=1e-3) The learning rate. weight_decay: float (default=0) The weight decay during training. num_worker: int Number of workers used to do dataloading during training. device: str The model device. Notes ----- .. [1] Gao, J., et al. (2020, August). COMPOSE: cross-modal pseudo-siamese network for patient trial matching. In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining (pp. 803-812). ''' def __init__(self, order, vocab_size, max_visit=20, word_dim=768, conv_dim=128, mem_dim=320, mlp_dim=512, demo_dim=3, margin=0.3, batch_size=512, epochs=50, learning_rate=1e-3, weight_decay=0, num_worker=0, device='cuda:0', experiment_id='test', ) -> None: super().__init__(experiment_id) self.config = { 'order':order, 'max_visit':max_visit, 'vocab_size':vocab_size, 'word_dim':word_dim, 'conv_dim':conv_dim, 'mem_dim':mem_dim, 'mlp_dim':mlp_dim, 'demo_dim':demo_dim, 'margin':margin, 'batch_size':batch_size, 'learning_rate':learning_rate, 'weight_decay':weight_decay, 'epochs':epochs, 'num_worker':num_worker, 'device':device, } self.device = device self.config['total_vocab_size'] = sum(vocab_size) self._build_model()
[docs] def predict(self, test_data, return_dict=False): ''' Predict the matching of patient and ECs of each target trial. Will make predictions for each patient, go through all included trials. For example, given 1000 patients and 10 trials, will generate predictions as the shape of ``1000 x 10 x num_eligibility_criteria``. Parameters ---------- test_data: dict A dict contains patient and trial data, respecitvely. { 'patient': ``, 'trial': `` } return_dict: bool If return results stored in dictionary, otherwise return tuples of results ''' self._input_data_check(test_data) patient_dataloader, trial_dataloader = self.get_test_dataloader(test_data) pred_res = self._predict_on_dataloader(patient_dataloader, trial_dataloader) if return_dict: pred_res['pred_trial'] = self._tuple_result_to_dict(pred_res['pred_trial']) return pred_res
[docs] def fit(self, train_data, valid_data=None): ''' Fit patient-trial matching model. Parameters ---------- train_data: dict A dict contains patient and trial data, respectively. { 'patient': ``, 'trial': `` } valid_data: dict A dict contains patient and trial data for evaluation. Same format as `train_data`. ''' self._input_data_check(train_data) if valid_data is not None: self._input_data_check(valid_data) self._fit_model(train_data, valid_data)
[docs] def save_model(self, output_dir): ''' Save the learned patient-match model to the disk. Parameters ---------- output_dir: str or None The dir to save the learned model. If set None, will save model to `self.checkout_dir`. ''' if output_dir is not None: make_dir_if_not_exist(output_dir) else: output_dir = self.checkout_dir self._save_config(self.config, output_dir=output_dir) self._save_checkpoint({'model':self.model}, output_dir=output_dir)
[docs] def load_model(self, checkpoint): ''' Load model and the pre-encoded trial embeddings from the given checkpoint dir. Parameters ---------- checkpoint: str The input dir that stores the pretrained model. If a directory, the only checkpoint file `*.pth.tar` will be loaded. If a filepath, will load from this file. ''' checkpoint_filename = check_checkpoint_file(checkpoint) config_filename = check_model_config_file(checkpoint) state_dict = torch.load(checkpoint_filename) if config_filename is not None: config = self._load_config(config_filename) self.config.update(config) self.model = state_dict['model']
def get_train_dataloader(self, train_data): trial_data = train_data['trial'] patient_data = train_data['patient'] patient_dataloader = DataLoader(patient_data, batch_size=self.config['batch_size'], num_workers=self.config['num_worker'], pin_memory=True, shuffle=True, collate_fn=PatientCollator( config={ 'visit_mode':train_data['patient'].metadata['visit']['mode'], 'label_mode':train_data['patient'].metadata['label']['mode'], } ), ) trial_dataloader = DataLoader(trial_data, batch_size=self.config['batch_size'], num_workers=self.config['num_worker'], pin_memory=True, shuffle=True, collate_fn=TrialCollator(), ) return patient_dataloader, trial_dataloader def get_test_dataloader(self, test_data): trial_data = test_data['trial'] patient_data = test_data['patient'] patient_dataloader = DataLoader(patient_data, batch_size=self.config['batch_size'], num_workers=self.config['num_worker'], pin_memory=False, shuffle=False, collate_fn=PatientCollator( config={ 'visit_mode':patient_data.metadata['visit']['mode'], 'label_mode':patient_data.metadata['label']['mode'], } ), ) trial_dataloader = DataLoader(trial_data, batch_size=1, # get one trial once when making predictions num_workers=self.config['num_worker'], pin_memory=False, shuffle=False, collate_fn=TrialCollator(), ) return patient_dataloader, trial_dataloader def _build_model(self): self.model = BuildModel( total_vocab_size=self.config['total_vocab_size'], word_dim=self.config['word_dim'], conv_dim=self.config['conv_dim'], mem_dim=self.config['mem_dim'], demo_dim=self.config['demo_dim'], mlp_dim=self.config['mlp_dim'], ) if 'cuda' in self.device: self.model.cuda() def _fit_model(self, train_data, valid_data): patient_dataloader, trial_dataloader = self.get_train_dataloader(train_data) loss_models = self._build_loss_model() # need to customize the training process in trainer train_objectives = [[[patient_dataloader,trial_dataloader], loss_model] for loss_model in loss_models] trainer = PatientTrialTrainer(model=self, train_objectives=train_objectives, test_data=valid_data, ) trainer.train(**self.config) @torch.no_grad() def _predict_on_dataloader(self, patient_dataloader, trial_dataloader, return_label=False): '''Return trial level matching predictions. Return label for evaluation if required. ''' self.eval() # record predictions pred_all, label_all = [], [] pred_inc, pred_exc = [], [] # record labels label_trial_inc, label_trial_exc = [], [] label_patient_inc, label_patient_exc = [], [] for i, batch_trial in enumerate(trial_dataloader): pred_patient_inc, pred_patient_exc = [], [] label_trial = [] label_trial_inc_, label_trial_exc_ = [], [] for batch_ehr in patient_dataloader: inputs = {} inputs.update(batch_ehr) inputs.update(batch_trial) inputs = self._prepare_input(inputs) pred = self.model(inputs) pred_patient_inc.append(pred['logit_inc']) pred_patient_exc.append(pred['logit_exc']) if return_label: # match trial ec idnex with patient index match_inc, match_inc_ec = self._match_patient_with_trial_ec(batch_trial=batch_trial, batch_ehr=batch_ehr, ec='inc', ec_idx=0) match_exc, match_exc_ec = self._match_patient_with_trial_ec(batch_trial=batch_trial, batch_ehr=batch_ehr, ec='exc', ec_idx=1) match_label = match_inc * ~match_exc label_trial.append(match_label.astype(int)) label_trial_inc_.append(match_inc_ec) # bs_patient, num_ec label_trial_exc_.append(match_exc_ec) if i == 0: # get patient label [label_patient_inc.append(y[0]) for y in batch_ehr['y']] [label_patient_exc.append(y[1]) for y in batch_ehr['y']] if return_label: label_trial_inc_ = np.concatenate(label_trial_inc_) label_trial_exc_ = np.concatenate(label_trial_exc_) label_trial_inc.append(label_trial_inc_) label_trial_exc.append(label_trial_exc_) label_all.append(np.concatenate(label_trial)) # get a batch of trial to all EHRs matching pred_patient_inc =, 1) # 1, num_patients, num_inc_ec, output_class pred_patient_exc =, 1) # 1, num_patients, num_exc_ec, output_class pred_trial_inc = self._match_trial_for_patients(pred_patient_inc, 'inc') pred_trial_exc = self._match_trial_for_patients(pred_patient_exc, 'exc') pred_trial = pred_trial_exc * pred_trial_inc # iff both inc and exc are matched, shape=(bs_trial, bs_patient) pred_all.append((batch_trial['nct_id'], pred_trial.detach().cpu().numpy())) pred_inc.append(pred_patient_inc.detach().cpu().numpy()) pred_exc.append(pred_patient_exc.detach().cpu().numpy()) return_res = { 'pred_trial':pred_all, # trial-level prediction 'pred_inc': pred_inc, # inclusion-level prediction 'pred_exc': pred_exc, # exclusion-level prediction } if return_label: return_res['label_trial'] = np.stack(label_all) # trial-level labels return_res['label_patient_inc'] = label_patient_inc # patient inclusion criteria label return_res['label_patient_exc'] = label_patient_exc # patient exclusion criteria label return_res['label_trial_inc'] = label_trial_inc # trial inclusion criteria index return_res['label_trial_exc'] = label_trial_exc # trial exclusion criteria index return return_res def _build_loss_model(self): return [COMPOSELoss(self.model)] def _prepare_input(self, data): ''' Prepare inputs for compose model encoding. { 'v': patient visit events, 'x': patient features, 'y': which ECs patient are matched, 'inc_ec_index': sampled batch of inclusion criteria index, 'exc_ec_index': sampled batch of exclusion criteria index, 'inc_ec_index': sampled batch of inclusion criteria embedding, 'exc_ec_index': sampled batch of exclusion criteria embedding, } ''' inputs = self._prepare_ehr_input(data=data) trial_inputs = self._prepare_trial_patient_match_input(data=data) inputs.update(trial_inputs) return inputs def _prepare_ehr_input(self, data): inputs = {} visits = data['v'] feature = data['x'] if not isinstance(feature, torch.Tensor): feature = torch.tensor(feature) feature = v_lengths = [len(visits[self.config['order'][0]][idx][:self.config['max_visit']]) for idx in range(len(visits[self.config['order'][0]]))] mask = torch.zeros(len(v_lengths), max(v_lengths)) v = torch.zeros(len(v_lengths), max(v_lengths), self.config['total_vocab_size']) for idx in range(len(v_lengths)): v[idx,:v_lengths[idx]] = torch.tensor(self._translate_dense_visits_to_sparse({k: visits[k][idx][:self.config['max_visit']] for k in visits})) mask[idx,:v_lengths[idx]] = 1 if 'cuda' in self.device: v = v.cuda() mask = mask.cuda() feature = feature.cuda() inputs['x'] = feature.float() inputs['mask'] = mask inputs['v'] = v return inputs def _prepare_trial_patient_match_input(self, data): outputs = {} inc_inputs = self._prepare_ec_input(data, 'inc', 0) # patient label 0 corresponds to inclusion exc_inputs = self._prepare_ec_input(data, 'exc', 1) # patient label 1 corresponds to exclusion outputs.update(inc_inputs) outputs.update(exc_inputs) return outputs def _prepare_ec_input(self, data, ec, label_idx): patient_label = data['y'] inc_label = [] inc_idx = data[f'{ec}_ec_index'] inc_emb = data[f'{ec}_ec_emb'] max_n_inc = [len(l) for l in inc_idx] inc_mask = torch.zeros(len(max_n_inc), max(max_n_inc)) num_patient = len(patient_label) num_sample = min(len(max_n_inc), num_patient) inc_emb_ts = [] for idx in range(num_sample): # go through all trials # if len(patients) > len(trials) # only take the top len(trials) samples ec_embs = torch.zeros(max(max_n_inc), inc_emb[0].shape[-1]) ec_embs[:max_n_inc[idx]] = inc_emb[idx] inc_emb_ts.append(ec_embs) inc_mask[idx][:max_n_inc[idx]] = 1 p_label = patient_label[idx][label_idx] # each trial match each patient # trial1 -> patient1 # trial2 -> patient2 # ... # trial10 -> patient10 # the remaining of patients will be ignored if len(trial) == 10 inc_label_ = np.in1d(inc_idx[idx], p_label) if ec == 'inc': # inclusion inc_label_ = torch.tensor(inc_label_.astype(int)) # match is 1 inc_label_[inc_label_==0] = 2 # for the other inclusion it is unknown else: # exclusion inc_label_ = torch.tensor((~inc_label_).astype(int)) # unmatch is 0 inc_label_[inc_label_!=0] = 2 # for the other exclusion it is unknown if 'cuda' in self.device: inc_label_ = inc_label_.cuda() inc_label.append(inc_label_) inc_emb_ts = torch.stack(inc_emb_ts) if 'cuda' in self.device: inc_emb_ts = inc_emb_ts.cuda() inc_mask = inc_mask.cuda() inc_label = pad_sequence(inc_label, batch_first=True) outputs = { f'{ec}_emb': inc_emb_ts, f'{ec}_emb_mask': inc_mask, f'{ec}_label': inc_label, } return outputs def _match_patient_with_trial_ec(self, batch_trial, batch_ehr, ec, ec_idx): trial_inc_idx = batch_trial[f'{ec}_ec_index']# assume test dataloader only has 1 sample each batch patient_label = batch_ehr['y'] match_label, match_ec_label = [], [] for p_label in patient_label: matched = np.in1d(trial_inc_idx, p_label[ec_idx]).astype(int) if ec_idx == 0: # inclusion label = sum(matched) == len(trial_inc_idx) matched[matched==0] = 2 # unk else: # exclusion label = sum(matched) == 0 matched[matched==0] = 2 # unk matched[matched==1] = 0 # unmatch match_ec_label.append(matched) match_label.append(label) return np.array(match_label), np.stack(match_ec_label)
class BuildModel(nn.Module): ''' Build COMPOSE model. ''' def __init__(self, total_vocab_size, word_dim, conv_dim, mem_dim, demo_dim, mlp_dim, ) -> None: super().__init__() self.embedding_network = ECEmbedding(word_dim=word_dim, conv_dim=conv_dim) self.memory_network = EHRMemoryNetwork(total_vocab_size=total_vocab_size, word_dim=word_dim, mem_dim=mem_dim, demo_dim=demo_dim) self.query_network = QueryNetwork(mem_dim=mem_dim, conv_dim=conv_dim, mlp_dim=mlp_dim) def forward(self, inputs): ehr = inputs['v'] # tensor bs, n_visit, n_event demo = inputs['x'] # tensor bs, n_feature ehr_mask = inputs['mask'] memory = self.memory_network(ehr, demo, ehr_mask) # batch_size, 2, mem_dim inc_ec, inc_ec_mask = inputs['inc_emb'], inputs['inc_emb_mask'] inc_emb, inc_ec_emb = self.embedding_network(inc_ec, inc_ec_mask) # bs, num_ec, 512 exc_ec, exc_ec_mask = inputs['exc_emb'], inputs['exc_emb_mask'] exc_emb, exc_ec_emb = self.embedding_network(exc_ec, exc_ec_mask) # bs, num_ec, 512 ouptut_inc, response_inc, query_inc = self.query_network(memory, inc_ec_emb, inc_ec_mask) # bs_trial, bs_ehr, num_ec, 2 output_exc, response_exc, query_exc = self.query_network(memory, exc_ec_emb, exc_ec_mask) # bs_trial, bs_ehr, num_ec, 2 # pred_inc = torch.softmax(ouptut_inc, dim=-1) # pred_exc = torch.softmax(output_exc, dim=-1) return {'logit_inc':ouptut_inc, 'logit_exc':output_exc, 'response_inc':response_inc, 'response_exc':response_exc, 'query_inc':query_inc, 'query_exc':query_exc, } class CausalConv1d(torch.nn.Conv1d): def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1, bias=True): super(CausalConv1d, self).__init__( in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=0, dilation=dilation, groups=groups, bias=bias) self.__padding = (kernel_size - 1) * dilation def forward(self, input): return super(CausalConv1d, self).forward(F.pad(input, (self.__padding, 0))) class HighwayBlock(nn.Module): def __init__(self, input_dim, kernel_size): super(HighwayBlock, self).__init__() self.conv_t = CausalConv1d(input_dim, input_dim, kernel_size) self.conv_z = CausalConv1d(input_dim, input_dim, kernel_size) def forward(self, input): t = torch.sigmoid(self.conv_t(input)) z = t * self.conv_z(input) + (1-t) * input return z class ECEmbedding(nn.Module): def __init__(self, word_dim, conv_dim): super(ECEmbedding, self).__init__() # TODO # add trainable embeddings: ec_emb, ec_emb_fixed #Input: batch_size * sentence_len * embd_dim self.word_dim = word_dim self.conv_dim = conv_dim #High-way network for word features self.init_conv1 = CausalConv1d(word_dim, conv_dim, 1) self.init_conv2 = CausalConv1d(word_dim, conv_dim, 3) self.init_conv3 = CausalConv1d(word_dim, conv_dim, 5) self.init_conv4 = CausalConv1d(word_dim, conv_dim, 7) self.highway1 = HighwayBlock(4*conv_dim, 3) self.highway2 = HighwayBlock(4*conv_dim, 3) self.highway3 = HighwayBlock(4*conv_dim, 3) self.pool = nn.AdaptiveMaxPool1d(1) def forward(self, input, mask): #Input: B * L * embd_dim #Mask: B * L input = input.permute(0,2,1) conv1 = self.init_conv1(input) conv2 = self.init_conv2(input) conv3 = self.init_conv3(input) conv4 = self.init_conv4(input) concat =,conv2,conv3,conv4), dim=1) highway_res = self.highway1(concat) highway_res = torch.relu(highway_res) highway_res = self.highway2(highway_res) highway_res = torch.relu(highway_res) highway_res = self.highway3(highway_res) highway_res = torch.relu(highway_res) # bs, 512, num_ec highway_res = highway_res * mask.unsqueeze(1) # bs, 512, num_ec pooled_res = self.pool(highway_res) pooled_res = pooled_res.squeeze(-1) # bs, 512, encoding for the whole trial? highway_res = highway_res.permute(0,2,1) # bs, num_ec, 512 return pooled_res, highway_res class EHRMemoryNetwork(nn.Module): def __init__(self, total_vocab_size, word_dim, mem_dim, demo_dim): super(EHRMemoryNetwork, self).__init__() self.mem_dim = mem_dim self.ehr_embedding_matrix = nn.Linear(total_vocab_size, word_dim, bias=False) self.erase_layer = nn.Linear(word_dim, mem_dim) self.add_layer = nn.Linear(word_dim, mem_dim) self.demo_embd = nn.Linear(demo_dim, mem_dim) self.init_memory = nn.Parameter(torch.randn(1, mem_dim)) def forward(self, input, demo, mask): ''' TODO: use hierarchy of codes to enhance code embeddings Now: we only make multihot embedding of the input codes. input is ehr concept embedding with (bs, num_visit, 12=3x4, word_dim), 12 is 3 types of codes and 4 layers of code hierarchy, maxpooling is applied to input EHRs when each type has multiple codes ''' batch_size = input.size(0) time_step = input.size(1) memory = self.init_memory demo_mem = torch.tanh(self.demo_embd(demo.float())) for i in range(time_step): cur_input = input[:, i, ...] # bs, vocab_size cur_input = self.ehr_embedding_matrix(cur_input) # bs, word_dim erase = torch.sigmoid(self.erase_layer(cur_input)) # bs, 320 add = torch.tanh(self.add_layer(cur_input)) # bs, 320 cur_mask = mask[:, i].reshape(batch_size, 1) # bs, 1 erase = erase * cur_mask add = add * cur_mask update_ratio = (1 - erase) + add # bs, 320 memory = memory * update_ratio # bs, 320 memory = torch.stack([memory, demo_mem], dim=1) # bs, 2, mem_dim return memory class QueryNetwork(nn.Module): def __init__(self, mem_dim, conv_dim, mlp_dim): super(QueryNetwork, self).__init__() self.word_trans = nn.Linear(4*conv_dim,mem_dim, bias=False) self.mlp = nn.Linear(2*mem_dim, mlp_dim) self.output = nn.Linear(mlp_dim, 3) # 0: match, 1: umatch, 2: unknown def forward(self, memory, query, ec_mask): #query: bs, num_ec, 4*conv_dim #memory: bs, 2, mem_dim trans_query = self.word_trans(query) trans_query = torch.relu(trans_query) #bs, num_ec, mem_dim # bs_trial, bs, num_ec, mem_dim trans_query = trans_query[None].repeat(len(memory), 1, 1, 1).permute(1,0,2,3) pred_list = [] response_list = [] # go through all trials for idx, t_query in enumerate(trans_query): # mask is only useful when computing loss? mask = ec_mask[idx] # num_ec # t_query.shape = bs, num_ec, mem_dim attention = torch.bmm(t_query, memory.permute(0,2,1)) # bs, num_ec, 2 attention = torch.softmax(attention, dim=-1) # bs, num_ec, 2, attention over memory slots response = torch.bmm(attention, memory) # bs, num_ec, mem_dim output =, t_query), dim=-1) # bs, num_ec, 2*mem_dim output = self.mlp(output) # bs, num_ec, 512 output = torch.relu(output) output = self.output(output) # bs, num_ec, 2 or 3(if label has the class `neutral`) pred_list.append(output) response_list.append(response) pred_list = torch.stack(pred_list, 0) # bs_trial, bs_ehr, num_ec, output_class response_list = torch.stack(response_list, 0) # bs_trial, bs_ehr, num_ec, mem_dim return pred_list, response_list, trans_query