Source code for pytrial.tasks.trial_search.models.base

'''
TODO: add .from_pretrained to load pretrained trial search model
from cloud storage.
'''
import abc
import os
import json
import pdb

import torch
import pandas as pd

from pytrial.utils.check import check_model_dir
from pytrial.utils.check import make_dir_if_not_exist

[docs]class TrialSearchBase(abc.ABC):
    '''Abstract class for all trial search algroithms.

    Parameters
    ----------
    experiment_id: str, optional (default = 'test')
        The name of current experiment.

    '''
    @abc.abstractmethod
    def __init__(self, experiment_id='test'):
        check_model_dir(experiment_id)
        self.checkout_dir = os.path.join('./experiments_records', experiment_id,
                                         'checkpoints')
        self.result_dir = os.path.join('./experiments_records', experiment_id,
                                       'results')
        make_dir_if_not_exist(self.checkout_dir)
        make_dir_if_not_exist(self.result_dir)


[docs]    @abc.abstractmethod
    def fit(self, train_data, valid_data):
        '''
        Fit the model with training data. Need to implement in subclass.

        Parameters
        ----------
        train_data: dict
            Training data for model fitting.

            train_data = {

            'x': pd.DataFrame,
            
            'fields': list[str],
            
            'y': pd.Series or np.array,
            
            }

        valid_data: dict
            Validation data.
            
            valid_data = {
                
            'x': pd.DataFrame,
            
            'fields': list[str],
            
            'y': pd.Series or np.array,
            
            }

        Returns
        -------
        self: object
            The trained model.

        '''
        raise NotImplementedError

    @abc.abstractmethod
    def predict(self, test_data):
        raise NotImplementedError

[docs]    @abc.abstractmethod
    def load_model(self, checkpoint):
        '''
        Parameters
        ----------
        checkpoint: str
            The path to the saved model.

        Returns
        -------
        self: object
            The loaded pretrained model.
        '''
        raise NotImplementedError

[docs]    @abc.abstractmethod
    def save_model(self, output_dir):
        '''
        Parameters
        ----------
        output_dir: str
            The directory to save the model states.

        '''
        raise NotImplementedError

[docs]    @abc.abstractmethod
    def encode(self, inputs):
        '''
        Encode input documents into embeddings.

        Parameters
        ----------
        inputs: dict
            The input documents.
        '''
        raise NotImplementedError

    def train(self, mode=True):
        self.training = mode
        self.model.train()
        return self
    
    def eval(self, mode=False):
        self.training = mode
        self.model.eval()
        return self

    @abc.abstractmethod
    def _build_model(self):
        raise NotImplementedError

    @abc.abstractmethod
    def __getitem__(self, tag):
        '''
        Get the embeddings of documents by the trial tags.

        Parameters
        ----------
        tag: str, int, list[str], list[int]
            The tag (or tags) to be looked up in the model.

        Returns
        -------
        The embeddings of each document.
        '''
        raise NotImplementedError



    def _save_checkpoint(self, state,
                        epoch_id=0,
                        is_best=False,
                        output_dir=None,
                        filename='checkpoint.pth.tar'):
        if output_dir is None:
            output_dir = self.checkout_dir

        if epoch_id < 1:
            filepath = os.path.join(output_dir, 'latest.' + filename)
        elif is_best:
            filepath = os.path.join(output_dir, 'best.' + filename)
        else:
            filepath = os.path.join(self.checkout_dir,
                                    str(epoch_id) + '.' + filename)
        torch.save(state, filepath)

    def _save_model_config(self, model_config, output_dir=None):
        if output_dir is None:
            output_dir = self.checkout_dir
        temp_path = os.path.join(output_dir, "model_config.json")
        if os.path.exists(temp_path):
            os.remove(temp_path)
        with open(temp_path, "w", encoding='utf-8') as f:
            f.write(json.dumps(model_config, indent=4))

    def _load_model_config(self, checkpoint=''):
        if checkpoint == '':
            temp_path = os.path.join(self.checkout_dir,
                                     'model_config.json')
            assert os.path.exists(
                temp_path), 'cannot find predictor_config.json, please it in dir {0}'.format(
                self.checkout_dir)
        else:
            temp_path = checkpoint
            assert os.path.exists(
                temp_path), 'cannot find checkpoint file from path: {0}'.format(
                checkpoint)
        print('load predictor config file from {0}'.format(temp_path))
        with open(temp_path, 'r') as f:
            predictor_config = json.load(f)
        return predictor_config

    def _input_data_check(self, inputs):
        '''
        Check the training / testing data fits the formats.
        Target to (1) check if inputs valid,
                    if not, give tips about the data problem.

        Parameters
        ----------
        inputs: {
                'x': pd.DataFrame,
                'fields': list[str],
                'tag': str,
                }
        '''
        # check overall input format
        assert 'x' in inputs, 'No input trial doc dataframe found in inputs.'
        df = inputs['x']
        if 'fields' in inputs:
            try:
                _ = df[inputs['fields']]
            except:
                raise Exception('Cannot find the specified `fields` in inputs dataframe.')
        if 'tag' in inputs:
            try:
                _ = df[inputs['tag']]
            except:
                raise Exception('Cannot find the specified `tag` in inputs dataframe.')

        # check data type
        try:
            _ = df.applymap(str)
        except:
            raise Exception('Cannot transform the input dataframe to str type, please check the inputs.')

    def _process_dataframe(self, df, fields):
        if fields is not None:
            df = df[fields]
        if 'nct_id' in df:
            df = df.drop(['nct_id'], axis=1)
        df = df.applymap(str)
        df = df.apply(lambda x: x.name + ': ' + x)
        df = df.applymap(lambda x: x.lower())
        df_raw_texts = df.agg(' '.join, axis=1)
        df_raw_texts = pd.DataFrame(df_raw_texts, columns=['text'])
        return df_raw_texts


def whitening_torch_final(embeddings):
    '''
    Whitening the embeddings.

    Parameters
    ----------
    embeddings: torch.Tensor
        The embeddings to be whitened. The shape is (n, d).
    '''
    mu = torch.mean(embeddings, dim=0, keepdim=True)
    cov = torch.mm((embeddings - mu).t(), embeddings - mu)
    u, s, vt = torch.svd(cov)
    W = torch.mm(u, torch.diag(1/torch.sqrt(s)))
    embeddings = torch.mm(embeddings - mu, W)
    return embeddings