Source code for pytrial.data.demo_data

'''
Provide an easy-to-acess function to load ready-to-use demo data
from './demo_data' folder.
'''
import pdb
import os
import dill
import wget

import pandas as pd

from .patient_data import TabularPatientBase
from ..utils.trial_utils import ClinicalTrials
from ..utils.tabular_utils import read_csv_to_df, load_table_config

TRIALSIM_DATA_URL = 'https://storage.googleapis.com/pytrial/TrialSim-data.xlsx'
SYNTHETIC_DATA_URL = 'https://github.com/RyanWangZf/PromptEHR/raw/main/demo_data/synthetic_ehr/data.pkl'
SEQ_TRIAL_PATIENT_URL = 'https://storage.googleapis.com/pytrial/seq_patient_nct00174655.zip'
TOP_URL = 'https://github.com/futianfan/clinical-trial-outcome-prediction/raw/main/data/'
TRIAL_PATIENT_URL = 'https://storage.googleapis.com/pytrial/demo_trial_patient_data.zip'
TRIAL_DOCUMENT_URL = 'https://storage.googleapis.com/pytrial/clinical_trials.csv'

__all__ = [
    'load_mimic_ehr_sequence',
    'load_synthetic_ehr_sequence',
    'load_trial_patient_sequence',
    'load_trial_patient_tabular',
    'load_trial_outcome_data',
    'load_trial_document_data',
]


[docs]def load_synthetic_ehr_sequence(input_dir=None, n_sample=None):
    '''
    Load synthetic EHR patient sequence data, which was generated by PromptEHR (https://arxiv.org/pdf/2211.01761.pdf).

    Parameters
    ----------
    input_dir: str
        The folder that stores the demo data. If None, we will download the demo data and save it
        to './demo_data/synthetic_ehr'. Make sure to remove this folder if it is empty.
    
    n_sample: int
        The number of samples we want to load. If None, all data will be loaded.
    '''
    if input_dir is None:
        input_dir = './demo_data/synthetic_ehr'

    if not os.path.exists(input_dir):
        os.makedirs(input_dir)
        url = SYNTHETIC_DATA_URL
        filename = wget.download(url, out=input_dir)
        print(f'Download synthetic EHRs to {input_dir}.')
    
    with open(os.path.join(input_dir,'data.pkl'), 'rb') as f:
        x = dill.load(f)
    
    # ZW: temporal solution to solve the typo
    x['cat_cardinalities'] = x.pop('cat_cardinalties')

    if n_sample is not None:
        # cut to get smaller demo data
        x['visit'] = x['visit'][:n_sample]
        x['y'] = x['y'][:n_sample]
        x['feature'] = x['feature'][:n_sample]

    return x


[docs]def load_mimic_ehr_sequence(input_dir=None, n_sample=None):
    '''
    Load EHR patient sequence data, which needs to be accessed via https://physionet.org/content/mimiciii/1.4/.

    Parameters
    ----------
    input_dir: str
        The folder that stores the demo data. If None, we will look for the demo data in
        './demo_data/demo_patient_sequence/ehr'.
    
    n_sample: int
        The number of samples we want to load. If None, all data will be loaded.
    '''
    if input_dir is None:
        input_dir = './demo_data/demo_patient_sequence/ehr'

    if not os.path.exists(input_dir):
        raise ValueError(f'Please download the MIMIC-III dataset and put it in {input_dir}.')

    visit = dill.load(open(os.path.join(input_dir, 'visits.pkl'), 'rb'))
    voc = dill.load(open(os.path.join(input_dir, 'voc.pkl'), 'rb'))

    # make some simple processing
    feature = pd.read_csv(os.path.join(input_dir, 'feature.csv'), index_col=0)
    label = feature['MORTALITY'].values
    x = feature[['AGE','GENDER','ETHNICITY']]
    tabx = TabularPatientBase(x)
    x = tabx.df.values # get processed patient features in matrix form

    if n_sample is not None:
        # cut to get smaller demo data
        visit = visit[:n_sample]
        label = label[:n_sample]
        x = x[:n_sample]

    n_num_feature = 1
    cat_cardinalities = []
    for i in range(n_num_feature, x.shape[1]):
        cat_cardinalities.append(len(list(set(x[:,i]))))

    return {
        'visit':visit,
        'voc':voc,
        'order':['diag','prod','med'],
        'mortality':label,
        'feature':x,
        'n_num_feature':n_num_feature,
        'cat_cardinalities':cat_cardinalities,
        }

[docs]def load_trial_patient_sequence(input_dir=None):
    '''
    Load synthetic sequential trial patient records.

    Parameters
    ----------
    input_dir: str
        The folder that stores the demo data. If None, we will download the demo data and save it
        to './demo_data/demo_patient_sequence/trial'. Make sure to remove this folder if it is empty.    
    '''
    if input_dir is None:
        input_dir = './demo_data/demo_patient_sequence/trial'
    
    if not os.path.exists(input_dir):
        os.makedirs(input_dir)
        url = SEQ_TRIAL_PATIENT_URL
        filename = wget.download(url, out=input_dir)
        # unzip filename
        import zipfile
        with zipfile.ZipFile(filename, 'r') as zip_ref:
            zip_ref.extractall(input_dir)
        print(f'\n Download trial patient sequence data to {input_dir}.')

    # load patient data
    print("#"*5+'Demo Data Folder'+"#"*5)
    print(os.listdir(input_dir))
    print("#"*20)
    visit = dill.load(open(os.path.join(input_dir,'visit.pkl'), 'rb'))
    vocs = dill.load(open(os.path.join(input_dir,'voc.pkl'), 'rb'))
    feature = pd.read_csv(os.path.join(input_dir, 'feature.csv'))
    v_stage = dill.load(open(os.path.join(input_dir,'visit_stage.pkl'), 'rb'))
    orders = list(vocs.keys())
    # data preprocessing
    label_relapse = feature['num relapse']
    label_mortality = feature['death'].values
    x = feature.drop(['num relapse','death','RUSUBJID'], axis=1)
    x['weight'] = x['weight'].replace({'>= 125':'125'}).astype(float)
    tabx = TabularPatientBase(x)
    x = tabx.df.values # get processed patient features in matrix form
    return {
        'feature':x,
        'visit':visit,
        'voc':vocs,
        'order':orders,
        'visit_stage':v_stage,
        'relapse':label_relapse,
        'mortality':label_mortality,
    }

[docs]def load_trial_outcome_data(input_dir=None, phase='I', split='train'):
    '''
    Load trial outcome prediction (TOP) benchmark data.

    Parameters
    ----------
    input_dir: str
        The folder that stores the demo data. If None, we will download the demo data and save it
        to './demo_data/demo_trial_data'. Make sure to remove this folder if it is empty.

    phase: {'I','II','III'}
        The phase of the trial data. Can be 'I', 'II', 'III'.
    
    split: {'train', 'test', 'valid'}
        The split of the trial data. Can be 'train', 'test', 'valid'.
    '''

    BENCHMARK_DATA_URL = 'https://storage.googleapis.com/pytrial/HINT-benchmark-data/hint_benchmark_dataset_w_date.zip'

    if input_dir is None:
        input_dir = './demo_data/demo_trial_outcome_data'

    if not os.path.exists(input_dir):
        os.makedirs(input_dir)
        # download the benchmark data
        wget.download(BENCHMARK_DATA_URL, out=input_dir)
        # unzip filename
        import zipfile
        with zipfile.ZipFile(os.path.join(input_dir, 'hint_benchmark_dataset_w_date.zip'), 'r') as zip_ref:
            zip_ref.extractall(input_dir)
        print(f'\n Download trial data to {input_dir}.')
    
    filename = 'phase_{}_{}.csv'.format(phase, split)
    filename = os.path.join(input_dir, filename)
    # load patient data
    df = pd.read_csv(filename)
    return {'data':df}

[docs]def load_trial_document_data(input_dir=None, 
    n_sample=None,
    source='preprocessed',
    date='20221001',
    ):
    '''
    Load trial document data obtained from ClinicalTrials.gov.

    Parameters
    ----------
    input_dir: str
        The folder that stores the demo data. If None, we will download the demo data and save it
        to ''./demo_data/demo_trial_document'. Make sure to remove this folder if it is empty.
    
    n_sample: int
        The number of samples we want to load. If None, all data will be loaded.
    
    source: {'clinicaltrials.gov', 'preprocessed'}
        The source of the data. If 'clinicaltrials.gov', we will download the raw data from
        that website and process it. If 'preprocessed', we will load the preprocessed data.

    date: str
        The date of the clinicaltrials.gov copy. Only valid when ``source='clinicaltrials.gov'``.
    '''
    if input_dir is None:
        input_dir = './demo_data/demo_trial_document'
    
    if not os.path.exists(input_dir):
        os.makedirs(input_dir)
    
    filepath = os.path.join(input_dir, 'clinical_trials.csv')
    label_filepath = os.path.join(input_dir, 'TrialSim-data.xlsx')
    
    if not os.path.exists(filepath):
        if source == 'clinicaltrials.gov':
            print("Downloading clinicaltrials.gov data...")
            # download demo data
            client = ClinicalTrials()
            client.download(date=date, output_dir=input_dir)
        else:
            # download the preprocessed data
            print("Downloading preprocessed clinical trial documents data (copy of 10/01/2022)...")
            url = TRIAL_DOCUMENT_URL
            import wget
            filepath = wget.download(url, out=os.path.join(input_dir, 'clinical_trials.csv'))

    if not os.path.exists(label_filepath):
        # download demo label
        import wget
        wget.download(TRIALSIM_DATA_URL, out=label_filepath)

    df = pd.read_csv(filepath, index_col=0)
    df_tr = df # all data
    df_val = pd.read_excel(label_filepath, index_col=0)

    df_v = pd.DataFrame({'nct_id':df_val.iloc[:,:11].to_numpy().flatten()})
    df_v = df_v.merge(df, on='nct_id', how='inner')
    df_tr = pd.concat([df_tr, df_v], axis=0).drop_duplicates()

    if n_sample is not None:
        # cut to get smaller demo data
        df_tr = df_tr.iloc[:n_sample]

    return {
        'x': df_tr,
        'fields':['title','intervention_name','disease','keyword'],
        'ctx_fields':['description','criteria'],
        'tag': 'nct_id',
        'x_val':df_val.iloc[:,:11],
        'y_val':df_val.iloc[:,11:],
    }

[docs]def load_trial_patient_tabular(input_dir=None):
    '''
    Load synthetic tabular trial patient records.

    Parameters
    ----------
    input_dir: str
        The folder that stores the demo data. If None, we will download the demo data and save it
        to './demo_data/demo_trial_patient_data'. Make sure to remove this folder if it is empty.    
    '''
    if input_dir is None:
        input_dir = './demo_data/demo_trial_patient_data'
    
    if not os.path.exists(input_dir):
        os.makedirs(input_dir)
        url = TRIAL_PATIENT_URL
        filename = wget.download(url, out=input_dir)
        # unzip filename
        import zipfile
        with zipfile.ZipFile(filename, 'r') as zip_ref:
            zip_ref.extractall(input_dir)
        print(f'\n Download trial patient data to {input_dir}.')
    
    # load patient data
    df = read_csv_to_df(os.path.join(input_dir, 'data_processed.csv'), index_col=0)
    table_config = load_table_config(input_dir)
    return {'data':df, 'metadata':table_config}