Source code for pytrial.data.demo_data

'''
Provide an easy-to-acess function to load ready-to-use demo data
from './demo_data' folder.
'''
import pdb
import os
import dill
import wget

import pandas as pd

from .patient_data import TabularPatientBase
from ..utils.trial_utils import ClinicalTrials
from ..utils.tabular_utils import read_csv_to_df, load_table_config

TRIALSIM_DATA_URL = 'https://storage.googleapis.com/pytrial/TrialSim-data.xlsx'
SYNTHETIC_DATA_URL = 'https://github.com/RyanWangZf/PromptEHR/raw/main/demo_data/synthetic_ehr/data.pkl'
SEQ_TRIAL_PATIENT_URL = 'https://storage.googleapis.com/pytrial/seq_patient_nct00174655.zip'
TOP_URL = 'https://github.com/futianfan/clinical-trial-outcome-prediction/raw/main/data/'
TRIAL_PATIENT_URL = 'https://storage.googleapis.com/pytrial/demo_trial_patient_data.zip'
TRIAL_DOCUMENT_URL = 'https://storage.googleapis.com/pytrial/clinical_trials.csv'

__all__ = [
    'load_mimic_ehr_sequence',
    'load_synthetic_ehr_sequence',
    'load_trial_patient_sequence',
    'load_trial_patient_tabular',
    'load_trial_outcome_data',
    'load_trial_document_data',
]


[docs]def load_synthetic_ehr_sequence(input_dir=None, n_sample=None): ''' Load synthetic EHR patient sequence data, which was generated by PromptEHR (https://arxiv.org/pdf/2211.01761.pdf). Parameters ---------- input_dir: str The folder that stores the demo data. If None, we will download the demo data and save it to './demo_data/synthetic_ehr'. Make sure to remove this folder if it is empty. n_sample: int The number of samples we want to load. If None, all data will be loaded. ''' if input_dir is None: input_dir = './demo_data/synthetic_ehr' if not os.path.exists(input_dir): os.makedirs(input_dir) url = SYNTHETIC_DATA_URL filename = wget.download(url, out=input_dir) print(f'Download synthetic EHRs to {input_dir}.') with open(os.path.join(input_dir,'data.pkl'), 'rb') as f: x = dill.load(f) # ZW: temporal solution to solve the typo x['cat_cardinalities'] = x.pop('cat_cardinalties') if n_sample is not None: # cut to get smaller demo data x['visit'] = x['visit'][:n_sample] x['y'] = x['y'][:n_sample] x['feature'] = x['feature'][:n_sample] return x
[docs]def load_mimic_ehr_sequence(input_dir=None, n_sample=None): ''' Load EHR patient sequence data, which needs to be accessed via https://physionet.org/content/mimiciii/1.4/. Parameters ---------- input_dir: str The folder that stores the demo data. If None, we will look for the demo data in './demo_data/demo_patient_sequence/ehr'. n_sample: int The number of samples we want to load. If None, all data will be loaded. ''' if input_dir is None: input_dir = './demo_data/demo_patient_sequence/ehr' if not os.path.exists(input_dir): raise ValueError(f'Please download the MIMIC-III dataset and put it in {input_dir}.') visit = dill.load(open(os.path.join(input_dir, 'visits.pkl'), 'rb')) voc = dill.load(open(os.path.join(input_dir, 'voc.pkl'), 'rb')) # make some simple processing feature = pd.read_csv(os.path.join(input_dir, 'feature.csv'), index_col=0) label = feature['MORTALITY'].values x = feature[['AGE','GENDER','ETHNICITY']] tabx = TabularPatientBase(x) x = tabx.df.values # get processed patient features in matrix form if n_sample is not None: # cut to get smaller demo data visit = visit[:n_sample] label = label[:n_sample] x = x[:n_sample] n_num_feature = 1 cat_cardinalities = [] for i in range(n_num_feature, x.shape[1]): cat_cardinalities.append(len(list(set(x[:,i])))) return { 'visit':visit, 'voc':voc, 'order':['diag','prod','med'], 'mortality':label, 'feature':x, 'n_num_feature':n_num_feature, 'cat_cardinalities':cat_cardinalities, }
[docs]def load_trial_patient_sequence(input_dir=None): ''' Load synthetic sequential trial patient records. Parameters ---------- input_dir: str The folder that stores the demo data. If None, we will download the demo data and save it to './demo_data/demo_patient_sequence/trial'. Make sure to remove this folder if it is empty. ''' if input_dir is None: input_dir = './demo_data/demo_patient_sequence/trial' if not os.path.exists(input_dir): os.makedirs(input_dir) url = SEQ_TRIAL_PATIENT_URL filename = wget.download(url, out=input_dir) # unzip filename import zipfile with zipfile.ZipFile(filename, 'r') as zip_ref: zip_ref.extractall(input_dir) print(f'\n Download trial patient sequence data to {input_dir}.') # load patient data print("#"*5+'Demo Data Folder'+"#"*5) print(os.listdir(input_dir)) print("#"*20) visit = dill.load(open(os.path.join(input_dir,'visit.pkl'), 'rb')) vocs = dill.load(open(os.path.join(input_dir,'voc.pkl'), 'rb')) feature = pd.read_csv(os.path.join(input_dir, 'feature.csv')) v_stage = dill.load(open(os.path.join(input_dir,'visit_stage.pkl'), 'rb')) orders = list(vocs.keys()) # data preprocessing label_relapse = feature['num relapse'] label_mortality = feature['death'].values x = feature.drop(['num relapse','death','RUSUBJID'], axis=1) x['weight'] = x['weight'].replace({'>= 125':'125'}).astype(float) tabx = TabularPatientBase(x) x = tabx.df.values # get processed patient features in matrix form return { 'feature':x, 'visit':visit, 'voc':vocs, 'order':orders, 'visit_stage':v_stage, 'relapse':label_relapse, 'mortality':label_mortality, }
[docs]def load_trial_outcome_data(input_dir=None, phase='I', split='train'): ''' Load trial outcome prediction (TOP) benchmark data. Parameters ---------- input_dir: str The folder that stores the demo data. If None, we will download the demo data and save it to './demo_data/demo_trial_data'. Make sure to remove this folder if it is empty. phase: {'I','II','III'} The phase of the trial data. Can be 'I', 'II', 'III'. split: {'train', 'test', 'valid'} The split of the trial data. Can be 'train', 'test', 'valid'. ''' BENCHMARK_DATA_URL = 'https://storage.googleapis.com/pytrial/HINT-benchmark-data/hint_benchmark_dataset_w_date.zip' if input_dir is None: input_dir = './demo_data/demo_trial_outcome_data' if not os.path.exists(input_dir): os.makedirs(input_dir) # download the benchmark data wget.download(BENCHMARK_DATA_URL, out=input_dir) # unzip filename import zipfile with zipfile.ZipFile(os.path.join(input_dir, 'hint_benchmark_dataset_w_date.zip'), 'r') as zip_ref: zip_ref.extractall(input_dir) print(f'\n Download trial data to {input_dir}.') filename = 'phase_{}_{}.csv'.format(phase, split) filename = os.path.join(input_dir, filename) # load patient data df = pd.read_csv(filename) return {'data':df}
[docs]def load_trial_document_data(input_dir=None, n_sample=None, source='preprocessed', date='20221001', ): ''' Load trial document data obtained from ClinicalTrials.gov. Parameters ---------- input_dir: str The folder that stores the demo data. If None, we will download the demo data and save it to ''./demo_data/demo_trial_document'. Make sure to remove this folder if it is empty. n_sample: int The number of samples we want to load. If None, all data will be loaded. source: {'clinicaltrials.gov', 'preprocessed'} The source of the data. If 'clinicaltrials.gov', we will download the raw data from that website and process it. If 'preprocessed', we will load the preprocessed data. date: str The date of the clinicaltrials.gov copy. Only valid when ``source='clinicaltrials.gov'``. ''' if input_dir is None: input_dir = './demo_data/demo_trial_document' if not os.path.exists(input_dir): os.makedirs(input_dir) filepath = os.path.join(input_dir, 'clinical_trials.csv') label_filepath = os.path.join(input_dir, 'TrialSim-data.xlsx') if not os.path.exists(filepath): if source == 'clinicaltrials.gov': print("Downloading clinicaltrials.gov data...") # download demo data client = ClinicalTrials() client.download(date=date, output_dir=input_dir) else: # download the preprocessed data print("Downloading preprocessed clinical trial documents data (copy of 10/01/2022)...") url = TRIAL_DOCUMENT_URL import wget filepath = wget.download(url, out=os.path.join(input_dir, 'clinical_trials.csv')) if not os.path.exists(label_filepath): # download demo label import wget wget.download(TRIALSIM_DATA_URL, out=label_filepath) df = pd.read_csv(filepath, index_col=0) df_tr = df # all data df_val = pd.read_excel(label_filepath, index_col=0) df_v = pd.DataFrame({'nct_id':df_val.iloc[:,:11].to_numpy().flatten()}) df_v = df_v.merge(df, on='nct_id', how='inner') df_tr = pd.concat([df_tr, df_v], axis=0).drop_duplicates() if n_sample is not None: # cut to get smaller demo data df_tr = df_tr.iloc[:n_sample] return { 'x': df_tr, 'fields':['title','intervention_name','disease','keyword'], 'ctx_fields':['description','criteria'], 'tag': 'nct_id', 'x_val':df_val.iloc[:,:11], 'y_val':df_val.iloc[:,11:], }
[docs]def load_trial_patient_tabular(input_dir=None): ''' Load synthetic tabular trial patient records. Parameters ---------- input_dir: str The folder that stores the demo data. If None, we will download the demo data and save it to './demo_data/demo_trial_patient_data'. Make sure to remove this folder if it is empty. ''' if input_dir is None: input_dir = './demo_data/demo_trial_patient_data' if not os.path.exists(input_dir): os.makedirs(input_dir) url = TRIAL_PATIENT_URL filename = wget.download(url, out=input_dir) # unzip filename import zipfile with zipfile.ZipFile(filename, 'r') as zip_ref: zip_ref.extractall(input_dir) print(f'\n Download trial patient data to {input_dir}.') # load patient data df = read_csv_to_df(os.path.join(input_dir, 'data_processed.csv'), index_col=0) table_config = load_table_config(input_dir) return {'data':df, 'metadata':table_config}