'''
Provide an easy-to-acess function to load ready-to-use demo data
from './demo_data' folder.
'''
import pdb
import os
import dill
import wget
import pandas as pd
from .patient_data import TabularPatientBase
from ..utils.trial_utils import ClinicalTrials
from ..utils.tabular_utils import read_csv_to_df, load_table_config
TRIALSIM_DATA_URL = 'https://storage.googleapis.com/pytrial/TrialSim-data.xlsx'
SYNTHETIC_DATA_URL = 'https://github.com/RyanWangZf/PromptEHR/raw/main/demo_data/synthetic_ehr/data.pkl'
SEQ_TRIAL_PATIENT_URL = 'https://storage.googleapis.com/pytrial/seq_patient_nct00174655.zip'
TOP_URL = 'https://github.com/futianfan/clinical-trial-outcome-prediction/raw/main/data/'
TRIAL_PATIENT_URL = 'https://storage.googleapis.com/pytrial/demo_trial_patient_data.zip'
TRIAL_DOCUMENT_URL = 'https://storage.googleapis.com/pytrial/clinical_trials.csv'
__all__ = [
'load_mimic_ehr_sequence',
'load_synthetic_ehr_sequence',
'load_trial_patient_sequence',
'load_trial_patient_tabular',
'load_trial_outcome_data',
'load_trial_document_data',
]
[docs]def load_synthetic_ehr_sequence(input_dir=None, n_sample=None):
'''
Load synthetic EHR patient sequence data, which was generated by PromptEHR (https://arxiv.org/pdf/2211.01761.pdf).
Parameters
----------
input_dir: str
The folder that stores the demo data. If None, we will download the demo data and save it
to './demo_data/synthetic_ehr'. Make sure to remove this folder if it is empty.
n_sample: int
The number of samples we want to load. If None, all data will be loaded.
'''
if input_dir is None:
input_dir = './demo_data/synthetic_ehr'
if not os.path.exists(input_dir):
os.makedirs(input_dir)
url = SYNTHETIC_DATA_URL
filename = wget.download(url, out=input_dir)
print(f'Download synthetic EHRs to {input_dir}.')
with open(os.path.join(input_dir,'data.pkl'), 'rb') as f:
x = dill.load(f)
# ZW: temporal solution to solve the typo
x['cat_cardinalities'] = x.pop('cat_cardinalties')
if n_sample is not None:
# cut to get smaller demo data
x['visit'] = x['visit'][:n_sample]
x['y'] = x['y'][:n_sample]
x['feature'] = x['feature'][:n_sample]
return x
[docs]def load_mimic_ehr_sequence(input_dir=None, n_sample=None):
'''
Load EHR patient sequence data, which needs to be accessed via https://physionet.org/content/mimiciii/1.4/.
Parameters
----------
input_dir: str
The folder that stores the demo data. If None, we will look for the demo data in
'./demo_data/demo_patient_sequence/ehr'.
n_sample: int
The number of samples we want to load. If None, all data will be loaded.
'''
if input_dir is None:
input_dir = './demo_data/demo_patient_sequence/ehr'
if not os.path.exists(input_dir):
raise ValueError(f'Please download the MIMIC-III dataset and put it in {input_dir}.')
visit = dill.load(open(os.path.join(input_dir, 'visits.pkl'), 'rb'))
voc = dill.load(open(os.path.join(input_dir, 'voc.pkl'), 'rb'))
# make some simple processing
feature = pd.read_csv(os.path.join(input_dir, 'feature.csv'), index_col=0)
label = feature['MORTALITY'].values
x = feature[['AGE','GENDER','ETHNICITY']]
tabx = TabularPatientBase(x)
x = tabx.df.values # get processed patient features in matrix form
if n_sample is not None:
# cut to get smaller demo data
visit = visit[:n_sample]
label = label[:n_sample]
x = x[:n_sample]
n_num_feature = 1
cat_cardinalities = []
for i in range(n_num_feature, x.shape[1]):
cat_cardinalities.append(len(list(set(x[:,i]))))
return {
'visit':visit,
'voc':voc,
'order':['diag','prod','med'],
'mortality':label,
'feature':x,
'n_num_feature':n_num_feature,
'cat_cardinalities':cat_cardinalities,
}
[docs]def load_trial_patient_sequence(input_dir=None):
'''
Load synthetic sequential trial patient records.
Parameters
----------
input_dir: str
The folder that stores the demo data. If None, we will download the demo data and save it
to './demo_data/demo_patient_sequence/trial'. Make sure to remove this folder if it is empty.
'''
if input_dir is None:
input_dir = './demo_data/demo_patient_sequence/trial'
if not os.path.exists(input_dir):
os.makedirs(input_dir)
url = SEQ_TRIAL_PATIENT_URL
filename = wget.download(url, out=input_dir)
# unzip filename
import zipfile
with zipfile.ZipFile(filename, 'r') as zip_ref:
zip_ref.extractall(input_dir)
print(f'\n Download trial patient sequence data to {input_dir}.')
# load patient data
print("#"*5+'Demo Data Folder'+"#"*5)
print(os.listdir(input_dir))
print("#"*20)
visit = dill.load(open(os.path.join(input_dir,'visit.pkl'), 'rb'))
vocs = dill.load(open(os.path.join(input_dir,'voc.pkl'), 'rb'))
feature = pd.read_csv(os.path.join(input_dir, 'feature.csv'))
v_stage = dill.load(open(os.path.join(input_dir,'visit_stage.pkl'), 'rb'))
orders = list(vocs.keys())
# data preprocessing
label_relapse = feature['num relapse']
label_mortality = feature['death'].values
x = feature.drop(['num relapse','death','RUSUBJID'], axis=1)
x['weight'] = x['weight'].replace({'>= 125':'125'}).astype(float)
tabx = TabularPatientBase(x)
x = tabx.df.values # get processed patient features in matrix form
return {
'feature':x,
'visit':visit,
'voc':vocs,
'order':orders,
'visit_stage':v_stage,
'relapse':label_relapse,
'mortality':label_mortality,
}
[docs]def load_trial_outcome_data(input_dir=None, phase='I', split='train'):
'''
Load trial outcome prediction (TOP) benchmark data.
Parameters
----------
input_dir: str
The folder that stores the demo data. If None, we will download the demo data and save it
to './demo_data/demo_trial_data'. Make sure to remove this folder if it is empty.
phase: {'I','II','III'}
The phase of the trial data. Can be 'I', 'II', 'III'.
split: {'train', 'test', 'valid'}
The split of the trial data. Can be 'train', 'test', 'valid'.
'''
BENCHMARK_DATA_URL = 'https://storage.googleapis.com/pytrial/HINT-benchmark-data/hint_benchmark_dataset_w_date.zip'
if input_dir is None:
input_dir = './demo_data/demo_trial_outcome_data'
if not os.path.exists(input_dir):
os.makedirs(input_dir)
# download the benchmark data
wget.download(BENCHMARK_DATA_URL, out=input_dir)
# unzip filename
import zipfile
with zipfile.ZipFile(os.path.join(input_dir, 'hint_benchmark_dataset_w_date.zip'), 'r') as zip_ref:
zip_ref.extractall(input_dir)
print(f'\n Download trial data to {input_dir}.')
filename = 'phase_{}_{}.csv'.format(phase, split)
filename = os.path.join(input_dir, filename)
# load patient data
df = pd.read_csv(filename)
return {'data':df}
[docs]def load_trial_document_data(input_dir=None,
n_sample=None,
source='preprocessed',
date='20221001',
):
'''
Load trial document data obtained from ClinicalTrials.gov.
Parameters
----------
input_dir: str
The folder that stores the demo data. If None, we will download the demo data and save it
to ''./demo_data/demo_trial_document'. Make sure to remove this folder if it is empty.
n_sample: int
The number of samples we want to load. If None, all data will be loaded.
source: {'clinicaltrials.gov', 'preprocessed'}
The source of the data. If 'clinicaltrials.gov', we will download the raw data from
that website and process it. If 'preprocessed', we will load the preprocessed data.
date: str
The date of the clinicaltrials.gov copy. Only valid when ``source='clinicaltrials.gov'``.
'''
if input_dir is None:
input_dir = './demo_data/demo_trial_document'
if not os.path.exists(input_dir):
os.makedirs(input_dir)
filepath = os.path.join(input_dir, 'clinical_trials.csv')
label_filepath = os.path.join(input_dir, 'TrialSim-data.xlsx')
if not os.path.exists(filepath):
if source == 'clinicaltrials.gov':
print("Downloading clinicaltrials.gov data...")
# download demo data
client = ClinicalTrials()
client.download(date=date, output_dir=input_dir)
else:
# download the preprocessed data
print("Downloading preprocessed clinical trial documents data (copy of 10/01/2022)...")
url = TRIAL_DOCUMENT_URL
import wget
filepath = wget.download(url, out=os.path.join(input_dir, 'clinical_trials.csv'))
if not os.path.exists(label_filepath):
# download demo label
import wget
wget.download(TRIALSIM_DATA_URL, out=label_filepath)
df = pd.read_csv(filepath, index_col=0)
df_tr = df # all data
df_val = pd.read_excel(label_filepath, index_col=0)
df_v = pd.DataFrame({'nct_id':df_val.iloc[:,:11].to_numpy().flatten()})
df_v = df_v.merge(df, on='nct_id', how='inner')
df_tr = pd.concat([df_tr, df_v], axis=0).drop_duplicates()
if n_sample is not None:
# cut to get smaller demo data
df_tr = df_tr.iloc[:n_sample]
return {
'x': df_tr,
'fields':['title','intervention_name','disease','keyword'],
'ctx_fields':['description','criteria'],
'tag': 'nct_id',
'x_val':df_val.iloc[:,:11],
'y_val':df_val.iloc[:,11:],
}
[docs]def load_trial_patient_tabular(input_dir=None):
'''
Load synthetic tabular trial patient records.
Parameters
----------
input_dir: str
The folder that stores the demo data. If None, we will download the demo data and save it
to './demo_data/demo_trial_patient_data'. Make sure to remove this folder if it is empty.
'''
if input_dir is None:
input_dir = './demo_data/demo_trial_patient_data'
if not os.path.exists(input_dir):
os.makedirs(input_dir)
url = TRIAL_PATIENT_URL
filename = wget.download(url, out=input_dir)
# unzip filename
import zipfile
with zipfile.ZipFile(filename, 'r') as zip_ref:
zip_ref.extractall(input_dir)
print(f'\n Download trial patient data to {input_dir}.')
# load patient data
df = read_csv_to_df(os.path.join(input_dir, 'data_processed.csv'), index_col=0)
table_config = load_table_config(input_dir)
return {'data':df, 'metadata':table_config}