utils/data_loader.py · scan

import numpy as np 
import pandas as pd
import scipy
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE

def _select_markers(exprs,marker_list,name_pool):
    indicies = []  # indices of selected markers
    for index,top_gene in enumerate(marker_list):
        idx = [i for i,name in enumerate(name_pool) if top_gene == name]
        if len(idx) == 0:  continue
        else:  indicies.append(idx)
    indicies = np.array(indicies)
    indicies = np.squeeze(indicies)
    output_data = exprs[:,indicies]
    return output_data
def _get_split_with_test(exprs,clin_y,rest_idx,test_idx,seed=1024):
    
    # recover original test set
    x_rest,x_test = exprs[rest_idx,:],exprs[test_idx,:]
    cy_rest,cy_test = clin_y[rest_idx,:],clin_y[test_idx,:]

    # create K folds of train/validation splits
    skf = StratifiedKFold(n_splits=4,random_state=seed,shuffle=True)
    skf_splits=[]
    for train_index, val_index in skf.split(x_rest, cy_rest[:,-1].astype(int)):
        skf_splits.append([train_index, val_index])
    return skf_splits,x_test,cy_test,x_rest,cy_rest

def split_nsclc_data_with_test(expr_path,clin_path,name_path,save_path,test_idx_path,new_markers=[]):
    '''
        Example usage in main():
        # split_nsclc_data_with_test(
        #     '../data/nsclc/cohort/cohort_data.npz',
        #     '../data/nsclc/cohort/clinical_data.npz',
        #     '../data/nsclc/cohort/cohort_name_pool.npz',
        #     '../data/nsclc/nsclc_',
        #     '../data/nsclc/cohort/nsclc_test_idx.npz')
    '''

    # for unlabelled samples, should check cohort/clinical_data.npz (['no_clinical'])
    # all data in ['clinical'] come with correspoding labels
    expr = np.load(expr_path,allow_pickle=True)
    expr = expr['clinical']
    clin = np.load(clin_path,allow_pickle=True)
    clin = clin['clinical']
    clin = clin[:,:-1]  # last colummn is 'complete' which is no use
    name = np.load(name_path,allow_pickle=True)
    name = name['name']

    # selected biomarkers with our systems biology selector
    marker_list = ['EPCAM','HIF1A','PKM','PTK7','ALCAM','CADM1','SLC2A1',
        'CUL1','CUL3','EGFR','ELAVL1','GRB2','NRF1','RNF2','RPA2']
    index = np.load(test_idx_path,allow_pickle=True)
    rest_idx = index['rest_idx']
    test_idx = index['test_idx']

    x_rest,x_test = expr[rest_idx,:],expr[test_idx,:]
    cy_rest,cy_test = clin[rest_idx,:],clin[test_idx,:]    

    # print(np.shape(x_rest))
    # print(np.shape(x_test))
    # print(np.shape(cy_rest))
    # print(np.shape(cy_test))
    # print(cy_rest[0:5,0])  # age
    # print(cy_rest[0:5,1])  # gender (male == 1 / female == 0)
    # print(cy_rest[0:5,2])  # stage (1-5)
    # print(cy_rest[0:5,3])  # os time (months)
    # print(cy_rest[0:5,4])  # event
    # print(cy_rest[0:5,5])  # label

    biomarker_idx = []
    for gene in marker_list:
        idx = [i for i,g in enumerate(name) if gene == g]
        biomarker_idx.extend(idx)
    biomarker_idx = np.array(biomarker_idx)
    # print(biomarker_idx)

    # name.npz & nsclc_data.npz contains the cohort with all gene expressions
    # np.savez_compressed('../data/nsclc/cohort/name.npz',name=name)
    # np.savez_compressed('../data/nsclc/cohort/nsclc_data.npz',
    #     x_train=x_rest,cy_train=cy_rest,
    #     x_test=x_test,cy_test=cy_test,
    #     biomarker_idx=biomarker_idx)

    # test set and several train/validation splits
    skf_splits,x_test_tmp,cy_test_tmp,x_rest,cy_rest = _get_split_with_test(expr,clin,rest_idx,test_idx)
    for idx in range(len(skf_splits)):
        [train_index, val_index] = skf_splits[idx]
        x_train,x_valid = x_rest[train_index], x_rest[val_index]
        cy_train,cy_valid = cy_rest[train_index], cy_rest[val_index]

        x_test = x_test_tmp
        cy_test = cy_test_tmp
        c_train,c_valid,c_test = cy_train[:,:3],cy_valid[:,:3],cy_test[:,:3]
        o_train,o_valid,o_test = cy_train[:,-3],cy_valid[:,-3],cy_test[:,-3]
        e_train,e_valid,e_test = cy_train[:,-2],cy_valid[:,-2],cy_test[:,-2]
        y_train,y_valid,y_test = cy_train[:,-1],cy_valid[:,-1],cy_test[:,-1]

        if len(new_markers) == 0:
            x_train = _select_markers(x_train,marker_list,name)
            x_valid = _select_markers(x_valid,marker_list,name)
            x_test  = _select_markers(x_test,marker_list,name)
        else:
            x_train = _select_markers(x_train,new_markers,name)
            x_valid = _select_markers(x_valid,new_markers,name)
            x_test  = _select_markers(x_test,new_markers,name)

        scaler_x = preprocessing.StandardScaler().fit(x_train.astype(np.float32))
        x_train = scaler_x.transform(x_train.astype(np.float32))
        x_valid = scaler_x.transform(x_valid.astype(np.float32))
        x_test = scaler_x.transform(x_test.astype(np.float32))
        scaler_c = preprocessing.StandardScaler().fit(c_train.astype(np.float32))
        c_train = scaler_c.transform(c_train.astype(np.float32))
        c_valid = scaler_c.transform(c_valid.astype(np.float32))
        c_test = scaler_c.transform(c_test.astype(np.float32))

        x_mean = scaler_x.mean_
        x_scale = scaler_x.scale_
        c_mean = scaler_c.mean_
        c_scale = scaler_c.scale_

        np.savez_compressed(save_path + '_' + str(idx) + '.npz',
            x_train=x_train,x_valid=x_valid,x_test=x_test,
            c_train=c_train,c_valid=c_valid,c_test=c_test,
            y_train=y_train,y_valid=y_valid,y_test=y_test,
            e_train=e_train,e_valid=e_valid,e_test=e_test,
            o_train=o_train,o_valid=o_valid,o_test=o_test,
            x_mean=x_mean,x_scale=x_scale,c_mean=c_mean,c_scale=c_scale)

def get_nsclc_data_unlabeled(expr_path,clin_path,name_path,save_path,new_markers=[]):
    '''
        Example usage in main():
        # split_nsclc_data_with_test(
        #     '../data/nsclc/cohort/cohort_data.npz',
        #     '../data/nsclc/cohort/clinical_data.npz',
        #     '../data/nsclc/cohort/cohort_name_pool.npz',
        #     '../data/nsclc/nsclc_unlabeled')
    '''
    # for unlabeled samples, should check cohort/clinical_data.npz (['no_clinical'])
    # all data in ['clinical'] come with correspoding labels
    expr = np.load(expr_path,allow_pickle=True)
    expr = expr['no_clinical']
    clin = np.load(clin_path,allow_pickle=True)
    clin = clin['no_clinical']
    clin = clin[:,:-1]  # last colummn is 'complete' which is no use
    name = np.load(name_path,allow_pickle=True)
    name = name['name']
    marker_list = ['EPCAM','HIF1A','PKM','PTK7','ALCAM','CADM1','SLC2A1',
        'CUL1','CUL3','EGFR','ELAVL1','GRB2','NRF1','RNF2','RPA2']

    # store all unlabeled samples in a single file
    if len(new_markers) == 0:  expr = _select_markers(expr,marker_list,name)
    else:  expr = _select_markers(expr,new_markers,name)  

    # c = [age, gender, stage, ostime, event, complete] where `complete` already dropped
    # missing values are marked as -1
    x_w_full,c_w_full,o_w_full,e_w_full,x_n_full,c_n_full,o_n_full,e_n_full = [],[],[],[],[],[],[],[]
    for idx in range(np.shape(expr)[0]):
        if np.any(clin[idx,:]) == -1:
            x_n_full.append(expr[idx,:])
            c_n_full.append(clin[idx,0:3])
            o_n_full.append(clin[idx,3])
            e_n_full.append(clin[idx,4])
        else:
            x_w_full.append(expr[idx,:])
            c_w_full.append(clin[idx,0:3])
            o_w_full.append(clin[idx,3])
            e_w_full.append(clin[idx,4])

    # 62/102 with full clinical information; 40/102 without            
    np.savez_compressed(save_path + '.npz',
        x_w_full=x_w_full,c_w_full=c_w_full,o_w_full=o_w_full,e_w_full=e_w_full,
        x_n_full=x_n_full,c_n_full=c_n_full,o_n_full=o_n_full,e_n_full=e_n_full)

def split_breast_cancer_data(train_file_path,test_file_path,save_path,val_idx,features_c=[],features_x=[]):
    '''
        Example usage in main():
        # split_breast_cancer_data(
        #     'data/breast/metabric.pkl','data/breast/test_sample_hold_out.npz',
        #     'data/breast/breast_'
        #     '0')
    '''

    if len(features_c) == 0:   # 10 clinical features
        clinical_feature = ['Age', 'Menopausal State', 'Size', 'Radio Therapy',
            'Chemotherapy', 'Hormone Therapy', 'Neoplasm Histologic Grade', 'Cellularity',
            'Surgery-breast conserving', 'Surgery-mastectomy']
    else:  clinical_feature = features_c
    if len(features_x) == 0:  # 20 biomarkers
        gene_feature = ['ESR1','PGR','ERBB2','MKI67','PLAU',
            'ELAVL1','EGFR','BTRC','FBXO6','SHMT2','KRAS','SRPK2',
            'YWHAQ','PDHA1','EWSR1','ZDHHC17','ENO1','DBN1','PLK1','GSK3B']
    else:  gene_feature = features_x

    # row: feature; col: patient
    combine = pd.read_pickle(train_file_path)
    event = combine.T['Event'].values
    idx_censor = np.array(event == 0)  # data with no events are considered censored
    combine = combine.loc[:,~idx_censor]
    # `combine.loc[:,idx_censor]` then are the unlabeled data => breast_unlabeled.npz

    # drop NA clinical features
    if clinical_feature != None:
        idx_null = combine.loc[clinical_feature].isnull().values
        idx_null = np.sum(idx_null, axis=0).astype(bool)
        combine = combine.loc[:,~idx_null]  # possibly take out idx_null for `unsupervised data`
    all_feature = np.concatenate([['Time'],clinical_feature, gene_feature])
    combine = combine.loc[all_feature,:]

    all_sample = combine.columns.values.astype(str)
    test_sample = np.load(test_file_path)['test_sample_hold_out']
    test_sample = np.intersect1d(test_sample, all_sample)
    test = combine.loc[:,test_sample]
    rest = combine.drop(test_sample, axis=1)
    
    cutoff = 60  # die before cutoff = 60 => label = 1
    y_test = test.loc['Time',:].values<cutoff
    X_test = test.drop('Time', axis=0).values.T
    y_rest = rest.loc['Time',:].values<cutoff
    X_rest = rest.drop('Time', axis=0).values.T

    o_test = test.loc['Time',:].values
    o_rest = rest.loc['Time',:].values

    skf = StratifiedKFold(n_splits=4, random_state=7)
    skf_splits=[]
    for train_index, val_index in skf.split(X_rest, y_rest):
        skf_splits.append([train_index, val_index])   
    [train_index, val_index] = skf_splits[val_idx]
    X_train, X_val = X_rest[train_index], X_rest[val_index]
    y_train, y_val = y_rest[train_index], y_rest[val_index]
    o_train, o_val = o_rest[train_index], o_rest[val_index]

    scaler = preprocessing.StandardScaler().fit(X_train.astype(np.float32))
    X_train = scaler.transform(X_train.astype(np.float32))
    X_val = scaler.transform(X_val.astype(np.float32))
    X_test = scaler.transform(X_test.astype(np.float32))

    np.savez_compressed(save_path + str(val_idx) + '.npz',
        x_train=X_train[:,-len(gene_feature):],
        x_valid=X_val[:,-len(gene_feature):],
        x_test=X_test[:,-len(gene_feature):],
        c_train=X_train[:,:len(clinical_feature)],
        c_valid=X_val[:,:len(clinical_feature)],
        c_test=X_test[:,:len(clinical_feature)],
        o_train=o_train.astype(float),o_valid=o_val.astype(float),o_test=o_test.astype(float),
        y_train=y_train.astype(float),y_valid=y_val.astype(float),y_test=y_test.astype(float),
        mean=scaler.mean_,scale=scaler.scale_)

def create_breast_unlabeled_data(train_file_path,test_file_path,save_path,features_x=[]):
    '''
        Extract unlabeled samples from METABRIC raw data for semi-supervised learning
        Def: unlabeled means samples without full clinical records and/or label
        
        Example usage in main():
        # create_breast_unlabeled_data(
        #     'data/breast/metabric.pkl','data/breast/test_sample_hold_out.npz',
        #     'data/breast/breast_unlabeled.npz')
    '''

    # all clinical records available
    clinical_feature = ['Age', 'Menopausal State', 'Size', 'Radio Therapy',
        'Chemotherapy', 'Hormone Therapy', 'Neoplasm Histologic Grade', 'Cellularity',
        'Surgery-breast conserving', 'Surgery-mastectomy']
    if len(features_x) == 0:  # 20 biomarkers
        gene_feature = ['ESR1','PGR','ERBB2','MKI67','PLAU',
            'ELAVL1','EGFR','BTRC','FBXO6','SHMT2','KRAS','SRPK2',
            'YWHAQ','PDHA1','EWSR1','ZDHHC17','ENO1','DBN1','PLK1','GSK3B']
    else:  gene_feature = features_x

    # row: feature; col: patient
    combine = pd.read_pickle(train_file_path)
    event = combine.T['Event'].values
    idx_censor = np.array(event == 0)  # data with no events are considered censored
    #combine = combine.loc[:,~idx_censor]
    censor = combine.loc[:,idx_censor]
    all_feature = np.concatenate([clinical_feature,gene_feature])
    unlabeled = censor.loc[all_feature,:]    
    unlabeled = np.array(unlabeled.T,dtype=float)
    unlabeled_c = unlabeled[:,0:10]
    unlabeled_x = unlabeled[:,10:]

    # unlabeled_c is not suitable for normalizing since it consists NaN's
    # normalization need to follow x_train distribution
    np.savez_compressed(save_path,
        unlabeled_x=unlabeled_x,
        unlabeled_c=unlabeled_c)

def main():
    # split_nsclc_data_with_test(
    #     '../data/nsclc/cohort/cohort_data.npz',
    #     '../data/nsclc/cohort/clinical_data.npz',
    #     '../data/nsclc/cohort/cohort_name_pool.npz',
    #     '../data/nsclc/nsclc_',
    #     '../data/nsclc/cohort/nsclc_test_idx.npz')

    # split_nsclc_data_with_test(
    #     '../data/nsclc/cohort/cohort_data.npz',
    #     '../data/nsclc/cohort/clinical_data.npz',
    #     '../data/nsclc/cohort/cohort_name_pool.npz',
    #     '../data/nsclc/nsclc_unlabeled')

    # split_breast_cancer_data(
    #     'data/breast/metabric.pkl','data/breast/test_sample_hold_out.npz',
    #     'data/breast/breast_'
    #     '0')

    # create_breast_unlabeled_data(
    #     'data/breast/metabric.pkl','data/breast/test_sample_hold_out.npz',
    #     'data/breast/breast_unlabeled.npz')

if __name__ == "__main__":
    main()