import numpy as np import pandas as pd import scipy from sklearn import preprocessing from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import StratifiedShuffleSplit from sklearn.utils.class_weight import compute_class_weight from imblearn.over_sampling import SMOTE def _select_markers(exprs,marker_list,name_pool): indicies = [] # indices of selected markers for index,top_gene in enumerate(marker_list): idx = [i for i,name in enumerate(name_pool) if top_gene == name] if len(idx) == 0: continue else: indicies.append(idx) indicies = np.array(indicies) indicies = np.squeeze(indicies) output_data = exprs[:,indicies] return output_data def _get_split_with_test(exprs,clin_y,rest_idx,test_idx,seed=1024): # recover original test set x_rest,x_test = exprs[rest_idx,:],exprs[test_idx,:] cy_rest,cy_test = clin_y[rest_idx,:],clin_y[test_idx,:] # create K folds of train/validation splits skf = StratifiedKFold(n_splits=4,random_state=seed,shuffle=True) skf_splits=[] for train_index, val_index in skf.split(x_rest, cy_rest[:,-1].astype(int)): skf_splits.append([train_index, val_index]) return skf_splits,x_test,cy_test,x_rest,cy_rest def split_nsclc_data_with_test(expr_path,clin_path,name_path,save_path,test_idx_path,new_markers=[]): ''' Example usage in main(): # split_nsclc_data_with_test( # '../data/nsclc/cohort/cohort_data.npz', # '../data/nsclc/cohort/clinical_data.npz', # '../data/nsclc/cohort/cohort_name_pool.npz', # '../data/nsclc/nsclc_', # '../data/nsclc/cohort/nsclc_test_idx.npz') ''' # for unlabelled samples, should check cohort/clinical_data.npz (['no_clinical']) # all data in ['clinical'] come with correspoding labels expr = np.load(expr_path,allow_pickle=True) expr = expr['clinical'] clin = np.load(clin_path,allow_pickle=True) clin = clin['clinical'] clin = clin[:,:-1] # last colummn is 'complete' which is no use name = np.load(name_path,allow_pickle=True) name = name['name'] # selected biomarkers with our systems biology selector marker_list = ['EPCAM','HIF1A','PKM','PTK7','ALCAM','CADM1','SLC2A1', 'CUL1','CUL3','EGFR','ELAVL1','GRB2','NRF1','RNF2','RPA2'] index = np.load(test_idx_path,allow_pickle=True) rest_idx = index['rest_idx'] test_idx = index['test_idx'] x_rest,x_test = expr[rest_idx,:],expr[test_idx,:] cy_rest,cy_test = clin[rest_idx,:],clin[test_idx,:] # print(np.shape(x_rest)) # print(np.shape(x_test)) # print(np.shape(cy_rest)) # print(np.shape(cy_test)) # print(cy_rest[0:5,0]) # age # print(cy_rest[0:5,1]) # gender (male == 1 / female == 0) # print(cy_rest[0:5,2]) # stage (1-5) # print(cy_rest[0:5,3]) # os time (months) # print(cy_rest[0:5,4]) # event # print(cy_rest[0:5,5]) # label biomarker_idx = [] for gene in marker_list: idx = [i for i,g in enumerate(name) if gene == g] biomarker_idx.extend(idx) biomarker_idx = np.array(biomarker_idx) # print(biomarker_idx) # name.npz & nsclc_data.npz contains the cohort with all gene expressions # np.savez_compressed('../data/nsclc/cohort/name.npz',name=name) # np.savez_compressed('../data/nsclc/cohort/nsclc_data.npz', # x_train=x_rest,cy_train=cy_rest, # x_test=x_test,cy_test=cy_test, # biomarker_idx=biomarker_idx) # test set and several train/validation splits skf_splits,x_test_tmp,cy_test_tmp,x_rest,cy_rest = _get_split_with_test(expr,clin,rest_idx,test_idx) for idx in range(len(skf_splits)): [train_index, val_index] = skf_splits[idx] x_train,x_valid = x_rest[train_index], x_rest[val_index] cy_train,cy_valid = cy_rest[train_index], cy_rest[val_index] x_test = x_test_tmp cy_test = cy_test_tmp c_train,c_valid,c_test = cy_train[:,:3],cy_valid[:,:3],cy_test[:,:3] o_train,o_valid,o_test = cy_train[:,-3],cy_valid[:,-3],cy_test[:,-3] e_train,e_valid,e_test = cy_train[:,-2],cy_valid[:,-2],cy_test[:,-2] y_train,y_valid,y_test = cy_train[:,-1],cy_valid[:,-1],cy_test[:,-1] if len(new_markers) == 0: x_train = _select_markers(x_train,marker_list,name) x_valid = _select_markers(x_valid,marker_list,name) x_test = _select_markers(x_test,marker_list,name) else: x_train = _select_markers(x_train,new_markers,name) x_valid = _select_markers(x_valid,new_markers,name) x_test = _select_markers(x_test,new_markers,name) scaler_x = preprocessing.StandardScaler().fit(x_train.astype(np.float32)) x_train = scaler_x.transform(x_train.astype(np.float32)) x_valid = scaler_x.transform(x_valid.astype(np.float32)) x_test = scaler_x.transform(x_test.astype(np.float32)) scaler_c = preprocessing.StandardScaler().fit(c_train.astype(np.float32)) c_train = scaler_c.transform(c_train.astype(np.float32)) c_valid = scaler_c.transform(c_valid.astype(np.float32)) c_test = scaler_c.transform(c_test.astype(np.float32)) x_mean = scaler_x.mean_ x_scale = scaler_x.scale_ c_mean = scaler_c.mean_ c_scale = scaler_c.scale_ np.savez_compressed(save_path + '_' + str(idx) + '.npz', x_train=x_train,x_valid=x_valid,x_test=x_test, c_train=c_train,c_valid=c_valid,c_test=c_test, y_train=y_train,y_valid=y_valid,y_test=y_test, e_train=e_train,e_valid=e_valid,e_test=e_test, o_train=o_train,o_valid=o_valid,o_test=o_test, x_mean=x_mean,x_scale=x_scale,c_mean=c_mean,c_scale=c_scale) def get_nsclc_data_unlabeled(expr_path,clin_path,name_path,save_path,new_markers=[]): ''' Example usage in main(): # split_nsclc_data_with_test( # '../data/nsclc/cohort/cohort_data.npz', # '../data/nsclc/cohort/clinical_data.npz', # '../data/nsclc/cohort/cohort_name_pool.npz', # '../data/nsclc/nsclc_unlabeled') ''' # for unlabeled samples, should check cohort/clinical_data.npz (['no_clinical']) # all data in ['clinical'] come with correspoding labels expr = np.load(expr_path,allow_pickle=True) expr = expr['no_clinical'] clin = np.load(clin_path,allow_pickle=True) clin = clin['no_clinical'] clin = clin[:,:-1] # last colummn is 'complete' which is no use name = np.load(name_path,allow_pickle=True) name = name['name'] marker_list = ['EPCAM','HIF1A','PKM','PTK7','ALCAM','CADM1','SLC2A1', 'CUL1','CUL3','EGFR','ELAVL1','GRB2','NRF1','RNF2','RPA2'] # store all unlabeled samples in a single file if len(new_markers) == 0: expr = _select_markers(expr,marker_list,name) else: expr = _select_markers(expr,new_markers,name) # c = [age, gender, stage, ostime, event, complete] where `complete` already dropped # missing values are marked as -1 x_w_full,c_w_full,o_w_full,e_w_full,x_n_full,c_n_full,o_n_full,e_n_full = [],[],[],[],[],[],[],[] for idx in range(np.shape(expr)[0]): if np.any(clin[idx,:]) == -1: x_n_full.append(expr[idx,:]) c_n_full.append(clin[idx,0:3]) o_n_full.append(clin[idx,3]) e_n_full.append(clin[idx,4]) else: x_w_full.append(expr[idx,:]) c_w_full.append(clin[idx,0:3]) o_w_full.append(clin[idx,3]) e_w_full.append(clin[idx,4]) # 62/102 with full clinical information; 40/102 without np.savez_compressed(save_path + '.npz', x_w_full=x_w_full,c_w_full=c_w_full,o_w_full=o_w_full,e_w_full=e_w_full, x_n_full=x_n_full,c_n_full=c_n_full,o_n_full=o_n_full,e_n_full=e_n_full) def split_breast_cancer_data(train_file_path,test_file_path,save_path,val_idx,features_c=[],features_x=[]): ''' Example usage in main(): # split_breast_cancer_data( # 'data/breast/metabric.pkl','data/breast/test_sample_hold_out.npz', # 'data/breast/breast_' # '0') ''' if len(features_c) == 0: # 10 clinical features clinical_feature = ['Age', 'Menopausal State', 'Size', 'Radio Therapy', 'Chemotherapy', 'Hormone Therapy', 'Neoplasm Histologic Grade', 'Cellularity', 'Surgery-breast conserving', 'Surgery-mastectomy'] else: clinical_feature = features_c if len(features_x) == 0: # 20 biomarkers gene_feature = ['ESR1','PGR','ERBB2','MKI67','PLAU', 'ELAVL1','EGFR','BTRC','FBXO6','SHMT2','KRAS','SRPK2', 'YWHAQ','PDHA1','EWSR1','ZDHHC17','ENO1','DBN1','PLK1','GSK3B'] else: gene_feature = features_x # row: feature; col: patient combine = pd.read_pickle(train_file_path) event = combine.T['Event'].values idx_censor = np.array(event == 0) # data with no events are considered censored combine = combine.loc[:,~idx_censor] # `combine.loc[:,idx_censor]` then are the unlabeled data => breast_unlabeled.npz # drop NA clinical features if clinical_feature != None: idx_null = combine.loc[clinical_feature].isnull().values idx_null = np.sum(idx_null, axis=0).astype(bool) combine = combine.loc[:,~idx_null] # possibly take out idx_null for `unsupervised data` all_feature = np.concatenate([['Time'],clinical_feature, gene_feature]) combine = combine.loc[all_feature,:] all_sample = combine.columns.values.astype(str) test_sample = np.load(test_file_path)['test_sample_hold_out'] test_sample = np.intersect1d(test_sample, all_sample) test = combine.loc[:,test_sample] rest = combine.drop(test_sample, axis=1) cutoff = 60 # die before cutoff = 60 => label = 1 y_test = test.loc['Time',:].values<cutoff X_test = test.drop('Time', axis=0).values.T y_rest = rest.loc['Time',:].values<cutoff X_rest = rest.drop('Time', axis=0).values.T o_test = test.loc['Time',:].values o_rest = rest.loc['Time',:].values skf = StratifiedKFold(n_splits=4, random_state=7) skf_splits=[] for train_index, val_index in skf.split(X_rest, y_rest): skf_splits.append([train_index, val_index]) [train_index, val_index] = skf_splits[val_idx] X_train, X_val = X_rest[train_index], X_rest[val_index] y_train, y_val = y_rest[train_index], y_rest[val_index] o_train, o_val = o_rest[train_index], o_rest[val_index] scaler = preprocessing.StandardScaler().fit(X_train.astype(np.float32)) X_train = scaler.transform(X_train.astype(np.float32)) X_val = scaler.transform(X_val.astype(np.float32)) X_test = scaler.transform(X_test.astype(np.float32)) np.savez_compressed(save_path + str(val_idx) + '.npz', x_train=X_train[:,-len(gene_feature):], x_valid=X_val[:,-len(gene_feature):], x_test=X_test[:,-len(gene_feature):], c_train=X_train[:,:len(clinical_feature)], c_valid=X_val[:,:len(clinical_feature)], c_test=X_test[:,:len(clinical_feature)], o_train=o_train.astype(float),o_valid=o_val.astype(float),o_test=o_test.astype(float), y_train=y_train.astype(float),y_valid=y_val.astype(float),y_test=y_test.astype(float), mean=scaler.mean_,scale=scaler.scale_) def create_breast_unlabeled_data(train_file_path,test_file_path,save_path,features_x=[]): ''' Extract unlabeled samples from METABRIC raw data for semi-supervised learning Def: unlabeled means samples without full clinical records and/or label Example usage in main(): # create_breast_unlabeled_data( # 'data/breast/metabric.pkl','data/breast/test_sample_hold_out.npz', # 'data/breast/breast_unlabeled.npz') ''' # all clinical records available clinical_feature = ['Age', 'Menopausal State', 'Size', 'Radio Therapy', 'Chemotherapy', 'Hormone Therapy', 'Neoplasm Histologic Grade', 'Cellularity', 'Surgery-breast conserving', 'Surgery-mastectomy'] if len(features_x) == 0: # 20 biomarkers gene_feature = ['ESR1','PGR','ERBB2','MKI67','PLAU', 'ELAVL1','EGFR','BTRC','FBXO6','SHMT2','KRAS','SRPK2', 'YWHAQ','PDHA1','EWSR1','ZDHHC17','ENO1','DBN1','PLK1','GSK3B'] else: gene_feature = features_x # row: feature; col: patient combine = pd.read_pickle(train_file_path) event = combine.T['Event'].values idx_censor = np.array(event == 0) # data with no events are considered censored #combine = combine.loc[:,~idx_censor] censor = combine.loc[:,idx_censor] all_feature = np.concatenate([clinical_feature,gene_feature]) unlabeled = censor.loc[all_feature,:] unlabeled = np.array(unlabeled.T,dtype=float) unlabeled_c = unlabeled[:,0:10] unlabeled_x = unlabeled[:,10:] # unlabeled_c is not suitable for normalizing since it consists NaN's # normalization need to follow x_train distribution np.savez_compressed(save_path, unlabeled_x=unlabeled_x, unlabeled_c=unlabeled_c) def main(): # split_nsclc_data_with_test( # '../data/nsclc/cohort/cohort_data.npz', # '../data/nsclc/cohort/clinical_data.npz', # '../data/nsclc/cohort/cohort_name_pool.npz', # '../data/nsclc/nsclc_', # '../data/nsclc/cohort/nsclc_test_idx.npz') # split_nsclc_data_with_test( # '../data/nsclc/cohort/cohort_data.npz', # '../data/nsclc/cohort/clinical_data.npz', # '../data/nsclc/cohort/cohort_name_pool.npz', # '../data/nsclc/nsclc_unlabeled') # split_breast_cancer_data( # 'data/breast/metabric.pkl','data/breast/test_sample_hold_out.npz', # 'data/breast/breast_' # '0') # create_breast_unlabeled_data( # 'data/breast/metabric.pkl','data/breast/test_sample_hold_out.npz', # 'data/breast/breast_unlabeled.npz') if __name__ == "__main__": main()