import pandas as pd import numpy as np def main(): name = pd.read_csv('../data/nsclc/indep/name_pool.csv',sep='\t',low_memory=False) name = np.array(name)[:,0] data = pd.read_csv('../data/nsclc/indep/cohort_data.csv',sep=',',header=None,low_memory=False) data = np.array(data) clin = pd.read_csv('../data/nsclc/indep/clin_os_old.csv',sep=',',low_memory=False) clin = np.array(clin) age = clin[:,1] gender = clin[:,2] stage = clin[:,3] ostime = clin[:,4] event = clin[:,5] label = clin[:,6] # gender to one-hot vector Gender = np.zeros((np.shape(gender)[0],2)) for i in range(np.shape(Gender)[0]): if gender[i] > 0.5: Gender[i,1] = 1 # stage to one-hot vector Stage = np.zeros((np.shape(stage)[0],5)) for i in range(np.shape(Stage)[0]): Stage[i,stage[i]-1] = 1 # concatenate clinical information clin = np.concatenate((np.expand_dims(age,axis=1),np.expand_dims(gender,axis=1)),axis=1) clin = np.concatenate((clin,np.expand_dims(stage,axis=1)),axis=1) print(clin) print(np.shape(clin)) Clin = np.concatenate((np.expand_dims(age,axis=1),Gender),axis=1) Clin = np.concatenate((Clin,Stage),axis=1) print(Clin) print(np.shape(Clin)) # select gene feature gene_feature = ['EPCAM','HIF1A','PKM','PTK7','ALCAM','CADM1','SLC2A1', 'CUL1','CUL3','EGFR','ELAVL1','GRB2','NRF1','RNF2','RPA2'] data = np.transpose(data) # [marker, patient] data_out = [] for targ_gene in gene_feature: idx_gene = [idx for idx,gene in enumerate(name) if gene.upper() == targ_gene.upper()] data_out.extend(data[idx_gene,:]) data_out = np.transpose(np.array(data_out)) np.savez_compressed('../data/nsclc/indep.npz', x_test=data_out,c_test=clin,y_test=label,o_test=ostime,e_test=event,C_test=Clin) if __name__ == '__main__': main()