scan / utils / proc_nsclc_indep.py
proc_nsclc_indep.py
Raw
import pandas as pd
import numpy as np



def main():
    name = pd.read_csv('../data/nsclc/indep/name_pool.csv',sep='\t',low_memory=False)
    name = np.array(name)[:,0]

    data = pd.read_csv('../data/nsclc/indep/cohort_data.csv',sep=',',header=None,low_memory=False)
    data = np.array(data)

    clin = pd.read_csv('../data/nsclc/indep/clin_os_old.csv',sep=',',low_memory=False)
    clin = np.array(clin)

    age    = clin[:,1]
    gender = clin[:,2]
    stage  = clin[:,3]
    ostime = clin[:,4]
    event  = clin[:,5]
    label  = clin[:,6]

    # gender to one-hot vector
    Gender = np.zeros((np.shape(gender)[0],2))
    for i in range(np.shape(Gender)[0]):
        if gender[i] > 0.5:  Gender[i,1] = 1

    # stage to one-hot vector
    Stage = np.zeros((np.shape(stage)[0],5))
    for i in range(np.shape(Stage)[0]):
        Stage[i,stage[i]-1] = 1

    # concatenate clinical information
    clin = np.concatenate((np.expand_dims(age,axis=1),np.expand_dims(gender,axis=1)),axis=1)
    clin = np.concatenate((clin,np.expand_dims(stage,axis=1)),axis=1)
    print(clin)
    print(np.shape(clin))

    Clin = np.concatenate((np.expand_dims(age,axis=1),Gender),axis=1)
    Clin = np.concatenate((Clin,Stage),axis=1)
    print(Clin)
    print(np.shape(Clin))

    # select gene feature
    gene_feature = ['EPCAM','HIF1A','PKM','PTK7','ALCAM','CADM1','SLC2A1',
            'CUL1','CUL3','EGFR','ELAVL1','GRB2','NRF1','RNF2','RPA2']  

    data = np.transpose(data)  # [marker, patient]
    data_out = []
    for targ_gene in gene_feature:
        idx_gene = [idx for idx,gene in enumerate(name) if gene.upper() == targ_gene.upper()]
        data_out.extend(data[idx_gene,:])
    data_out = np.transpose(np.array(data_out))

    np.savez_compressed('../data/nsclc/indep.npz',
        x_test=data_out,c_test=clin,y_test=label,o_test=ostime,e_test=event,C_test=Clin)


if __name__ == '__main__':
    main()