scan / utils / proc_breast_indep.py
proc_breast_indep.py
Raw
import pandas as pd
import numpy as np
import re



def main():
    data = pd.read_csv('../data/breast/GSE21653_series_matrix.txt',sep='\t',low_memory=False)
    data = np.array(data)
    print(np.shape(data))

    pats,events,months = [],[],[]
    for i in range(np.shape(data)[1]):
        line = data[:,i]
        pats.append(line[-1])
        has_event,has_month = False,False

        for j in range(len(line)):
            # print('j = ' + str(j) + '; str = ' + str(line[j]))
            match_object = re.search(r"dfs\s*evt: ([0-1])",str(line[j]))
            try:
                event = match_object.group(1)
                events.append(event)
                has_event = True
            except AttributeError as e:
                pass
            
            match_object = re.search(r"dfs time \(months\): (\s*[0-9]*\.[0-9]*)",str(line[j]))
            try:
                month = match_object.group(1)
                months.append(float(month))
                has_month = True
            except AttributeError as e:
                pass     

        if not has_event:  events.append(-999)
        if not has_month:  months.append(-999)

    print(np.shape(events))
    print(np.shape(months))
    print(np.shape(pats))

    events = np.array(events,dtype=np.float32)
    months = np.array(months,dtype=np.float32)


    out_events,out_months = [],[]
    out_labels = []

    for i in range(np.shape(pats)[0]):
        if np.abs(events[i] - (-999)) < 1E-3:  continue  # missing values
        if np.abs(months[i] - (-999)) < 1E-3:  continue  # missing values
        if np.abs(events[i] - 0) < 1E-3:  continue  # include ony patients with DFS event

        # generate labels
        if months[i] <= 60.0:  out_labels.append(1.0)
        else:  out_labels.append(0.0)

        out_events.append(events[i])
        out_months.append(months[i])

    print(np.shape(out_labels))
    print(np.shape(out_events))
    print(np.shape(out_months))

    np.savez_compressed('../data/breast/indep.npz',
        y_test=out_labels,o_test=out_months,e_test=out_events)


if __name__ == '__main__':
    main()