import pandas as pd import numpy as np import re def main(): data = pd.read_csv('../data/breast/GSE21653_series_matrix.txt',sep='\t',low_memory=False) data = np.array(data) print(np.shape(data)) pats,events,months = [],[],[] for i in range(np.shape(data)[1]): line = data[:,i] pats.append(line[-1]) has_event,has_month = False,False for j in range(len(line)): # print('j = ' + str(j) + '; str = ' + str(line[j])) match_object = re.search(r"dfs\s*evt: ([0-1])",str(line[j])) try: event = match_object.group(1) events.append(event) has_event = True except AttributeError as e: pass match_object = re.search(r"dfs time \(months\): (\s*[0-9]*\.[0-9]*)",str(line[j])) try: month = match_object.group(1) months.append(float(month)) has_month = True except AttributeError as e: pass if not has_event: events.append(-999) if not has_month: months.append(-999) print(np.shape(events)) print(np.shape(months)) print(np.shape(pats)) events = np.array(events,dtype=np.float32) months = np.array(months,dtype=np.float32) out_events,out_months = [],[] out_labels = [] for i in range(np.shape(pats)[0]): if np.abs(events[i] - (-999)) < 1E-3: continue # missing values if np.abs(months[i] - (-999)) < 1E-3: continue # missing values if np.abs(events[i] - 0) < 1E-3: continue # include ony patients with DFS event # generate labels if months[i] <= 60.0: out_labels.append(1.0) else: out_labels.append(0.0) out_events.append(events[i]) out_months.append(months[i]) print(np.shape(out_labels)) print(np.shape(out_events)) print(np.shape(out_months)) np.savez_compressed('../data/breast/indep.npz', y_test=out_labels,o_test=out_months,e_test=out_events) if __name__ == '__main__': main()