import pandas as pd from glob import glob seqinfoIDprobPath = '/IPMNPDAC_WGS/Data/Data/sigDPC/step2seqInfoSigOut/' caseChrPosClusterpath = '/IPMNPDAC_WGS/Data/Data/sigDPC/step3caseChrPosCluster/' outpath = '/IPMNPDAC_WGS/Data/Data/sigDPC/step4bcaseSampleClusterID/' # 1) process seqinfoSBSprob by adding column case_chrs_pos seqinfoIDprob = pd.read_csv(seqinfoIDprobPath + '41seqInfoIDprob.csv') # 2) process caseChrPosCluster by adding column case_chrs_pos, and merge dfs = [] for fn in glob(caseChrPosClusterpath + '*_chrPosCluster.csv'): caseid = fn.split('/')[-1].split('_')[0] chrPosCluster_df = pd.read_csv(fn) chrPosCluster_df.insert(0, 'case_chrs_pos', caseid + '_' + chrPosCluster_df['chr_pos'].astype(str)) chrPosCluster_df = chrPosCluster_df[['case_chrs_pos', 'clusterNo']] chrPosCluster_ID = seqinfoIDprob.merge(chrPosCluster_df) chrPosCluster_ID = chrPosCluster_ID.drop_duplicates(subset='case_chrs_pos', keep='first') chrPosCluster_ID.to_csv(outpath + '{}_chrPosCluster_ID.csv'.format(caseid), index=0) dfs.append(chrPosCluster_ID)