import os import glob import shutil import pandas as pd datapath = '/mnt/d/1a_18_04_MU/5B_PancreaticCancer/4_IPMN_WGS/6b_callersDPC/10_CNVpindelSNVmsDPC/1_msDPC_mostUextract/' outputpath = '/mnt/b/1_3July23VersionDataPlotCode/gitdata/Data/sigDPC/step3caseChrPosCluster/' dfs=[] for fd in (glob.glob(datapath + '*pindelSNVmsDPC')): caseID = fd.split('/')[-1].split('_')[0] snvFile = '{}__DP_and_cluster_info_0.01.txt'.format(caseID)#the file here could be snv+pindel clustFile = '{}__union_filtered_SNVs.txt'.format(caseID) pathx = os.path.join(os.getcwd(), fd) snvFilex = os.path.join(pathx, snvFile) clustFilex = os.path.join(pathx, clustFile) dfchrpos = pd.read_csv(snvFilex, sep='\t') dfcluste = pd.read_csv(clustFilex, sep='\t') dfIDcluster = pd.concat([dfchrpos, dfcluste], axis=1) dfIDcluster.insert(0, 'chr_pos', dfIDcluster['chr'].astype(str) + '_' + dfIDcluster['pos'].astype(str)) dfIDcluster = dfIDcluster[['chr_pos', 'chr', 'pos', 'most.likely.cluster']] dfIDcluster = dfIDcluster.rename(columns = {'most.likely.cluster':'clusterNo'}) dfs.append(dfIDcluster) dfIDcluster.to_csv(outputpath + '{}_chrPosCluster.csv'.format(caseID), index=0)