IPMNPDAC_WGS/clusterAssignment/sbsClusterAssignment_step3.py · IPMNPDACpaperArchive

import os
import glob
import shutil
import pandas as pd

datapath = '/mnt/d/1a_18_04_MU/5B_PancreaticCancer/4_IPMN_WGS/6b_callersDPC/10_CNVpindelSNVmsDPC/1_msDPC_mostUextract/'
outputpath = '/mnt/b/1_3July23VersionDataPlotCode/gitdata/Data/sigDPC/step3caseChrPosCluster/'

dfs=[]
for fd in (glob.glob(datapath + '*pindelSNVmsDPC')):
    caseID = fd.split('/')[-1].split('_')[0]
    snvFile = '{}__DP_and_cluster_info_0.01.txt'.format(caseID)#the file here could be snv+pindel
    clustFile = '{}__union_filtered_SNVs.txt'.format(caseID)
    pathx = os.path.join(os.getcwd(), fd)   
    snvFilex = os.path.join(pathx, snvFile)
    clustFilex = os.path.join(pathx, clustFile)
    dfchrpos = pd.read_csv(snvFilex, sep='\t')
    dfcluste = pd.read_csv(clustFilex, sep='\t')
    dfIDcluster = pd.concat([dfchrpos, dfcluste], axis=1)
    dfIDcluster.insert(0, 'chr_pos', dfIDcluster['chr'].astype(str) + '_' + dfIDcluster['pos'].astype(str))
    dfIDcluster =  dfIDcluster[['chr_pos', 'chr', 'pos', 'most.likely.cluster']]
    dfIDcluster = dfIDcluster.rename(columns = {'most.likely.cluster':'clusterNo'})
    dfs.append(dfIDcluster)
    dfIDcluster.to_csv(outputpath + '{}_chrPosCluster.csv'.format(caseID), index=0)