import os from glob import glob import pandas as pd import pyranges as pr # a) function for SV matching cluster def msClusterSV(msDPCdata, svdata, pathout): # 1) Load data caseMsDPC = pd.read_csv(msDPCdata) caseID = msDPCdata.split('/')[-1].split('_')[0] caseMsDPC['chrom'] = 'chr' + caseMsDPC['chr'].astype(str) caseMsDPC = caseMsDPC.rename(columns={"most.likely.cluster": "msCluster"}) allsv = pd.read_csv(svdata) caseSV = allsv[allsv['samples'].str.contains(caseID)] # 2) Convert to PyRanges # SNV is a single base interval snv = pr.PyRanges(pd.DataFrame({"Chromosome": caseMsDPC['chrom'],"Start": caseMsDPC['pos'],"End": caseMsDPC['pos']+1, "msCluster": caseMsDPC['msCluster']})) sv = pr.PyRanges(pd.DataFrame({"Chromosome": caseSV['CHROM'],"Start": caseSV['START'],"End": caseSV['END'], "typeTumorSample": caseSV['typeTumorSample'],"samples": caseSV['samples'],"SVLEN": caseSV['SVLEN'], "svclass": caseSV['svclass'],"CHR2": caseSV['CHR2'],"rgn": caseSV['rgn'],"gene": caseSV['gene'], "tid": caseSV['tid'],"PASS": caseSV['PASS']})) #3) Perform overlap join (very fast C implementation) joined = snv.join(sv) # Convert back to pandas result = joined.df.rename(columns={"Start": "snvPos"}) result.to_csv(f"{pathout}{caseID}_snvSVCluster.csv", index=False) # b)Batch execution def allFolderFileRun(): pathinput = os.path.expanduser('/IPMNPDAC_WGS/Data/svDriverCluster/CasesCHROMposClusts/') sVdatapath = os.path.expanduser('/IPMNPDAC_WGS/Data/svDriverCluster/all41SVs.csv') outpathx = os.path.expanduser('/IPMNPDAC_WGS/Data/svDriverCluster/') for filex in glob(pathinput + '*_chrPosCluster.csv'): msClusterSV(filex, sVdatapath, outpathx) if __name__=="__main__": allFolderFileRun()