IPMNPDACpaperArchive / IPMNPDAC_WGS / clusterAssignment / sbsClusterAssignment_step2.py
sbsClusterAssignment_step2.py
Raw
import pandas as pd 

# 1) process Probabilities file by adding column sampleMut: e.g. case10_2_A[C>A]A
sampleID = pd.read_csv('/mnt/b/1_3July23VersionDataPlotCode/Figure1/sampleID2025.csv')
sampleNewID = list(sampleID.ID2025)
sampleOldID = list(sampleID.previousID)

pathsigout = '/mnt/e/5_signatureResult34_41samples/41SBS96/SBS96/Suggested_Solution/COSMIC_SBS96_Decomposed_Solution/Activities/'
sigoutseq = pd.read_csv(pathsigout + 'De_Novo_MutationType_Probabilities.txt', sep = '\t')
sigoutseq = sigoutseq.replace(sampleOldID, sampleNewID, regex=True)
sigoutseq.insert(0, 'sampleMut', [str(a) +'_'+str(b) for a,b in zip(sigoutseq.SampleNames, sigoutseq.MutationType)])

# 2) merge seqinfo and SBS probabilities on the column'sampleMut'	
seqInfopath = '/IPMNPDAC_WGS/Data/Data/sigDPC/step1seqProcessed/'
seqInfoSigOut = '/IPMNPDAC_WGS/Data/Data/sigDPC/step2seqInfoSigOut/'
allSampleSeqInfo = pd.read_csv(seqInfopath + '41Samples_SBSseqinfo.csv')
segInfoSBSprob = allSampleSeqInfo.merge(sigoutseq)
colNameSBS = [a for a in list(segInfoSBSprob) if 'SBS' in a]
segInfoSBSprob = segInfoSBSprob[['samples', 'chrs', 'pos', 'sampleMut'] + colNameSBS]
segInfoSBSprob.to_csv(seqInfoSigOut +'41seqInfoSBSprob.csv', index=0)