IPMNPDACpaperArchive / IPMNPDAC_WGS / clusterAssignment / id83ClusterAssignment_step1.py
id83ClusterAssignment_step1.py
Raw
import os
from glob import glob
import pandas as pd
import numpy as np
from natsort import natsorted
from natsort import index_natsorted

sampleID = pd.read_csv('/mnt/b/1_3July23VersionDataPlotCode/Figure1/sampleID2025.csv')
sampleNewID = list(sampleID.ID2025)
sampleOldID = list(sampleID.previousID)

# 1) ID83_seqinfo
id83seqinfoPath = '/mnt/e/5_signatureResult34_41samples/42ID83/output/vcf_files/ID/'

idDFs = []
for fm in glob(id83seqinfoPath +'*seqinfo.txt'):
    idDf = pd.read_csv(fm, sep='\t', header=None)
    idDf.columns = ['samples', 'chrs', 'pos', 'mutationType', 'REF', 'ALT', 'orientation']
    idDf = idDf[['samples', 'chrs', 'pos', 'mutationType', 'orientation']]
    idMutTypeSeq = [f[2:] for f in idDf.mutationType]
    idDf.insert(4, 'mutationTypeSeq', idMutTypeSeq)
    idDFs.append(idDf)   

IDdf = pd.concat(idDFs)  
IDdf_sampleMut = [str(g)+ '_' + str(h) for g, h in zip(IDdf['samples'], IDdf['mutationTypeSeq'])]    
IDdf.insert(2, 'sampleMut', IDdf_sampleMut)

# 2) indel-ID83 Mutation Probabilities
id83ActivitiesPath = '/mnt/e/5_signatureResult34_41samples/42ID83/ID83/Suggested_Solution/COSMIC_ID83_Decomposed_Solution/Activities/'
IDmutProb = pd.read_csv(id83ActivitiesPath + 'De_Novo_MutationType_Probabilities.txt', sep='\t')   
IDsampleMut = [str(j)+'_' + str(k) for j, k in zip(IDmutProb['Sample Names'], IDmutProb['MutationType'])]
IDmutProb.insert(2, 'sampleMut',IDsampleMut)
IDmutProb = IDmutProb[['sampleMut',	'ID1',	'ID2',	'ID5',	'ID6',	'ID8',	'ID9',	'ID14']]

# 3) merged by the common column sampleMut 
df_IDseqProb = IDdf.merge(IDmutProb)
df_IDseqProb['case_chr_pos_mut'] = df_IDseqProb.apply(lambda row: '_'.join([row['sampleMut'].split('_')[0], str(row['chrs']), str(row['pos']), row['sampleMut'].split('_')[2]]),axis=1)
df_IDseqProb = df_IDseqProb[['samples',	'case_chr_pos_mut', 'chrs',	'pos',	'ID1', 'ID2', 'ID5', 'ID6',	'ID8', 'ID9', 'ID14']]
df_IDseqProb.insert(0, 'caseid', [x.split('_')[0] for x in df_IDseqProb.samples])
df_IDseqProb.insert(0, 'case_chrs_pos', df_IDseqProb['caseid'].astype(str) + '_' + df_IDseqProb['chrs'].astype(str)
                    + '_' + df_IDseqProb['pos'].astype(str))
df_IDseqProb = df_IDseqProb[['case_chrs_pos', 'samples','case_chr_pos_mut',	'ID1',	'ID2',	'ID5',	'ID6',	'ID8',	'ID9',	'ID14']]
df_IDseqProb = df_IDseqProb.replace(sampleOldID, sampleNewID, regex=True)
df_IDseqProb = df_IDseqProb.query('samples != "case12_S12"')
df_IDseqProb = df_IDseqProb.drop_duplicates(subset='case_chr_pos_mut', keep='first')
df_IDseqProb.to_csv('/IPMNPDAC_WGS/Data/Data/sigDPC/step2seqInfoSigOut/41seqInfoIDprob.csv', index=0)