#对预测得到的结果转化为谱图的形式 #目前改了by和BY,exp暂时还没改 import pandas as pd import numpy as np import ipdb from utils import * import torch import masses from masses import * # ----------------------- dict ------------------------------# MAXFICHARGE = int(num_col/12) if MAXFICHARGE == 2: ionname_by = "b1,b1n,b1o,b1h,b1nh,b1oh,y1,y1n,y1o,y1h,y1nh,y1oh,"\ "b2,b2n,b2o,b2h,b2nh,b2oh,y2,y2n,y2o,y2h,y2nh,y2oh".split( ',') ionname_BY = "B1,B1n,B1o,B1f,Y1,Y1n,Y1o,Y1f,"\ "B2,B2n,B2o,B2f,Y2,Y2n,Y2o,Y2f".split( ',') if MAXFICHARGE == 3: ionname_by = "b1,b1n,b1o,b1h,b1nh,b1oh,y1,y1n,y1o,y1h,y1nh,y1oh,"\ "b2,b2n,b2o,b2h,b2nh,b2oh,y2,y2n,y2o,y2h,y2nh,y2oh,"\ "b3,b3n,b3o,b3h,b3nh,b3oh,y3,y3n,y3o,y3h,y3nh,y3oh".split( ',') ionname_BY = "B1,B1n,B1o,B1f,Y1,Y1n,Y1o,Y1f,"\ "B2,B2n,B2o,B2f,Y2,Y2n,Y2o,Y2f,"\ "B3,B3n,B3o,B3f,Y3,Y3n,Y3o,Y3f".split( ',') lossdict_by = {"noloss": '', "H2O": 'o', "NH3": 'n', '(HexNAc)1': 'h', '1(NH3)1(+HexNAc)': 'nh', '1(H2O)1(+HexNAc)': 'oh'} lossdict_BY = {"noloss": '', "H2O": 'o', "NH3": 'n', 'FUC': 'f'} fields_byBY = "SourceFile,id,PEP.StrippedSequence,iden_pep,PlausibleStruct,PP.Charge,metric,FragmentMz,FI.FrgType,FI.LossType,FI.Charge,FI.Intensity,FI.FrgNum,Precurmass_cal".split(',') fields_exp = "SourceFile,id,PEP.StrippedSequence,iden_pep,PlausibleStruct,PP.Charge,FragmentMz,FI.FrgType,FI.LossType,FI.Charge,FI.Intensity,FI.FrgNum,Precurmass_cal".split(',') # ----------------------- func ------------------------------# def byBYextract(row): outputdataframe1 = pd.DataFrame(columns=fields_byBY) # StrippedSequence = "".join([vocab.to_word(i) for i in eval(row['repsequence'])]) StrippedSequence=row["ipeptide"] PPCharge=int(row["charge"]) metric=float(row["metric"]) id=row["id"] global id_glyspec global id_iden_pep glyspec=id_glyspec[id] ms2by=row["ms2by"] ms2BY=row["ms2BY"] iden_pep=id_iden_pep[id] # import ipdb # ipdb.set_trace() PlausibleStruct=iden_pep.split("_")[-1] glysite=iden_pep.split("_")[2] # Precurmass_exp=glyspec_pepmass_mgf[glyspec] Precurmass_cal= (calcModpepMass(peptide_process(iden_pep))+MASS["H2O"])/PPCharge+MASS["H+"]#根据iden_pep计算母离子质荷比 # deltamz=abs(Precurmass_exp-Precurmass_cal)/Precurmass_exp FRAG_MODE=["HCD_by","HCD_1"] mz_calc=masses.pepfragmass(iden_pep,FRAG_MODE) #得到iontype和m/z对 # ipdb.set_trace() #by jmax=ms2by.size()[1] imax=ms2by.size()[0] # ipdb.set_trace() assert jmax==num_col for j in range(0,jmax): for i in range(0,imax): intensity_pred=ms2by[i][j] #从ms2中提取对应位置的强度 if intensity_pred <=0: continue intensity_pred=float(intensity_pred) ionname=ionname_by[j] #从ionname列表中获得对应结果的离子类型 FIFrgType=ionname[0] FICharge=int(ionname[1]) FILossType=ionname[2:] FILossType=list(lossdict_by.keys())[list(lossdict_by.values()).index(FILossType)] assert FIFrgType in ["b","y"] # ipdb.set_trace() if FIFrgType=="b": FragmentNumber=i+1 masskey="b" + str(FragmentNumber) + "_" + str(FICharge) if FIFrgType=="y": FragmentNumber=imax - i masskey="y" + str(FragmentNumber) + "_" + str(FICharge) # import ipdb if FIFrgType=="b" and int(FragmentNumber)<=int(glysite) and ionname.endswith("h"): continue if FIFrgType=="y" and int(i+1)>int(glysite) and ionname.endswith("h"): continue if FILossType!="noloss": loss=ionname[2] if loss !="h": loss=list(lossdict_by.keys())[list(lossdict_by.values()).index(loss)] masskey+="_loss_"+FILossType if ionname.endswith("h"): masskey+="_"+"(HexNAc)1" # import ipdb # ipdb.set_trace() else: masskey+="_"+"(HexNAc)0" counter=0 for k in mz_calc: for dict in k: key=[str(key) for key in dict.keys()] assert len(key)==1 if masskey == key[0]: value=[float(value) for value in dict.values()] assert len(value)==1 FragmentMz=value[0] counter+=1 if counter>1: raise ValueError("loop number more than one") # ipdb.set_trace() if counter==1: outputdataframe1.loc[len(outputdataframe1)] = [glyspec,id, StrippedSequence,iden_pep,PlausibleStruct,\ PPCharge,metric,FragmentMz,\ FIFrgType, FILossType,FICharge,intensity_pred,\ FragmentNumber,Precurmass_cal] #BY jmax=ms2BY.size()[1] imax=ms2BY.size()[0] assert jmax==num_col*2/3 for j in range(0,jmax): for i in range(0,imax): intensity_pred=ms2BY[i][j] #从ms2中提取对应位置的强度 if intensity_pred <=0: continue intensity_pred=float(intensity_pred) ionname=ionname_BY[j] #从ionname列表中获得对应结果的离子类型 FIFrgType=ionname[0] FICharge=int(ionname[1]) FILossType=ionname[2:] if FILossType=="f" and "F" not in PlausibleStruct: continue if FILossType=="f" and FIFrgType=="B": continue FILossType=list(lossdict_BY.keys())[list(lossdict_BY.values()).index(FILossType)] FragmentNumber=i #"_" + str(k) + "_" + str(ficharge)+"_loss_H2O" masskey="_" + str(FragmentNumber) + "_" + str(FICharge) if FILossType!="noloss": masskey+="_loss_"+FILossType counter=0 # if FILossType=="FUC": # ipdb.set_trace() for k in mz_calc: for dict in k: key=[str(key) for key in dict.keys()] assert len(key)==1 if FIFrgType == key[0][0]: if key[0].endswith(masskey): value=[float(value) for value in dict.values()] assert len(value)==1 FragmentMz=value[0] counter+=1 if counter>1: raise ValueError("loop number more than one") # import ipdb # ipdb.set_trace() if counter==1: outputdataframe1.loc[len(outputdataframe1)] = [glyspec,id, StrippedSequence,iden_pep,PlausibleStruct,\ PPCharge,metric,FragmentMz,\ FIFrgType, FILossType,FICharge,intensity_pred,\ FragmentNumber,Precurmass_cal] # import ipdb # ipdb.set_trace() return outputdataframe1 def expbyBYextract(row): outputdataframe1 = pd.DataFrame(columns=fields_exp) StrippedSequence = row["peptide"] PPCharge=int(row["charge"]) id=row["_id"] glyspec=row["GlySpec"] # import ipdb # ipdb.set_trace() ms2by=row["ions_by"] ms2BY=row["ions_BY"] # iden_pep=id_iden_pep[id] # import ipdb # ipdb.set_trace() iden_pep=row["iden_pep"] PlausibleStruct=iden_pep.split("_")[-1] glysite=iden_pep.split("_")[2] # Precurmass_exp=glyspec_pepmass_mgf[glyspec] Precurmass_cal= (calcModpepMass(peptide_process(iden_pep))+MASS["H2O"])/PPCharge+MASS["H+"]#根据iden_pep计算母离子质荷比 # deltamz=abs(Precurmass_exp-Precurmass_cal)/Precurmass_exp FRAG_MODE=["HCD_by","HCD_1"] mz_calc=masses.pepfragmass(iden_pep,FRAG_MODE) #得到iontype和m/z对 # ipdb.set_trace() #by jmax=ms2by.size()[1] imax=ms2by.size()[0] # ipdb.set_trace() assert jmax==num_col for j in range(0,jmax): for i in range(0,imax): intensity_pred=ms2by[i][j] #从ms2中提取对应位置的强度 if intensity_pred <=0: continue intensity_pred=float(intensity_pred) ionname=ionname_by[j] #从ionname列表中获得对应结果的离子类型 FIFrgType=ionname[0] FICharge=int(ionname[1]) FILossType=ionname[2:] FILossType=list(lossdict_by.keys())[list(lossdict_by.values()).index(FILossType)] assert FIFrgType in ["b","y"] # ipdb.set_trace() if FIFrgType=="b": FragmentNumber=i+1 masskey="b" + str(FragmentNumber) + "_" + str(FICharge) if FIFrgType=="y": FragmentNumber=imax - i masskey="y" + str(FragmentNumber) + "_" + str(FICharge) # import ipdb if FIFrgType=="b" and int(FragmentNumber)<=int(glysite) and ionname.endswith("h"): continue if FIFrgType=="y" and int(i+1)>int(glysite) and ionname.endswith("h"): continue if FILossType!="noloss": loss=ionname[2] if loss !="h": loss=list(lossdict_by.keys())[list(lossdict_by.values()).index(loss)] masskey+="_loss_"+FILossType if ionname.endswith("h"): masskey+="_"+"(HexNAc)1" # import ipdb # ipdb.set_trace() else: masskey+="_"+"(HexNAc)0" counter=0 for k in mz_calc: for dict in k: key=[str(key) for key in dict.keys()] assert len(key)==1 if masskey == key[0]: value=[float(value) for value in dict.values()] assert len(value)==1 FragmentMz=value[0] counter+=1 if counter>1: raise ValueError("loop number more than one") # ipdb.set_trace() if counter==1: outputdataframe1.loc[len(outputdataframe1)] = [glyspec,id, StrippedSequence,iden_pep,PlausibleStruct,\ PPCharge,FragmentMz,\ FIFrgType, FILossType,FICharge,intensity_pred,\ FragmentNumber,Precurmass_cal] #BY jmax=ms2BY.size()[1] imax=ms2BY.size()[0] assert jmax==num_col*2/3 for j in range(0,jmax): for i in range(0,imax): intensity_pred=ms2BY[i][j] #从ms2中提取对应位置的强度 if intensity_pred <=0: continue intensity_pred=float(intensity_pred) ionname=ionname_BY[j] #从ionname列表中获得对应结果的离子类型 FIFrgType=ionname[0] FICharge=int(ionname[1]) FILossType=ionname[2:] if FILossType=="f" and "F" not in PlausibleStruct: continue if FILossType=="f" and FIFrgType=="B": continue FILossType=list(lossdict_BY.keys())[list(lossdict_BY.values()).index(FILossType)] FragmentNumber=i #"_" + str(k) + "_" + str(ficharge)+"_loss_H2O" masskey="_" + str(FragmentNumber) + "_" + str(FICharge) if FILossType!="noloss": masskey+="_loss_"+FILossType counter=0 # if FILossType=="FUC": # ipdb.set_trace() for k in mz_calc: for dict in k: key=[str(key) for key in dict.keys()] assert len(key)==1 if FIFrgType == key[0][0]: if key[0].endswith(masskey): value=[float(value) for value in dict.values()] assert len(value)==1 FragmentMz=value[0] counter+=1 if counter>1: raise ValueError("loop number more than one") # import ipdb # ipdb.set_trace() if counter==1: outputdataframe1.loc[len(outputdataframe1)] = [glyspec,id, StrippedSequence,iden_pep,PlausibleStruct,\ PPCharge,FragmentMz,\ FIFrgType, FILossType,FICharge,intensity_pred,\ FragmentNumber,Precurmass_cal] # import ipdb # ipdb.set_trace() return outputdataframe1 def byBYextract_reidentification(row): outputdataframe1 = pd.DataFrame(columns=["GlySpec","modelpep","FragmentMz",\ "FIFrgType", "FILossType","FICharge","intensity_pred",\ "FragmentNumber","FragmentGlycan"]) import ipdb GlySpec=row["GlySpec"] FRAG_MODE=["HCD_1"] # ipdb.set_trace() #BY for num in ["1","2"]: modelpep=row["modelpep"+num] ms2BY=row["maskmodel"+num] mz_calc=masses.pepfragmass(modelpep,FRAG_MODE) #得到iontype和m/z对 # ipdb.set_trace() jmax=ms2BY.size()[1] imax=ms2BY.size()[0] assert jmax==num_col*2/3 # ipdb.set_trace() for j in range(0,jmax): for i in range(0,imax): intensity_pred=ms2BY[i][j] #从ms2中提取对应位置的强度 if intensity_pred <=0: continue intensity_pred=float(intensity_pred) ionname=ionname_BY[j] #从ionname列表中获得对应结果的离子类型 FIFrgType=ionname[0] FICharge=int(ionname[1]) FILossType=ionname[2:] FILossType=list(lossdict_BY.keys())[list(lossdict_BY.values()).index(FILossType)] FragmentNumber=i masskey="_" + str(FragmentNumber) + "_" + str(FICharge) if FILossType!="noloss": masskey+="_loss_"+FILossType counter=0 # ipdb.set_trace() for k in mz_calc: for dict in k: key=[str(key) for key in dict.keys()] assert len(key)==1 if FIFrgType == key[0][0]: if key[0].endswith(masskey): value=[float(value) for value in dict.values()] assert len(value)==1 FragmentMz=value[0] fragnode=str(key[0].split("_")[0][1:]) # ipdb.set_trace() counter+=1 if counter>1: raise ValueError("loop number more than one") # import ipdb # ipdb.set_trace() if counter==1: outputdataframe1.loc[len(outputdataframe1)] = [GlySpec,modelpep,FragmentMz,\ FIFrgType, FILossType,FICharge,intensity_pred,\ FragmentNumber,fragnode] # import ipdb # ipdb.set_trace() return outputdataframe1 # ----------------------- execute ------------------------------# def postprocessing(savename): filename=savename+"_byBY_result.csv" df=pd.read_csv(filename) # import ipdb # ipdb.set_trace() # df=df.loc[df["metric"]==max(df["metric"])] #取一个进行测试 # df.to_csv(datafold+"del.csv",index=False) print(df.columns) df["ms2by"]=df["ms2by"].apply(lambda x:torch.Tensor(eval(x))) df["ms2BY"]=df["ms2BY"].apply(lambda x:torch.Tensor(eval(x))) #训练的时候ms2使用了log2(x+1),这里变回来#检查一下 #'repsequence', 'charge', 'ipeptide', 'iPlausibleStruct', 'ms2', 'cos','id' print(f"The number of the lines of the input files {len(df)}") filename_expjson=savename+"_byBY_testdata.json" dfexp=pd.read_json(filename_expjson) # dfexp=dfexp.loc[dfexp["_id"]==2665] # dfexp.to_csv(datafold+"exp_6923_del.csv",index=False) print(dfexp.columns) # import ipdb # ipdb.set_trace() global id_glyspec global id_iden_pep id_glyspec = dict(zip(dfexp['_id'],dfexp['GlySpec'])) #产生_id,GlySpec字典 id_iden_pep=dict(zip(dfexp['_id'],dfexp['iden_pep'])) # import ipdb # ipdb.set_trace() # dfinput=pd.read_csv(dfinput) # ipdb.set_trace() outputdataframe_BY=pd.concat(df.apply(byBYextract,axis=1).to_list()) outputdataframe_BY.reset_index(inplace=True,drop=True) print(f"The number of the lines of the output files {len(outputdataframe_BY)}") outputdataframe_BY.to_csv(savename+"_predictmonomz.csv",index=False) # import ipdb # ipdb.set_trace() dfexp["ions_by"]=dfexp["ions_by"].apply(lambda x:torch.Tensor(x)) dfexp["ions_BY"]=dfexp["ions_BY"].apply(lambda x:torch.Tensor(x)) print(dfexp.columns) # dfexp.to_csv(datafold+"exp_6923_del.csv",index=False print(f"The number of the lines of the input files {len(dfexp)}") # outputdataframe_expBY = pd.DataFrame(columns=fields_exp) # import ipdb # ipdb.set_trace() outputdataframe_expBY=pd.concat(dfexp.apply(expbyBYextract,axis=1).to_list()) outputdataframe_expBY.reset_index(inplace=True,drop=True) print(f"The number of the lines of the output files {len(outputdataframe_expBY)}") outputdataframe_expBY.to_csv(savename+"_expmonomz.csv",index=False) def Precurmass_cal(innstance): iden_pep=innstance["iden_pep"] PPCharge=int(innstance["charge"]) Precurmass_cal=(calcModpepMass(peptide_process(iden_pep))+MASS["H2O"])/PPCharge+MASS["H+"] return Precurmass_cal def postprocessing_nointensity(savename,dfinputname): filename=savename+"_byBY_result.csv" df=pd.read_csv(filename) print(df.columns) print(f"The number of the lines of the input files {len(df)}") filename_expjson=savename+"_byBY_testdata.json" dfexp=pd.read_json(filename_expjson) print(dfexp.columns) print(f"The number of the lines of the input exp files {len(dfexp)}") # import ipdb # ipdb.set_trace() dfexp=dfexp[['_id','GlySpec',"iden_pep"]] df=df[[ 'charge', 'ipeptide', 'iPlausibleStruct','metric', "metricBY_cos",'id']] df.columns=['charge', 'PEP.StrippedSequence', 'PlausibleStruct','metric',"metricBY", '_id'] df=pd.merge(df,dfexp,on="_id",how="left") # import ipdb # ipdb.set_trace() dfinput=pd.read_csv(dfinputname) # ipdb.set_trace() dfinput=dfinput[['GlySpec','pepmass_mgf',"matching_ration","iden_pep"]].drop_duplicates() print(len(dfinput)) df=pd.merge(df,dfinput,on=["GlySpec","iden_pep"],how="left") # ipdb.set_trace() df["Precurmass_cal"]= df.apply(Precurmass_cal,axis=1) df["deltamzpercent"]=abs(df["pepmass_mgf"]-df["Precurmass_cal"])/df["Precurmass_cal"] df.to_csv(savename+"_nointensity_predictmonomz.csv",index=False) # if __name__ == '__main__': # savename="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/human/IgG/PXD015360_IgG/test_replace_predict/test_byBY_PXD015360_IgG_data_1st_target_decoy_model" # dfinputname="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/human/IgG/PXD015360_IgG/PXD015360_IgG_data_1st_redo1_nocoelute_target_decoy_Retained_all.csv" # postprocessing_nointensity(savename,dfinputname) # savename="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/human/PXD009654/test_replace_predict/test_byBY_PXD009654_human_data_1st_target_decoy_model" # dfinputname="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/human/PXD009654/PXD009654_data_1st_redo1_nocoelute_target_decoy_Retained_all.csv" # postprocessing_nointensity(savename,dfinputname) # savename="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/human/IgG/PXD009716/test_replace_predict/PXD009716_IgG_data_1st_target_decoy_NAGF_model_0.930816" # dfinputname="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/human/IgG/PXD009716/PXD009716_IgG_data_1st_target_decoy_Retained_all.csv" # postprocessing_nointensity(savename,dfinputname) # savename="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/syn/PXD023980/test_replace_predict/test_byBY_PXD023980_syn_test_9_glycans_redo1_model_0.927153" # dfinputname="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/syn/PXD023980/PXD023980_Syn_test_9_glycans_redo1_data_1st.csv" # postprocessing_nointensity(savename,dfinputname) # savename="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/syn/PXD023980/test_replace_predict/test_byBY_PXD023980_syn_identification_task3" # dfinputname="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/syn/PXD023980/Syn_PXD023980_task3_data_1st.csv" # postprocessing_nointensity(savename,dfinputname) # savename="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/human/PXD016428/test_replace_predict/test_byBY_PXD016428_IgG_data_1st_target_decoy_model_0.933599" # dfinputname="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/human/PXD016428/PXD016428_serum_data_1st_target_decoy_Retained_all.csv" # postprocessing_nointensity(savename,dfinputname) # savename="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/human/PXD016428/test_replace_predict/test_byBY_PXD016428_IgG_data_1st_target_decoy_model_0.937599" # dfinputname="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/human/PXD016428/PXD016428_serum_data_1st_target_decoy_Retained_all.csv" # postprocessing_nointensity(savename,dfinputname) # savename="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/mouse/Five_tissues/test_replace_predict/test_byBY_PXD005411_model_0.927740" # postprocessing(savename)