#对预测得到的结果转化为谱图的形式
#目前改了by和BY,exp暂时还没改
import pandas as pd
import numpy as np
import ipdb
from utils import *
import torch
import masses
from masses import *
# ----------------------- dict ------------------------------#
MAXFICHARGE = 2
if MAXFICHARGE == 2:
ionname_by = "b1,b1n,b1o,b1h,b1nh,b1oh,b1hh,b1nhh,b1ohh,y1,y1n,y1o,y1h,y1nh,y1oh,y1hh,y1nhh,y1ohh,"\
"b2,b2n,b2o,b2h,b2nh,b2oh,b2hh,b2nhh,b2ohh,y2,y2n,y2o,y2h,y2nh,y2oh,y2hh,y2nhh,y2ohh".split(
',')
ionname_BY = "B1,B1n,B1o,B1f,Y1,Y1n,Y1o,Y1f,"\
"B2,B2n,B2o,B2f,Y2,Y2n,Y2o,Y2f".split(',')
lossdict_by = {"noloss": '', "H2O": 'o', "NH3": 'n', '(HexNAc)1_(HexNAc)0': 'h','(HexNAc)1_(HexNAc)1': 'hh',"(HexNAc)0":"",
'1(NH3)1(+HexNAc)': 'nh', '1(H2O)1(+HexNAc)': 'oh','1(NH3)2(+HexNAc)': 'nhh', '1(H2O)2(+HexNAc)': 'ohh'}
lossdict_BY = {"noloss": '', "H2O": 'o', "NH3": 'n', 'FUC': 'f'}
fields_byBY = "SourceFile,id,PEP.StrippedSequence,iden_pep,PlausibleStruct,PP.Charge,metric,FragmentMz,FI.FrgType,FI.LossType,FI.Charge,FI.Intensity,FI.FrgNum,Precurmass_cal".split(',')
fields_exp = "SourceFile,id,PEP.StrippedSequence,iden_pep,PlausibleStruct,PP.Charge,FragmentMz,FI.FrgType,FI.LossType,FI.Charge,FI.Intensity,FI.FrgNum,Precurmass_cal".split(',')
# ----------------------- func ------------------------------#
def parse_hexnac_suffix(suffix: str):
"""将糖基组合解析为标准形式:'h', 'hh' 等"""
states = suffix.split("_")
hex_count = states.count("(HexNAc)1")
return "h" * hex_count
def normalize_hexnac_combo(k: str):
"""标准化糖基组合顺序,避免重复定义"""
parts = k.split("_")
gly_parts = sorted([p for p in parts if p.startswith("(HexNAc)")])
return "_".join(parts[:-len(gly_parts)] + gly_parts)
def byBYextract(row):
outputdataframe1 = pd.DataFrame(columns=fields_byBY)
# StrippedSequence = "".join([vocab.to_word(i) for i in eval(row['repsequence'])])
StrippedSequence=row["ipeptide"]
PPCharge=int(row["charge"])
metric=float(row["metric"])
id=row["id"]
global id_glyspec
global id_iden_pep
glyspec=id_glyspec[id]
ms2by=row["ms2by"]
ms2BY=row["ms2BY"]
iden_pep=id_iden_pep[id]
# import ipdb
# ipdb.set_trace()
PlausibleStruct=iden_pep.split("_")[-1]
glysite=iden_pep.split("_")[2]
# Precurmass_exp=glyspec_pepmass_mgf[glyspec]
Precurmass_cal= (calcModpepMass(peptide_process(iden_pep))+MASS["H2O"])/PPCharge+MASS["H+"]#根据iden_pep计算母离子质荷比
# deltamz=abs(Precurmass_exp-Precurmass_cal)/Precurmass_exp
FRAG_MODE=["HCD_by","HCD_1"]
mz_calc=masses.pepfragmass(iden_pep,FRAG_MODE) #得到iontype和m/z对
# ipdb.set_trace()
#by
jmax=ms2by.size()[1]
imax=ms2by.size()[0]
# import ipdb
# ipdb.set_trace()
assert jmax==len(ionname_by)
for j in range(0,jmax):
for i in range(0,imax):
intensity_pred=ms2by[i][j] #从ms2中提取对应位置的强度
if intensity_pred <=0:
continue
intensity_pred=float(intensity_pred)
ionname=ionname_by[j] #从ionname列表中获得对应结果的离子类型
FIFrgType=ionname[0]
FICharge=int(ionname[1])
if FIFrgType=="b":
FragmentNumber=i+1
masskey=ionname+ "_" + str(FragmentNumber)
if FIFrgType=="y":
FragmentNumber=imax - i
masskey=ionname+ "_" + str(FragmentNumber)
counter=0
for k in mz_calc:
for dict in k:
key=[str(key) for key in dict.keys()]
assert len(key)==1
ke=key[0]
if ke[0].islower(): # b/y ions
ke = normalize_hexnac_combo(ke)
FragmentTypek = ke[0]
FragmentNumberk = int(ke.split("_")[0][1:])
Fragmentchargek = int(ke.split("_")[1])
Fragmentlossk = ""
if "loss" in ke:
index_loss= ke.split("_").index("loss")+1
Fragmentlossk = ke.split("_")[index_loss].upper().replace("H20", "H2O")
Fragmentlossk = lossdict_by.get(Fragmentlossk, "")
hex_suffix = "_".join(x for x in ke.split("_") if x.startswith("(HexNAc)"))
Fragmenthexnac = parse_hexnac_suffix(hex_suffix)
ion_k = FragmentTypek + str(Fragmentchargek) + Fragmentlossk + Fragmenthexnac+"_"+str(FragmentNumberk)
# else: # B/Y ions
# # ion_type_BY.append(ion_type)
# import ipdb
# ipdb.set_trace()
if masskey == ion_k:
value=[float(value) for value in dict.values()]
assert len(value)==1
FragmentMz=value[0]
counter+=1
# if counter>1:
# import ipdb
# ipdb.set_trace()
# raise ValueError("loop number more than one")
# ipdb.set_trace()
if counter==1:
outputdataframe1.loc[len(outputdataframe1)] = [glyspec,id, StrippedSequence,iden_pep,PlausibleStruct,\
PPCharge,metric,FragmentMz,\
FIFrgType, masskey,FICharge,intensity_pred,\
FragmentNumber,Precurmass_cal]
#BY
jmax=ms2BY.size()[1]
imax=ms2BY.size()[0]
assert jmax==len(ionname_BY)
for j in range(0,jmax):
for i in range(0,imax):
intensity_pred=ms2BY[i][j] #从ms2中提取对应位置的强度
if intensity_pred <=0:
continue
intensity_pred=float(intensity_pred)
ionname=ionname_BY[j] #从ionname列表中获得对应结果的离子类型
FIFrgType=ionname[0]
FICharge=int(ionname[1])
FILossType=ionname[2:]
if FILossType=="f" and "F" not in PlausibleStruct:
continue
if FILossType=="f" and FIFrgType=="B":
continue
FILossType=list(lossdict_BY.keys())[list(lossdict_BY.values()).index(FILossType)]
FragmentNumber=i
#"_" + str(k) + "_" + str(ficharge)+"_loss_H2O"
masskey="_" + str(FragmentNumber) + "_" + str(FICharge)
if FILossType!="noloss":
masskey+="_loss_"+FILossType
counter=0
# if FILossType=="FUC":
# ipdb.set_trace()
for k in mz_calc:
for dict in k:
key=[str(key) for key in dict.keys()]
assert len(key)==1
if FIFrgType == key[0][0]:
if key[0].endswith(masskey):
value=[float(value) for value in dict.values()]
assert len(value)==1
FragmentMz=value[0]
counter+=1
if counter>1:
raise ValueError("loop number more than one")
# import ipdb
# ipdb.set_trace()
if counter==1:
outputdataframe1.loc[len(outputdataframe1)] = [glyspec,id, StrippedSequence,iden_pep,PlausibleStruct,\
PPCharge,metric,FragmentMz,\
FIFrgType, FILossType,FICharge,intensity_pred,\
FragmentNumber,Precurmass_cal]
# import ipdb
# ipdb.set_trace()
return outputdataframe1
def expbyBYextract(row):
outputdataframe1 = pd.DataFrame(columns=fields_exp)
StrippedSequence = row["peptide"]
PPCharge=int(row["charge"])
id=row["_id"]
glyspec=row["GlySpec"]
# import ipdb
# ipdb.set_trace()
ms2by=row["ions_by"]
ms2BY=row["ions_BY"]
# iden_pep=id_iden_pep[id]
# import ipdb
# ipdb.set_trace()
iden_pep=row["iden_pep"]
PlausibleStruct=iden_pep.split("_")[-1]
glysite=iden_pep.split("_")[2]
# Precurmass_exp=glyspec_pepmass_mgf[glyspec]
Precurmass_cal= (calcModpepMass(peptide_process(iden_pep))+MASS["H2O"])/PPCharge+MASS["H+"]#根据iden_pep计算母离子质荷比
# deltamz=abs(Precurmass_exp-Precurmass_cal)/Precurmass_exp
FRAG_MODE=["HCD_by","HCD_1"]
mz_calc=masses.pepfragmass(iden_pep,FRAG_MODE) #得到iontype和m/z对
# ipdb.set_trace()
#by
jmax=ms2by.size()[1]
imax=ms2by.size()[0]
# ipdb.set_trace()
assert jmax==len(ionname_by)
for j in range(0,jmax):
for i in range(0,imax):
intensity_pred=ms2by[i][j] #从ms2中提取对应位置的强度
if intensity_pred <=0:
continue
intensity_pred=float(intensity_pred)
ionname=ionname_by[j] #从ionname列表中获得对应结果的离子类型
FIFrgType=ionname[0]
FICharge=int(ionname[1])
FILossType=ionname[2:]
FILossType=list(lossdict_by.keys())[list(lossdict_by.values()).index(FILossType)]
assert FIFrgType in ["b","y"]
# ipdb.set_trace()
if FIFrgType=="b":
FragmentNumber=i+1
masskey=ionname+ "_" + str(FragmentNumber)
if FIFrgType=="y":
FragmentNumber=imax - i
masskey=ionname+ "_" + str(FragmentNumber)
counter=0
for k in mz_calc:
for dict in k:
key=[str(key) for key in dict.keys()]
assert len(key)==1
ke=key[0]
if ke[0].islower(): # b/y ions
ke = normalize_hexnac_combo(ke)
FragmentTypek = ke[0]
FragmentNumberk = int(ke.split("_")[0][1:])
Fragmentchargek = int(ke.split("_")[1])
Fragmentlossk = ""
if "loss" in ke:
index_loss= ke.split("_").index("loss")+1
Fragmentlossk = ke.split("_")[index_loss].upper().replace("H20", "H2O")
Fragmentlossk = lossdict_by.get(Fragmentlossk, "")
hex_suffix = "_".join(x for x in ke.split("_") if x.startswith("(HexNAc)"))
Fragmenthexnac = parse_hexnac_suffix(hex_suffix)
ion_k = FragmentTypek + str(Fragmentchargek) + Fragmentlossk + Fragmenthexnac+"_"+str(FragmentNumberk)
# else: # B/Y ions
# # ion_type_BY.append(ion_type)
# import ipdb
# ipdb.set_trace()
if masskey == ion_k:
value=[float(value) for value in dict.values()]
assert len(value)==1
FragmentMz=value[0]
counter+=1
# ipdb.set_trace()
if counter==1:
outputdataframe1.loc[len(outputdataframe1)] = [glyspec,id, StrippedSequence,iden_pep,PlausibleStruct,\
PPCharge,FragmentMz,\
FIFrgType, FILossType,FICharge,intensity_pred,\
FragmentNumber,Precurmass_cal]
#BY
jmax=ms2BY.size()[1]
imax=ms2BY.size()[0]
assert jmax==16
# ipdb.set_trace()
for j in range(0,jmax):
for i in range(0,imax):
intensity_pred=ms2BY[i][j] #从ms2中提取对应位置的强度
if intensity_pred <=0:
continue
intensity_pred=float(intensity_pred)
ionname=ionname_BY[j] #从ionname列表中获得对应结果的离子类型
FIFrgType=ionname[0]
FICharge=int(ionname[1])
FILossType=ionname[2:]
if FILossType=="f" and "F" not in PlausibleStruct:
continue
if FILossType=="f" and FIFrgType=="B":
continue
FILossType=list(lossdict_BY.keys())[list(lossdict_BY.values()).index(FILossType)]
FragmentNumber=i
#"_" + str(k) + "_" + str(ficharge)+"_loss_H2O"
masskey="_" + str(FragmentNumber) + "_" + str(FICharge)
if FILossType!="noloss":
masskey+="_loss_"+FILossType
counter=0
# if FILossType=="FUC":
# ipdb.set_trace()
for k in mz_calc:
for dict in k:
key=[str(key) for key in dict.keys()]
assert len(key)==1
if FIFrgType == key[0][0]:
if key[0].endswith(masskey):
value=[float(value) for value in dict.values()]
assert len(value)==1
FragmentMz=value[0]
counter+=1
if counter>1:
raise ValueError("loop number more than one")
# import ipdb
# ipdb.set_trace()
if counter==1:
outputdataframe1.loc[len(outputdataframe1)] = [glyspec,id, StrippedSequence,iden_pep,PlausibleStruct,\
PPCharge,FragmentMz,\
FIFrgType, FILossType,FICharge,intensity_pred,\
FragmentNumber,Precurmass_cal]
# import ipdb
# ipdb.set_trace()
return outputdataframe1
def byBYextract_reidentification(row):
outputdataframe1 = pd.DataFrame(columns=["GlySpec","modelpep","FragmentMz",\
"FIFrgType", "FILossType","FICharge","intensity_pred",\
"FragmentNumber","FragmentGlycan"])
import ipdb
GlySpec=row["GlySpec"]
FRAG_MODE=["HCD_1"]
# ipdb.set_trace()
#BY
for num in ["1","2"]:
modelpep=row["modelpep"+num]
ms2BY=row["maskmodel"+num]
mz_calc=masses.pepfragmass(modelpep,FRAG_MODE) #得到iontype和m/z对
# ipdb.set_trace()
jmax=ms2BY.size()[1]
imax=ms2BY.size()[0]
assert jmax==16
# ipdb.set_trace()
for j in range(0,jmax):
for i in range(0,imax):
intensity_pred=ms2BY[i][j] #从ms2中提取对应位置的强度
if intensity_pred <=0:
continue
intensity_pred=float(intensity_pred)
ionname=ionname_BY[j] #从ionname列表中获得对应结果的离子类型
FIFrgType=ionname[0]
FICharge=int(ionname[1])
FILossType=ionname[2:]
FILossType=list(lossdict_BY.keys())[list(lossdict_BY.values()).index(FILossType)]
FragmentNumber=i
masskey="_" + str(FragmentNumber) + "_" + str(FICharge)
if FILossType!="noloss":
masskey+="_loss_"+FILossType
counter=0
# ipdb.set_trace()
for k in mz_calc:
for dict in k:
key=[str(key) for key in dict.keys()]
assert len(key)==1
if FIFrgType == key[0][0]:
if key[0].endswith(masskey):
value=[float(value) for value in dict.values()]
assert len(value)==1
FragmentMz=value[0]
fragnode=str(key[0].split("_")[0][1:])
# ipdb.set_trace()
counter+=1
if counter>1:
raise ValueError("loop number more than one")
# import ipdb
# ipdb.set_trace()
if counter==1:
outputdataframe1.loc[len(outputdataframe1)] = [GlySpec,modelpep,FragmentMz,\
FIFrgType, FILossType,FICharge,intensity_pred,\
FragmentNumber,fragnode]
# import ipdb
# ipdb.set_trace()
return outputdataframe1
# ----------------------- execute ------------------------------#
def postprocessing(savename):
filename=savename+"_byBY_result.csv"
df=pd.read_csv(filename)
# import ipdb
# ipdb.set_trace()
# df=df.loc[df["metric"]==max(df["metric"])] #取一个进行测试
# df.to_csv(datafold+"del.csv",index=False)
print(df.columns)
df["ms2by"]=df["ms2by"].apply(lambda x:torch.Tensor(eval(x)))
df["ms2BY"]=df["ms2BY"].apply(lambda x:torch.Tensor(eval(x)))
#训练的时候ms2使用了log2(x+1),这里变回来#检查一下
#'repsequence', 'charge', 'ipeptide', 'iPlausibleStruct', 'ms2', 'cos','id'
print(f"The number of the lines of the input files {len(df)}")
filename_expjson=savename+"_byBY_testdata.json"
dfexp=pd.read_json(filename_expjson)
# dfexp=dfexp.loc[dfexp["_id"]==2665]
# dfexp.to_csv(datafold+"exp_6923_del.csv",index=False)
print(dfexp.columns)
# import ipdb
# ipdb.set_trace()
global id_glyspec
global id_iden_pep
id_glyspec = dict(zip(dfexp['_id'],dfexp['GlySpec'])) #产生_id,GlySpec字典
id_iden_pep=dict(zip(dfexp['_id'],dfexp['iden_pep']))
# import ipdb
# ipdb.set_trace()
# dfinput=pd.read_csv(dfinput)
# ipdb.set_trace()
outputdataframe_BY=pd.concat(df.apply(byBYextract,axis=1).to_list())
outputdataframe_BY.reset_index(inplace=True,drop=True)
print(f"The number of the lines of the output files {len(outputdataframe_BY)}")
outputdataframe_BY.to_csv(savename+"_predictmonomz.csv",index=False)
# import ipdb
# ipdb.set_trace()
dfexp["ions_by"]=dfexp["ions_by"].apply(lambda x:torch.Tensor(x))
dfexp["ions_BY"]=dfexp["ions_BY"].apply(lambda x:torch.Tensor(x))
print(dfexp.columns)
# dfexp.to_csv(datafold+"exp_6923_del.csv",index=False
print(f"The number of the lines of the input files {len(dfexp)}")
# outputdataframe_expBY = pd.DataFrame(columns=fields_exp)
# import ipdb
# ipdb.set_trace()
outputdataframe_expBY=pd.concat(dfexp.apply(expbyBYextract,axis=1).to_list())
outputdataframe_expBY.reset_index(inplace=True,drop=True)
print(f"The number of the lines of the output files {len(outputdataframe_expBY)}")
outputdataframe_expBY.to_csv(savename+"_expmonomz.csv",index=False)
def Precurmass_cal(innstance):
iden_pep=innstance["iden_pep"]
PPCharge=int(innstance["charge"])
Precurmass_cal=(calcModpepMass(peptide_process(iden_pep))+MASS["H2O"])/PPCharge+MASS["H+"]
return Precurmass_cal
def postprocessing_nointensity(savename,dfinputname):
filename=savename+"_byBY_result.csv"
df=pd.read_csv(filename)
print(df.columns)
print(f"The number of the lines of the input files {len(df)}")
filename_expjson=savename+"_byBY_testdata.json"
dfexp=pd.read_json(filename_expjson)
print(dfexp.columns)
print(f"The number of the lines of the input exp files {len(dfexp)}")
# import ipdb
# ipdb.set_trace()
dfexp=dfexp[['_id','GlySpec',"iden_pep"]]
df=df[[ 'charge', 'ipeptide', 'iPlausibleStruct','metric', "metricBY_cos",'id']]
df.columns=['charge', 'PEP.StrippedSequence', 'PlausibleStruct','metric',"metricBY", '_id']
df=pd.merge(df,dfexp,on="_id",how="left")
# import ipdb
# ipdb.set_trace()
dfinput=pd.read_csv(dfinputname)
# ipdb.set_trace()
dfinput=dfinput[['GlySpec','pepmass_mgf',"matching_ration","iden_pep"]].drop_duplicates()
print(len(dfinput))
df=pd.merge(df,dfinput,on=["GlySpec","iden_pep"],how="left")
# ipdb.set_trace()
df["Precurmass_cal"]= df.apply(Precurmass_cal,axis=1)
df["deltamzpercent"]=abs(df["pepmass_mgf"]-df["Precurmass_cal"])/df["Precurmass_cal"]
df.to_csv(savename+"_nointensity_predictmonomz.csv",index=False)
if __name__ == '__main__':
# savename="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/human/IgG/PXD015360_IgG/test_replace_predict/test_byBY_PXD015360_IgG_data_1st_target_decoy_model"
# dfinputname="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/human/IgG/PXD015360_IgG/PXD015360_IgG_data_1st_redo1_nocoelute_target_decoy_Retained_all.csv"
# postprocessing_nointensity(savename,dfinputname)
# savename="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/human/PXD009654/test_replace_predict/test_byBY_PXD009654_human_data_1st_target_decoy_model"
# dfinputname="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/human/PXD009654/PXD009654_data_1st_redo1_nocoelute_target_decoy_Retained_all.csv"
# postprocessing_nointensity(savename,dfinputname)
# savename="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/human/IgG/PXD009716/test_replace_predict/PXD009716_IgG_data_1st_target_decoy_NAGF_model_0.930816"
# dfinputname="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/human/IgG/PXD009716/PXD009716_IgG_data_1st_target_decoy_Retained_all.csv"
# postprocessing_nointensity(savename,dfinputname)
# savename="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/syn/PXD023980/test_replace_predict/test_byBY_PXD023980_syn_test_9_glycans_redo1_model_0.927153"
# dfinputname="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/syn/PXD023980/PXD023980_Syn_test_9_glycans_redo1_data_1st.csv"
# postprocessing_nointensity(savename,dfinputname)
# savename="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/syn/PXD023980/test_replace_predict/test_byBY_PXD023980_syn_identification_task3"
# dfinputname="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/syn/PXD023980/Syn_PXD023980_task3_data_1st.csv"
# postprocessing_nointensity(savename,dfinputname)
# savename="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/human/PXD016428/test_replace_predict/test_byBY_PXD016428_IgG_data_1st_target_decoy_model_0.933599"
# dfinputname="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/human/PXD016428/PXD016428_serum_data_1st_target_decoy_Retained_all.csv"
# postprocessing_nointensity(savename,dfinputname)
# savename="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/human/PXD016428/test_replace_predict/test_byBY_PXD016428_IgG_data_1st_target_decoy_model_0.937599"
# dfinputname="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/human/PXD016428/PXD016428_serum_data_1st_target_decoy_Retained_all.csv"
# postprocessing_nointensity(savename,dfinputname)
# savename="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/mouse/Five_tissues/test_replace_predict/test_byBY_PXD005411_model_0.927740"
# postprocessing(savename)
postprocessing("/remote-home1/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/code2/task_processing/NO/multiple/multiple_data/test_replace_predict/test_byBY_multiple_Dev")