DeepGPO_code/postprocessing_spectra.py · DeepGPO

#对预测得到的结果转化为谱图的形式
#目前改了by和BY，exp暂时还没改
import pandas as pd
import numpy as np
import ipdb
from utils import *
import torch
import masses
from masses import *
# ----------------------- dict ------------------------------#
MAXFICHARGE = int(num_col/12)
if MAXFICHARGE == 2:
    ionname_by = "b1,b1n,b1o,b1h,b1nh,b1oh,y1,y1n,y1o,y1h,y1nh,y1oh,"\
        "b2,b2n,b2o,b2h,b2nh,b2oh,y2,y2n,y2o,y2h,y2nh,y2oh".split(
        ',')
    ionname_BY = "B1,B1n,B1o,B1f,Y1,Y1n,Y1o,Y1f,"\
                "B2,B2n,B2o,B2f,Y2,Y2n,Y2o,Y2f".split(
                ',')

if MAXFICHARGE == 3:
    ionname_by = "b1,b1n,b1o,b1h,b1nh,b1oh,y1,y1n,y1o,y1h,y1nh,y1oh,"\
        "b2,b2n,b2o,b2h,b2nh,b2oh,y2,y2n,y2o,y2h,y2nh,y2oh,"\
        "b3,b3n,b3o,b3h,b3nh,b3oh,y3,y3n,y3o,y3h,y3nh,y3oh".split(
        ',')
    ionname_BY = "B1,B1n,B1o,B1f,Y1,Y1n,Y1o,Y1f,"\
        "B2,B2n,B2o,B2f,Y2,Y2n,Y2o,Y2f,"\
        "B3,B3n,B3o,B3f,Y3,Y3n,Y3o,Y3f".split(
        ',')
lossdict_by = {"noloss": '', "H2O": 'o', "NH3": 'n',  '(HexNAc)1': 'h',
                '1(NH3)1(+HexNAc)': 'nh', '1(H2O)1(+HexNAc)': 'oh'}
lossdict_BY = {"noloss": '', "H2O": 'o', "NH3": 'n', 'FUC': 'f'}
fields_byBY = "SourceFile,id,PEP.StrippedSequence,iden_pep,PlausibleStruct,PP.Charge,metric,FragmentMz,FI.FrgType,FI.LossType,FI.Charge,FI.Intensity,FI.FrgNum,Precurmass_cal".split(',')
fields_exp = "SourceFile,id,PEP.StrippedSequence,iden_pep,PlausibleStruct,PP.Charge,FragmentMz,FI.FrgType,FI.LossType,FI.Charge,FI.Intensity,FI.FrgNum,Precurmass_cal".split(',')
    
# ----------------------- func ------------------------------#

def byBYextract(row):
    outputdataframe1 = pd.DataFrame(columns=fields_byBY)
    # StrippedSequence = "".join([vocab.to_word(i) for i in eval(row['repsequence'])])
    StrippedSequence=row["ipeptide"]
    PPCharge=int(row["charge"])
    metric=float(row["metric"])
    id=row["id"]
    global id_glyspec
    global id_iden_pep
    glyspec=id_glyspec[id]
    ms2by=row["ms2by"]
    ms2BY=row["ms2BY"]
    iden_pep=id_iden_pep[id]
    

    # import ipdb
    # ipdb.set_trace()
    PlausibleStruct=iden_pep.split("_")[-1]
    glysite=iden_pep.split("_")[2]
    # Precurmass_exp=glyspec_pepmass_mgf[glyspec]
    Precurmass_cal= (calcModpepMass(peptide_process(iden_pep))+MASS["H2O"])/PPCharge+MASS["H+"]#根据iden_pep计算母离子质荷比
    # deltamz=abs(Precurmass_exp-Precurmass_cal)/Precurmass_exp
    FRAG_MODE=["HCD_by","HCD_1"]
    mz_calc=masses.pepfragmass(iden_pep,FRAG_MODE) #得到iontype和m/z对
    # ipdb.set_trace()  
    #by  
    jmax=ms2by.size()[1]
    imax=ms2by.size()[0]
    # ipdb.set_trace()
    assert jmax==num_col
    for j in range(0,jmax):
        for i in range(0,imax):
            intensity_pred=ms2by[i][j] #从ms2中提取对应位置的强度
            if intensity_pred <=0:
                continue
            intensity_pred=float(intensity_pred)
            ionname=ionname_by[j] #从ionname列表中获得对应结果的离子类型
            FIFrgType=ionname[0]
            FICharge=int(ionname[1])
            FILossType=ionname[2:]
            FILossType=list(lossdict_by.keys())[list(lossdict_by.values()).index(FILossType)]
            assert FIFrgType in ["b","y"]
            # ipdb.set_trace()
            if FIFrgType=="b":
                FragmentNumber=i+1
                masskey="b" + str(FragmentNumber) + "_" + str(FICharge)
            if FIFrgType=="y":
                FragmentNumber=imax - i
                masskey="y" + str(FragmentNumber) + "_" + str(FICharge)
            # import ipdb
            if FIFrgType=="b" and int(FragmentNumber)<=int(glysite) and ionname.endswith("h"):
                continue
            if FIFrgType=="y" and int(i+1)>int(glysite) and ionname.endswith("h"):
                continue
            if FILossType!="noloss":
                loss=ionname[2]
                if loss !="h":
                    loss=list(lossdict_by.keys())[list(lossdict_by.values()).index(loss)]
                    masskey+="_loss_"+FILossType
            if ionname.endswith("h"):
                masskey+="_"+"(HexNAc)1"
                # import ipdb
                # ipdb.set_trace()
            else:
                masskey+="_"+"(HexNAc)0"

            counter=0
            for k in mz_calc: 
                for dict in k:
                    key=[str(key) for key in dict.keys()]
                    assert len(key)==1
                    if masskey == key[0]:
                        value=[float(value) for value in dict.values()]
                        assert len(value)==1
                        FragmentMz=value[0]
                        counter+=1
                        if counter>1:
                            raise ValueError("loop number more than one")
            # ipdb.set_trace()
            if counter==1:
                outputdataframe1.loc[len(outputdataframe1)] = [glyspec,id, StrippedSequence,iden_pep,PlausibleStruct,\
                                                           PPCharge,metric,FragmentMz,\
                                                           FIFrgType, FILossType,FICharge,intensity_pred,\
                                                           FragmentNumber,Precurmass_cal]                       


    #BY
    jmax=ms2BY.size()[1]
    imax=ms2BY.size()[0]
    assert jmax==num_col*2/3
    for j in range(0,jmax):
        for i in range(0,imax):
            intensity_pred=ms2BY[i][j] #从ms2中提取对应位置的强度
            if intensity_pred <=0:
                continue
            intensity_pred=float(intensity_pred)
            ionname=ionname_BY[j] #从ionname列表中获得对应结果的离子类型
            FIFrgType=ionname[0]
            FICharge=int(ionname[1])
            FILossType=ionname[2:]
            if FILossType=="f" and "F" not in PlausibleStruct:
                continue
            if FILossType=="f" and FIFrgType=="B":
                continue
            
            FILossType=list(lossdict_BY.keys())[list(lossdict_BY.values()).index(FILossType)]
            FragmentNumber=i
            
            #"_" + str(k) + "_" + str(ficharge)+"_loss_H2O"
            masskey="_" + str(FragmentNumber) + "_" + str(FICharge)
            if FILossType!="noloss":
                masskey+="_loss_"+FILossType
            counter=0
            # if FILossType=="FUC":
            #     ipdb.set_trace()
            for k in mz_calc: 
                for dict in k:
                    key=[str(key) for key in dict.keys()]
                    assert len(key)==1
                    if FIFrgType == key[0][0]:
                        if key[0].endswith(masskey):
                            value=[float(value) for value in dict.values()]
                            assert len(value)==1
                            FragmentMz=value[0]
                            counter+=1
                            if counter>1:
                                raise ValueError("loop number more than one")
            # import ipdb
            # ipdb.set_trace()
            if counter==1:
                outputdataframe1.loc[len(outputdataframe1)] = [glyspec,id, StrippedSequence,iden_pep,PlausibleStruct,\
                                                           PPCharge,metric,FragmentMz,\
                                                           FIFrgType, FILossType,FICharge,intensity_pred,\
                                                           FragmentNumber,Precurmass_cal]
            
    # import ipdb
    # ipdb.set_trace()
    return outputdataframe1

def expbyBYextract(row):
    outputdataframe1 = pd.DataFrame(columns=fields_exp)
    StrippedSequence = row["peptide"]
    PPCharge=int(row["charge"])
    id=row["_id"]
    glyspec=row["GlySpec"]
    # import ipdb
    # ipdb.set_trace()
    ms2by=row["ions_by"]
    ms2BY=row["ions_BY"]
    # iden_pep=id_iden_pep[id]
    # import ipdb
    # ipdb.set_trace()
    iden_pep=row["iden_pep"]
    PlausibleStruct=iden_pep.split("_")[-1]
    glysite=iden_pep.split("_")[2]
    # Precurmass_exp=glyspec_pepmass_mgf[glyspec]
    Precurmass_cal= (calcModpepMass(peptide_process(iden_pep))+MASS["H2O"])/PPCharge+MASS["H+"]#根据iden_pep计算母离子质荷比
    # deltamz=abs(Precurmass_exp-Precurmass_cal)/Precurmass_exp
    FRAG_MODE=["HCD_by","HCD_1"]
    mz_calc=masses.pepfragmass(iden_pep,FRAG_MODE) #得到iontype和m/z对
    # ipdb.set_trace()  
    #by  
    jmax=ms2by.size()[1]
    imax=ms2by.size()[0]
    # ipdb.set_trace()
    assert jmax==num_col
    for j in range(0,jmax):
        for i in range(0,imax):
            intensity_pred=ms2by[i][j] #从ms2中提取对应位置的强度
            if intensity_pred <=0:
                continue
            intensity_pred=float(intensity_pred)
            ionname=ionname_by[j] #从ionname列表中获得对应结果的离子类型
            FIFrgType=ionname[0]
            FICharge=int(ionname[1])
            FILossType=ionname[2:]
            FILossType=list(lossdict_by.keys())[list(lossdict_by.values()).index(FILossType)]
            assert FIFrgType in ["b","y"]
            # ipdb.set_trace()
            if FIFrgType=="b":
                FragmentNumber=i+1
                masskey="b" + str(FragmentNumber) + "_" + str(FICharge)
            if FIFrgType=="y":
                FragmentNumber=imax - i
                masskey="y" + str(FragmentNumber) + "_" + str(FICharge)
            # import ipdb
            if FIFrgType=="b" and int(FragmentNumber)<=int(glysite) and ionname.endswith("h"):
                continue
            if FIFrgType=="y" and int(i+1)>int(glysite) and ionname.endswith("h"):
                continue
            if FILossType!="noloss":
                loss=ionname[2]
                if loss !="h":
                    loss=list(lossdict_by.keys())[list(lossdict_by.values()).index(loss)]
                    masskey+="_loss_"+FILossType
            if ionname.endswith("h"):
                masskey+="_"+"(HexNAc)1"
                # import ipdb
                # ipdb.set_trace()
            else:
                masskey+="_"+"(HexNAc)0"

            counter=0
            for k in mz_calc: 
                for dict in k:
                    key=[str(key) for key in dict.keys()]
                    assert len(key)==1
                    if masskey == key[0]:
                        value=[float(value) for value in dict.values()]
                        assert len(value)==1
                        FragmentMz=value[0]
                        counter+=1
                        if counter>1:
                            raise ValueError("loop number more than one")
            # ipdb.set_trace()
            if counter==1:
                outputdataframe1.loc[len(outputdataframe1)] = [glyspec,id, StrippedSequence,iden_pep,PlausibleStruct,\
                                                           PPCharge,FragmentMz,\
                                                           FIFrgType, FILossType,FICharge,intensity_pred,\
                                                           FragmentNumber,Precurmass_cal]                       


    #BY
    jmax=ms2BY.size()[1]
    imax=ms2BY.size()[0]
    assert jmax==num_col*2/3
    for j in range(0,jmax):
        for i in range(0,imax):
            intensity_pred=ms2BY[i][j] #从ms2中提取对应位置的强度
            if intensity_pred <=0:
                continue
            intensity_pred=float(intensity_pred)
            ionname=ionname_BY[j] #从ionname列表中获得对应结果的离子类型
            FIFrgType=ionname[0]
            FICharge=int(ionname[1])
            FILossType=ionname[2:]
            if FILossType=="f" and "F" not in PlausibleStruct:
                continue
            if FILossType=="f" and FIFrgType=="B":
                continue
            
            FILossType=list(lossdict_BY.keys())[list(lossdict_BY.values()).index(FILossType)]
            FragmentNumber=i
            
            #"_" + str(k) + "_" + str(ficharge)+"_loss_H2O"
            masskey="_" + str(FragmentNumber) + "_" + str(FICharge)
            if FILossType!="noloss":
                masskey+="_loss_"+FILossType
            counter=0
            # if FILossType=="FUC":
            #     ipdb.set_trace()
            for k in mz_calc: 
                for dict in k:
                    key=[str(key) for key in dict.keys()]
                    assert len(key)==1
                    if FIFrgType == key[0][0]:
                        if key[0].endswith(masskey):
                            value=[float(value) for value in dict.values()]
                            assert len(value)==1
                            FragmentMz=value[0]
                            counter+=1
                            if counter>1:
                                raise ValueError("loop number more than one")
            # import ipdb
            # ipdb.set_trace()
            if counter==1:
                outputdataframe1.loc[len(outputdataframe1)] = [glyspec,id, StrippedSequence,iden_pep,PlausibleStruct,\
                                                           PPCharge,FragmentMz,\
                                                           FIFrgType, FILossType,FICharge,intensity_pred,\
                                                           FragmentNumber,Precurmass_cal]
            
    # import ipdb
    # ipdb.set_trace()
    return outputdataframe1

def byBYextract_reidentification(row):
    outputdataframe1 = pd.DataFrame(columns=["GlySpec","modelpep","FragmentMz",\
                                             "FIFrgType", "FILossType","FICharge","intensity_pred",\
                                             "FragmentNumber","FragmentGlycan"])
    import ipdb
    GlySpec=row["GlySpec"]
    FRAG_MODE=["HCD_1"]
    # ipdb.set_trace()  

    #BY
    for num in ["1","2"]:
        modelpep=row["modelpep"+num]
        ms2BY=row["maskmodel"+num]
        mz_calc=masses.pepfragmass(modelpep,FRAG_MODE) #得到iontype和m/z对
        # ipdb.set_trace()
        jmax=ms2BY.size()[1]
        imax=ms2BY.size()[0]
        assert jmax==num_col*2/3
        # ipdb.set_trace()
        for j in range(0,jmax):
            for i in range(0,imax):
                intensity_pred=ms2BY[i][j] #从ms2中提取对应位置的强度
                if intensity_pred <=0:
                    continue
                intensity_pred=float(intensity_pred)
                ionname=ionname_BY[j] #从ionname列表中获得对应结果的离子类型
                FIFrgType=ionname[0]
                FICharge=int(ionname[1])
                FILossType=ionname[2:]
                
                FILossType=list(lossdict_BY.keys())[list(lossdict_BY.values()).index(FILossType)]
                FragmentNumber=i
                masskey="_" + str(FragmentNumber) + "_" + str(FICharge)
                if FILossType!="noloss":
                    masskey+="_loss_"+FILossType
                counter=0
                # ipdb.set_trace()
                for k in mz_calc: 
                    for dict in k:
                        key=[str(key) for key in dict.keys()]
                        assert len(key)==1
                        if FIFrgType == key[0][0]:
                            if key[0].endswith(masskey):
                                value=[float(value) for value in dict.values()]
                                assert len(value)==1
                                FragmentMz=value[0]
                                fragnode=str(key[0].split("_")[0][1:])
                                # ipdb.set_trace()
                                counter+=1
                                if counter>1:
                                    raise ValueError("loop number more than one")
                # import ipdb
                # ipdb.set_trace()
                if counter==1:
                    outputdataframe1.loc[len(outputdataframe1)] = [GlySpec,modelpep,FragmentMz,\
                                                            FIFrgType, FILossType,FICharge,intensity_pred,\
                                                            FragmentNumber,fragnode]
            
    # import ipdb
    # ipdb.set_trace()
    return outputdataframe1
# ----------------------- execute ------------------------------#
def postprocessing(savename):
    filename=savename+"_byBY_result.csv"
    df=pd.read_csv(filename)
    # import ipdb
    # ipdb.set_trace()
    # df=df.loc[df["metric"]==max(df["metric"])] #取一个进行测试
    # df.to_csv(datafold+"del.csv",index=False)
    print(df.columns)
    df["ms2by"]=df["ms2by"].apply(lambda x:torch.Tensor(eval(x)))
    df["ms2BY"]=df["ms2BY"].apply(lambda x:torch.Tensor(eval(x)))
    #训练的时候ms2使用了log2（x+1)，这里变回来#检查一下
    #'repsequence', 'charge', 'ipeptide', 'iPlausibleStruct', 'ms2', 'cos','id'
    print(f"The number of the lines of the input files {len(df)}")
    filename_expjson=savename+"_byBY_testdata.json"
    dfexp=pd.read_json(filename_expjson)
    # dfexp=dfexp.loc[dfexp["_id"]==2665] 
    # dfexp.to_csv(datafold+"exp_6923_del.csv",index=False)
    print(dfexp.columns)
    # import ipdb
    # ipdb.set_trace()
    global id_glyspec
    global id_iden_pep
    id_glyspec = dict(zip(dfexp['_id'],dfexp['GlySpec'])) #产生_id,GlySpec字典
    id_iden_pep=dict(zip(dfexp['_id'],dfexp['iden_pep']))
    # import ipdb
    # ipdb.set_trace()
    # dfinput=pd.read_csv(dfinput)
    # ipdb.set_trace()
    outputdataframe_BY=pd.concat(df.apply(byBYextract,axis=1).to_list())
    outputdataframe_BY.reset_index(inplace=True,drop=True)
    print(f"The number of the lines of the output files {len(outputdataframe_BY)}")
    outputdataframe_BY.to_csv(savename+"_predictmonomz.csv",index=False)


    
    # import ipdb
    # ipdb.set_trace()
    dfexp["ions_by"]=dfexp["ions_by"].apply(lambda x:torch.Tensor(x))
    dfexp["ions_BY"]=dfexp["ions_BY"].apply(lambda x:torch.Tensor(x))
    print(dfexp.columns)
    # dfexp.to_csv(datafold+"exp_6923_del.csv",index=False
    print(f"The number of the lines of the input files {len(dfexp)}")
    # outputdataframe_expBY = pd.DataFrame(columns=fields_exp)
    # import ipdb
    # ipdb.set_trace()
    outputdataframe_expBY=pd.concat(dfexp.apply(expbyBYextract,axis=1).to_list())
    outputdataframe_expBY.reset_index(inplace=True,drop=True)
    print(f"The number of the lines of the output files {len(outputdataframe_expBY)}")
    outputdataframe_expBY.to_csv(savename+"_expmonomz.csv",index=False)

def Precurmass_cal(innstance):
    iden_pep=innstance["iden_pep"]
    PPCharge=int(innstance["charge"])
    Precurmass_cal=(calcModpepMass(peptide_process(iden_pep))+MASS["H2O"])/PPCharge+MASS["H+"]
    return Precurmass_cal
def postprocessing_nointensity(savename,dfinputname):
    filename=savename+"_byBY_result.csv"
    df=pd.read_csv(filename)
    print(df.columns)
    print(f"The number of the lines of the input files {len(df)}")
    filename_expjson=savename+"_byBY_testdata.json"
    dfexp=pd.read_json(filename_expjson)
    print(dfexp.columns)
    print(f"The number of the lines of the input exp files {len(dfexp)}")
    # import ipdb
    # ipdb.set_trace()
    dfexp=dfexp[['_id','GlySpec',"iden_pep"]]
    df=df[[ 'charge', 'ipeptide', 'iPlausibleStruct','metric', "metricBY_cos",'id']]
    df.columns=['charge', 'PEP.StrippedSequence', 'PlausibleStruct','metric',"metricBY", '_id']
    df=pd.merge(df,dfexp,on="_id",how="left")
    # import ipdb
    # ipdb.set_trace()
    dfinput=pd.read_csv(dfinputname)
    # ipdb.set_trace()
    dfinput=dfinput[['GlySpec','pepmass_mgf',"matching_ration","iden_pep"]].drop_duplicates()
    print(len(dfinput))
    df=pd.merge(df,dfinput,on=["GlySpec","iden_pep"],how="left")
    # ipdb.set_trace()
    df["Precurmass_cal"]= df.apply(Precurmass_cal,axis=1)
    df["deltamzpercent"]=abs(df["pepmass_mgf"]-df["Precurmass_cal"])/df["Precurmass_cal"]
    df.to_csv(savename+"_nointensity_predictmonomz.csv",index=False)


# if __name__ == '__main__':  

    # savename="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/human/IgG/PXD015360_IgG/test_replace_predict/test_byBY_PXD015360_IgG_data_1st_target_decoy_model"
    # dfinputname="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/human/IgG/PXD015360_IgG/PXD015360_IgG_data_1st_redo1_nocoelute_target_decoy_Retained_all.csv"
    # postprocessing_nointensity(savename,dfinputname)


    # savename="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/human/PXD009654/test_replace_predict/test_byBY_PXD009654_human_data_1st_target_decoy_model"
    # dfinputname="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/human/PXD009654/PXD009654_data_1st_redo1_nocoelute_target_decoy_Retained_all.csv"
    # postprocessing_nointensity(savename,dfinputname)

    # savename="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/human/IgG/PXD009716/test_replace_predict/PXD009716_IgG_data_1st_target_decoy_NAGF_model_0.930816"
    # dfinputname="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/human/IgG/PXD009716/PXD009716_IgG_data_1st_target_decoy_Retained_all.csv"
    # postprocessing_nointensity(savename,dfinputname)


    # savename="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/syn/PXD023980/test_replace_predict/test_byBY_PXD023980_syn_test_9_glycans_redo1_model_0.927153"
    # dfinputname="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/syn/PXD023980/PXD023980_Syn_test_9_glycans_redo1_data_1st.csv"
    # postprocessing_nointensity(savename,dfinputname)
    
    # savename="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/syn/PXD023980/test_replace_predict/test_byBY_PXD023980_syn_identification_task3"
    # dfinputname="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/syn/PXD023980/Syn_PXD023980_task3_data_1st.csv"
    # postprocessing_nointensity(savename,dfinputname)

    # savename="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/human/PXD016428/test_replace_predict/test_byBY_PXD016428_IgG_data_1st_target_decoy_model_0.933599"
    # dfinputname="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/human/PXD016428/PXD016428_serum_data_1st_target_decoy_Retained_all.csv"
    # postprocessing_nointensity(savename,dfinputname)


    # savename="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/human/PXD016428/test_replace_predict/test_byBY_PXD016428_IgG_data_1st_target_decoy_model_0.937599"
    # dfinputname="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/human/PXD016428/PXD016428_serum_data_1st_target_decoy_Retained_all.csv"
    # postprocessing_nointensity(savename,dfinputname)

    # savename="/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/mouse/Five_tissues/test_replace_predict/test_byBY_PXD005411_model_0.927740"
    # postprocessing(savename)