import pandas as pd
import os
import numpy as np
from pathlib import Path
import masses
import mgf_processing
from weights import *
# --------------------------- argparse ---------------------#
import argparse
def parsering():
parser = argparse.ArgumentParser()
# Training parameter
parser.add_argument('--datafold', type=str,
default="/remote-home1/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/mouse/PXD005411/",
help='datafold ')
parser.add_argument('--dfname', type=str,
default="pGlycoDB-GP-FDR-Pro_PXD005411.txt",
help='pglyco3 crude result ')
parser.add_argument('--mgfdatafold', type=str,
default="MSConvert_mgf_PXD005411/" ,
help='mgf data fold')
parser.add_argument('--output_name', type=str,
default="PXD005411_MouseBrain_data_1st.csv", help='outputfile name')
parser.add_argument('--only_duplicated', type=str,default="Drop_duplicated", help='Duplicated/Drop_duplicated/Retained_all')
parser.add_argument('--mgfsourceorign', type=str,default="pGlyco3", help='Please ensure the tool for producing mgf (MsConvert or pGlyco3)')
parser.add_argument('--fragmentation', type=str,default="HCD", help='HCD/EThCD/ETD')
parser.add_argument('--enzyme', type=str,default="None", help='protease used')
parser.add_argument('--filter_jsonname', type=str,default="SStruespectrum_filtered_O_", help='')
parser.add_argument('--not_use_weights', action='store_true', help='')
args = parser.parse_args()
return args
args=parsering()
DFNAME=args.datafold+args.dfname
mgfdatafold=args.datafold+args.mgfdatafold
output_name=args.datafold+args.output_name
only_duplicated=args.only_duplicated
mgfsourceorign=args.mgfsourceorign
assert mgfsourceorign in ["pGlyco3","MsConvert"], "mgfsourceorign not in [pGlyco3,MsConvert]"
fragmentation=args.fragmentation
assert fragmentation in ["HCD","ETD","EThCD"], "fragmentation not in [HCD,ETD,EThCD]"
Enzyme=args.enzyme
if args.not_use_weights:
# Code when not using weights
print("Not using weights")
else:
# Code when using weights
print("Using weights")
# --------------------------- hyper paramaters ---------------------#
FRAG_AVA=["ETD","HCD_1","HCD_by","HCD_BY_2"]
if fragmentation=="HCD":
FRAG_INDEX=[1,2] #"HCD_1" for BY prediction. "HCD_by" for by prediction
if fragmentation=="ETD":
FRAG_INDEX=[0]
if fragmentation=="EThCD":
FRAG_INDEX=[0,1,2]
FRAG_MODE=[x for x in FRAG_AVA if FRAG_AVA.index(x) in FRAG_INDEX]
print(f"FRAG_MODE: {FRAG_MODE}")
jsonfold= os.path.join(mgfdatafold, "json/")
jsonname="SStruespectrum.json"
filter_jsonname=args.filter_jsonname+args.only_duplicated+".json" #相比于SStruespectrum.json提取数据中有的scan,来减少搜索范围
TOLER=20
# --------------------------- pglyco3 result processing---------------------#
def pglyco3_result(DFNAME):
df=pd.read_csv(DFNAME,sep="\t")
df.reset_index(inplace=True,drop=True)
df_column=list(df.columns)
print(f"Columns of df {df_column}",end = "\n\n")
print(f"df rank should all be 1.. Please check!!: {list(df['Rank'].drop_duplicates())}",end = "\n\n")
assert list(df['Rank'].drop_duplicates())==[1]
df["Peptide"]=df["Peptide"].str.replace("J","N")
if "LocalizedSiteGroups" in df_column:
if args.not_use_weights:
use_weights=False
else:
use_weights=True
print("weight is considered: ", use_weights)
else:
use_weights=False
if use_weights:
df=df[["RawName","Scan", 'Charge',"Peptide","Mod",
"PlausibleStruct",'GlySite',"RT","PrecursorMZ",'TotalFDR',"LocalizedSiteGroups"]]
else:
df=df[["RawName","Scan", 'Charge',"Peptide","Mod",
"PlausibleStruct",'GlySite',"RT","PrecursorMZ",'TotalFDR']]
print(f"Row number of df {len(df)}",end = "\n\n")
df.drop_duplicates(inplace=True)
print(f"Row number of df after drop_duplicates {len(df)}",end = "\n\n")
return df,use_weights
def combine_iden_pep(instance):
a=instance["Peptide"]
b=instance["Mod"]
e=""
if not pd.isna(b):
b=b.rstrip(";")
for i in b.split(";"):
for k in i.split(","):
k=k[:3]+"."
e+=k
b=e
else:
b=None
c=instance["GlySite"]-1 #GlySite 是从1开始的,会比index J 大一
d=instance["Charge"]
e=instance["PlausibleStruct"]
return str(a)+"_"+str(b)+"_"+str(c)+"_"+str(d)+"_"+str(e)
def pglyco3_processing(df,
only_duplicated="Drop_duplicated"):
"""Create required columns.
Args:
duplicated: True or False: whether or not only peak duplicated columns.
True: only duplicated row are retained for repeatability test.
False: only rows with lowest totalFDR for duplicated columns or unique columns are retained for training.
"""
# ipdb.set_trace()
df["iden_pep"]=df.apply(combine_iden_pep,axis=1) #eg. JASQNQDNVYQGGGVCLDCQHHTTGINCER_16.Car.19.Car.28.Car._0_4_(N(N(H(H(H))(H(H)))))
if only_duplicated=="Duplicated":
df1=df[["iden_pep"]].loc[df["iden_pep"].duplicated()].drop_duplicates()
df=df.loc[df["iden_pep"].isin(df1["iden_pep"])]
print("Waiting to process multiply glycopeptides")
if only_duplicated == "Drop_duplicated":
df.sort_values(by='TotalFDR',ascending=True,inplace=True)
# ipdb.set_trace()
df.drop_duplicates(subset=['iden_pep'],inplace=True)
df.reset_index(drop=True,inplace=True)
if only_duplicated == "Retained_all":
pass
return df
# --------------------------- spectrum filtration---------------------#
#从json中找到相应的谱图,缩小搜索空间
def json_extraction(jsonfold=jsonfold,
jsonname=jsonname,
filename=filter_jsonname,
mgfsourceorign=mgfsourceorign):
datalis=pd.read_json(os.path.join(jsonfold, jsonname))
datalis["title"]=datalis["SourceFile"].map(str) + "-" + datalis["Spectrum"].map(str)
datalis=datalis.loc[datalis["title"].isin(df["GlySpec"])]
print("Please ensure the Spectrum numbers of MsConvert json files match those of the pGlyco3 result!")
datalis.reset_index(inplace=True, drop=True)
datalis.to_json(os.path.join(jsonfold, filename))
return datalis
# ----------------------- ions picking ------------------------------#
def fragment_training(instance):
spectrum=instance["GlySpec"]
datalis_1=datalis.loc[datalis["title"]==spectrum]
datalis_1=datalis_1.reset_index(drop=True)
iden_pep=instance["iden_pep"]
mz_calc=masses.pepfragmass(iden_pep,FRAG_MODE,3) #iden_pep已经改成了glysite,避免多J的可能
ppm=TOLER
FragmentMz=[]
for mz in mz_calc:
for ion in mz:
FragmentMz.append(list(ion.values())[0])
FragmentMz=list(set(FragmentMz))
mass={"FragmentMz":FragmentMz}
#FragmentMz:所有算出来的理论质荷比
mzdict=mgf_processing.putTintensity(ppm, mass, datalis_1)
for k in list(mzdict.keys()):
if mzdict[k]==0:
del mzdict[k]
mzdict_1={}
#补上mzdict的碎裂类型
for i in mz_calc:
for a in i:
mz_calc_1=list(a.values())[0]
if mz_calc_1 in list(mzdict.keys()):
# print("a",a)
# print("mzdict[mz_calc_1]",mzdict[mz_calc_1])
type=list(a.keys())[0]
intensity=mzdict[mz_calc_1]
if not mz_calc_1 in mzdict_1.keys():
type_list=[]
type_list.append(type)
ions=(type_list,intensity)
mzdict_1[mz_calc_1]=ions
else:
type_list=mzdict_1[mz_calc_1][0]
type_list.append(type)
ions=(type_list,intensity)
mzdict_1[mz_calc_1]=ions
return mzdict_1
def mz_matching(instance):
spectrum=instance["GlySpec"]
datalis_1=datalis.loc[datalis["title"]==spectrum]
datalis_1=datalis_1.reset_index(drop=True)
iden_pep=instance["iden_pep"]
mz_calc=masses.pepfragmass(iden_pep,["HCD_BY_2"],4) #iden_pep已经改成了glysite,避免多J的可能
ppm=TOLER
FragmentMz=[]
for mz in mz_calc:
for ion in mz:
FragmentMz.append(list(ion.values())[0])
FragmentMz=list(set(FragmentMz))
FragmentMz.sort()
mzexp=datalis_1["mz"][0]
mzexp.sort()
matchmz=[]
for k in mzexp:
i = (np.abs(np.array(FragmentMz) - k)).argmin()
# ipdb.set_trace()
if abs(FragmentMz[i] - k) < k * TOLER * 1 / 1000000: #args.ppm=tolerance here,可以改回args版本
matchmz.append(k)
return {"matchmz":len(matchmz),"calc":len(FragmentMz),"mzexp":len(mzexp)}
# --------------------------- execution ---------------------#
if __name__=="__main__":
DFNAME_path = Path(DFNAME)
print(DFNAME_path)
assert DFNAME_path.exists()
# pglyco3 formatted result
df_fp,use_weights=pglyco3_result(DFNAME)
df=pglyco3_processing(df_fp,
only_duplicated=only_duplicated)
# if mgfsourceorign=="MsConvert":
df["GlySpec"]=df["RawName"].map(str) + "-" + df["Scan"].map(str)
#json file
json_path=Path(jsonfold,jsonname)
if json_path.exists():
print(f"{jsonname} exists.")
else:
print(f"{jsonname} does not exist. Begin mgf_process to produce required file...")
datalis=mgf_processing.mgf_process(mgfdatafold=mgfdatafold,sourceorign=mgfsourceorign)
#filtered json file
file3_name_path = Path(jsonfold,filter_jsonname)
# if file3_name_path.exists():
# print(f"{filter_jsonname} exists.")
# datalis=pd.read_json(os.path.join(jsonfold, filter_jsonname))
# else:
print(f"{file3_name_path} does not exist. Begin json_extraction to produce required file...")
datalis=json_extraction(jsonfold=jsonfold,
jsonname=jsonname,
filename=filter_jsonname,
mgfsourceorign=mgfsourceorign)
datalis.drop_duplicates(subset="title",inplace=True)
df=df[df["GlySpec"].isin(datalis["title"])]
assert len(df["GlySpec"].drop_duplicates())==len(datalis["title"].drop_duplicates())
if use_weights:
df["weights"] = df.apply(lambda x: weights(x, Enzyme), axis=1)
df=df[[ "GlySpec",'Charge',"RT","PrecursorMZ",'Peptide', 'Mod', 'PlausibleStruct', 'GlySite', 'iden_pep',"TotalFDR","weights"]]
else:
df=df[[ "GlySpec",'Charge',"RT","PrecursorMZ",'Peptide', 'Mod', 'PlausibleStruct', 'GlySite', 'iden_pep',"TotalFDR"]]
df.drop_duplicates(subset=["GlySpec",'Charge',"RT","PrecursorMZ",'Peptide', 'Mod', 'PlausibleStruct', 'GlySite', 'iden_pep',"TotalFDR"],inplace=True)
df.reset_index(drop=True,inplace=True)
print("len(df_iden_pep.drop_duplicates())",len(df["iden_pep"].drop_duplicates()))
print("len(df)",len(df))
df["ions"]=df.apply(fragment_training,axis=1)
df.to_csv(output_name,index=False)