#难点: #1.谱图数目非常少,严格的过滤规则会降低灵敏度。并且酶切和软件打分并不能给出非常置信的结果 #2.软件定位存在各种不准确的谱图,比如没有位点打分,或者多位点,需要去提高准确性 #3.StcE的酶切特性是有争议的,非常难以确定规则 #方法: #为了不过度减少灵敏度,并保证一定程度上的准确性。 #相比于直接按照一定的规则去过滤谱图,并且这个过滤规则因为不确定是按照酶切还是位点打分,过于复杂 #所以改成给不同的谱图一个权重。这个权重是加在loss函数中的 #具体: #给训练谱图加权重 是单个位点的概率分的平方 #满足酶切特点概率分乘以2,最少是1。 #整个肽段只有一个可能位点最少是1。 #多位点是0。报告结果与定位localized group不一致为0. #note:除此以外,一个训练策略就是不去重 #这算不算few-shot,如果审稿人问为什么不做few-shot怎么办 Enzyme_dict={"None":[],"Trypsin":[],"OgpA":[1],"IMPa":[1], "SmE":[1,-1],"StcE":[1,-2], "AMsia":[1,-1],"FAsia":[1,-1],"WAsia":[1,-1],"YAsia":[1,-1], "AM0627":[1,-1],"F290A":[1,-1],"W149A":[1,-1],"Y287A":[1,-1]} def multi(instance): if type(instance)==float: return "not_found" elif ";" in instance: import ipdb ipdb.set_trace() return "Poss_multi" else: instance = instance.replace("{", "").replace("}", "") parts = instance.split(',') t1_num = parts[0][1:] t2_num = parts[1][1:] try: last_number = float(parts[-1]) #概率是否需要平方 # last_number=last_number*last_number except ValueError: return "最后一个元素不是数字" if t1_num != t2_num: return last_number / 2 else: return last_number def checksite(LocalizedSiteGroups,GlySite): if type(LocalizedSiteGroups)==float: return True elif ";" in LocalizedSiteGroups: return True else: LocalizedSiteGroups = LocalizedSiteGroups.replace("{", "").replace("}", "") parts = LocalizedSiteGroups.split(',') t1_num = int(parts[0][1:]) t2_num = int(parts[1][1:]) if t1_num!=GlySite and t2_num!=GlySite: return False else: return True def weights(instance, Enzyme): LocalizedSiteGroups=instance["LocalizedSiteGroups"] GlySite=instance["GlySite"] Peptide=instance["Peptide"] prob=multi(LocalizedSiteGroups) if prob=="Poss_multi": prob=0 return prob elif not checksite(LocalizedSiteGroups,GlySite): prob=0 return prob else: if not Enzyme in Enzyme_dict.keys(): raise KeyError("Enzyme not in Enzyme_dict.") for i in Enzyme_dict[Enzyme]: if i<0: Peptide_length=len(Peptide) i=Peptide_length+i+1 if i==GlySite: if prob=="not_found": prob=1 return prob else: prob= prob*2 prob=max(1,prob) return prob if prob=="not_found": #如果肽段只有一个位点,那么最少也为1 ST_count=Peptide.count("S")+Peptide.count("T") if ST_count==1: prob=1 else: prob=0 return prob else: ST_count=Peptide.count("S")+Peptide.count("T") if ST_count==1: # prob= prob*2 # prob=max(1,prob) prob=1 return prob else: prob= prob return prob # import pandas as pd # df=pd.read_csv("/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/NO/PXD004590/PXD004590_Chr/pGlycoDB-GP-FDR-Pro-Quant-Site.txt",sep="\t") # Enzyme = "None" # or any other enzyme you want to use # df["weights"] = df.apply(lambda x: weights(x, Enzyme), axis=1) # import ipdb # ipdb.set_trace()