DeepGPO / DeepGPO_code / weights.py
weights.py
Raw
#难点:
#1.谱图数目非常少,严格的过滤规则会降低灵敏度。并且酶切和软件打分并不能给出非常置信的结果
#2.软件定位存在各种不准确的谱图,比如没有位点打分,或者多位点,需要去提高准确性
#3.StcE的酶切特性是有争议的,非常难以确定规则

#方法:
#为了不过度减少灵敏度,并保证一定程度上的准确性。
#相比于直接按照一定的规则去过滤谱图,并且这个过滤规则因为不确定是按照酶切还是位点打分,过于复杂
#所以改成给不同的谱图一个权重。这个权重是加在loss函数中的

#具体:
#给训练谱图加权重 是单个位点的概率分的平方
#满足酶切特点概率分乘以2,最少是1。
#整个肽段只有一个可能位点最少是1。
#多位点是0。报告结果与定位localized group不一致为0.

#note:除此以外,一个训练策略就是不去重
#这算不算few-shot,如果审稿人问为什么不做few-shot怎么办
Enzyme_dict={"None":[],"Trypsin":[],"OgpA":[1],"IMPa":[1],
             "SmE":[1,-1],"StcE":[1,-2],
             "AMsia":[1,-1],"FAsia":[1,-1],"WAsia":[1,-1],"YAsia":[1,-1],
             "AM0627":[1,-1],"F290A":[1,-1],"W149A":[1,-1],"Y287A":[1,-1]}
def multi(instance):
    if  type(instance)==float:
        return "not_found"
    elif ";" in instance:
        import ipdb
        ipdb.set_trace()
        return "Poss_multi"
    else:
        instance = instance.replace("{", "").replace("}", "")
        parts = instance.split(',')
        t1_num =  parts[0][1:]
        t2_num =  parts[1][1:]
        try:
            last_number = float(parts[-1])
            #概率是否需要平方
            # last_number=last_number*last_number
        except ValueError:
            return "最后一个元素不是数字"
        if t1_num != t2_num:

            return last_number / 2
        else:
            return last_number
def checksite(LocalizedSiteGroups,GlySite):
    if  type(LocalizedSiteGroups)==float:
        return True
    elif ";" in LocalizedSiteGroups:
        return True
    else:
        LocalizedSiteGroups = LocalizedSiteGroups.replace("{", "").replace("}", "")
        parts = LocalizedSiteGroups.split(',')
        t1_num =  int(parts[0][1:])
        t2_num =  int(parts[1][1:])
        if t1_num!=GlySite and t2_num!=GlySite:
            return False  
        else:
            return True  
def weights(instance, Enzyme):
    LocalizedSiteGroups=instance["LocalizedSiteGroups"]
    GlySite=instance["GlySite"]
    Peptide=instance["Peptide"]
    prob=multi(LocalizedSiteGroups)
    if prob=="Poss_multi":
        prob=0
        return prob
    elif not checksite(LocalizedSiteGroups,GlySite):
        prob=0
        return prob
    else:
        if not Enzyme in Enzyme_dict.keys():
            raise KeyError("Enzyme not in Enzyme_dict.")
        for i in Enzyme_dict[Enzyme]:
            if i<0:
                Peptide_length=len(Peptide)
                i=Peptide_length+i+1
            if i==GlySite:
                if prob=="not_found":
                    prob=1
                    return prob
                else:
                    prob= prob*2
                    prob=max(1,prob)
                    return prob
        if  prob=="not_found":
            #如果肽段只有一个位点,那么最少也为1
            ST_count=Peptide.count("S")+Peptide.count("T")
            if ST_count==1:
                prob=1
            else:
                prob=0
            return prob
        else:
            ST_count=Peptide.count("S")+Peptide.count("T")
            if ST_count==1:
                # prob= prob*2
                # prob=max(1,prob)
                prob=1
                return prob
            else:
                prob= prob
            return prob
# import pandas as pd
# df=pd.read_csv("/remote-home/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/NO/PXD004590/PXD004590_Chr/pGlycoDB-GP-FDR-Pro-Quant-Site.txt",sep="\t")
# Enzyme = "None"  # or any other enzyme you want to use
# df["weights"] = df.apply(lambda x: weights(x, Enzyme), axis=1)
# import ipdb
# ipdb.set_trace()