#合并by和BY仪器预测
import torch.nn as nn
import torch.nn.functional as F
import os
from preprocess import PPeptidePipebyBY
from torch.optim.lr_scheduler import StepLR,ReduceLROnPlateau
from model_gly import *
from Bertmodel import ModelbyBYms2_bert
import ipdb
import pandas as pd
from pathlib import Path
from utils import *
from transformers import BertConfig
# ----------------------- training time begin ------------------------------#
from timeit import default_timer as timer
train_time_start = timer()
import datetime
starttime = datetime.datetime.now()
print(f"starttime {starttime}",end="\n\n")
# ----------------------- model parameter for optimization ------------------------------#
#调参 还有基于的模型也可以改
#采用nni进行自动调参 https://nni.readthedocs.io/zh/stable/index.html
import argparse
import logging
logger=logging.Logger("my_logger")
def parsering():
parser = argparse.ArgumentParser()
parser.add_argument('--lr', type=float, default=0.0001, help='learning rate')
parser.add_argument('--warmupsteps', type=int, default=0, help='warmupsteps')
parser.add_argument('--weight_decay', type=float, default=1e-2, help='weight_decay')
parser.add_argument('--device', type=int, default=0, help='cudadevice')
parser.add_argument('--testdata', type=str, default="alltest", help='data for test')
parser.add_argument('--step_size', type=int, default=1000, help='step size')
parser.add_argument('--lr_sche', default="False")
parser.add_argument('--model_ablation', type=str, default="DeepFLR", help='model for ablation (BERT, DeepFLR, protein bert,YYglyco)')
parser.add_argument('--folder_path', type=str,
default="/remote-home1/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/code2/task_processing/NO/multiple/",
help='the folder path for the training data')
parser.add_argument("--organism",type=str, default="mouse")
parser.add_argument("--task_name",type=str, default="byBY_mouse_five_tissues")
parser.add_argument("--trainpathcsv",type=str,
default="Five_tissues/Mouse_five_tissues_data_1st_redo1_filtered_combine.csv")
parser.add_argument("--pattern",type=str,
default='*_data_1st.csv')
parser.add_argument("--ms2_method",type=str,default='cos_sqrt')
args = parser.parse_args()
return args
args=parsering()
lr=args.lr
lr_sche=args.lr_sche
warmupsteps=args.warmupsteps
weight_decay=args.weight_decay
organism=args.organism
testdata=args.testdata
pattern=args.pattern
device = torch.device('cuda', args.device) if torch.cuda.is_available() else torch.device('cpu')
# ----------------------- model parameter------------------------------#
set_seed(seed)
batch_size=128
task_name=args.task_name
check_point_name=str(starttime)+task_name+testdata+"_checkpoint"
print("GNN_edge_decoder_type ",GNN_edge_decoder_type)
print("GNN_edge_hidden_dim ",GNN_edge_hidden_dim)
print("GNN_edge_num_layers ",GNN_edge_num_layers)
print("GNN_global_hidden_dim ",GNN_global_hidden_dim)
print(f"Project name check_point_name {check_point_name} !",end="\n\n")
print(f"hyper parameter tested lr {lr} !",end="\n\n")
print(f"hyper parameter tested BATCH_SIZE {BATCH_SIZE} !",end="\n\n")
print(f"hyper parameter tested warmupsteps {warmupsteps} !",end="\n\n")
print(f"hyper parameter tested batch_size {batch_size} !",end="\n\n")
print(f"hyper parameter tested weight decay {weight_decay} !",end="\n\n")
# ----------------------- input pre-processing------------------------------#
folder_path = args.folder_path
import folder_walk
trainpathcsv_list=[]
for org in organism.split(","):
folder_pathi=folder_path+org+"/"
print(folder_pathi)
trainpathcsv_list+=folder_walk.trainpathcsv_list(folder_path=folder_pathi,pattern=pattern)
# trainpathcsv_list+=folder_walk.trainpathcsv_list(folder_path=folder_pathi,pattern='PXD031025_data_1st.csv')
# trainpathcsv_list = [item for item in trainpathcsv_list if "PXD024995" in item]
# trainpathcsv_list = [item for item in trainpathcsv_list if "process" not in item]
print(f"Please check trainpathcsv_list {trainpathcsv_list}. The {len(trainpathcsv_list)} files contains all the training data!")
#将获得的文件合并以后并且导出,按照TotalFDR排序并去重
traincsv=pd.DataFrame()
trainpathcsv=folder_path+org+"/"+args.trainpathcsv
for x in trainpathcsv_list:
if testdata in x:
print(f"The test data is {x}! It is removed from the training data!")
else:
train=pd.read_csv(x)
traincsv=pd.concat([traincsv,train])
# traincsv.sort_values(by='TotalFDR',ascending=True,inplace=True)
# traincsv.drop_duplicates(subset=['iden_pep'],inplace=True)
traincsv.reset_index(drop=True,inplace=True)
print(f"with the addition of {x}, the combined file contains {len(traincsv)} lines")
# ipdb.set_trace()
#希望模型可以将权重更多的转移到高权重谱图上,要么可以做卡值,0.5的不要,也避免算metric有影响
#或者可以进行权重的平方
if "weights" in traincsv.columns:
traincsv=traincsv[traincsv["weights"]>0.5]
traincsv=traincsv[traincsv["ions"]!="{}"]
traincsv.drop_duplicates(inplace=True)
traincsv.reset_index(drop=True,inplace=True)
#根据glysite进行权重矫正
# glysite_counts = traincsv['GlySite'].value_counts()
# traincsv['weights'] = traincsv.apply(lambda row: row['weights'] / glysite_counts[row['GlySite']]*max(glysite_counts), axis=1)
print("number of training spectra",len(traincsv))
print("number of training iden_pep",len(traincsv["iden_pep"].drop_duplicates()))
# import ipdb
# ipdb.set_trace()
output_directory = os.path.dirname(trainpathcsv)
# Check if the directory exists, and create it if not
if not os.path.exists(output_directory):
os.makedirs(output_directory)
print(trainpathcsv)
if not os.path.exists(trainpathcsv):
traincsv.to_csv(trainpathcsv,index=False)
traindatajson=trainpathcsv[:-4]+"_byBYprocessed.json"
trainjson=trainpathcsv[:-4]+"_train_byBYprocessed.json"
devjson=trainpathcsv[:-4]+"_dev_byBYprocessed.json"
# ipdb.set_trace()
traindatajson_path = Path(traindatajson)
if traindatajson_path.exists():
print(f"{traindatajson} exists.")
# df_fp=pd.read_json(traindatajson)
else:
print(f"{traindatajson} does not exist. Begin matrixwithdict to produce result...")
os.system("python matrixwithdict.py \
--do_byBY \
--DDAfile {} \
--outputfile {} \
--split {}".format(trainpathcsv,traindatajson,"True"))
filename=traindatajson
filename_train=trainjson
filename_dev=devjson
# import ipdb
databundle_train=PPeptidePipebyBY(vocab=vocab).process_from_file(paths=filename_train)
databundle_dev=PPeptidePipebyBY(vocab=vocab).process_from_file(paths=filename_dev)
traindata=databundle_train.get_dataset("train")
devdata=databundle_dev.get_dataset("train")
print("totaldata",devdata)
#totaldata如果不去重,totaldata就要避免数据泄露问题,按照iden_pep来分割
def savingFastnlpdataset_DataFrame(dataset):
dataset_field=dataset.field_arrays.keys()
frame=pd.DataFrame(columns=dataset_field)
for i in range(len(dataset)):
c_list=[]
for name in dataset_field:
target=dataset.field_arrays[name][i]
if name=="ions_by_p" or name=="ions_BY_p":
c_list.append(target.cpu().numpy().tolist())
else:
c_list.append(target)
frame.loc[i]=c_list
return frame
# devframe=savingFastnlpdataset_DataFrame(devdata)
# devframe.to_json(trainpathcsv[:-4]+"_totaldata_devframe.json")
# torch.save(devframe,"20230223_test_model_validata_BY")
# trainframe=savingFastnlpdataset_DataFrame(traindata)
# trainframe.to_json("20230223_test_model_train_data_BY.json")
# torch.save(trainframe,"20230223_test_model_train_data_BY")
# ----------------------- model ------------------------------#
model_ablation=args.model_ablation #DeepFLR
print(f"hyper parameter tested model_ablation {model_ablation} !",end="\n\n")
if model_ablation=="Transformer":
config=BertConfig.from_pretrained("bert-base-uncased")
deepms2=ModelbyBYms2_bert(config)
if model_ablation=="BERT":
pretrainmodel="bert-base-uncased"
# pretrainmodel="/remote-home1/share/hf_cache/huggingface/hub/models--bert-base-uncased"
deepms2=ModelbyBYms2_bert.from_pretrained(pretrainmodel)
if model_ablation=="mouse_human_all":
config=BertConfig.from_pretrained("bert-base-uncased")
logger.warning("get config")
bestmodelpath="/remote-home1/yxwang/test/zzb/DeepGlyco/DeepSweet_v1/data/human/Mouse_human_15_datasets_data_1st_redo1_filtered_combine_byBYprocessed/checkpoints/2023-06-22-01-10-34-746895/epoch-124_step-47988_mediancos-0.927153.pt"
model_sign=bestmodelpath.split("/")[-1]
deepms2=ModelbyBYms2_bert(config)
bestmodel=torch.load(bestmodelpath).state_dict()
logger.warning("get bestmodel")
origin_model=deepms2.state_dict()
for key in origin_model.keys():
if key in bestmodel.keys():
if bestmodel[key].shape !=origin_model[key].shape:
origin_model[key]=bestmodel[key][:origin_model[key].shape[0],] #linear尺寸匹配不上的部分进行裁剪
logger.warning(f"size different key: {key}")
else:
origin_model[key]=bestmodel[key]
else:
logger.warning(f"not found key: {key}") #GNN匹配不是的参数就保持原样
logger.warning("starting loading")
deepms2.load_state_dict(origin_model)
if model_ablation=="All_base":
config=BertConfig.from_pretrained("bert-base-uncased")
bestmodelpath="../multiple_data/model/base_model/epoch-100_step-63500_mediancos-0.940425.pt"
model_sign=bestmodelpath.split("/")[-1]
deepms2=ModelbyBYms2_bert(config)
checkpoint = torch.load(bestmodelpath)
checkpoint_state_dict = checkpoint if isinstance(checkpoint, dict) else checkpoint.state_dict()
model_state_dict = deepms2.state_dict()
filtered_state_dict = {
k: v for k, v in checkpoint_state_dict.items()
if k in model_state_dict and v.shape == model_state_dict[k].shape
}
model_state_dict.update(filtered_state_dict)
deepms2.load_state_dict(model_state_dict)
print(f"Successfully loaded {len(filtered_state_dict)} parameters from checkpoint.")
logger.warning("get model")
#model info
from torchinfo import summary
summary(deepms2)
logger.warning("after summary")
# ipdb.set_trace()
# ----------------------- Trainer ------------------------------#
from fastNLP import Const
#可以将pred_by和pred_BY合并到一起
# metrics=CossimilarityMetricfortest_byBY(savename=None,
# pred_by="pred_by",pred_BY="pred_BY",
# target_by="ions_by_p",target_BY="ions_BY_p",
# seq_len='seq_len',num_col=num_col,
# sequence='sequence',charge="charge",
# decoration="decoration")
metrics=CossimilarityMetricfortest_byBY(savename=None,pred=Const.OUTPUT,target=Const.TARGET,
seq_len='seq_len',num_col=num_col,sequence='sequence',
charge="charge",decoration="decoration",
args=args)
# from fastNLP import MSELoss
from MSELoss_for_byBY import MSELoss_byBY
loss=MSELoss_byBY(pred_by="pred_by",pred_BY="pred_BY",target_by="target_by",target_BY="target_BY")
# loss=MSELoss(pred=Const.OUTPUT,target=Const.TARGET)
import torch.optim as optim
optimizer=optim.AdamW(deepms2.parameters(),lr=lr,weight_decay=weight_decay)
if lr_sche=="True":
step_size=args.step_size
print(f"hyper parameter tested step_size {step_size} !",end="\n\n")
lr_scheduler = StepLR(optimizer, step_size=step_size,verbose=True)
print(f"lr scheduler is used ! lr: {lr}, step_size: {step_size}")
# lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(traindata))
# lr_scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.2, patience=10,
# verbose=False,
# threshold=0.0001, threshold_mode='rel', cooldown=0, min_lr=0,
# eps=1e-08)
# print(f"lr scheduler is used ! lr: {lr}")
from fastNLP import WarmupCallback,SaveModelCallback,LRScheduler
save_path=filename[:-5]+"/checkpoints"
# save_path=os.path.join(path0,"checkpoints/"+pretrainmodel+"/pretrained_trainall_ss/combinemann_all")
callback=[WarmupCallback(warmupsteps)]
# callback.append(WandbCallback(project="Deepsweet",name=check_point_name,config={"lr":lr,"seed":seed,
# "Batch_size":BATCH_SIZE,"warmupsteps":warmupsteps,"temperature":None,"weight_decay":None}))
callback.append(SaveModelCallback(save_path,top=5))
if lr_sche=="True":
callback.append(LRScheduler(lr_scheduler))
#trainer
from fastNLP import Trainer
if vocab_save:
vocab.save(os.path.join(save_path,"vocab"))
import ipdb
pptrainer=Trainer(model=deepms2, train_data=traindata,
device=device, dev_data=devdata,
save_path=save_path,
loss=loss,metrics=metrics,callbacks=callback,
optimizer=optimizer,n_epochs=N_epochs,batch_size=batch_size,
update_every=int(BATCH_SIZE/batch_size),dev_batch_size=batch_size)
pptrainer.train()
# ----------------------- Time ------------------------------#
train_time_end = timer()
def print_train_time(start: float, end: float, device: torch.device = None):
"""Prints difference between start and end time.
Args:
start (float): Start time of computation (preferred in timeit format).
end (float): End time of computation.
device ([type], optional): Device that compute is running on. Defaults to None.
Returns:
float: time between start and end in seconds (higher is longer).
"""
total_time = end - start
print(f"Train time on {device} for{check_point_name}: {total_time:.3f} seconds")
return total_time
total_train_time_model_2 = print_train_time(start=train_time_start,
end=train_time_end,
device=device)