ml-solarwind / delay_DST_kfold_train.py
delay_DST_kfold_train.py
Raw
# -*- coding: utf-8 -*-
"""
Created on Tue May 26 16:49:43 2020

@author: baum_c4

performs K-fold cross validation (10fold) and derives RMSEs to compare different models

derives Shapley values from the whole dataset

"""



import pickle
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from func_sorted_stratification import sorted_stratification
import shap
#import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
#from sklearn.linear_model import LogisticRegression
with open('delay_DST_learningset.pickle', 'rb') as f:
   [learnvector,timevector]= pickle.load(f)
   
classifiers =     RandomForestRegressor(n_estimators=60,random_state=2)
classifier_noDST =     RandomForestRegressor(n_estimators=60,random_state=1)
X=learnvector
Y=timevector
   
# with open('optimized_gp5.pickle','rb') as f:
#    res_gp=pickle.load(f)
# gb_model=GradientBoostingRegressor(max_depth=res_gp[0],
#                                   learning_rate = res_gp[1],
#                                   random_state = 42,
#                                   max_features=res_gp[2],
#                                   min_samples_split=res_gp[3],
#                                   min_samples_leaf=res_gp[4],
#                                   n_estimators=res_gp[5]) 
# with open('optimized_forest5.pickle','rb') as f:
#    res_forest=pickle.load(f)
# scopt_model=RandomForestRegressor(max_depth=res_forest[0],
#                                   n_estimators = res_forest[1],
#                                   random_state = 42,
#                                   max_features=res_forest[2],
#                                   min_samples_split=res_forest[3],
#                                   min_samples_leaf=res_forest[4]) 
X,Y=sorted_stratification(learnvector,timevector,10,1)

#load optimized parameters
with open('best_bayes.pickle', 'rb') as f:
    GPparams,RFparams=pickle.load(f)

#train models with optimized parameters    
scopt_model=RandomForestRegressor(max_depth=RFparams['max_depth'],
                                  n_estimators = 500,# does not need to be optimzed, more trees are better always
                                  random_state = 42,
                                  max_features=RFparams['max_features'],
                                  min_samples_split=RFparams['min_samples_split'],
                                  min_samples_leaf=RFparams['min_samples_leaf'],
                                  min_impurity_decrease=RFparams['min_impurity_decrease'],
                                  #max_samples=RFparams['max_samples']
                                  )    
gb_model=GradientBoostingRegressor(max_depth=GPparams['max_depth'],
                                  n_estimators = GPparams['n_estimators'],
                                  random_state = 42,
                                  max_features=GPparams['max_features'],
                                  min_samples_split=GPparams['min_samples_split'],
                                  min_samples_leaf=GPparams['min_samples_leaf'],
                                  learning_rate=GPparams['learning_rate'],
                                  min_impurity_decrease=GPparams['min_impurity_decrease'],
                                  #subsample=GPparams['subsample']
                                  )


idx=list(range(7))
#idx.pop(2)# pop out features if needed
#idx.pop(5)
#idx.pop(4)


allRMSES=[]
baseforest=[]
basegp=[]
optgp=[]
optforest=[]

for j in range(10):
    XTrain=np.concatenate([X[i] for i in range(len(X)) if i!=j])
    YTrain=np.concatenate([Y[i] for i in range(len(X)) if i!=j])
    XTest=X[j]
    YTest=Y[j]
   
    
    base_model = RandomForestRegressor( random_state = 42)
    base_model.fit(XTrain, YTrain[:,2])
    baseparam=base_model.get_params()
    Y_pred=base_model.predict(XTest)
    deltapred=(Y_pred-YTest[:,2])
    baseRMSEpred=np.sqrt(np.nansum(np.square(deltapred))/len(XTest))/60
    baseforest.append(baseRMSEpred)
    
    basegp_model = GradientBoostingRegressor( random_state = 42)
    basegp_model.fit(XTrain, YTrain[:,2])
    basegpparam=basegp_model.get_params()
    Y_pred=basegp_model.predict(XTest)
    deltapred=(Y_pred-YTest[:,2])
    basegpRMSEpred=np.sqrt(np.nansum(np.square(deltapred))/len(XTest))/60
    basegp.append(basegpRMSEpred)
    
    
    
  
    scopt_model.fit(XTrain, YTrain[:,2])
    Y_scopt=scopt_model.predict(XTest)
    scoptpred=(Y_scopt-YTest[:,2])/60
    RMSEscopt=np.sqrt(np.nansum(np.square(scoptpred))/len(XTest))
    optforest.append(RMSEscopt)
    
    gb_model.fit(XTrain, YTrain[:,2])
    Y_gb=gb_model.predict(XTest)
    gppred=(Y_gb-YTest[:,2])/60
    RMSEboost=np.sqrt(np.nansum(np.square(gppred))/len(XTest))  
    optgp.append(RMSEboost)
    
    ylinreg = LinearRegression().fit(XTrain,YTrain[:,2]).predict(XTest)
    deltalinreg=(ylinreg-YTest[:,2])/60
    RMSElinreg= np.sqrt(np.nansum(np.square(deltalinreg))/38)
    
    
    
    deltaflat=(YTest[:,1]-YTest[:,2])/60
    deltavec=(YTest[:,0]-YTest[:,2])/60
    
    deltapred=(Y_pred-YTest[:,2])/60
    RMSEflat= np.sqrt(np.nansum(np.square(deltaflat))/38)
    
    
    RMSEvec=np.sqrt(np.nansum(np.square(deltavec))/38)
    RMSEpred=np.sqrt(np.nansum(np.square(deltapred))/38)
    
    maxvec=np.max(np.abs(deltavec))
    maxpred=np.max(np.abs(deltapred))
    maxflat=np.max(np.abs(deltaflat))
   
    maxlinreg=np.max(np.abs(deltalinreg))
    print(j,RMSEpred,RMSEscopt,RMSEboost,RMSEvec)
    #(j,maxpred,maxvec)
    allRMSES.append([RMSEpred,RMSEscopt,RMSEboost,RMSEvec,RMSEflat,RMSElinreg])

allRMSES=np.array(allRMSES)
meanRMSE=np.mean(allRMSES,axis=0)
stdRMSE=np.std(allRMSES,axis=0)
print(meanRMSE)
print(stdRMSE)
#impis=np.array(importances)
features=np.array(['rx','ry','rz','vx','vy','vz','DST'])
features=features[idx]
# Initialize your Jupyter notebook with initjs(), otherwise you will get an error message.
XTrain=learnvector
YTrain=timevector
classifiers.fit(XTrain[:,idx], YTrain[:,2])
pXTrain=pd.DataFrame(data=XTrain[:,idx],columns=features)
# Get the predictions and put them with the test data.

with open('plot_RMSE_kfold.pickle', 'wb') as f:
    pickle.dump(allRMSES, f)





explainer =shap.TreeExplainer(classifiers,pXTrain)
shap_values = explainer.shap_values(pXTrain)# derive shapley values from the whole dataset

shap.summary_plot(shap_values/60, XTrain[:,idx], plot_type="bar",feature_names=features)
shap.summary_plot(shap_values/60, XTrain[:,idx],feature_names=features)
shap.force_plot(explainer.expected_value, shap_values, XTrain[:,idx])


shap.dependence_plot("vx", shap_values/60, pXTrain)
shap.dependence_plot("vy", shap_values/60, pXTrain)
shap.dependence_plot("vz", shap_values/60, pXTrain)

shap.dependence_plot("ry", shap_values/60, pXTrain)
shap.dependence_plot("rz", shap_values/60, pXTrain)
shap.dependence_plot("rx", shap_values/60, pXTrain)

shap.dependence_plot("DST", shap_values/60, pXTrain)

with open('plot_Shapley.pickle', 'wb') as f:# export shapley data
    pickle.dump(shap_values, f)


with open('plot_optimization_data.pickle', 'wb') as f:# export optimization comparison results
    pickle.dump([baseforest,basegp,optforest,optgp], f)