# -*- coding: utf-8 -*- """ Created on Tue May 26 16:49:43 2020 @author: baum_c4 performs K-fold cross validation (10fold) and derives RMSEs to compare different models derives Shapley values from the whole dataset """ import pickle import numpy as np import pandas as pd from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor from func_sorted_stratification import sorted_stratification import shap #import matplotlib.pyplot as plt from sklearn.svm import SVR from sklearn.linear_model import LinearRegression #from sklearn.linear_model import LogisticRegression with open('delay_DST_learningset.pickle', 'rb') as f: [learnvector,timevector]= pickle.load(f) classifiers = RandomForestRegressor(n_estimators=60,random_state=2) classifier_noDST = RandomForestRegressor(n_estimators=60,random_state=1) X=learnvector Y=timevector # with open('optimized_gp5.pickle','rb') as f: # res_gp=pickle.load(f) # gb_model=GradientBoostingRegressor(max_depth=res_gp[0], # learning_rate = res_gp[1], # random_state = 42, # max_features=res_gp[2], # min_samples_split=res_gp[3], # min_samples_leaf=res_gp[4], # n_estimators=res_gp[5]) # with open('optimized_forest5.pickle','rb') as f: # res_forest=pickle.load(f) # scopt_model=RandomForestRegressor(max_depth=res_forest[0], # n_estimators = res_forest[1], # random_state = 42, # max_features=res_forest[2], # min_samples_split=res_forest[3], # min_samples_leaf=res_forest[4]) X,Y=sorted_stratification(learnvector,timevector,10,1) #load optimized parameters with open('best_bayes.pickle', 'rb') as f: GPparams,RFparams=pickle.load(f) #train models with optimized parameters scopt_model=RandomForestRegressor(max_depth=RFparams['max_depth'], n_estimators = 500,# does not need to be optimzed, more trees are better always random_state = 42, max_features=RFparams['max_features'], min_samples_split=RFparams['min_samples_split'], min_samples_leaf=RFparams['min_samples_leaf'], min_impurity_decrease=RFparams['min_impurity_decrease'], #max_samples=RFparams['max_samples'] ) gb_model=GradientBoostingRegressor(max_depth=GPparams['max_depth'], n_estimators = GPparams['n_estimators'], random_state = 42, max_features=GPparams['max_features'], min_samples_split=GPparams['min_samples_split'], min_samples_leaf=GPparams['min_samples_leaf'], learning_rate=GPparams['learning_rate'], min_impurity_decrease=GPparams['min_impurity_decrease'], #subsample=GPparams['subsample'] ) idx=list(range(7)) #idx.pop(2)# pop out features if needed #idx.pop(5) #idx.pop(4) allRMSES=[] baseforest=[] basegp=[] optgp=[] optforest=[] for j in range(10): XTrain=np.concatenate([X[i] for i in range(len(X)) if i!=j]) YTrain=np.concatenate([Y[i] for i in range(len(X)) if i!=j]) XTest=X[j] YTest=Y[j] base_model = RandomForestRegressor( random_state = 42) base_model.fit(XTrain, YTrain[:,2]) baseparam=base_model.get_params() Y_pred=base_model.predict(XTest) deltapred=(Y_pred-YTest[:,2]) baseRMSEpred=np.sqrt(np.nansum(np.square(deltapred))/len(XTest))/60 baseforest.append(baseRMSEpred) basegp_model = GradientBoostingRegressor( random_state = 42) basegp_model.fit(XTrain, YTrain[:,2]) basegpparam=basegp_model.get_params() Y_pred=basegp_model.predict(XTest) deltapred=(Y_pred-YTest[:,2]) basegpRMSEpred=np.sqrt(np.nansum(np.square(deltapred))/len(XTest))/60 basegp.append(basegpRMSEpred) scopt_model.fit(XTrain, YTrain[:,2]) Y_scopt=scopt_model.predict(XTest) scoptpred=(Y_scopt-YTest[:,2])/60 RMSEscopt=np.sqrt(np.nansum(np.square(scoptpred))/len(XTest)) optforest.append(RMSEscopt) gb_model.fit(XTrain, YTrain[:,2]) Y_gb=gb_model.predict(XTest) gppred=(Y_gb-YTest[:,2])/60 RMSEboost=np.sqrt(np.nansum(np.square(gppred))/len(XTest)) optgp.append(RMSEboost) ylinreg = LinearRegression().fit(XTrain,YTrain[:,2]).predict(XTest) deltalinreg=(ylinreg-YTest[:,2])/60 RMSElinreg= np.sqrt(np.nansum(np.square(deltalinreg))/38) deltaflat=(YTest[:,1]-YTest[:,2])/60 deltavec=(YTest[:,0]-YTest[:,2])/60 deltapred=(Y_pred-YTest[:,2])/60 RMSEflat= np.sqrt(np.nansum(np.square(deltaflat))/38) RMSEvec=np.sqrt(np.nansum(np.square(deltavec))/38) RMSEpred=np.sqrt(np.nansum(np.square(deltapred))/38) maxvec=np.max(np.abs(deltavec)) maxpred=np.max(np.abs(deltapred)) maxflat=np.max(np.abs(deltaflat)) maxlinreg=np.max(np.abs(deltalinreg)) print(j,RMSEpred,RMSEscopt,RMSEboost,RMSEvec) #(j,maxpred,maxvec) allRMSES.append([RMSEpred,RMSEscopt,RMSEboost,RMSEvec,RMSEflat,RMSElinreg]) allRMSES=np.array(allRMSES) meanRMSE=np.mean(allRMSES,axis=0) stdRMSE=np.std(allRMSES,axis=0) print(meanRMSE) print(stdRMSE) #impis=np.array(importances) features=np.array(['rx','ry','rz','vx','vy','vz','DST']) features=features[idx] # Initialize your Jupyter notebook with initjs(), otherwise you will get an error message. XTrain=learnvector YTrain=timevector classifiers.fit(XTrain[:,idx], YTrain[:,2]) pXTrain=pd.DataFrame(data=XTrain[:,idx],columns=features) # Get the predictions and put them with the test data. with open('plot_RMSE_kfold.pickle', 'wb') as f: pickle.dump(allRMSES, f) explainer =shap.TreeExplainer(classifiers,pXTrain) shap_values = explainer.shap_values(pXTrain)# derive shapley values from the whole dataset shap.summary_plot(shap_values/60, XTrain[:,idx], plot_type="bar",feature_names=features) shap.summary_plot(shap_values/60, XTrain[:,idx],feature_names=features) shap.force_plot(explainer.expected_value, shap_values, XTrain[:,idx]) shap.dependence_plot("vx", shap_values/60, pXTrain) shap.dependence_plot("vy", shap_values/60, pXTrain) shap.dependence_plot("vz", shap_values/60, pXTrain) shap.dependence_plot("ry", shap_values/60, pXTrain) shap.dependence_plot("rz", shap_values/60, pXTrain) shap.dependence_plot("rx", shap_values/60, pXTrain) shap.dependence_plot("DST", shap_values/60, pXTrain) with open('plot_Shapley.pickle', 'wb') as f:# export shapley data pickle.dump(shap_values, f) with open('plot_optimization_data.pickle', 'wb') as f:# export optimization comparison results pickle.dump([baseforest,basegp,optforest,optgp], f)