# -*- coding: utf-8 -*- """ Created on Mon Jun 22 11:04:07 2020 @author: baum_c4 """ import pickle import numpy as np import pandas as pd from sklearn.ensemble import RandomForestRegressor from func_sorted_stratification import sorted_stratification import shap import matplotlib.pyplot as plt from sklearn.svm import SVR from sklearn.linear_model import LinearRegression from sklearn.linear_model import LogisticRegression from sklearn.base import clone with open('delay_DST_learningset.pickle', 'rb') as f: [learnvector,timevector]= pickle.load(f) random=42 # with open('optimized_forest5.pickle','rb') as f: # res_forest=pickle.load(f) # scopt_model=RandomForestRegressor(max_depth=res_forest[0], # n_estimators = res_forest[1], # random_state = 42, # max_features=res_forest[2], # min_samples_split=res_forest[3], # min_samples_leaf=res_forest[4]) # classifiers = RandomForestRegressor(n_estimators=60,random_state=random) # classifier_noDST = RandomForestRegressor(n_estimators=60,random_state=1 with open('best_bayes.pickle', 'rb') as f: GPparams,RFparams=pickle.load(f) scopt_model=RandomForestRegressor(max_depth=RFparams['max_depth'], n_estimators = 500, random_state = 42, max_features=RFparams['max_features'], min_samples_split=RFparams['min_samples_split'], min_samples_leaf=RFparams['min_samples_leaf'], min_impurity_decrease=RFparams['min_impurity_decrease'], #max_samples=RFparams['max_samples'] ) X=learnvector Y=timevector classifiersfull = RandomForestRegressor(n_estimators=60,random_state=random) kfoldlen=10 X,Y=sorted_stratification(learnvector,timevector,kfoldlen,1) importances=[] for j in range(kfoldlen): XTrain=np.concatenate([X[i] for i in range(len(X)) if i!=j]) YTrain=np.concatenate([Y[i] for i in range(len(X)) if i!=j]) XTest=X[j] YTest=Y[j] idx=list(range(7)) scopt_model.fit(XTrain[:,idx], YTrain[:,2]) Y_pred=scopt_model.predict(XTest) deltaflat=(YTest[:,1]-YTest[:,2])/60 deltavec=(YTest[:,0]-YTest[:,2])/60 deltapred=(Y_pred-YTest[:,2])/60 RMSEflat= np.sqrt(np.nansum(np.square(deltaflat))/len(XTest)) RMSEvec=np.sqrt(np.nansum(np.square(deltavec))/len(XTest)) baseRMSEpred=np.sqrt(np.nansum(np.square(deltapred))/len(XTest)) features=np.array(['rx','ry','rz','vx','vy','vz','DST']) pXTrain=pd.DataFrame(data=XTrain,columns=features) pXTest=pd.DataFrame(data=XTest,columns=features) importance=[] # iterating over all columns and storing feature importance (difference between benchmark and new model) for col in pXTrain: model_clone = clone(scopt_model) model_clone.random_state = random model_clone.fit(pXTrain.drop(col, axis = 1), YTrain[:,2]) clonepred = model_clone.predict(pXTest.drop(col, axis = 1)) deltaclone=(clonepred-YTest[:,2])/60 cloneRMSEpred=np.sqrt(np.nansum(np.square(deltaclone))/len(XTest)) importance.append(cloneRMSEpred-baseRMSEpred) importances.append(importance) impis=np.array(importances) impstd=np.std(impis,axis=0) impmean=np.mean(impis,axis=0) importances=[] sum_mean=np.sum(impmean) with open('plot_importances.pickle', 'wb') as f: pickle.dump(impis, f) # for j in range(kfoldlen): # XTrain=np.concatenate([X[i] for i in range(len(X)) if i!=j]) # YTrain=np.concatenate([Y[i] for i in range(len(X)) if i!=j]) # XTest=X[j] # YTest=Y[j] # idx=list(range(7)) # scopt_model.fit(XTrain[:,idx], YTrain[:,2]) # Y_pred=scopt_model.predict(XTrain) # deltapred=(Y_pred-YTrain[:,2])/60 # baseRMSEpred=np.sqrt(np.nansum(np.square(deltapred))/len(XTrain)) # features=np.array(['rx','ry','rz','vx','vy','vz','DST']) # pXTrain=pd.DataFrame(data=XTrain,columns=features) # pXTest=pd.DataFrame(data=XTest,columns=features) # importance=[] # # iterating over all columns and storing feature importance (difference between benchmark and new model) # for col in pXTrain: # model_clone = clone(scopt_model) # model_clone.random_state = random # model_clone.fit(pXTrain.drop(col, axis = 1), YTrain[:,2]) # clonepred = model_clone.predict(pXTrain.drop(col, axis = 1)) # deltaclone=(clonepred-YTrain[:,2])/60 # cloneRMSEpred=np.sqrt(np.nansum(np.square(deltaclone))/len(XTrain)) # importance.append(cloneRMSEpred-baseRMSEpred) # importances.append(importance) # timpis=np.array(importances) # timpstd=np.std(timpis,axis=0) # timpmean=np.mean(timpis,axis=0) # tsum_mean=np.sum(timpmean) # pXTrain=pd.DataFrame(data=learnvector,columns=features) # full=[] # classifiersfull.fit(learnvector[:,idx], timevector[:,2]) # Y_pred=classifiersfull.predict(learnvector) # deltapred=(Y_pred-timevector[:,2])/60 # baseRMSEpred=np.sqrt(np.nansum(np.square(deltapred))/len(XTest)) # for col in pXTrain: # model_clone = clone(classifiersfull) # model_clone.random_state = random # model_clone.fit(pXTrain.drop(col, axis = 1), timevector[:,2]) # clonepred = model_clone.predict(pXTrain.drop(col, axis = 1)) # deltaclone=(clonepred-timevector[:,2])/60 # cloneRMSEpred=np.sqrt(np.nansum(np.square(deltaclone))/len(XTest)) # full.append( cloneRMSEpred-baseRMSEpred) # full=np.array(full)