ml-solarwind / bestbayes_hyperparameter.py
bestbayes_hyperparameter.py
Raw
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 27 17:28:57 2020

@author: baum_c4

performs hyperparameter search based on scikit-optimize/skopt

"""
from skopt import BayesSearchCV
from sklearn.datasets import load_digits
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from func_sorted_stratification import sorted_stratification
from sklearn.ensemble import GradientBoostingRegressor
from skopt.space import Real, Integer
from skopt.utils import use_named_args
import pickle
import numpy as np


with open('delay_DST_learningset.pickle', 'rb') as f:
   [learnvector,timevector]= pickle.load(f)

   
random=42

X=learnvector
Y=timevector
   

    
X,Y=sorted_stratification(learnvector,timevector,10,0)
#importances=[]
j=5
#jj=9
XTrain=np.concatenate([X[i] for i in range(len(X)) if i!=j])
YTrain=np.concatenate([Y[i] for i in range(len(X)) if i!=j])
XTest=X[j]
YTest=Y[j]
#XTestt=X[jj]
n_features = XTrain.shape[1]

# log-uniform: understand as search over p = exp(x) by varying x
opt = BayesSearchCV(
    GradientBoostingRegressor(),
    {
        'learning_rate': (1e-2, 1e0, 'log-uniform'),
         'max_depth': (1, 10),
         'max_features': (1,n_features),
         'min_samples_split':(2,20),
         'min_samples_leaf':(1,20),
         'n_estimators':(5,500),
         'min_impurity_decrease': (0,0.3),
           #'subsample':(0.1,1)
         
#          Real(10**-2, 10**0, "log-uniform", name='learning_rate'),
 #         Integer(1, 5, name='max_features'),
  #        Integer(2, 20, name='min_samples_split'),
   #       Integer(1, 20, name='min_samples_leaf'),
    #      Integer(5,500,name='n_estimators'),
     #     Real(0,1,name='min_impurity_decrease'),  # categorical parameter
    },
    n_iter=100,
    cv=5
    
)

opt.fit(XTrain, YTrain[:,2])

print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(XTest, YTest[:,2]))



optRF = BayesSearchCV(
    RandomForestRegressor(n_estimators=500),
    {
        #'learning_rate': (1e-2, 1e0, 'log-uniform'),
         'max_depth': (1, 20),
         'max_features': (1,n_features),
         'min_samples_split':(2,20),
         'min_samples_leaf':(1,20),
         #'n_estimators':(20,1000),
         'min_impurity_decrease': (0.,0.3),
         #'max_samples':(1,273)
         
# spacetree  = [Integer(1, 160, name='max_depth'),
#           Integer(20, 500, name='n_estimators'),
#           Integer(1,n_features, name='max_features'),
#           Integer(2, 5, name='min_samples_split'),
#           Integer(1, 4, name='min_samples_leaf'),
#           Real(0,1,name='min_impurity_decrease')]
    },
    n_iter=100,
    cv=5
    
)

optRF.fit(XTrain, YTrain[:,2])

GPparams=opt.best_params_
RFparams=optRF.best_params_
print("val. score: %s" % optRF.best_score_)
print("test score: %s" % optRF.score(XTest, YTest[:,2]))

with open('best_bayes.pickle', 'wb') as f:
    pickle.dump([GPparams,RFparams],f)
    

with open('best_bayes_data.pickle', 'wb') as f:
    pickle.dump([XTrain,YTrain,XTest,YTest],f)