{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "id": "Iy_-s80Vcdau" }, "outputs": [], "source": [ "# Importing the necessary libraries\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "import seaborn as sns\n", "from pylab import rcParams\n", "from pandas.plotting import register_matplotlib_converters\n", "register_matplotlib_converters()\n", "sns.set(style='whitegrid', palette='muted', font_scale=1.5)" ] }, { "cell_type": "code", "source": [ "from google.colab import files\n", "uploaded = files.upload()" ], "metadata": { "id": "tV7_gsOacmZ1" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Importing the dataset\n", "df= pd.read_csv('dataset.csv', encoding='latin1')\n", "df" ], "metadata": { "id": "IJgAj4GpcorS" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df = df.drop(['numer_sta', 'lat', 'lon', 'storm name'], axis=1)\n", "\n", "df['date'] = pd.to_datetime(df['date'])\n", "df = df.set_index(\"date\")\n", "df" ], "metadata": { "id": "_fvD4f81cqx2" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Dealing with missing values\n", "print(df.isnull().sum())" ], "metadata": { "id": "32Gf51qQczf8" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from sklearn.impute import SimpleImputer\n", "\n", "# Create an imputer object using median as the strategy\n", "imputer = SimpleImputer(missing_values=np.nan, strategy='median')\n", "\n", "# Define the columns where you want to apply the imputation\n", "columns_to_impute = ['Temperature',\t'Humidity',\t'Wind speed',\t'Pressure', 'Wave height', 'Wave period']\n", "\n", "# Apply the imputer to the selected columns of the DataFrame\n", "df[columns_to_impute] = imputer.fit_transform(df[columns_to_impute])\n", "\n", "# Checking the DataFrame to ensure no more missing values\n", "df.isnull().values.any()" ], "metadata": { "id": "99xduElAc4Mp" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Split the data into training and testing dataset\n", "train_size = int(len(df) * 0.80)\n", "train, test = df.iloc[0:train_size], df.iloc[train_size:len(df)]\n", "print(\"train data: \", len(train))\n", "print(\"test data:\", len(test))" ], "metadata": { "id": "0BIk417Vc7OS" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "X_train, y_train = train.iloc[:, [0,1,2,3,4,5]].values, train.iloc[:,6].values\n", "X_test, y_test = test.iloc[:, [0,1,2,3,4,5]].values, test.iloc[:,6].values\n", "\n", "X_train.shape, y_train.shape, X_test.shape, y_test.shape" ], "metadata": { "id": "zGP65b45dE6s" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Normalization\n", "temp_training_mean = np.mean(X_train[:, 0])\n", "temp_training_std = np.std(X_train[:, 0])\n", "\n", "hum_training_mean = np.mean(X_train[:, 1])\n", "hum_training_std = np.std(X_train[:, 1])\n", "\n", "wind_training_mean = np.mean(X_train[:, 2])\n", "wind_training_std = np.std(X_train[:, 2])\n", "\n", "pres_training_mean = np.mean(X_train[:, 3])\n", "pres_training_std = np.std(X_train[:, 3])\n", "\n", "waveH_training_mean = np.mean(X_train[:, 4])\n", "waveH_training_std = np.std(X_train[:, 4])\n", "\n", "waveP_training_mean = np.mean(X_train[:, 5])\n", "waveP_training_std = np.std(X_train[:, 5])\n", "\n", "def preprocess(X):\n", " X[:, 0] = (X[:, 0] - temp_training_mean) / temp_training_std\n", " X[:, 1] = (X[:, 1] - hum_training_mean) / hum_training_std\n", " X[:, 2] = (X[:, 2] - wind_training_mean) / wind_training_std\n", " X[:, 3] = (X[:, 3] - pres_training_mean) / pres_training_std\n", " X[:, 4] = (X[:, 4] - waveH_training_mean) / waveH_training_std\n", " X[:, 4] = (X[:, 4] - waveP_training_mean) / waveP_training_std" ], "metadata": { "id": "VvgSrwisdFlt" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "preprocess(X_train)\n", "preprocess(X_test)" ], "metadata": { "id": "CCM3zvUhdJ0N" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Model and performance evaluation\n", "from xgboost import XGBClassifier\n", "from sklearn.metrics import precision_recall_fscore_support as score\n", "# Hyperparameter tuning\n", "from sklearn.model_selection import StratifiedKFold, cross_val_score\n", "from hyperopt import tpe, STATUS_OK, Trials, hp, fmin, STATUS_OK, space_eval" ], "metadata": { "id": "_5KsDqDQdMmB" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Space\n", "space = {\n", " 'learning_rate': hp.choice('learning_rate', [0.0001,0.001, 0.01, 0.1]),\n", " 'max_depth' : hp.choice('max_depth', range(3,13,1)),\n", " 'gamma' : hp.choice('gamma', [i/10.0 for i in range(0,5)]),\n", " 'colsample_bytree' : hp.choice('colsample_bytree', [i/10.0 for i in range(3,10)]),\n", " 'reg_alpha' : hp.choice('reg_alpha', [1e-5, 1e-2, 0.1, 1, 10, 100]),\n", " 'reg_lambda' : hp.choice('reg_lambda', [1e-5, 1e-2, 0.1, 1, 10, 100])\n", "}\n", "\n", "# Set up the k-fold cross-validation\n", "kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)" ], "metadata": { "id": "4_i8yFpldQGT" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "def objective(params):\n", "\n", " xgboost = XGBClassifier(seed=0, **params)\n", " scores = cross_val_score(xgboost, X_train, y_train, cv=kfold, scoring='recall', n_jobs=-1)\n", " # Extract the best score\n", " best_score = max(scores)\n", " # Loss must be minimized\n", " loss = - best_score\n", " # Dictionary with information for evaluation\n", " return {'loss': loss, 'params': params, 'status': STATUS_OK}\n", "# Trials to track progress\n", "bayes_trials = Trials()\n", "# Optimize\n", "best = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 50, trials = bayes_trials)" ], "metadata": { "id": "eF7ixSfxdUXA" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Print the index of the best parameters\n", "print(best)\n", "# Print the values of the best parameters\n", "print(space_eval(space, best))" ], "metadata": { "id": "xlazRZH5dbRj" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Train model using the best parameters\n", "xgboost_bo = XGBClassifier(seed=0,\n", " colsample_bytree=0.4,\n", " gamma=0.0,\n", " learning_rate=0.1,\n", " max_depth=12,\n", " reg_alpha=1e-05,\n", " reg_lambda=0.01\n", " ).fit(X_train,y_train)" ], "metadata": { "id": "FvzH_hhadecb" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Make prediction using the best model\n", "preds = xgboost_bo.predict(X_test)" ], "metadata": { "id": "1kzFkqvcdjgh" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Get performance metrics\n", "precision, recall, fscore, support = score(y_test, preds)\n", "# Print result\n", "print(f'The recall value for the xgboost Bayesian optimization is {recall[1]:.4f}')" ], "metadata": { "id": "RAtyASyVdmgX" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "preds = pd.DataFrame(preds, columns=[\"storms_prediction\"])\n", "preds['Date'] = pd.date_range(start='01/01/2016', periods=len(preds), freq='D')\n", "preds.Date = pd.to_datetime(preds.Date)\n", "preds = preds.set_index(\"Date\")" ], "metadata": { "id": "CMeN5p6Zdpdx" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "test = pd.concat([test,preds], axis=1)\n", "test" ], "metadata": { "id": "frJ4unUIdw2l" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from sklearn.metrics import confusion_matrix\n", "conf_matrix = confusion_matrix(y_test, preds)\n", "\n", "LABELS = [\"No storm\",\"Storm\"]\n", "\n", "plt.figure(figsize=(6, 6))\n", "sns.heatmap(conf_matrix, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt=\"d\");\n", "plt.title(\"Confusion matrix\")\n", "plt.ylabel('True class')\n", "plt.xlabel('Predicted class')\n", "plt.show()" ], "metadata": { "id": "Z0vQ0NFzd2RI" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "TP = conf_matrix[1][1]\n", "TN = conf_matrix[0][0]\n", "FP = conf_matrix[0][1]\n", "FN = conf_matrix[1][0]\n", "print('True Positives:', TP)\n", "print('True Negatives:', TN)\n", "print('False Positives:', FP)\n", "print('False Negatives:', FN)" ], "metadata": { "id": "2qbrvyefd5ls" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# calculate accuracy\n", "conf_accuracy = (float (TP+TN) / float(TP + TN + FP + FN))\n", "\n", "# calculate mis-classification\n", "conf_misclassification = 1- conf_accuracy\n", "\n", "# calculate the sensitivity\n", "conf_sensitivity = (TP / float(TP + FN))\n", "\n", "# calculate the specificity\n", "conf_specificity = (TN / float(TN + FP))\n", "\n", "# calculate precision\n", "conf_precision = (TN / float(TN + FP))\n", "\n", "# calculate f_1 score\n", "conf_f1 = 2 * ((conf_precision * conf_sensitivity) / (conf_precision + conf_sensitivity))\n", "\n", "\n", "print('-'*50)\n", "print(f'Accuracy: {round(conf_accuracy,2)}')\n", "print(f'Mis-Classification: {round(conf_misclassification,2)}')\n", "print(f'Sensitivity: {round(conf_sensitivity,2)}')\n", "print(f'Specificity: {round(conf_specificity,2)}')\n", "print(f'Precision: {round(conf_precision,2)}')\n", "print(f'f_1 Score: {round(conf_f1,2)}')" ], "metadata": { "id": "hr5w7iAad6bZ" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Compute micro-average ROC curve and ROC area\n", "from sklearn.metrics import roc_curve, auc\n", "fpr, tpr, _ = roc_curve(y_test, preds)\n", "roc_auc = auc(fpr, tpr)" ], "metadata": { "id": "qVFEULqieAw9" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "plt.figure()\n", "plt.figure(figsize=(20,10))\n", "lw = 2\n", "plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)\n", "plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')\n", "plt.xlim([-0.02, 1.0])\n", "plt.ylim([0.0, 1.05])\n", "plt.xlabel('False Positive Rate')\n", "plt.ylabel('True Positive Rate')\n", "plt.title('ROC curve')\n", "plt.legend(loc=\"lower right\")\n", "plt.show()" ], "metadata": { "id": "Ux2KU2eKeDrw" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "plt.figure(figsize=(20,8))\n", "plt.plot(test.index, test['Storm'], color = 'b', label=\"True storms\", marker='o', markersize=8, linestyle='dashed')\n", "plt.plot(test.index, test['storms_prediction'], color = 'lightsalmon', label=\"Predicted storms\", marker='o', markersize=8, linestyle='dashed')\n", "plt.title('Prediction of storms using XGBoost model')\n", "plt.legend(loc='best', fontsize='large')\n", "plt.xticks(fontsize=18)\n", "plt.yticks(fontsize=16)\n", "plt.xlabel(\"Date time\")\n", "plt.ylabel(\"Storms\")\n", "plt.show()" ], "metadata": { "id": "i4IjAT3LeG5Z" }, "execution_count": null, "outputs": [] } ] }