Prediction_of_storm_occurrence.ipynb · Storm-Prediction

{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Iy_-s80Vcdau"
      },
      "outputs": [],
      "source": [
        "#  Importing the necessary libraries\n",
        "import numpy as np\n",
        "import matplotlib.pyplot as plt\n",
        "import pandas as pd\n",
        "import seaborn as sns\n",
        "from pylab import rcParams\n",
        "from pandas.plotting import register_matplotlib_converters\n",
        "register_matplotlib_converters()\n",
        "sns.set(style='whitegrid', palette='muted', font_scale=1.5)"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from google.colab import files\n",
        "uploaded = files.upload()"
      ],
      "metadata": {
        "id": "tV7_gsOacmZ1"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Importing the dataset\n",
        "df= pd.read_csv('dataset.csv', encoding='latin1')\n",
        "df"
      ],
      "metadata": {
        "id": "IJgAj4GpcorS"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "df = df.drop(['numer_sta', 'lat', 'lon', 'storm name'], axis=1)\n",
        "\n",
        "df['date'] = pd.to_datetime(df['date'])\n",
        "df = df.set_index(\"date\")\n",
        "df"
      ],
      "metadata": {
        "id": "_fvD4f81cqx2"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Dealing with missing values\n",
        "print(df.isnull().sum())"
      ],
      "metadata": {
        "id": "32Gf51qQczf8"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.impute import SimpleImputer\n",
        "\n",
        "# Create an imputer object using median as the strategy\n",
        "imputer = SimpleImputer(missing_values=np.nan, strategy='median')\n",
        "\n",
        "# Define the columns where you want to apply the imputation\n",
        "columns_to_impute = ['Temperature',\t'Humidity',\t'Wind speed',\t'Pressure', 'Wave height', 'Wave period']\n",
        "\n",
        "# Apply the imputer to the selected columns of the DataFrame\n",
        "df[columns_to_impute] = imputer.fit_transform(df[columns_to_impute])\n",
        "\n",
        "# Checking the DataFrame to ensure no more missing values\n",
        "df.isnull().values.any()"
      ],
      "metadata": {
        "id": "99xduElAc4Mp"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Split the data into training and testing dataset\n",
        "train_size = int(len(df) * 0.80)\n",
        "train, test = df.iloc[0:train_size], df.iloc[train_size:len(df)]\n",
        "print(\"train data: \", len(train))\n",
        "print(\"test data:\", len(test))"
      ],
      "metadata": {
        "id": "0BIk417Vc7OS"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "X_train, y_train = train.iloc[:, [0,1,2,3,4,5]].values, train.iloc[:,6].values\n",
        "X_test, y_test = test.iloc[:, [0,1,2,3,4,5]].values, test.iloc[:,6].values\n",
        "\n",
        "X_train.shape, y_train.shape, X_test.shape, y_test.shape"
      ],
      "metadata": {
        "id": "zGP65b45dE6s"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Normalization\n",
        "temp_training_mean = np.mean(X_train[:, 0])\n",
        "temp_training_std = np.std(X_train[:, 0])\n",
        "\n",
        "hum_training_mean = np.mean(X_train[:, 1])\n",
        "hum_training_std = np.std(X_train[:, 1])\n",
        "\n",
        "wind_training_mean = np.mean(X_train[:, 2])\n",
        "wind_training_std = np.std(X_train[:, 2])\n",
        "\n",
        "pres_training_mean = np.mean(X_train[:, 3])\n",
        "pres_training_std = np.std(X_train[:, 3])\n",
        "\n",
        "waveH_training_mean = np.mean(X_train[:, 4])\n",
        "waveH_training_std = np.std(X_train[:, 4])\n",
        "\n",
        "waveP_training_mean = np.mean(X_train[:, 5])\n",
        "waveP_training_std = np.std(X_train[:, 5])\n",
        "\n",
        "def preprocess(X):\n",
        "  X[:, 0] = (X[:, 0] - temp_training_mean) / temp_training_std\n",
        "  X[:, 1] = (X[:, 1] - hum_training_mean) / hum_training_std\n",
        "  X[:, 2] = (X[:, 2] - wind_training_mean) / wind_training_std\n",
        "  X[:, 3] = (X[:, 3] - pres_training_mean) / pres_training_std\n",
        "  X[:, 4] = (X[:, 4] - waveH_training_mean) / waveH_training_std\n",
        "  X[:, 4] = (X[:, 4] - waveP_training_mean) / waveP_training_std"
      ],
      "metadata": {
        "id": "VvgSrwisdFlt"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "preprocess(X_train)\n",
        "preprocess(X_test)"
      ],
      "metadata": {
        "id": "CCM3zvUhdJ0N"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Model and performance evaluation\n",
        "from xgboost import XGBClassifier\n",
        "from sklearn.metrics import precision_recall_fscore_support as score\n",
        "# Hyperparameter tuning\n",
        "from sklearn.model_selection import StratifiedKFold, cross_val_score\n",
        "from hyperopt import tpe, STATUS_OK, Trials, hp, fmin, STATUS_OK, space_eval"
      ],
      "metadata": {
        "id": "_5KsDqDQdMmB"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Space\n",
        "space = {\n",
        "    'learning_rate': hp.choice('learning_rate', [0.0001,0.001, 0.01, 0.1]),\n",
        "    'max_depth' : hp.choice('max_depth', range(3,13,1)),\n",
        "    'gamma' : hp.choice('gamma', [i/10.0 for i in range(0,5)]),\n",
        "    'colsample_bytree' : hp.choice('colsample_bytree', [i/10.0 for i in range(3,10)]),\n",
        "    'reg_alpha' : hp.choice('reg_alpha', [1e-5, 1e-2, 0.1, 1, 10, 100]),\n",
        "    'reg_lambda' : hp.choice('reg_lambda', [1e-5, 1e-2, 0.1, 1, 10, 100])\n",
        "}\n",
        "\n",
        "# Set up the k-fold cross-validation\n",
        "kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)"
      ],
      "metadata": {
        "id": "4_i8yFpldQGT"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def objective(params):\n",
        "\n",
        "    xgboost = XGBClassifier(seed=0, **params)\n",
        "    scores = cross_val_score(xgboost, X_train, y_train, cv=kfold, scoring='recall', n_jobs=-1)\n",
        "    # Extract the best score\n",
        "    best_score = max(scores)\n",
        "    # Loss must be minimized\n",
        "    loss = - best_score\n",
        "    # Dictionary with information for evaluation\n",
        "    return {'loss': loss, 'params': params, 'status': STATUS_OK}\n",
        "# Trials to track progress\n",
        "bayes_trials = Trials()\n",
        "# Optimize\n",
        "best = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 50, trials = bayes_trials)"
      ],
      "metadata": {
        "id": "eF7ixSfxdUXA"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Print the index of the best parameters\n",
        "print(best)\n",
        "# Print the values of the best parameters\n",
        "print(space_eval(space, best))"
      ],
      "metadata": {
        "id": "xlazRZH5dbRj"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Train model using the best parameters\n",
        "xgboost_bo = XGBClassifier(seed=0,\n",
        "                           colsample_bytree=0.4,\n",
        "                           gamma=0.0,\n",
        "                           learning_rate=0.1,\n",
        "                           max_depth=12,\n",
        "                           reg_alpha=1e-05,\n",
        "                           reg_lambda=0.01\n",
        "                           ).fit(X_train,y_train)"
      ],
      "metadata": {
        "id": "FvzH_hhadecb"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Make prediction using the best model\n",
        "preds = xgboost_bo.predict(X_test)"
      ],
      "metadata": {
        "id": "1kzFkqvcdjgh"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Get performance metrics\n",
        "precision, recall, fscore, support = score(y_test, preds)\n",
        "# Print result\n",
        "print(f'The recall value for the xgboost Bayesian optimization is {recall[1]:.4f}')"
      ],
      "metadata": {
        "id": "RAtyASyVdmgX"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "preds = pd.DataFrame(preds, columns=[\"storms_prediction\"])\n",
        "preds['Date'] = pd.date_range(start='01/01/2016', periods=len(preds), freq='D')\n",
        "preds.Date = pd.to_datetime(preds.Date)\n",
        "preds = preds.set_index(\"Date\")"
      ],
      "metadata": {
        "id": "CMeN5p6Zdpdx"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "test = pd.concat([test,preds], axis=1)\n",
        "test"
      ],
      "metadata": {
        "id": "frJ4unUIdw2l"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.metrics import confusion_matrix\n",
        "conf_matrix = confusion_matrix(y_test, preds)\n",
        "\n",
        "LABELS = [\"No storm\",\"Storm\"]\n",
        "\n",
        "plt.figure(figsize=(6, 6))\n",
        "sns.heatmap(conf_matrix, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt=\"d\");\n",
        "plt.title(\"Confusion matrix\")\n",
        "plt.ylabel('True class')\n",
        "plt.xlabel('Predicted class')\n",
        "plt.show()"
      ],
      "metadata": {
        "id": "Z0vQ0NFzd2RI"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "TP = conf_matrix[1][1]\n",
        "TN = conf_matrix[0][0]\n",
        "FP = conf_matrix[0][1]\n",
        "FN = conf_matrix[1][0]\n",
        "print('True Positives:', TP)\n",
        "print('True Negatives:', TN)\n",
        "print('False Positives:', FP)\n",
        "print('False Negatives:', FN)"
      ],
      "metadata": {
        "id": "2qbrvyefd5ls"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# calculate accuracy\n",
        "conf_accuracy = (float (TP+TN) / float(TP + TN + FP + FN))\n",
        "\n",
        "# calculate mis-classification\n",
        "conf_misclassification = 1- conf_accuracy\n",
        "\n",
        "# calculate the sensitivity\n",
        "conf_sensitivity = (TP / float(TP + FN))\n",
        "\n",
        "# calculate the specificity\n",
        "conf_specificity = (TN / float(TN + FP))\n",
        "\n",
        "# calculate precision\n",
        "conf_precision = (TN / float(TN + FP))\n",
        "\n",
        "# calculate f_1 score\n",
        "conf_f1 = 2 * ((conf_precision * conf_sensitivity) / (conf_precision + conf_sensitivity))\n",
        "\n",
        "\n",
        "print('-'*50)\n",
        "print(f'Accuracy: {round(conf_accuracy,2)}')\n",
        "print(f'Mis-Classification: {round(conf_misclassification,2)}')\n",
        "print(f'Sensitivity: {round(conf_sensitivity,2)}')\n",
        "print(f'Specificity: {round(conf_specificity,2)}')\n",
        "print(f'Precision: {round(conf_precision,2)}')\n",
        "print(f'f_1 Score: {round(conf_f1,2)}')"
      ],
      "metadata": {
        "id": "hr5w7iAad6bZ"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Compute micro-average ROC curve and ROC area\n",
        "from sklearn.metrics import roc_curve, auc\n",
        "fpr, tpr, _ = roc_curve(y_test, preds)\n",
        "roc_auc = auc(fpr, tpr)"
      ],
      "metadata": {
        "id": "qVFEULqieAw9"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "plt.figure()\n",
        "plt.figure(figsize=(20,10))\n",
        "lw = 2\n",
        "plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)\n",
        "plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')\n",
        "plt.xlim([-0.02, 1.0])\n",
        "plt.ylim([0.0, 1.05])\n",
        "plt.xlabel('False Positive Rate')\n",
        "plt.ylabel('True Positive Rate')\n",
        "plt.title('ROC curve')\n",
        "plt.legend(loc=\"lower right\")\n",
        "plt.show()"
      ],
      "metadata": {
        "id": "Ux2KU2eKeDrw"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "plt.figure(figsize=(20,8))\n",
        "plt.plot(test.index, test['Storm'], color = 'b', label=\"True storms\", marker='o', markersize=8, linestyle='dashed')\n",
        "plt.plot(test.index, test['storms_prediction'], color = 'lightsalmon', label=\"Predicted storms\", marker='o', markersize=8, linestyle='dashed')\n",
        "plt.title('Prediction of storms using XGBoost model')\n",
        "plt.legend(loc='best', fontsize='large')\n",
        "plt.xticks(fontsize=18)\n",
        "plt.yticks(fontsize=16)\n",
        "plt.xlabel(\"Date time\")\n",
        "plt.ylabel(\"Storms\")\n",
        "plt.show()"
      ],
      "metadata": {
        "id": "i4IjAT3LeG5Z"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}