Data-Science-Assignment / data_preprocessing_cardiovascular.ipynb
data_preprocessing_cardiovascular.ipynb
Raw
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "8a6a621a",
   "metadata": {},
   "source": [
    "### <font color='289C4E'>Table of contents<font><a class='anchor' id='top'></a>\n",
    "1. [Data Cleaning](#Data-cleaning)\n",
    "2. [Data Scaling](#Data-scaling)\n",
    "3. [Data Transformation](#Data-Transformation)\n",
    "4. [Data Reduction](#Data-reduction)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0d0582c1",
   "metadata": {},
   "source": [
    "# Data preprocessing"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "00afed32",
   "metadata": {},
   "source": [
    "## Importing dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "aae84b5a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# import libraries\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "# Read the dataset\n",
    "dataset_path = \"C:\\My\\Top-up Degree\\Data Science\\Data Science - Assignment\\Data set\\Kaggle\\Cardiovascular Diseases Risk Prediction Dataset\\CVD_cleaned.csv\"\n",
    "data = pd.read_csv(dataset_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7b1bb101",
   "metadata": {},
   "source": [
    "## Data cleaning"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4de2bc46",
   "metadata": {},
   "source": [
    "### Fixing any missing values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "6551a76e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'No missing values found'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Import pandas as pd\n",
    "import pandas as pd\n",
    "\n",
    "# Import display from IPython.display\n",
    "from IPython.display import display\n",
    "\n",
    "# Calculate the missing data ratio for each column as percentages\n",
    "missing_data_ratio = data.isnull().mean() * 100\n",
    "\n",
    "# Convert the Series to a DataFrame for better display\n",
    "missing_data_df = pd.DataFrame({'Missing Data Ratio (%)': missing_data_ratio})\n",
    "\n",
    "# Check if there are any missing values in the entire dataset\n",
    "if missing_data_df['Missing Data Ratio (%)'].any():\n",
    "    # Display the missing data ratio for each column as percentages in a scrollable DataFrame\n",
    "    with pd.option_context('display.max_rows', None):\n",
    "        display(missing_data_df)\n",
    "else:\n",
    "    # Display a message that there are no missing values\n",
    "    display('No missing values found')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c1ead635",
   "metadata": {},
   "source": [
    "### Removing duplicates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "03ea64c1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of duplicates: 80\n"
     ]
    }
   ],
   "source": [
    "# Count duplicates\n",
    "duplicate_count = data.duplicated().sum()\n",
    "\n",
    "# Print the count\n",
    "print(\"Number of duplicates:\", duplicate_count)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "c5988257",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove duplicate rows\n",
    "data = data.drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "248702e3",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of duplicates: 0\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(308774, 19)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Count duplicates\n",
    "duplicate_count = data.duplicated().sum()\n",
    "\n",
    "# Print the count\n",
    "print(\"Number of duplicates:\", duplicate_count)\n",
    "data.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "22db0341",
   "metadata": {},
   "source": [
    "### Dealing with outliers (Quantile Capping for columns with outliers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "5f7291ba",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(308774, 19)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lower_percentile = 1\n",
    "upper_percentile = 99\n",
    "\n",
    "# Perform quantile capping on data_train_dia\n",
    "lower_bound_dia = data[['Height_(cm)', 'Weight_(kg)', 'BMI']].quantile(lower_percentile/100)\n",
    "upper_bound_dia = data[['Height_(cm)', 'Weight_(kg)', 'BMI']].quantile(upper_percentile/100)\n",
    "data[['Height_(cm)', 'Weight_(kg)', 'BMI']] = data[['Height_(cm)', 'Weight_(kg)', 'BMI']].clip(lower_bound_dia, upper_bound_dia, axis=1)\n",
    "data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "343bda42",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Height_(cm)</th>\n",
       "      <th>Weight_(kg)</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Alcohol_Consumption</th>\n",
       "      <th>Fruit_Consumption</th>\n",
       "      <th>Green_Vegetables_Consumption</th>\n",
       "      <th>FriedPotato_Consumption</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>308774.000000</td>\n",
       "      <td>308774.000000</td>\n",
       "      <td>308774.000000</td>\n",
       "      <td>308774.000000</td>\n",
       "      <td>308774.000000</td>\n",
       "      <td>308774.000000</td>\n",
       "      <td>308774.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>170.625367</td>\n",
       "      <td>83.432232</td>\n",
       "      <td>28.575602</td>\n",
       "      <td>5.097557</td>\n",
       "      <td>29.834290</td>\n",
       "      <td>15.109517</td>\n",
       "      <td>6.297237</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>10.382607</td>\n",
       "      <td>20.509949</td>\n",
       "      <td>6.223295</td>\n",
       "      <td>8.200434</td>\n",
       "      <td>24.877812</td>\n",
       "      <td>14.926912</td>\n",
       "      <td>8.583837</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>150.000000</td>\n",
       "      <td>47.630000</td>\n",
       "      <td>18.010000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>163.000000</td>\n",
       "      <td>68.040000</td>\n",
       "      <td>24.210000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>12.000000</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>2.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>170.000000</td>\n",
       "      <td>81.650000</td>\n",
       "      <td>27.440000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>30.000000</td>\n",
       "      <td>12.000000</td>\n",
       "      <td>4.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>178.000000</td>\n",
       "      <td>95.250000</td>\n",
       "      <td>31.850000</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>30.000000</td>\n",
       "      <td>20.000000</td>\n",
       "      <td>8.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>193.000000</td>\n",
       "      <td>147.420000</td>\n",
       "      <td>49.492700</td>\n",
       "      <td>30.000000</td>\n",
       "      <td>120.000000</td>\n",
       "      <td>128.000000</td>\n",
       "      <td>128.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         Height_(cm)    Weight_(kg)            BMI  Alcohol_Consumption  \\\n",
       "count  308774.000000  308774.000000  308774.000000        308774.000000   \n",
       "mean      170.625367      83.432232      28.575602             5.097557   \n",
       "std        10.382607      20.509949       6.223295             8.200434   \n",
       "min       150.000000      47.630000      18.010000             0.000000   \n",
       "25%       163.000000      68.040000      24.210000             0.000000   \n",
       "50%       170.000000      81.650000      27.440000             1.000000   \n",
       "75%       178.000000      95.250000      31.850000             6.000000   \n",
       "max       193.000000     147.420000      49.492700            30.000000   \n",
       "\n",
       "       Fruit_Consumption  Green_Vegetables_Consumption  \\\n",
       "count      308774.000000                 308774.000000   \n",
       "mean           29.834290                     15.109517   \n",
       "std            24.877812                     14.926912   \n",
       "min             0.000000                      0.000000   \n",
       "25%            12.000000                      4.000000   \n",
       "50%            30.000000                     12.000000   \n",
       "75%            30.000000                     20.000000   \n",
       "max           120.000000                    128.000000   \n",
       "\n",
       "       FriedPotato_Consumption  \n",
       "count            308774.000000  \n",
       "mean                  6.297237  \n",
       "std                   8.583837  \n",
       "min                   0.000000  \n",
       "25%                   2.000000  \n",
       "50%                   4.000000  \n",
       "75%                   8.000000  \n",
       "max                 128.000000  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "8b1ad678",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>General_Health</th>\n",
       "      <th>Checkup</th>\n",
       "      <th>Exercise</th>\n",
       "      <th>Heart_Disease</th>\n",
       "      <th>Skin_Cancer</th>\n",
       "      <th>Other_Cancer</th>\n",
       "      <th>Depression</th>\n",
       "      <th>Diabetes</th>\n",
       "      <th>Arthritis</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age_Category</th>\n",
       "      <th>Height_(cm)</th>\n",
       "      <th>Weight_(kg)</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Smoking_History</th>\n",
       "      <th>Alcohol_Consumption</th>\n",
       "      <th>Fruit_Consumption</th>\n",
       "      <th>Green_Vegetables_Consumption</th>\n",
       "      <th>FriedPotato_Consumption</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Poor</td>\n",
       "      <td>Within the past 2 years</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Female</td>\n",
       "      <td>70-74</td>\n",
       "      <td>150.0</td>\n",
       "      <td>47.63</td>\n",
       "      <td>18.01</td>\n",
       "      <td>Yes</td>\n",
       "      <td>0.0</td>\n",
       "      <td>30.0</td>\n",
       "      <td>16.0</td>\n",
       "      <td>12.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Very Good</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>No</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>Female</td>\n",
       "      <td>70-74</td>\n",
       "      <td>165.0</td>\n",
       "      <td>77.11</td>\n",
       "      <td>28.29</td>\n",
       "      <td>No</td>\n",
       "      <td>0.0</td>\n",
       "      <td>30.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Very Good</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>Female</td>\n",
       "      <td>60-64</td>\n",
       "      <td>163.0</td>\n",
       "      <td>88.45</td>\n",
       "      <td>33.47</td>\n",
       "      <td>No</td>\n",
       "      <td>4.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>16.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Poor</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>Male</td>\n",
       "      <td>75-79</td>\n",
       "      <td>180.0</td>\n",
       "      <td>93.44</td>\n",
       "      <td>28.73</td>\n",
       "      <td>No</td>\n",
       "      <td>0.0</td>\n",
       "      <td>30.0</td>\n",
       "      <td>30.0</td>\n",
       "      <td>8.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Good</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Male</td>\n",
       "      <td>80+</td>\n",
       "      <td>191.0</td>\n",
       "      <td>88.45</td>\n",
       "      <td>24.37</td>\n",
       "      <td>Yes</td>\n",
       "      <td>0.0</td>\n",
       "      <td>8.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  General_Health                  Checkup Exercise Heart_Disease Skin_Cancer  \\\n",
       "0           Poor  Within the past 2 years       No            No          No   \n",
       "1      Very Good     Within the past year       No           Yes          No   \n",
       "2      Very Good     Within the past year      Yes            No          No   \n",
       "3           Poor     Within the past year      Yes           Yes          No   \n",
       "4           Good     Within the past year       No            No          No   \n",
       "\n",
       "  Other_Cancer Depression Diabetes Arthritis     Sex Age_Category  \\\n",
       "0           No         No       No       Yes  Female        70-74   \n",
       "1           No         No      Yes        No  Female        70-74   \n",
       "2           No         No      Yes        No  Female        60-64   \n",
       "3           No         No      Yes        No    Male        75-79   \n",
       "4           No         No       No        No    Male          80+   \n",
       "\n",
       "   Height_(cm)  Weight_(kg)    BMI Smoking_History  Alcohol_Consumption  \\\n",
       "0        150.0        47.63  18.01             Yes                  0.0   \n",
       "1        165.0        77.11  28.29              No                  0.0   \n",
       "2        163.0        88.45  33.47              No                  4.0   \n",
       "3        180.0        93.44  28.73              No                  0.0   \n",
       "4        191.0        88.45  24.37             Yes                  0.0   \n",
       "\n",
       "   Fruit_Consumption  Green_Vegetables_Consumption  FriedPotato_Consumption  \n",
       "0               30.0                          16.0                     12.0  \n",
       "1               30.0                           0.0                      4.0  \n",
       "2               12.0                           3.0                     16.0  \n",
       "3               30.0                          30.0                      8.0  \n",
       "4                8.0                           4.0                      0.0  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b4f827ea",
   "metadata": {},
   "source": [
    "#### Saving the cleaned Data for visualization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "afecdb55",
   "metadata": {},
   "outputs": [],
   "source": [
    "data.to_csv('cvd_data_cleaned.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "83dee627",
   "metadata": {},
   "source": [
    "## Data scaling"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9d2e68f2",
   "metadata": {},
   "source": [
    "### Using Normalization (MinMaxScaling)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "687fccfc",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-block alert-info\">\n",
    "<b>Note:</b> This can help improve the performance of some machine learning algorithms that operate on a linear space or use a distance metric, such as KNN, linear regression, or K-means.\n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "7c347866",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>General_Health</th>\n",
       "      <th>Checkup</th>\n",
       "      <th>Exercise</th>\n",
       "      <th>Heart_Disease</th>\n",
       "      <th>Skin_Cancer</th>\n",
       "      <th>Other_Cancer</th>\n",
       "      <th>Depression</th>\n",
       "      <th>Diabetes</th>\n",
       "      <th>Arthritis</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age_Category</th>\n",
       "      <th>Height_(cm)</th>\n",
       "      <th>Weight_(kg)</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Smoking_History</th>\n",
       "      <th>Alcohol_Consumption</th>\n",
       "      <th>Fruit_Consumption</th>\n",
       "      <th>Green_Vegetables_Consumption</th>\n",
       "      <th>FriedPotato_Consumption</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Poor</td>\n",
       "      <td>Within the past 2 years</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Female</td>\n",
       "      <td>70-74</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>Yes</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.09375</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Very Good</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>No</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>Female</td>\n",
       "      <td>70-74</td>\n",
       "      <td>0.348837</td>\n",
       "      <td>0.295420</td>\n",
       "      <td>0.326529</td>\n",
       "      <td>No</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.03125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Very Good</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>Female</td>\n",
       "      <td>60-64</td>\n",
       "      <td>0.302326</td>\n",
       "      <td>0.409059</td>\n",
       "      <td>0.491063</td>\n",
       "      <td>No</td>\n",
       "      <td>0.133333</td>\n",
       "      <td>0.100000</td>\n",
       "      <td>0.023438</td>\n",
       "      <td>0.12500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Poor</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>Male</td>\n",
       "      <td>75-79</td>\n",
       "      <td>0.697674</td>\n",
       "      <td>0.459064</td>\n",
       "      <td>0.340504</td>\n",
       "      <td>No</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.234375</td>\n",
       "      <td>0.06250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Good</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Male</td>\n",
       "      <td>80+</td>\n",
       "      <td>0.953488</td>\n",
       "      <td>0.409059</td>\n",
       "      <td>0.202016</td>\n",
       "      <td>Yes</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.066667</td>\n",
       "      <td>0.031250</td>\n",
       "      <td>0.00000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  General_Health                  Checkup Exercise Heart_Disease Skin_Cancer  \\\n",
       "0           Poor  Within the past 2 years       No            No          No   \n",
       "1      Very Good     Within the past year       No           Yes          No   \n",
       "2      Very Good     Within the past year      Yes            No          No   \n",
       "3           Poor     Within the past year      Yes           Yes          No   \n",
       "4           Good     Within the past year       No            No          No   \n",
       "\n",
       "  Other_Cancer Depression Diabetes Arthritis     Sex Age_Category  \\\n",
       "0           No         No       No       Yes  Female        70-74   \n",
       "1           No         No      Yes        No  Female        70-74   \n",
       "2           No         No      Yes        No  Female        60-64   \n",
       "3           No         No      Yes        No    Male        75-79   \n",
       "4           No         No       No        No    Male          80+   \n",
       "\n",
       "   Height_(cm)  Weight_(kg)       BMI Smoking_History  Alcohol_Consumption  \\\n",
       "0     0.000000     0.000000  0.000000             Yes             0.000000   \n",
       "1     0.348837     0.295420  0.326529              No             0.000000   \n",
       "2     0.302326     0.409059  0.491063              No             0.133333   \n",
       "3     0.697674     0.459064  0.340504              No             0.000000   \n",
       "4     0.953488     0.409059  0.202016             Yes             0.000000   \n",
       "\n",
       "   Fruit_Consumption  Green_Vegetables_Consumption  FriedPotato_Consumption  \n",
       "0           0.250000                      0.125000                  0.09375  \n",
       "1           0.250000                      0.000000                  0.03125  \n",
       "2           0.100000                      0.023438                  0.12500  \n",
       "3           0.250000                      0.234375                  0.06250  \n",
       "4           0.066667                      0.031250                  0.00000  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.preprocessing import MinMaxScaler\n",
    "\n",
    "# Define the numerical features\n",
    "numerical_features = ['Height_(cm)', 'Weight_(kg)', 'BMI', 'Alcohol_Consumption', 'Fruit_Consumption', 'Green_Vegetables_Consumption', 'FriedPotato_Consumption']\n",
    "\n",
    "# create a new dataframe for minmax scaling\n",
    "data_minmax_scaled = data.copy()\n",
    "\n",
    "# Create a MinMaxScaler object\n",
    "scaler = MinMaxScaler()\n",
    "\n",
    "# Fit and transform the numerical features using the scaler\n",
    "data_minmax_scaled[numerical_features] = scaler.fit_transform(data_minmax_scaled[numerical_features])\n",
    "\n",
    "data_minmax_scaled.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7fe1c75f",
   "metadata": {},
   "source": [
    "### Using standardization"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "42aa8620",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-block alert-info\">\n",
    "<b>Note:</b>  This can help reduce the effect of outliers and improve the performance of some machine learning algorithms that assume normality, such as SVM, logistic regression, or PCA\n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "99b8ea8e",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>General_Health</th>\n",
       "      <th>Checkup</th>\n",
       "      <th>Exercise</th>\n",
       "      <th>Heart_Disease</th>\n",
       "      <th>Skin_Cancer</th>\n",
       "      <th>Other_Cancer</th>\n",
       "      <th>Depression</th>\n",
       "      <th>Diabetes</th>\n",
       "      <th>Arthritis</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age_Category</th>\n",
       "      <th>Height_(cm)</th>\n",
       "      <th>Weight_(kg)</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Smoking_History</th>\n",
       "      <th>Alcohol_Consumption</th>\n",
       "      <th>Fruit_Consumption</th>\n",
       "      <th>Green_Vegetables_Consumption</th>\n",
       "      <th>FriedPotato_Consumption</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Poor</td>\n",
       "      <td>Within the past 2 years</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Female</td>\n",
       "      <td>70-74</td>\n",
       "      <td>-1.986534</td>\n",
       "      <td>-1.745606</td>\n",
       "      <td>-1.697753</td>\n",
       "      <td>Yes</td>\n",
       "      <td>-0.621621</td>\n",
       "      <td>0.006661</td>\n",
       "      <td>0.059656</td>\n",
       "      <td>0.664362</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Very Good</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>No</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>Female</td>\n",
       "      <td>70-74</td>\n",
       "      <td>-0.541808</td>\n",
       "      <td>-0.308252</td>\n",
       "      <td>-0.045893</td>\n",
       "      <td>No</td>\n",
       "      <td>-0.621621</td>\n",
       "      <td>0.006661</td>\n",
       "      <td>-1.012235</td>\n",
       "      <td>-0.267624</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Very Good</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>Female</td>\n",
       "      <td>60-64</td>\n",
       "      <td>-0.734438</td>\n",
       "      <td>0.244651</td>\n",
       "      <td>0.786465</td>\n",
       "      <td>No</td>\n",
       "      <td>-0.133842</td>\n",
       "      <td>-0.716876</td>\n",
       "      <td>-0.811255</td>\n",
       "      <td>1.130355</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Poor</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>Male</td>\n",
       "      <td>75-79</td>\n",
       "      <td>0.902919</td>\n",
       "      <td>0.487948</td>\n",
       "      <td>0.024810</td>\n",
       "      <td>No</td>\n",
       "      <td>-0.621621</td>\n",
       "      <td>0.006661</td>\n",
       "      <td>0.997561</td>\n",
       "      <td>0.198369</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Good</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Male</td>\n",
       "      <td>80+</td>\n",
       "      <td>1.962384</td>\n",
       "      <td>0.244651</td>\n",
       "      <td>-0.675785</td>\n",
       "      <td>Yes</td>\n",
       "      <td>-0.621621</td>\n",
       "      <td>-0.877663</td>\n",
       "      <td>-0.744262</td>\n",
       "      <td>-0.733617</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  General_Health                  Checkup Exercise Heart_Disease Skin_Cancer  \\\n",
       "0           Poor  Within the past 2 years       No            No          No   \n",
       "1      Very Good     Within the past year       No           Yes          No   \n",
       "2      Very Good     Within the past year      Yes            No          No   \n",
       "3           Poor     Within the past year      Yes           Yes          No   \n",
       "4           Good     Within the past year       No            No          No   \n",
       "\n",
       "  Other_Cancer Depression Diabetes Arthritis     Sex Age_Category  \\\n",
       "0           No         No       No       Yes  Female        70-74   \n",
       "1           No         No      Yes        No  Female        70-74   \n",
       "2           No         No      Yes        No  Female        60-64   \n",
       "3           No         No      Yes        No    Male        75-79   \n",
       "4           No         No       No        No    Male          80+   \n",
       "\n",
       "   Height_(cm)  Weight_(kg)       BMI Smoking_History  Alcohol_Consumption  \\\n",
       "0    -1.986534    -1.745606 -1.697753             Yes            -0.621621   \n",
       "1    -0.541808    -0.308252 -0.045893              No            -0.621621   \n",
       "2    -0.734438     0.244651  0.786465              No            -0.133842   \n",
       "3     0.902919     0.487948  0.024810              No            -0.621621   \n",
       "4     1.962384     0.244651 -0.675785             Yes            -0.621621   \n",
       "\n",
       "   Fruit_Consumption  Green_Vegetables_Consumption  FriedPotato_Consumption  \n",
       "0           0.006661                      0.059656                 0.664362  \n",
       "1           0.006661                     -1.012235                -0.267624  \n",
       "2          -0.716876                     -0.811255                 1.130355  \n",
       "3           0.006661                      0.997561                 0.198369  \n",
       "4          -0.877663                     -0.744262                -0.733617  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "# create a new dataframe for standard scaling\n",
    "data_standard_scaled = data.copy()\n",
    "\n",
    "# Define the numerical features\n",
    "numerical_features = ['Height_(cm)', 'Weight_(kg)', 'BMI', 'Alcohol_Consumption', 'Fruit_Consumption', 'Green_Vegetables_Consumption', 'FriedPotato_Consumption']\n",
    "\n",
    "# Create a StandardScaler object\n",
    "scaler = StandardScaler()\n",
    "\n",
    "# Fit and transform the numerical features using the scaler\n",
    "data_standard_scaled[numerical_features] = scaler.fit_transform(data_standard_scaled[numerical_features])\n",
    "\n",
    "data_standard_scaled.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c9b606d7",
   "metadata": {},
   "source": [
    "### Using Robust Scaling technique"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c992841a",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-block alert-info\">\n",
    "<b>Note:</b> This can help improve the performance of some machine learning algorithms that are sensitive to outliers, such as DBSCAN, linear models with regularization, or neural networks.\n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "cdb11ee9",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>General_Health</th>\n",
       "      <th>Checkup</th>\n",
       "      <th>Exercise</th>\n",
       "      <th>Heart_Disease</th>\n",
       "      <th>Skin_Cancer</th>\n",
       "      <th>Other_Cancer</th>\n",
       "      <th>Depression</th>\n",
       "      <th>Diabetes</th>\n",
       "      <th>Arthritis</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age_Category</th>\n",
       "      <th>Height_(cm)</th>\n",
       "      <th>Weight_(kg)</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Smoking_History</th>\n",
       "      <th>Alcohol_Consumption</th>\n",
       "      <th>Fruit_Consumption</th>\n",
       "      <th>Green_Vegetables_Consumption</th>\n",
       "      <th>FriedPotato_Consumption</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Poor</td>\n",
       "      <td>Within the past 2 years</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Female</td>\n",
       "      <td>70-74</td>\n",
       "      <td>-1.333333</td>\n",
       "      <td>-1.250276</td>\n",
       "      <td>-1.234293</td>\n",
       "      <td>Yes</td>\n",
       "      <td>-0.166667</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.2500</td>\n",
       "      <td>1.333333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Very Good</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>No</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>Female</td>\n",
       "      <td>70-74</td>\n",
       "      <td>-0.333333</td>\n",
       "      <td>-0.166850</td>\n",
       "      <td>0.111257</td>\n",
       "      <td>No</td>\n",
       "      <td>-0.166667</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.7500</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Very Good</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>Female</td>\n",
       "      <td>60-64</td>\n",
       "      <td>-0.466667</td>\n",
       "      <td>0.249908</td>\n",
       "      <td>0.789267</td>\n",
       "      <td>No</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>-1.000000</td>\n",
       "      <td>-0.5625</td>\n",
       "      <td>2.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Poor</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>Male</td>\n",
       "      <td>75-79</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.433297</td>\n",
       "      <td>0.168848</td>\n",
       "      <td>No</td>\n",
       "      <td>-0.166667</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.1250</td>\n",
       "      <td>0.666667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Good</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Male</td>\n",
       "      <td>80+</td>\n",
       "      <td>1.400000</td>\n",
       "      <td>0.249908</td>\n",
       "      <td>-0.401832</td>\n",
       "      <td>Yes</td>\n",
       "      <td>-0.166667</td>\n",
       "      <td>-1.222222</td>\n",
       "      <td>-0.5000</td>\n",
       "      <td>-0.666667</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  General_Health                  Checkup Exercise Heart_Disease Skin_Cancer  \\\n",
       "0           Poor  Within the past 2 years       No            No          No   \n",
       "1      Very Good     Within the past year       No           Yes          No   \n",
       "2      Very Good     Within the past year      Yes            No          No   \n",
       "3           Poor     Within the past year      Yes           Yes          No   \n",
       "4           Good     Within the past year       No            No          No   \n",
       "\n",
       "  Other_Cancer Depression Diabetes Arthritis     Sex Age_Category  \\\n",
       "0           No         No       No       Yes  Female        70-74   \n",
       "1           No         No      Yes        No  Female        70-74   \n",
       "2           No         No      Yes        No  Female        60-64   \n",
       "3           No         No      Yes        No    Male        75-79   \n",
       "4           No         No       No        No    Male          80+   \n",
       "\n",
       "   Height_(cm)  Weight_(kg)       BMI Smoking_History  Alcohol_Consumption  \\\n",
       "0    -1.333333    -1.250276 -1.234293             Yes            -0.166667   \n",
       "1    -0.333333    -0.166850  0.111257              No            -0.166667   \n",
       "2    -0.466667     0.249908  0.789267              No             0.500000   \n",
       "3     0.666667     0.433297  0.168848              No            -0.166667   \n",
       "4     1.400000     0.249908 -0.401832             Yes            -0.166667   \n",
       "\n",
       "   Fruit_Consumption  Green_Vegetables_Consumption  FriedPotato_Consumption  \n",
       "0           0.000000                        0.2500                 1.333333  \n",
       "1           0.000000                       -0.7500                 0.000000  \n",
       "2          -1.000000                       -0.5625                 2.000000  \n",
       "3           0.000000                        1.1250                 0.666667  \n",
       "4          -1.222222                       -0.5000                -0.666667  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Import pandas and sklearn libraries\n",
    "import pandas as pd\n",
    "from sklearn.preprocessing import RobustScaler\n",
    "\n",
    "# create a new dataframe for standard scaling\n",
    "data_robust_scaled = data.copy()\n",
    "\n",
    "# Define the numerical features\n",
    "numerical_features = ['Height_(cm)', 'Weight_(kg)', 'BMI', 'Alcohol_Consumption', 'Fruit_Consumption', 'Green_Vegetables_Consumption', 'FriedPotato_Consumption']\n",
    "\n",
    "# Create a RobustScaler object\n",
    "scaler = RobustScaler()\n",
    "\n",
    "# Fit and transform the numerical features using the scaler\n",
    "data_robust_scaled[numerical_features] = scaler.fit_transform(data_robust_scaled[numerical_features])\n",
    "\n",
    "data_robust_scaled.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2b60c5fc",
   "metadata": {},
   "source": [
    "### Using the chosen scaling technique"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "473e8878",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Overwrite the original data dataframe with the scaled dataframe\n",
    "data = data_minmax_scaled"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e706df4e",
   "metadata": {},
   "source": [
    "## Data Transformation"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c9b959c9",
   "metadata": {},
   "source": [
    "### Checking the abnormal data before transforming them"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "5a28862c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Very Good    110351\n",
      "Good          95355\n",
      "Excellent     55929\n",
      "Fair          35808\n",
      "Poor          11331\n",
      "Name: General_Health, dtype: int64\n",
      "\n",
      "Within the past year       239295\n",
      "Within the past 2 years     37210\n",
      "Within the past 5 years     17442\n",
      "5 or more years ago         13420\n",
      "Never                        1407\n",
      "Name: Checkup, dtype: int64\n",
      "\n",
      "Yes    239305\n",
      "No      69469\n",
      "Name: Exercise, dtype: int64\n",
      "\n",
      "No     283803\n",
      "Yes     24971\n",
      "Name: Heart_Disease, dtype: int64\n",
      "\n",
      "No     278782\n",
      "Yes     29992\n",
      "Name: Skin_Cancer, dtype: int64\n",
      "\n",
      "No     278897\n",
      "Yes     29877\n",
      "Name: Other_Cancer, dtype: int64\n",
      "\n",
      "No     246875\n",
      "Yes     61899\n",
      "Name: Depression, dtype: int64\n",
      "\n",
      "No                                            259062\n",
      "Yes                                            40170\n",
      "No, pre-diabetes or borderline diabetes         6896\n",
      "Yes, but female told only during pregnancy      2646\n",
      "Name: Diabetes, dtype: int64\n",
      "\n",
      "No     207711\n",
      "Yes    101063\n",
      "Name: Arthritis, dtype: int64\n",
      "\n",
      "Female    160155\n",
      "Male      148619\n",
      "Name: Sex, dtype: int64\n",
      "\n",
      "65-69    33425\n",
      "60-64    32409\n",
      "70-74    31099\n",
      "55-59    28048\n",
      "50-54    25090\n",
      "80+      22269\n",
      "40-44    21587\n",
      "45-49    20963\n",
      "75-79    20699\n",
      "35-39    20598\n",
      "18-24    18670\n",
      "30-34    18425\n",
      "25-29    15492\n",
      "Name: Age_Category, dtype: int64\n",
      "\n",
      "No     183516\n",
      "Yes    125258\n",
      "Name: Smoking_History, dtype: int64\n",
      "\n"
     ]
    }
   ],
   "source": [
    "cols = ['General_Health', 'Checkup', 'Exercise','Heart_Disease','Skin_Cancer','Other_Cancer','Depression','Diabetes','Arthritis','Sex','Age_Category','Smoking_History']\n",
    "for i in cols:\n",
    "    print(data[i].value_counts())\n",
    "    print()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d5c057cf",
   "metadata": {},
   "source": [
    "### Data transformation using OrdinalEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "609fc321",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>General_Health</th>\n",
       "      <th>Checkup</th>\n",
       "      <th>Exercise</th>\n",
       "      <th>Heart_Disease</th>\n",
       "      <th>Skin_Cancer</th>\n",
       "      <th>Other_Cancer</th>\n",
       "      <th>Depression</th>\n",
       "      <th>Diabetes</th>\n",
       "      <th>Arthritis</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age_Category</th>\n",
       "      <th>Height_(cm)</th>\n",
       "      <th>Weight_(kg)</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Smoking_History</th>\n",
       "      <th>Alcohol_Consumption</th>\n",
       "      <th>Fruit_Consumption</th>\n",
       "      <th>Green_Vegetables_Consumption</th>\n",
       "      <th>FriedPotato_Consumption</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>Within the past 2 years</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>No</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>70-74</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.093750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3.0</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Yes</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>70-74</td>\n",
       "      <td>0.348837</td>\n",
       "      <td>0.295420</td>\n",
       "      <td>0.326529</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.031250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3.0</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Yes</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>60-64</td>\n",
       "      <td>0.302326</td>\n",
       "      <td>0.409059</td>\n",
       "      <td>0.491063</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.133333</td>\n",
       "      <td>0.100000</td>\n",
       "      <td>0.023438</td>\n",
       "      <td>0.125000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Yes</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>75-79</td>\n",
       "      <td>0.697674</td>\n",
       "      <td>0.459064</td>\n",
       "      <td>0.340504</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.234375</td>\n",
       "      <td>0.062500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2.0</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>No</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>80+</td>\n",
       "      <td>0.953488</td>\n",
       "      <td>0.409059</td>\n",
       "      <td>0.202016</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.066667</td>\n",
       "      <td>0.031250</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2.0</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>No</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>60-64</td>\n",
       "      <td>0.767442</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.892554</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.100000</td>\n",
       "      <td>0.093750</td>\n",
       "      <td>0.093750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>1.0</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>No</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>60-64</td>\n",
       "      <td>0.581395</td>\n",
       "      <td>0.222668</td>\n",
       "      <td>0.150241</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.133333</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>2.0</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>No</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>65-69</td>\n",
       "      <td>0.348837</td>\n",
       "      <td>0.613589</td>\n",
       "      <td>0.696573</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.100000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.062500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>1.0</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>No</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>65-69</td>\n",
       "      <td>0.302326</td>\n",
       "      <td>0.249925</td>\n",
       "      <td>0.300165</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.100000</td>\n",
       "      <td>0.093750</td>\n",
       "      <td>0.031250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>1.0</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Yes</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>70-74</td>\n",
       "      <td>0.302326</td>\n",
       "      <td>0.440926</td>\n",
       "      <td>0.529180</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.100000</td>\n",
       "      <td>0.093750</td>\n",
       "      <td>0.007812</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   General_Health                  Checkup  Exercise  Heart_Disease  \\\n",
       "0             0.0  Within the past 2 years       0.0            0.0   \n",
       "1             3.0     Within the past year       0.0            1.0   \n",
       "2             3.0     Within the past year       1.0            0.0   \n",
       "3             0.0     Within the past year       1.0            1.0   \n",
       "4             2.0     Within the past year       0.0            0.0   \n",
       "5             2.0     Within the past year       0.0            0.0   \n",
       "6             1.0     Within the past year       1.0            1.0   \n",
       "7             2.0     Within the past year       1.0            0.0   \n",
       "8             1.0     Within the past year       0.0            0.0   \n",
       "9             1.0     Within the past year       0.0            0.0   \n",
       "\n",
       "   Skin_Cancer  Other_Cancer  Depression Diabetes  Arthritis  Sex  \\\n",
       "0          0.0           0.0         0.0       No        1.0  0.0   \n",
       "1          0.0           0.0         0.0      Yes        0.0  0.0   \n",
       "2          0.0           0.0         0.0      Yes        0.0  0.0   \n",
       "3          0.0           0.0         0.0      Yes        0.0  1.0   \n",
       "4          0.0           0.0         0.0       No        0.0  1.0   \n",
       "5          0.0           0.0         1.0       No        1.0  1.0   \n",
       "6          0.0           0.0         0.0       No        1.0  1.0   \n",
       "7          0.0           0.0         0.0       No        1.0  0.0   \n",
       "8          0.0           0.0         1.0       No        0.0  0.0   \n",
       "9          0.0           0.0         0.0      Yes        1.0  0.0   \n",
       "\n",
       "  Age_Category  Height_(cm)  Weight_(kg)       BMI  Smoking_History  \\\n",
       "0        70-74     0.000000     0.000000  0.000000              1.0   \n",
       "1        70-74     0.348837     0.295420  0.326529              0.0   \n",
       "2        60-64     0.302326     0.409059  0.491063              0.0   \n",
       "3        75-79     0.697674     0.459064  0.340504              0.0   \n",
       "4          80+     0.953488     0.409059  0.202016              1.0   \n",
       "5        60-64     0.767442     1.000000  0.892554              0.0   \n",
       "6        60-64     0.581395     0.222668  0.150241              1.0   \n",
       "7        65-69     0.348837     0.613589  0.696573              1.0   \n",
       "8        65-69     0.302326     0.249925  0.300165              1.0   \n",
       "9        70-74     0.302326     0.440926  0.529180              0.0   \n",
       "\n",
       "   Alcohol_Consumption  Fruit_Consumption  Green_Vegetables_Consumption  \\\n",
       "0             0.000000           0.250000                      0.125000   \n",
       "1             0.000000           0.250000                      0.000000   \n",
       "2             0.133333           0.100000                      0.023438   \n",
       "3             0.000000           0.250000                      0.234375   \n",
       "4             0.000000           0.066667                      0.031250   \n",
       "5             0.000000           0.100000                      0.093750   \n",
       "6             0.000000           0.133333                      0.062500   \n",
       "7             0.100000           0.250000                      0.062500   \n",
       "8             0.000000           0.100000                      0.093750   \n",
       "9             0.000000           0.100000                      0.093750   \n",
       "\n",
       "   FriedPotato_Consumption  \n",
       "0                 0.093750  \n",
       "1                 0.031250  \n",
       "2                 0.125000  \n",
       "3                 0.062500  \n",
       "4                 0.000000  \n",
       "5                 0.093750  \n",
       "6                 0.000000  \n",
       "7                 0.062500  \n",
       "8                 0.031250  \n",
       "9                 0.007812  "
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.preprocessing import OrdinalEncoder\n",
    "\n",
    "#First copy the original dataset to a new dataset before encoding\n",
    "data_ordinal_encode = data.copy()\n",
    "\n",
    "enc = OrdinalEncoder()\n",
    "data_ordinal_encode['Exercise'] = enc.fit_transform(data_ordinal_encode[['Exercise']])\n",
    "data_ordinal_encode['Heart_Disease'] = enc.fit_transform(data_ordinal_encode[['Heart_Disease']])\n",
    "data_ordinal_encode['Skin_Cancer'] = enc.fit_transform(data_ordinal_encode[['Skin_Cancer']])\n",
    "data_ordinal_encode['Other_Cancer'] = enc.fit_transform(data_ordinal_encode[['Other_Cancer']])\n",
    "data_ordinal_encode['Depression'] = enc.fit_transform(data_ordinal_encode[['Depression']])\n",
    "data_ordinal_encode['Sex'] = enc.fit_transform(data_ordinal_encode[['Sex']])\n",
    "data_ordinal_encode['Arthritis'] = enc.fit_transform(data_ordinal_encode[['Arthritis']])\n",
    "data_ordinal_encode['Smoking_History'] = enc.fit_transform(data_ordinal_encode[['Smoking_History']])\n",
    "\n",
    "rank=['Poor','Fair','Good','Very Good','Excellent']\n",
    "oe = OrdinalEncoder(categories=[rank])\n",
    "data_ordinal_encode['General_Health']=oe.fit_transform(data_ordinal_encode[['General_Health']])\n",
    "\n",
    "data_ordinal_encode.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "1f3bf4d0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>General_Health</th>\n",
       "      <th>Checkup</th>\n",
       "      <th>Exercise</th>\n",
       "      <th>Heart_Disease</th>\n",
       "      <th>Skin_Cancer</th>\n",
       "      <th>Other_Cancer</th>\n",
       "      <th>Depression</th>\n",
       "      <th>Diabetes</th>\n",
       "      <th>Arthritis</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age_Category</th>\n",
       "      <th>Height_(cm)</th>\n",
       "      <th>Weight_(kg)</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Smoking_History</th>\n",
       "      <th>Alcohol_Consumption</th>\n",
       "      <th>Fruit_Consumption</th>\n",
       "      <th>Green_Vegetables_Consumption</th>\n",
       "      <th>FriedPotato_Consumption</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>Within the past 2 years</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>70-74</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.09375</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3.0</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>70-74</td>\n",
       "      <td>0.348837</td>\n",
       "      <td>0.295420</td>\n",
       "      <td>0.326529</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.03125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3.0</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>60-64</td>\n",
       "      <td>0.302326</td>\n",
       "      <td>0.409059</td>\n",
       "      <td>0.491063</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.133333</td>\n",
       "      <td>0.100000</td>\n",
       "      <td>0.023438</td>\n",
       "      <td>0.12500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>75-79</td>\n",
       "      <td>0.697674</td>\n",
       "      <td>0.459064</td>\n",
       "      <td>0.340504</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.234375</td>\n",
       "      <td>0.06250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2.0</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>80+</td>\n",
       "      <td>0.953488</td>\n",
       "      <td>0.409059</td>\n",
       "      <td>0.202016</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.066667</td>\n",
       "      <td>0.031250</td>\n",
       "      <td>0.00000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   General_Health                  Checkup  Exercise  Heart_Disease  \\\n",
       "0             0.0  Within the past 2 years       0.0            0.0   \n",
       "1             3.0     Within the past year       0.0            1.0   \n",
       "2             3.0     Within the past year       1.0            0.0   \n",
       "3             0.0     Within the past year       1.0            1.0   \n",
       "4             2.0     Within the past year       0.0            0.0   \n",
       "\n",
       "   Skin_Cancer  Other_Cancer  Depression  Diabetes  Arthritis  Sex  \\\n",
       "0          0.0           0.0         0.0         0        1.0  0.0   \n",
       "1          0.0           0.0         0.0         1        0.0  0.0   \n",
       "2          0.0           0.0         0.0         1        0.0  0.0   \n",
       "3          0.0           0.0         0.0         1        0.0  1.0   \n",
       "4          0.0           0.0         0.0         0        0.0  1.0   \n",
       "\n",
       "  Age_Category  Height_(cm)  Weight_(kg)       BMI  Smoking_History  \\\n",
       "0        70-74     0.000000     0.000000  0.000000              1.0   \n",
       "1        70-74     0.348837     0.295420  0.326529              0.0   \n",
       "2        60-64     0.302326     0.409059  0.491063              0.0   \n",
       "3        75-79     0.697674     0.459064  0.340504              0.0   \n",
       "4          80+     0.953488     0.409059  0.202016              1.0   \n",
       "\n",
       "   Alcohol_Consumption  Fruit_Consumption  Green_Vegetables_Consumption  \\\n",
       "0             0.000000           0.250000                      0.125000   \n",
       "1             0.000000           0.250000                      0.000000   \n",
       "2             0.133333           0.100000                      0.023438   \n",
       "3             0.000000           0.250000                      0.234375   \n",
       "4             0.000000           0.066667                      0.031250   \n",
       "\n",
       "   FriedPotato_Consumption  \n",
       "0                  0.09375  \n",
       "1                  0.03125  \n",
       "2                  0.12500  \n",
       "3                  0.06250  \n",
       "4                  0.00000  "
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_ordinal_encode['Diabetes'] = data_ordinal_encode['Diabetes'].replace({\n",
    "    'Yes, but female told only during pregnancy':1,\n",
    "    'No, pre-diabetes or borderline diabetes':0,\n",
    "    'Yes':1,\n",
    "    'No':0\n",
    "})\n",
    "data_ordinal_encode.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "3d272aa5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "65-69    33425\n",
       "60-64    32409\n",
       "70-74    31099\n",
       "55-59    28048\n",
       "50-54    25090\n",
       "80+      22269\n",
       "40-44    21587\n",
       "45-49    20963\n",
       "75-79    20699\n",
       "35-39    20598\n",
       "18-24    18670\n",
       "30-34    18425\n",
       "25-29    15492\n",
       "Name: Age_Category, dtype: int64"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_ordinal_encode['Age_Category'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "14468652",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>General_Health</th>\n",
       "      <th>Checkup</th>\n",
       "      <th>Exercise</th>\n",
       "      <th>Heart_Disease</th>\n",
       "      <th>Skin_Cancer</th>\n",
       "      <th>Other_Cancer</th>\n",
       "      <th>Depression</th>\n",
       "      <th>Diabetes</th>\n",
       "      <th>Arthritis</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age_Category</th>\n",
       "      <th>Height_(cm)</th>\n",
       "      <th>Weight_(kg)</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Smoking_History</th>\n",
       "      <th>Alcohol_Consumption</th>\n",
       "      <th>Fruit_Consumption</th>\n",
       "      <th>Green_Vegetables_Consumption</th>\n",
       "      <th>FriedPotato_Consumption</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>Within the past 2 years</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.09375</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3.0</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>0.348837</td>\n",
       "      <td>0.295420</td>\n",
       "      <td>0.326529</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.03125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3.0</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>8.0</td>\n",
       "      <td>0.302326</td>\n",
       "      <td>0.409059</td>\n",
       "      <td>0.491063</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.133333</td>\n",
       "      <td>0.100000</td>\n",
       "      <td>0.023438</td>\n",
       "      <td>0.12500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>11.0</td>\n",
       "      <td>0.697674</td>\n",
       "      <td>0.459064</td>\n",
       "      <td>0.340504</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.234375</td>\n",
       "      <td>0.06250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2.0</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>0.953488</td>\n",
       "      <td>0.409059</td>\n",
       "      <td>0.202016</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.066667</td>\n",
       "      <td>0.031250</td>\n",
       "      <td>0.00000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   General_Health                  Checkup  Exercise  Heart_Disease  \\\n",
       "0             0.0  Within the past 2 years       0.0            0.0   \n",
       "1             3.0     Within the past year       0.0            1.0   \n",
       "2             3.0     Within the past year       1.0            0.0   \n",
       "3             0.0     Within the past year       1.0            1.0   \n",
       "4             2.0     Within the past year       0.0            0.0   \n",
       "\n",
       "   Skin_Cancer  Other_Cancer  Depression  Diabetes  Arthritis  Sex  \\\n",
       "0          0.0           0.0         0.0         0        1.0  0.0   \n",
       "1          0.0           0.0         0.0         1        0.0  0.0   \n",
       "2          0.0           0.0         0.0         1        0.0  0.0   \n",
       "3          0.0           0.0         0.0         1        0.0  1.0   \n",
       "4          0.0           0.0         0.0         0        0.0  1.0   \n",
       "\n",
       "   Age_Category  Height_(cm)  Weight_(kg)       BMI  Smoking_History  \\\n",
       "0          10.0     0.000000     0.000000  0.000000              1.0   \n",
       "1          10.0     0.348837     0.295420  0.326529              0.0   \n",
       "2           8.0     0.302326     0.409059  0.491063              0.0   \n",
       "3          11.0     0.697674     0.459064  0.340504              0.0   \n",
       "4          12.0     0.953488     0.409059  0.202016              1.0   \n",
       "\n",
       "   Alcohol_Consumption  Fruit_Consumption  Green_Vegetables_Consumption  \\\n",
       "0             0.000000           0.250000                      0.125000   \n",
       "1             0.000000           0.250000                      0.000000   \n",
       "2             0.133333           0.100000                      0.023438   \n",
       "3             0.000000           0.250000                      0.234375   \n",
       "4             0.000000           0.066667                      0.031250   \n",
       "\n",
       "   FriedPotato_Consumption  \n",
       "0                  0.09375  \n",
       "1                  0.03125  \n",
       "2                  0.12500  \n",
       "3                  0.06250  \n",
       "4                  0.00000  "
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# create an instance of the encoder\n",
    "encoder = OrdinalEncoder()\n",
    "\n",
    "# fit and transform the column\n",
    "encoded_column = encoder.fit_transform(data_ordinal_encode['Age_Category'].values.reshape(-1, 1))\n",
    "\n",
    "# assign the encoded column back to the dataframe\n",
    "data_ordinal_encode['Age_Category'] = encoded_column\n",
    "\n",
    "data_ordinal_encode.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "8e8fb07f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 -> 18-24\n",
      "1 -> 25-29\n",
      "2 -> 30-34\n",
      "3 -> 35-39\n",
      "4 -> 40-44\n",
      "5 -> 45-49\n",
      "6 -> 50-54\n",
      "7 -> 55-59\n",
      "8 -> 60-64\n",
      "9 -> 65-69\n",
      "10 -> 70-74\n",
      "11 -> 75-79\n",
      "12 -> 80+\n"
     ]
    }
   ],
   "source": [
    "# loop through the categories and the encoded values\n",
    "for category, value in zip(encoder.categories_[0], range(len(encoder.categories_[0]))):\n",
    "    # print the mapping\n",
    "    print(f\"{value} -> {category}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "c1eddf5c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[array(['Never', 'Within the past year', 'Within the past 2 years',\n",
      "       'Within the past 5 years', '5 or more years ago'], dtype=object)]\n"
     ]
    }
   ],
   "source": [
    "# create an instance of the encoder with the specified order of categories\n",
    "encoder = OrdinalEncoder(categories=[['Never', 'Within the past year', 'Within the past 2 years', 'Within the past 5 years', '5 or more years ago']])\n",
    "\n",
    "# fit and transform the column\n",
    "encoded_column = encoder.fit_transform(data_ordinal_encode['Checkup'].values.reshape(-1, 1))\n",
    "\n",
    "# assign the encoded column back to the dataframe\n",
    "data_ordinal_encode['Checkup'] = encoded_column\n",
    "\n",
    "print(encoder.categories_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "d3cc376d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 -> Never\n",
      "1 -> Within the past year\n",
      "2 -> Within the past 2 years\n",
      "3 -> Within the past 5 years\n",
      "4 -> 5 or more years ago\n"
     ]
    }
   ],
   "source": [
    "# loop through the categories and the encoded values\n",
    "for category, value in zip(encoder.categories_[0], range(len(encoder.categories_[0]))):\n",
    "    # print the mapping\n",
    "    print(f\"{value} -> {category}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "fd72478a",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>General_Health</th>\n",
       "      <th>Checkup</th>\n",
       "      <th>Exercise</th>\n",
       "      <th>Heart_Disease</th>\n",
       "      <th>Skin_Cancer</th>\n",
       "      <th>Other_Cancer</th>\n",
       "      <th>Depression</th>\n",
       "      <th>Diabetes</th>\n",
       "      <th>Arthritis</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age_Category</th>\n",
       "      <th>Height_(cm)</th>\n",
       "      <th>Weight_(kg)</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Smoking_History</th>\n",
       "      <th>Alcohol_Consumption</th>\n",
       "      <th>Fruit_Consumption</th>\n",
       "      <th>Green_Vegetables_Consumption</th>\n",
       "      <th>FriedPotato_Consumption</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.09375</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>0.348837</td>\n",
       "      <td>0.295420</td>\n",
       "      <td>0.326529</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.03125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>8.0</td>\n",
       "      <td>0.302326</td>\n",
       "      <td>0.409059</td>\n",
       "      <td>0.491063</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.133333</td>\n",
       "      <td>0.100000</td>\n",
       "      <td>0.023438</td>\n",
       "      <td>0.12500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>11.0</td>\n",
       "      <td>0.697674</td>\n",
       "      <td>0.459064</td>\n",
       "      <td>0.340504</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.234375</td>\n",
       "      <td>0.06250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>0.953488</td>\n",
       "      <td>0.409059</td>\n",
       "      <td>0.202016</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.066667</td>\n",
       "      <td>0.031250</td>\n",
       "      <td>0.00000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   General_Health  Checkup  Exercise  Heart_Disease  Skin_Cancer  \\\n",
       "0             0.0      2.0       0.0            0.0          0.0   \n",
       "1             3.0      1.0       0.0            1.0          0.0   \n",
       "2             3.0      1.0       1.0            0.0          0.0   \n",
       "3             0.0      1.0       1.0            1.0          0.0   \n",
       "4             2.0      1.0       0.0            0.0          0.0   \n",
       "\n",
       "   Other_Cancer  Depression  Diabetes  Arthritis  Sex  Age_Category  \\\n",
       "0           0.0         0.0         0        1.0  0.0          10.0   \n",
       "1           0.0         0.0         1        0.0  0.0          10.0   \n",
       "2           0.0         0.0         1        0.0  0.0           8.0   \n",
       "3           0.0         0.0         1        0.0  1.0          11.0   \n",
       "4           0.0         0.0         0        0.0  1.0          12.0   \n",
       "\n",
       "   Height_(cm)  Weight_(kg)       BMI  Smoking_History  Alcohol_Consumption  \\\n",
       "0     0.000000     0.000000  0.000000              1.0             0.000000   \n",
       "1     0.348837     0.295420  0.326529              0.0             0.000000   \n",
       "2     0.302326     0.409059  0.491063              0.0             0.133333   \n",
       "3     0.697674     0.459064  0.340504              0.0             0.000000   \n",
       "4     0.953488     0.409059  0.202016              1.0             0.000000   \n",
       "\n",
       "   Fruit_Consumption  Green_Vegetables_Consumption  FriedPotato_Consumption  \n",
       "0           0.250000                      0.125000                  0.09375  \n",
       "1           0.250000                      0.000000                  0.03125  \n",
       "2           0.100000                      0.023438                  0.12500  \n",
       "3           0.250000                      0.234375                  0.06250  \n",
       "4           0.066667                      0.031250                  0.00000  "
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_ordinal_encode.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "cf1a70b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# from sklearn.preprocessing import OrdinalEncoder\n",
    "\n",
    "# #First copy the original dataset to a new dataset before encoding\n",
    "# data_ordinal_encode = data.copy()\n",
    "\n",
    "\n",
    "# enc = OrdinalEncoder()\n",
    "# data_ordinal_encode['Exercise'] = enc.fit_transform(data_ordinal_encode[['Exercise']])\n",
    "# data_ordinal_encode['Heart_Disease'] = enc.fit_transform(data_ordinal_encode[['Heart_Disease']])\n",
    "# data_ordinal_encode['Skin_Cancer'] = enc.fit_transform(data_ordinal_encode[['Skin_Cancer']])\n",
    "# data_ordinal_encode['Other_Cancer'] = enc.fit_transform(data_ordinal_encode[['Other_Cancer']])\n",
    "# data_ordinal_encode['Depression'] = enc.fit_transform(data_ordinal_encode[['Depression']])\n",
    "# data_ordinal_encode['Sex'] = enc.fit_transform(data_ordinal_encode[['Sex']])\n",
    "# data_ordinal_encode['Arthritis'] = enc.fit_transform(data_ordinal_encode[['Arthritis']])\n",
    "# data_ordinal_encode['Smoking_History'] = enc.fit_transform(data_ordinal_encode[['Smoking_History']])\n",
    "# data_ordinal_encode.head(10)\n",
    "\n",
    "# rank=['Poor','Fair','Good','Very Good','Excellent']\n",
    "# oe = OrdinalEncoder(categories=[rank])\n",
    "# data_ordinal_encode['General_Health']=oe.fit_transform(data_ordinal_encode[['General_Health']])\n",
    "# data_ordinal_encode.head()\n",
    "\n",
    "# data_ordinal_encode['Diabetes'] = data_ordinal_encode['Diabetes'].replace({\n",
    "#     'Yes, but female told only during pregnancy':1,\n",
    "#     'No, pre-diabetes or borderline diabetes':0,\n",
    "#     'Yes':1,\n",
    "#     'No':0\n",
    "# })\n",
    "# data_ordinal_encode.head()\n",
    "\n",
    "\n",
    "# # create an instance of the encoder with the specified order of categories\n",
    "# encoder = OrdinalEncoder(categories=[['Never', 'Within the past year', 'Within the past 2 years', 'Within the past 5 years', '5 or more years ago']])\n",
    "\n",
    "# # fit and transform the column\n",
    "# encoded_column = encoder.fit_transform(data_ordinal_encode['Checkup'].values.reshape(-1, 1))\n",
    "\n",
    "# # assign the encoded column back to the dataframe\n",
    "# data_ordinal_encode['Checkup'] = encoded_column\n",
    "\n",
    "# print(encoder.categories_)\n",
    "\n",
    "# # loop through the categories and the encoded values\n",
    "# for category, value in zip(encoder.categories_[0], range(len(encoder.categories_[0]))):\n",
    "#     # print the mapping\n",
    "#     print(f\"{value} -> {category}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "55dfc75a",
   "metadata": {},
   "source": [
    "### Data transformation using Onehot Encoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "ff70948e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Height_(cm)</th>\n",
       "      <th>Weight_(kg)</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Alcohol_Consumption</th>\n",
       "      <th>Fruit_Consumption</th>\n",
       "      <th>Green_Vegetables_Consumption</th>\n",
       "      <th>FriedPotato_Consumption</th>\n",
       "      <th>Exercise_Yes</th>\n",
       "      <th>Age_Category_25-29</th>\n",
       "      <th>Age_Category_30-34</th>\n",
       "      <th>...</th>\n",
       "      <th>General_Health_Good</th>\n",
       "      <th>General_Health_Poor</th>\n",
       "      <th>General_Health_Very Good</th>\n",
       "      <th>Diabetes_No, pre-diabetes or borderline diabetes</th>\n",
       "      <th>Diabetes_Yes</th>\n",
       "      <th>Diabetes_Yes, but female told only during pregnancy</th>\n",
       "      <th>Checkup_Never</th>\n",
       "      <th>Checkup_Within the past 2 years</th>\n",
       "      <th>Checkup_Within the past 5 years</th>\n",
       "      <th>Checkup_Within the past year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.09375</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.348837</td>\n",
       "      <td>0.295420</td>\n",
       "      <td>0.326529</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.03125</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.302326</td>\n",
       "      <td>0.409059</td>\n",
       "      <td>0.491063</td>\n",
       "      <td>0.133333</td>\n",
       "      <td>0.100000</td>\n",
       "      <td>0.023438</td>\n",
       "      <td>0.12500</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.697674</td>\n",
       "      <td>0.459064</td>\n",
       "      <td>0.340504</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.234375</td>\n",
       "      <td>0.06250</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.953488</td>\n",
       "      <td>0.409059</td>\n",
       "      <td>0.202016</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.066667</td>\n",
       "      <td>0.031250</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 38 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   Height_(cm)  Weight_(kg)       BMI  Alcohol_Consumption  Fruit_Consumption  \\\n",
       "0     0.000000     0.000000  0.000000             0.000000           0.250000   \n",
       "1     0.348837     0.295420  0.326529             0.000000           0.250000   \n",
       "2     0.302326     0.409059  0.491063             0.133333           0.100000   \n",
       "3     0.697674     0.459064  0.340504             0.000000           0.250000   \n",
       "4     0.953488     0.409059  0.202016             0.000000           0.066667   \n",
       "\n",
       "   Green_Vegetables_Consumption  FriedPotato_Consumption  Exercise_Yes  \\\n",
       "0                      0.125000                  0.09375             0   \n",
       "1                      0.000000                  0.03125             0   \n",
       "2                      0.023438                  0.12500             1   \n",
       "3                      0.234375                  0.06250             1   \n",
       "4                      0.031250                  0.00000             0   \n",
       "\n",
       "   Age_Category_25-29  Age_Category_30-34  ...  General_Health_Good  \\\n",
       "0                   0                   0  ...                    0   \n",
       "1                   0                   0  ...                    0   \n",
       "2                   0                   0  ...                    0   \n",
       "3                   0                   0  ...                    0   \n",
       "4                   0                   0  ...                    1   \n",
       "\n",
       "   General_Health_Poor  General_Health_Very Good  \\\n",
       "0                    1                         0   \n",
       "1                    0                         1   \n",
       "2                    0                         1   \n",
       "3                    1                         0   \n",
       "4                    0                         0   \n",
       "\n",
       "   Diabetes_No, pre-diabetes or borderline diabetes  Diabetes_Yes  \\\n",
       "0                                                 0             0   \n",
       "1                                                 0             1   \n",
       "2                                                 0             1   \n",
       "3                                                 0             1   \n",
       "4                                                 0             0   \n",
       "\n",
       "   Diabetes_Yes, but female told only during pregnancy  Checkup_Never  \\\n",
       "0                                                  0                0   \n",
       "1                                                  0                0   \n",
       "2                                                  0                0   \n",
       "3                                                  0                0   \n",
       "4                                                  0                0   \n",
       "\n",
       "   Checkup_Within the past 2 years  Checkup_Within the past 5 years  \\\n",
       "0                                1                                0   \n",
       "1                                0                                0   \n",
       "2                                0                                0   \n",
       "3                                0                                0   \n",
       "4                                0                                0   \n",
       "\n",
       "   Checkup_Within the past year  \n",
       "0                             0  \n",
       "1                             1  \n",
       "2                             1  \n",
       "3                             1  \n",
       "4                             1  \n",
       "\n",
       "[5 rows x 38 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# Assuming 'data' is your original DataFrame\n",
    "data_onehot = data.copy()\n",
    "\n",
    "# List of columns to one-hot encode\n",
    "onehot_cols = [\n",
    "    'Exercise', 'Age_Category', 'Heart_Disease', 'Skin_Cancer', 'Other_Cancer',\n",
    "    'Depression', 'Sex', 'Arthritis', 'Smoking_History', 'General_Health',\n",
    "    'Diabetes', 'Checkup'\n",
    "]\n",
    "\n",
    "# Apply one-hot encoding using pandas.get_dummies()\n",
    "data_onehot = pd.get_dummies(data_onehot, columns=onehot_cols, drop_first=True)\n",
    "\n",
    "# Print the one-hot encoded DataFrame\n",
    "display(data_onehot.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0bf96744",
   "metadata": {},
   "source": [
    "## Data reduction"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "da203f0f",
   "metadata": {},
   "source": [
    "### Feature Selection\n",
    "#### Using correlation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "2a353aef",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>General_Health</th>\n",
       "      <th>Checkup</th>\n",
       "      <th>Exercise</th>\n",
       "      <th>Heart_Disease</th>\n",
       "      <th>Skin_Cancer</th>\n",
       "      <th>Other_Cancer</th>\n",
       "      <th>Depression</th>\n",
       "      <th>Diabetes</th>\n",
       "      <th>Arthritis</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age_Category</th>\n",
       "      <th>Height_(cm)</th>\n",
       "      <th>Weight_(kg)</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Smoking_History</th>\n",
       "      <th>Alcohol_Consumption</th>\n",
       "      <th>Fruit_Consumption</th>\n",
       "      <th>Green_Vegetables_Consumption</th>\n",
       "      <th>FriedPotato_Consumption</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>General_Health</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.088730</td>\n",
       "      <td>0.276080</td>\n",
       "      <td>-0.232484</td>\n",
       "      <td>-0.047079</td>\n",
       "      <td>-0.145614</td>\n",
       "      <td>-0.207533</td>\n",
       "      <td>-0.262990</td>\n",
       "      <td>-0.265911</td>\n",
       "      <td>0.018939</td>\n",
       "      <td>-0.167350</td>\n",
       "      <td>0.065967</td>\n",
       "      <td>-0.184094</td>\n",
       "      <td>-0.249477</td>\n",
       "      <td>-0.167538</td>\n",
       "      <td>0.118333</td>\n",
       "      <td>0.102602</td>\n",
       "      <td>0.119738</td>\n",
       "      <td>-0.031816</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Checkup</th>\n",
       "      <td>0.088730</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.035781</td>\n",
       "      <td>-0.084920</td>\n",
       "      <td>-0.081912</td>\n",
       "      <td>-0.088945</td>\n",
       "      <td>-0.033724</td>\n",
       "      <td>-0.129419</td>\n",
       "      <td>-0.155182</td>\n",
       "      <td>0.103296</td>\n",
       "      <td>-0.235454</td>\n",
       "      <td>0.096012</td>\n",
       "      <td>-0.009579</td>\n",
       "      <td>-0.062799</td>\n",
       "      <td>0.010875</td>\n",
       "      <td>0.048410</td>\n",
       "      <td>-0.042106</td>\n",
       "      <td>-0.036992</td>\n",
       "      <td>0.059084</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Exercise</th>\n",
       "      <td>0.276080</td>\n",
       "      <td>0.035781</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>-0.096321</td>\n",
       "      <td>-0.003963</td>\n",
       "      <td>-0.054363</td>\n",
       "      <td>-0.084673</td>\n",
       "      <td>-0.138379</td>\n",
       "      <td>-0.124785</td>\n",
       "      <td>0.059355</td>\n",
       "      <td>-0.122334</td>\n",
       "      <td>0.091628</td>\n",
       "      <td>-0.088641</td>\n",
       "      <td>-0.156225</td>\n",
       "      <td>-0.093241</td>\n",
       "      <td>0.095028</td>\n",
       "      <td>0.136782</td>\n",
       "      <td>0.124983</td>\n",
       "      <td>-0.036904</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Heart_Disease</th>\n",
       "      <td>-0.232484</td>\n",
       "      <td>-0.084920</td>\n",
       "      <td>-0.096321</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.090835</td>\n",
       "      <td>0.092369</td>\n",
       "      <td>0.032494</td>\n",
       "      <td>0.172183</td>\n",
       "      <td>0.153891</td>\n",
       "      <td>0.072606</td>\n",
       "      <td>0.229027</td>\n",
       "      <td>0.016824</td>\n",
       "      <td>0.047409</td>\n",
       "      <td>0.044433</td>\n",
       "      <td>0.107757</td>\n",
       "      <td>-0.036614</td>\n",
       "      <td>-0.020045</td>\n",
       "      <td>-0.024027</td>\n",
       "      <td>-0.009249</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Skin_Cancer</th>\n",
       "      <td>-0.047079</td>\n",
       "      <td>-0.081912</td>\n",
       "      <td>-0.003963</td>\n",
       "      <td>0.090835</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.150781</td>\n",
       "      <td>-0.013041</td>\n",
       "      <td>0.034466</td>\n",
       "      <td>0.136146</td>\n",
       "      <td>0.009658</td>\n",
       "      <td>0.272075</td>\n",
       "      <td>0.006679</td>\n",
       "      <td>-0.028367</td>\n",
       "      <td>-0.037468</td>\n",
       "      <td>0.032793</td>\n",
       "      <td>0.042734</td>\n",
       "      <td>0.024143</td>\n",
       "      <td>0.012894</td>\n",
       "      <td>-0.038945</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Other_Cancer</th>\n",
       "      <td>-0.145614</td>\n",
       "      <td>-0.088945</td>\n",
       "      <td>-0.054363</td>\n",
       "      <td>0.092369</td>\n",
       "      <td>0.150781</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.015861</td>\n",
       "      <td>0.066032</td>\n",
       "      <td>0.129320</td>\n",
       "      <td>-0.042061</td>\n",
       "      <td>0.234464</td>\n",
       "      <td>-0.044170</td>\n",
       "      <td>-0.020410</td>\n",
       "      <td>0.001875</td>\n",
       "      <td>0.053390</td>\n",
       "      <td>-0.008704</td>\n",
       "      <td>0.007992</td>\n",
       "      <td>-0.003215</td>\n",
       "      <td>-0.033326</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Depression</th>\n",
       "      <td>-0.207533</td>\n",
       "      <td>-0.033724</td>\n",
       "      <td>-0.084673</td>\n",
       "      <td>0.032494</td>\n",
       "      <td>-0.013041</td>\n",
       "      <td>0.015861</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.048940</td>\n",
       "      <td>0.121562</td>\n",
       "      <td>-0.141457</td>\n",
       "      <td>-0.103195</td>\n",
       "      <td>-0.093348</td>\n",
       "      <td>0.046350</td>\n",
       "      <td>0.109431</td>\n",
       "      <td>0.100215</td>\n",
       "      <td>-0.028200</td>\n",
       "      <td>-0.039938</td>\n",
       "      <td>-0.051134</td>\n",
       "      <td>0.018108</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Diabetes</th>\n",
       "      <td>-0.262990</td>\n",
       "      <td>-0.129419</td>\n",
       "      <td>-0.138379</td>\n",
       "      <td>0.172183</td>\n",
       "      <td>0.034466</td>\n",
       "      <td>0.066032</td>\n",
       "      <td>0.048940</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.133271</td>\n",
       "      <td>-0.000829</td>\n",
       "      <td>0.196006</td>\n",
       "      <td>-0.031031</td>\n",
       "      <td>0.160771</td>\n",
       "      <td>0.203496</td>\n",
       "      <td>0.054567</td>\n",
       "      <td>-0.113292</td>\n",
       "      <td>-0.018707</td>\n",
       "      <td>-0.028606</td>\n",
       "      <td>-0.002870</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Arthritis</th>\n",
       "      <td>-0.265911</td>\n",
       "      <td>-0.155182</td>\n",
       "      <td>-0.124785</td>\n",
       "      <td>0.153891</td>\n",
       "      <td>0.136146</td>\n",
       "      <td>0.129320</td>\n",
       "      <td>0.121562</td>\n",
       "      <td>0.133271</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>-0.100047</td>\n",
       "      <td>0.370996</td>\n",
       "      <td>-0.098835</td>\n",
       "      <td>0.074759</td>\n",
       "      <td>0.140789</td>\n",
       "      <td>0.123128</td>\n",
       "      <td>-0.024968</td>\n",
       "      <td>-0.001983</td>\n",
       "      <td>-0.018803</td>\n",
       "      <td>-0.050994</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Sex</th>\n",
       "      <td>0.018939</td>\n",
       "      <td>0.103296</td>\n",
       "      <td>0.059355</td>\n",
       "      <td>0.072606</td>\n",
       "      <td>0.009658</td>\n",
       "      <td>-0.042061</td>\n",
       "      <td>-0.141457</td>\n",
       "      <td>-0.000829</td>\n",
       "      <td>-0.100047</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>-0.060234</td>\n",
       "      <td>0.708497</td>\n",
       "      <td>0.362122</td>\n",
       "      <td>0.013432</td>\n",
       "      <td>0.073407</td>\n",
       "      <td>0.129311</td>\n",
       "      <td>-0.092486</td>\n",
       "      <td>-0.069169</td>\n",
       "      <td>0.130049</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Age_Category</th>\n",
       "      <td>-0.167350</td>\n",
       "      <td>-0.235454</td>\n",
       "      <td>-0.122334</td>\n",
       "      <td>0.229027</td>\n",
       "      <td>0.272075</td>\n",
       "      <td>0.234464</td>\n",
       "      <td>-0.103195</td>\n",
       "      <td>0.196006</td>\n",
       "      <td>0.370996</td>\n",
       "      <td>-0.060234</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>-0.121451</td>\n",
       "      <td>-0.060041</td>\n",
       "      <td>-0.004520</td>\n",
       "      <td>0.133155</td>\n",
       "      <td>0.012833</td>\n",
       "      <td>0.043661</td>\n",
       "      <td>0.036030</td>\n",
       "      <td>-0.142761</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Height_(cm)</th>\n",
       "      <td>0.065967</td>\n",
       "      <td>0.096012</td>\n",
       "      <td>0.091628</td>\n",
       "      <td>0.016824</td>\n",
       "      <td>0.006679</td>\n",
       "      <td>-0.044170</td>\n",
       "      <td>-0.093348</td>\n",
       "      <td>-0.031031</td>\n",
       "      <td>-0.098835</td>\n",
       "      <td>0.708497</td>\n",
       "      <td>-0.121451</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.483410</td>\n",
       "      <td>-0.013834</td>\n",
       "      <td>0.052032</td>\n",
       "      <td>0.129726</td>\n",
       "      <td>-0.046750</td>\n",
       "      <td>-0.030994</td>\n",
       "      <td>0.109709</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Weight_(kg)</th>\n",
       "      <td>-0.184094</td>\n",
       "      <td>-0.009579</td>\n",
       "      <td>-0.088641</td>\n",
       "      <td>0.047409</td>\n",
       "      <td>-0.028367</td>\n",
       "      <td>-0.020410</td>\n",
       "      <td>0.046350</td>\n",
       "      <td>0.160771</td>\n",
       "      <td>0.074759</td>\n",
       "      <td>0.362122</td>\n",
       "      <td>-0.060041</td>\n",
       "      <td>0.483410</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.859020</td>\n",
       "      <td>0.048845</td>\n",
       "      <td>-0.031117</td>\n",
       "      <td>-0.092119</td>\n",
       "      <td>-0.076765</td>\n",
       "      <td>0.097414</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>BMI</th>\n",
       "      <td>-0.249477</td>\n",
       "      <td>-0.062799</td>\n",
       "      <td>-0.156225</td>\n",
       "      <td>0.044433</td>\n",
       "      <td>-0.037468</td>\n",
       "      <td>0.001875</td>\n",
       "      <td>0.109431</td>\n",
       "      <td>0.203496</td>\n",
       "      <td>0.140789</td>\n",
       "      <td>0.013432</td>\n",
       "      <td>-0.004520</td>\n",
       "      <td>-0.013834</td>\n",
       "      <td>0.859020</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.025862</td>\n",
       "      <td>-0.110340</td>\n",
       "      <td>-0.078849</td>\n",
       "      <td>-0.072224</td>\n",
       "      <td>0.049331</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Smoking_History</th>\n",
       "      <td>-0.167538</td>\n",
       "      <td>0.010875</td>\n",
       "      <td>-0.093241</td>\n",
       "      <td>0.107757</td>\n",
       "      <td>0.032793</td>\n",
       "      <td>0.053390</td>\n",
       "      <td>0.100215</td>\n",
       "      <td>0.054567</td>\n",
       "      <td>0.123128</td>\n",
       "      <td>0.073407</td>\n",
       "      <td>0.133155</td>\n",
       "      <td>0.052032</td>\n",
       "      <td>0.048845</td>\n",
       "      <td>0.025862</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.100553</td>\n",
       "      <td>-0.093626</td>\n",
       "      <td>-0.034371</td>\n",
       "      <td>0.035824</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Alcohol_Consumption</th>\n",
       "      <td>0.118333</td>\n",
       "      <td>0.048410</td>\n",
       "      <td>0.095028</td>\n",
       "      <td>-0.036614</td>\n",
       "      <td>0.042734</td>\n",
       "      <td>-0.008704</td>\n",
       "      <td>-0.028200</td>\n",
       "      <td>-0.113292</td>\n",
       "      <td>-0.024968</td>\n",
       "      <td>0.129311</td>\n",
       "      <td>0.012833</td>\n",
       "      <td>0.129726</td>\n",
       "      <td>-0.031117</td>\n",
       "      <td>-0.110340</td>\n",
       "      <td>0.100553</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>-0.012542</td>\n",
       "      <td>0.060088</td>\n",
       "      <td>0.020503</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Fruit_Consumption</th>\n",
       "      <td>0.102602</td>\n",
       "      <td>-0.042106</td>\n",
       "      <td>0.136782</td>\n",
       "      <td>-0.020045</td>\n",
       "      <td>0.024143</td>\n",
       "      <td>0.007992</td>\n",
       "      <td>-0.039938</td>\n",
       "      <td>-0.018707</td>\n",
       "      <td>-0.001983</td>\n",
       "      <td>-0.092486</td>\n",
       "      <td>0.043661</td>\n",
       "      <td>-0.046750</td>\n",
       "      <td>-0.092119</td>\n",
       "      <td>-0.078849</td>\n",
       "      <td>-0.093626</td>\n",
       "      <td>-0.012542</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.270426</td>\n",
       "      <td>-0.060302</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Green_Vegetables_Consumption</th>\n",
       "      <td>0.119738</td>\n",
       "      <td>-0.036992</td>\n",
       "      <td>0.124983</td>\n",
       "      <td>-0.024027</td>\n",
       "      <td>0.012894</td>\n",
       "      <td>-0.003215</td>\n",
       "      <td>-0.051134</td>\n",
       "      <td>-0.028606</td>\n",
       "      <td>-0.018803</td>\n",
       "      <td>-0.069169</td>\n",
       "      <td>0.036030</td>\n",
       "      <td>-0.030994</td>\n",
       "      <td>-0.076765</td>\n",
       "      <td>-0.072224</td>\n",
       "      <td>-0.034371</td>\n",
       "      <td>0.060088</td>\n",
       "      <td>0.270426</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.003209</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>FriedPotato_Consumption</th>\n",
       "      <td>-0.031816</td>\n",
       "      <td>0.059084</td>\n",
       "      <td>-0.036904</td>\n",
       "      <td>-0.009249</td>\n",
       "      <td>-0.038945</td>\n",
       "      <td>-0.033326</td>\n",
       "      <td>0.018108</td>\n",
       "      <td>-0.002870</td>\n",
       "      <td>-0.050994</td>\n",
       "      <td>0.130049</td>\n",
       "      <td>-0.142761</td>\n",
       "      <td>0.109709</td>\n",
       "      <td>0.097414</td>\n",
       "      <td>0.049331</td>\n",
       "      <td>0.035824</td>\n",
       "      <td>0.020503</td>\n",
       "      <td>-0.060302</td>\n",
       "      <td>0.003209</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                              General_Health   Checkup  Exercise  \\\n",
       "General_Health                      1.000000  0.088730  0.276080   \n",
       "Checkup                             0.088730  1.000000  0.035781   \n",
       "Exercise                            0.276080  0.035781  1.000000   \n",
       "Heart_Disease                      -0.232484 -0.084920 -0.096321   \n",
       "Skin_Cancer                        -0.047079 -0.081912 -0.003963   \n",
       "Other_Cancer                       -0.145614 -0.088945 -0.054363   \n",
       "Depression                         -0.207533 -0.033724 -0.084673   \n",
       "Diabetes                           -0.262990 -0.129419 -0.138379   \n",
       "Arthritis                          -0.265911 -0.155182 -0.124785   \n",
       "Sex                                 0.018939  0.103296  0.059355   \n",
       "Age_Category                       -0.167350 -0.235454 -0.122334   \n",
       "Height_(cm)                         0.065967  0.096012  0.091628   \n",
       "Weight_(kg)                        -0.184094 -0.009579 -0.088641   \n",
       "BMI                                -0.249477 -0.062799 -0.156225   \n",
       "Smoking_History                    -0.167538  0.010875 -0.093241   \n",
       "Alcohol_Consumption                 0.118333  0.048410  0.095028   \n",
       "Fruit_Consumption                   0.102602 -0.042106  0.136782   \n",
       "Green_Vegetables_Consumption        0.119738 -0.036992  0.124983   \n",
       "FriedPotato_Consumption            -0.031816  0.059084 -0.036904   \n",
       "\n",
       "                              Heart_Disease  Skin_Cancer  Other_Cancer  \\\n",
       "General_Health                    -0.232484    -0.047079     -0.145614   \n",
       "Checkup                           -0.084920    -0.081912     -0.088945   \n",
       "Exercise                          -0.096321    -0.003963     -0.054363   \n",
       "Heart_Disease                      1.000000     0.090835      0.092369   \n",
       "Skin_Cancer                        0.090835     1.000000      0.150781   \n",
       "Other_Cancer                       0.092369     0.150781      1.000000   \n",
       "Depression                         0.032494    -0.013041      0.015861   \n",
       "Diabetes                           0.172183     0.034466      0.066032   \n",
       "Arthritis                          0.153891     0.136146      0.129320   \n",
       "Sex                                0.072606     0.009658     -0.042061   \n",
       "Age_Category                       0.229027     0.272075      0.234464   \n",
       "Height_(cm)                        0.016824     0.006679     -0.044170   \n",
       "Weight_(kg)                        0.047409    -0.028367     -0.020410   \n",
       "BMI                                0.044433    -0.037468      0.001875   \n",
       "Smoking_History                    0.107757     0.032793      0.053390   \n",
       "Alcohol_Consumption               -0.036614     0.042734     -0.008704   \n",
       "Fruit_Consumption                 -0.020045     0.024143      0.007992   \n",
       "Green_Vegetables_Consumption      -0.024027     0.012894     -0.003215   \n",
       "FriedPotato_Consumption           -0.009249    -0.038945     -0.033326   \n",
       "\n",
       "                              Depression  Diabetes  Arthritis       Sex  \\\n",
       "General_Health                 -0.207533 -0.262990  -0.265911  0.018939   \n",
       "Checkup                        -0.033724 -0.129419  -0.155182  0.103296   \n",
       "Exercise                       -0.084673 -0.138379  -0.124785  0.059355   \n",
       "Heart_Disease                   0.032494  0.172183   0.153891  0.072606   \n",
       "Skin_Cancer                    -0.013041  0.034466   0.136146  0.009658   \n",
       "Other_Cancer                    0.015861  0.066032   0.129320 -0.042061   \n",
       "Depression                      1.000000  0.048940   0.121562 -0.141457   \n",
       "Diabetes                        0.048940  1.000000   0.133271 -0.000829   \n",
       "Arthritis                       0.121562  0.133271   1.000000 -0.100047   \n",
       "Sex                            -0.141457 -0.000829  -0.100047  1.000000   \n",
       "Age_Category                   -0.103195  0.196006   0.370996 -0.060234   \n",
       "Height_(cm)                    -0.093348 -0.031031  -0.098835  0.708497   \n",
       "Weight_(kg)                     0.046350  0.160771   0.074759  0.362122   \n",
       "BMI                             0.109431  0.203496   0.140789  0.013432   \n",
       "Smoking_History                 0.100215  0.054567   0.123128  0.073407   \n",
       "Alcohol_Consumption            -0.028200 -0.113292  -0.024968  0.129311   \n",
       "Fruit_Consumption              -0.039938 -0.018707  -0.001983 -0.092486   \n",
       "Green_Vegetables_Consumption   -0.051134 -0.028606  -0.018803 -0.069169   \n",
       "FriedPotato_Consumption         0.018108 -0.002870  -0.050994  0.130049   \n",
       "\n",
       "                              Age_Category  Height_(cm)  Weight_(kg)  \\\n",
       "General_Health                   -0.167350     0.065967    -0.184094   \n",
       "Checkup                          -0.235454     0.096012    -0.009579   \n",
       "Exercise                         -0.122334     0.091628    -0.088641   \n",
       "Heart_Disease                     0.229027     0.016824     0.047409   \n",
       "Skin_Cancer                       0.272075     0.006679    -0.028367   \n",
       "Other_Cancer                      0.234464    -0.044170    -0.020410   \n",
       "Depression                       -0.103195    -0.093348     0.046350   \n",
       "Diabetes                          0.196006    -0.031031     0.160771   \n",
       "Arthritis                         0.370996    -0.098835     0.074759   \n",
       "Sex                              -0.060234     0.708497     0.362122   \n",
       "Age_Category                      1.000000    -0.121451    -0.060041   \n",
       "Height_(cm)                      -0.121451     1.000000     0.483410   \n",
       "Weight_(kg)                      -0.060041     0.483410     1.000000   \n",
       "BMI                              -0.004520    -0.013834     0.859020   \n",
       "Smoking_History                   0.133155     0.052032     0.048845   \n",
       "Alcohol_Consumption               0.012833     0.129726    -0.031117   \n",
       "Fruit_Consumption                 0.043661    -0.046750    -0.092119   \n",
       "Green_Vegetables_Consumption      0.036030    -0.030994    -0.076765   \n",
       "FriedPotato_Consumption          -0.142761     0.109709     0.097414   \n",
       "\n",
       "                                   BMI  Smoking_History  Alcohol_Consumption  \\\n",
       "General_Health               -0.249477        -0.167538             0.118333   \n",
       "Checkup                      -0.062799         0.010875             0.048410   \n",
       "Exercise                     -0.156225        -0.093241             0.095028   \n",
       "Heart_Disease                 0.044433         0.107757            -0.036614   \n",
       "Skin_Cancer                  -0.037468         0.032793             0.042734   \n",
       "Other_Cancer                  0.001875         0.053390            -0.008704   \n",
       "Depression                    0.109431         0.100215            -0.028200   \n",
       "Diabetes                      0.203496         0.054567            -0.113292   \n",
       "Arthritis                     0.140789         0.123128            -0.024968   \n",
       "Sex                           0.013432         0.073407             0.129311   \n",
       "Age_Category                 -0.004520         0.133155             0.012833   \n",
       "Height_(cm)                  -0.013834         0.052032             0.129726   \n",
       "Weight_(kg)                   0.859020         0.048845            -0.031117   \n",
       "BMI                           1.000000         0.025862            -0.110340   \n",
       "Smoking_History               0.025862         1.000000             0.100553   \n",
       "Alcohol_Consumption          -0.110340         0.100553             1.000000   \n",
       "Fruit_Consumption            -0.078849        -0.093626            -0.012542   \n",
       "Green_Vegetables_Consumption -0.072224        -0.034371             0.060088   \n",
       "FriedPotato_Consumption       0.049331         0.035824             0.020503   \n",
       "\n",
       "                              Fruit_Consumption  Green_Vegetables_Consumption  \\\n",
       "General_Health                         0.102602                      0.119738   \n",
       "Checkup                               -0.042106                     -0.036992   \n",
       "Exercise                               0.136782                      0.124983   \n",
       "Heart_Disease                         -0.020045                     -0.024027   \n",
       "Skin_Cancer                            0.024143                      0.012894   \n",
       "Other_Cancer                           0.007992                     -0.003215   \n",
       "Depression                            -0.039938                     -0.051134   \n",
       "Diabetes                              -0.018707                     -0.028606   \n",
       "Arthritis                             -0.001983                     -0.018803   \n",
       "Sex                                   -0.092486                     -0.069169   \n",
       "Age_Category                           0.043661                      0.036030   \n",
       "Height_(cm)                           -0.046750                     -0.030994   \n",
       "Weight_(kg)                           -0.092119                     -0.076765   \n",
       "BMI                                   -0.078849                     -0.072224   \n",
       "Smoking_History                       -0.093626                     -0.034371   \n",
       "Alcohol_Consumption                   -0.012542                      0.060088   \n",
       "Fruit_Consumption                      1.000000                      0.270426   \n",
       "Green_Vegetables_Consumption           0.270426                      1.000000   \n",
       "FriedPotato_Consumption               -0.060302                      0.003209   \n",
       "\n",
       "                              FriedPotato_Consumption  \n",
       "General_Health                              -0.031816  \n",
       "Checkup                                      0.059084  \n",
       "Exercise                                    -0.036904  \n",
       "Heart_Disease                               -0.009249  \n",
       "Skin_Cancer                                 -0.038945  \n",
       "Other_Cancer                                -0.033326  \n",
       "Depression                                   0.018108  \n",
       "Diabetes                                    -0.002870  \n",
       "Arthritis                                   -0.050994  \n",
       "Sex                                          0.130049  \n",
       "Age_Category                                -0.142761  \n",
       "Height_(cm)                                  0.109709  \n",
       "Weight_(kg)                                  0.097414  \n",
       "BMI                                          0.049331  \n",
       "Smoking_History                              0.035824  \n",
       "Alcohol_Consumption                          0.020503  \n",
       "Fruit_Consumption                           -0.060302  \n",
       "Green_Vegetables_Consumption                 0.003209  \n",
       "FriedPotato_Consumption                      1.000000  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Calculate the correlation matrix\n",
    "corr_matrix = data_ordinal_encode.corr()\n",
    "\n",
    "# Display the correlation matrix\n",
    "display(corr_matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "bf1f2555",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 1000x1000 with 2 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Import matplotlib and seaborn libraries\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "# Calculate the correlation matrix\n",
    "corr_matrix = data_ordinal_encode.corr()\n",
    "\n",
    "# Create a heatmap of the correlation matrix\n",
    "plt.figure(figsize=(10,10))\n",
    "sns.heatmap(corr_matrix, annot=True, cmap=\"RdBu\")\n",
    "plt.title(\"Correlation Matrix of Data Ordinal Encode\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "139714f2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>General_Health</th>\n",
       "      <th>Checkup</th>\n",
       "      <th>Exercise</th>\n",
       "      <th>Heart_Disease</th>\n",
       "      <th>Skin_Cancer</th>\n",
       "      <th>Other_Cancer</th>\n",
       "      <th>Depression</th>\n",
       "      <th>Diabetes</th>\n",
       "      <th>Arthritis</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age_Category</th>\n",
       "      <th>Height_(cm)</th>\n",
       "      <th>Weight_(kg)</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Smoking_History</th>\n",
       "      <th>Alcohol_Consumption</th>\n",
       "      <th>Fruit_Consumption</th>\n",
       "      <th>Green_Vegetables_Consumption</th>\n",
       "      <th>FriedPotato_Consumption</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.093750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>0.348837</td>\n",
       "      <td>0.295420</td>\n",
       "      <td>0.326529</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.031250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>8.0</td>\n",
       "      <td>0.302326</td>\n",
       "      <td>0.409059</td>\n",
       "      <td>0.491063</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.133333</td>\n",
       "      <td>0.100000</td>\n",
       "      <td>0.023438</td>\n",
       "      <td>0.125000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>11.0</td>\n",
       "      <td>0.697674</td>\n",
       "      <td>0.459064</td>\n",
       "      <td>0.340504</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.234375</td>\n",
       "      <td>0.062500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>0.953488</td>\n",
       "      <td>0.409059</td>\n",
       "      <td>0.202016</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.066667</td>\n",
       "      <td>0.031250</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>308849</th>\n",
       "      <td>3.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.418605</td>\n",
       "      <td>0.340916</td>\n",
       "      <td>0.350669</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.133333</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>308850</th>\n",
       "      <td>1.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>9.0</td>\n",
       "      <td>0.697674</td>\n",
       "      <td>0.222668</td>\n",
       "      <td>0.110219</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.266667</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.468750</td>\n",
       "      <td>0.031250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>308851</th>\n",
       "      <td>3.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.162791</td>\n",
       "      <td>0.136286</td>\n",
       "      <td>0.212180</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.133333</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.031250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>308852</th>\n",
       "      <td>3.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>9.0</td>\n",
       "      <td>0.767442</td>\n",
       "      <td>0.318168</td>\n",
       "      <td>0.181687</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.100000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.093750</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>308853</th>\n",
       "      <td>4.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>0.232558</td>\n",
       "      <td>0.336306</td>\n",
       "      <td>0.435160</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.033333</td>\n",
       "      <td>0.041667</td>\n",
       "      <td>0.093750</td>\n",
       "      <td>0.007812</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>308774 rows × 19 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        General_Health  Checkup  Exercise  Heart_Disease  Skin_Cancer  \\\n",
       "0                  0.0      2.0       0.0            0.0          0.0   \n",
       "1                  3.0      1.0       0.0            1.0          0.0   \n",
       "2                  3.0      1.0       1.0            0.0          0.0   \n",
       "3                  0.0      1.0       1.0            1.0          0.0   \n",
       "4                  2.0      1.0       0.0            0.0          0.0   \n",
       "...                ...      ...       ...            ...          ...   \n",
       "308849             3.0      1.0       1.0            0.0          0.0   \n",
       "308850             1.0      3.0       1.0            0.0          0.0   \n",
       "308851             3.0      4.0       1.0            0.0          0.0   \n",
       "308852             3.0      1.0       1.0            0.0          0.0   \n",
       "308853             4.0      1.0       1.0            0.0          0.0   \n",
       "\n",
       "        Other_Cancer  Depression  Diabetes  Arthritis  Sex  Age_Category  \\\n",
       "0                0.0         0.0         0        1.0  0.0          10.0   \n",
       "1                0.0         0.0         1        0.0  0.0          10.0   \n",
       "2                0.0         0.0         1        0.0  0.0           8.0   \n",
       "3                0.0         0.0         1        0.0  1.0          11.0   \n",
       "4                0.0         0.0         0        0.0  1.0          12.0   \n",
       "...              ...         ...       ...        ...  ...           ...   \n",
       "308849           0.0         0.0         0        0.0  1.0           1.0   \n",
       "308850           0.0         0.0         1        0.0  1.0           9.0   \n",
       "308851           0.0         1.0         1        0.0  0.0           2.0   \n",
       "308852           0.0         0.0         0        0.0  1.0           9.0   \n",
       "308853           0.0         0.0         0        0.0  0.0           5.0   \n",
       "\n",
       "        Height_(cm)  Weight_(kg)       BMI  Smoking_History  \\\n",
       "0          0.000000     0.000000  0.000000              1.0   \n",
       "1          0.348837     0.295420  0.326529              0.0   \n",
       "2          0.302326     0.409059  0.491063              0.0   \n",
       "3          0.697674     0.459064  0.340504              0.0   \n",
       "4          0.953488     0.409059  0.202016              1.0   \n",
       "...             ...          ...       ...              ...   \n",
       "308849     0.418605     0.340916  0.350669              0.0   \n",
       "308850     0.697674     0.222668  0.110219              0.0   \n",
       "308851     0.162791     0.136286  0.212180              1.0   \n",
       "308852     0.767442     0.318168  0.181687              0.0   \n",
       "308853     0.232558     0.336306  0.435160              0.0   \n",
       "\n",
       "        Alcohol_Consumption  Fruit_Consumption  Green_Vegetables_Consumption  \\\n",
       "0                  0.000000           0.250000                      0.125000   \n",
       "1                  0.000000           0.250000                      0.000000   \n",
       "2                  0.133333           0.100000                      0.023438   \n",
       "3                  0.000000           0.250000                      0.234375   \n",
       "4                  0.000000           0.066667                      0.031250   \n",
       "...                     ...                ...                           ...   \n",
       "308849             0.133333           0.250000                      0.062500   \n",
       "308850             0.266667           0.125000                      0.468750   \n",
       "308851             0.133333           0.333333                      0.062500   \n",
       "308852             0.100000           0.250000                      0.093750   \n",
       "308853             0.033333           0.041667                      0.093750   \n",
       "\n",
       "        FriedPotato_Consumption  \n",
       "0                      0.093750  \n",
       "1                      0.031250  \n",
       "2                      0.125000  \n",
       "3                      0.062500  \n",
       "4                      0.000000  \n",
       "...                         ...  \n",
       "308849                 0.000000  \n",
       "308850                 0.031250  \n",
       "308851                 0.031250  \n",
       "308852                 0.000000  \n",
       "308853                 0.007812  \n",
       "\n",
       "[308774 rows x 19 columns]"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_ordinal_encode"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c77616d6",
   "metadata": {},
   "source": [
    "## Feature importance with Random Forest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "8bd37d3b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 1000x600 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.model_selection import train_test_split\n",
    "import numpy as np\n",
    "\n",
    "# Defining the features (X) and the target (y)\n",
    "X = data_ordinal_encode.drop(\"Heart_Disease\", axis=1)\n",
    "y = data_ordinal_encode[\"Heart_Disease\"]\n",
    "\n",
    "# Performing the train-test split\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
    "\n",
    "\n",
    "# Random Forest for feature importance\n",
    "rf = RandomForestClassifier(n_estimators=100, random_state=42)\n",
    "rf.fit(X_train, y_train)\n",
    "\n",
    "# Get feature importances\n",
    "importances = rf.feature_importances_\n",
    "\n",
    "# Get feature names\n",
    "feature_names = list(X_train.columns)\n",
    "\n",
    "# Sort feature importances in descending order\n",
    "indices = np.argsort(importances)[::-1]\n",
    "\n",
    "# Plot the feature importances\n",
    "plt.figure(figsize=(10, 6))\n",
    "plt.title(\"Feature Importance\")\n",
    "plt.bar(range(X_train.shape[1]), importances[indices])\n",
    "plt.xticks(range(X_train.shape[1]), np.array(feature_names)[indices], rotation=90)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "3a61353f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>General_Health</th>\n",
       "      <th>Heart_Disease</th>\n",
       "      <th>Age_Category</th>\n",
       "      <th>Height_(cm)</th>\n",
       "      <th>Weight_(kg)</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Alcohol_Consumption</th>\n",
       "      <th>Fruit_Consumption</th>\n",
       "      <th>Green_Vegetables_Consumption</th>\n",
       "      <th>FriedPotato_Consumption</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.093750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>0.348837</td>\n",
       "      <td>0.295420</td>\n",
       "      <td>0.326529</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.031250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>8.0</td>\n",
       "      <td>0.302326</td>\n",
       "      <td>0.409059</td>\n",
       "      <td>0.491063</td>\n",
       "      <td>0.133333</td>\n",
       "      <td>0.100000</td>\n",
       "      <td>0.023438</td>\n",
       "      <td>0.125000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>11.0</td>\n",
       "      <td>0.697674</td>\n",
       "      <td>0.459064</td>\n",
       "      <td>0.340504</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.234375</td>\n",
       "      <td>0.062500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>0.953488</td>\n",
       "      <td>0.409059</td>\n",
       "      <td>0.202016</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.066667</td>\n",
       "      <td>0.031250</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>308849</th>\n",
       "      <td>3.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.418605</td>\n",
       "      <td>0.340916</td>\n",
       "      <td>0.350669</td>\n",
       "      <td>0.133333</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>308850</th>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>9.0</td>\n",
       "      <td>0.697674</td>\n",
       "      <td>0.222668</td>\n",
       "      <td>0.110219</td>\n",
       "      <td>0.266667</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.468750</td>\n",
       "      <td>0.031250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>308851</th>\n",
       "      <td>3.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.162791</td>\n",
       "      <td>0.136286</td>\n",
       "      <td>0.212180</td>\n",
       "      <td>0.133333</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>0.031250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>308852</th>\n",
       "      <td>3.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>9.0</td>\n",
       "      <td>0.767442</td>\n",
       "      <td>0.318168</td>\n",
       "      <td>0.181687</td>\n",
       "      <td>0.100000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.093750</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>308853</th>\n",
       "      <td>4.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>0.232558</td>\n",
       "      <td>0.336306</td>\n",
       "      <td>0.435160</td>\n",
       "      <td>0.033333</td>\n",
       "      <td>0.041667</td>\n",
       "      <td>0.093750</td>\n",
       "      <td>0.007812</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>308774 rows × 10 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        General_Health  Heart_Disease  Age_Category  Height_(cm)  Weight_(kg)  \\\n",
       "0                  0.0            0.0          10.0     0.000000     0.000000   \n",
       "1                  3.0            1.0          10.0     0.348837     0.295420   \n",
       "2                  3.0            0.0           8.0     0.302326     0.409059   \n",
       "3                  0.0            1.0          11.0     0.697674     0.459064   \n",
       "4                  2.0            0.0          12.0     0.953488     0.409059   \n",
       "...                ...            ...           ...          ...          ...   \n",
       "308849             3.0            0.0           1.0     0.418605     0.340916   \n",
       "308850             1.0            0.0           9.0     0.697674     0.222668   \n",
       "308851             3.0            0.0           2.0     0.162791     0.136286   \n",
       "308852             3.0            0.0           9.0     0.767442     0.318168   \n",
       "308853             4.0            0.0           5.0     0.232558     0.336306   \n",
       "\n",
       "             BMI  Alcohol_Consumption  Fruit_Consumption  \\\n",
       "0       0.000000             0.000000           0.250000   \n",
       "1       0.326529             0.000000           0.250000   \n",
       "2       0.491063             0.133333           0.100000   \n",
       "3       0.340504             0.000000           0.250000   \n",
       "4       0.202016             0.000000           0.066667   \n",
       "...          ...                  ...                ...   \n",
       "308849  0.350669             0.133333           0.250000   \n",
       "308850  0.110219             0.266667           0.125000   \n",
       "308851  0.212180             0.133333           0.333333   \n",
       "308852  0.181687             0.100000           0.250000   \n",
       "308853  0.435160             0.033333           0.041667   \n",
       "\n",
       "        Green_Vegetables_Consumption  FriedPotato_Consumption  \n",
       "0                           0.125000                 0.093750  \n",
       "1                           0.000000                 0.031250  \n",
       "2                           0.023438                 0.125000  \n",
       "3                           0.234375                 0.062500  \n",
       "4                           0.031250                 0.000000  \n",
       "...                              ...                      ...  \n",
       "308849                      0.062500                 0.000000  \n",
       "308850                      0.468750                 0.031250  \n",
       "308851                      0.062500                 0.031250  \n",
       "308852                      0.093750                 0.000000  \n",
       "308853                      0.093750                 0.007812  \n",
       "\n",
       "[308774 rows x 10 columns]"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_dropped_columns = data_ordinal_encode.copy()\n",
    "data_dropped_columns = data_dropped_columns.drop(['Sex', 'Skin_Cancer','Smoking_History', 'Other_Cancer', 'Checkup', 'Depression', 'Exercise', 'Arthritis', 'Diabetes'], axis=1)\n",
    "data_dropped_columns"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3736ed42",
   "metadata": {},
   "source": [
    "### SelectKBest Method for feature selection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "f3e51e5d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                         feature         score        p-value\n",
      "0                 General_Health  17642.352807   0.000000e+00\n",
      "9                   Age_Category  17092.766758   0.000000e+00\n",
      "6                       Diabetes   9433.881563   0.000000e+00\n",
      "7                      Arthritis   7489.831691   0.000000e+00\n",
      "13               Smoking_History   3627.435019   0.000000e+00\n",
      "2                       Exercise   2891.549432   0.000000e+00\n",
      "4                   Other_Cancer   2657.117300   0.000000e+00\n",
      "3                    Skin_Cancer   2568.860485   0.000000e+00\n",
      "1                        Checkup   2242.836339   0.000000e+00\n",
      "8                            Sex   1636.347832   0.000000e+00\n",
      "11                   Weight_(kg)    695.571837  4.064439e-153\n",
      "12                           BMI    610.820147  1.004189e-134\n",
      "14           Alcohol_Consumption    414.490985   4.438285e-92\n",
      "5                     Depression    326.364158   6.491542e-73\n",
      "16  Green_Vegetables_Consumption    178.362099   1.133331e-40\n",
      "15             Fruit_Consumption    124.114486   8.053439e-29\n",
      "10                   Height_(cm)     87.422775   8.819184e-21\n",
      "17       FriedPotato_Consumption     26.414514   2.756301e-07\n"
     ]
    }
   ],
   "source": [
    "# Import pandas and sklearn libraries\n",
    "import pandas as pd\n",
    "from sklearn.feature_selection import SelectKBest, f_classif\n",
    "\n",
    "# Separate the features (X) and the target (y)\n",
    "X = data_ordinal_encode.drop(\"Heart_Disease\", axis=1)\n",
    "y = data_ordinal_encode[\"Heart_Disease\"]\n",
    "\n",
    "# Create a feature selector object\n",
    "selector = SelectKBest(score_func=f_classif, k=\"all\")\n",
    "\n",
    "# Fit the selector to the data\n",
    "selector.fit(X, y)\n",
    "\n",
    "# Get the scores and p-values of each feature\n",
    "scores = selector.scores_\n",
    "pvalues = selector.pvalues_\n",
    "\n",
    "# Create a dataframe of the feature names, scores, and p-values\n",
    "feature_df = pd.DataFrame({\"feature\": X.columns, \"score\": scores, \"p-value\": pvalues})\n",
    "\n",
    "# Sort the dataframe by score in descending order\n",
    "feature_df = feature_df.sort_values(by=\"score\", ascending=False)\n",
    "\n",
    "# Print the dataframe\n",
    "print(feature_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "c2c10503",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                         feature     score\n",
      "2                       Exercise  0.061817\n",
      "8                            Sex  0.041916\n",
      "0                 General_Health  0.041514\n",
      "1                        Checkup  0.036737\n",
      "9                   Age_Category  0.036575\n",
      "13               Smoking_History  0.032005\n",
      "7                      Arthritis  0.027391\n",
      "6                       Diabetes  0.013933\n",
      "15             Fruit_Consumption  0.011096\n",
      "16  Green_Vegetables_Consumption  0.008065\n",
      "5                     Depression  0.007394\n",
      "3                    Skin_Cancer  0.005377\n",
      "14           Alcohol_Consumption  0.005152\n",
      "11                   Weight_(kg)  0.005096\n",
      "10                   Height_(cm)  0.004626\n",
      "4                   Other_Cancer  0.003841\n",
      "12                           BMI  0.003536\n",
      "17       FriedPotato_Consumption  0.003392\n"
     ]
    }
   ],
   "source": [
    "# Import pandas and sklearn libraries\n",
    "import pandas as pd\n",
    "from sklearn.feature_selection import SelectKBest, mutual_info_classif\n",
    "\n",
    "# Separate the features (X) and the target (y)\n",
    "X = data_ordinal_encode.drop(\"Heart_Disease\", axis=1)\n",
    "y = data_ordinal_encode[\"Heart_Disease\"]\n",
    "\n",
    "# Create a feature selector object\n",
    "selector = SelectKBest(score_func=mutual_info_classif, k=\"all\")\n",
    "\n",
    "# Fit the selector to the data\n",
    "selector.fit(X, y)\n",
    "\n",
    "# Get the scores of each feature\n",
    "scores = selector.scores_\n",
    "\n",
    "# Create a dataframe of the feature names and scores\n",
    "feature_df = pd.DataFrame({\"feature\": X.columns, \"score\": scores})\n",
    "\n",
    "# Sort the dataframe by score in descending order\n",
    "feature_df = feature_df.sort_values(by=\"score\", ascending=False)\n",
    "\n",
    "# Print the dataframe\n",
    "print(feature_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7fe60ee2",
   "metadata": {},
   "source": [
    "### Using cross validation to compare feature selection techinques"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "75a25416",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(308774, 19)"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_ordinal_encode.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 328,
   "id": "ba32cb71",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "SelectKBest_f_classif:\n",
      "Number of features selected: 18\n",
      "| feature                      |      score |      p-value |\n",
      "|:-----------------------------|-----------:|-------------:|\n",
      "| General_Health               | 17642.4    | 0            |\n",
      "| Age_Category                 | 17092.8    | 0            |\n",
      "| Diabetes                     |  9433.88   | 0            |\n",
      "| Arthritis                    |  7489.83   | 0            |\n",
      "| Smoking_History              |  3627.44   | 0            |\n",
      "| Exercise                     |  2891.55   | 0            |\n",
      "| Other_Cancer                 |  2657.12   | 0            |\n",
      "| Skin_Cancer                  |  2568.86   | 0            |\n",
      "| Checkup                      |  2242.84   | 0            |\n",
      "| Sex                          |  1636.35   | 0            |\n",
      "| Weight_(kg)                  |   695.572  | 4.06444e-153 |\n",
      "| BMI                          |   610.82   | 1.00419e-134 |\n",
      "| Alcohol_Consumption          |   414.491  | 4.43828e-92  |\n",
      "| Depression                   |   326.364  | 6.49154e-73  |\n",
      "| Green_Vegetables_Consumption |   178.362  | 1.13333e-40  |\n",
      "| Fruit_Consumption            |   124.114  | 8.05344e-29  |\n",
      "| Height_(cm)                  |    87.4228 | 8.81918e-21  |\n",
      "| FriedPotato_Consumption      |    26.4145 | 2.7563e-07   |\n",
      "SelectKBest_mutual_info_classif:\n",
      "Number of features selected: 18\n",
      "| feature                      |      score | p-value   |\n",
      "|:-----------------------------|-----------:|:----------|\n",
      "| Exercise                     | 0.0625872  |           |\n",
      "| Sex                          | 0.0419496  |           |\n",
      "| General_Health               | 0.0408466  |           |\n",
      "| Checkup                      | 0.0364467  |           |\n",
      "| Age_Category                 | 0.0359838  |           |\n",
      "| Smoking_History              | 0.0313306  |           |\n",
      "| Arthritis                    | 0.0269889  |           |\n",
      "| Diabetes                     | 0.0137668  |           |\n",
      "| Alcohol_Consumption          | 0.0136969  |           |\n",
      "| Depression                   | 0.00663824 |           |\n",
      "| Skin_Cancer                  | 0.00516473 |           |\n",
      "| FriedPotato_Consumption      | 0.00503667 |           |\n",
      "| Other_Cancer                 | 0.00451621 |           |\n",
      "| Fruit_Consumption            | 0.00405717 |           |\n",
      "| Green_Vegetables_Consumption | 0.00386828 |           |\n",
      "| Weight_(kg)                  | 0.0036666  |           |\n",
      "| Height_(cm)                  | 0.00315128 |           |\n",
      "| BMI                          | 0.00236402 |           |\n",
      "SelectKBest_f_regression:\n",
      "Number of features selected: 18\n",
      "| feature                      |      score |      p-value |\n",
      "|:-----------------------------|-----------:|-------------:|\n",
      "| General_Health               | 17642.4    | 0            |\n",
      "| Age_Category                 | 17092.8    | 0            |\n",
      "| Diabetes                     |  9433.88   | 0            |\n",
      "| Arthritis                    |  7489.83   | 0            |\n",
      "| Smoking_History              |  3627.44   | 0            |\n",
      "| Exercise                     |  2891.55   | 0            |\n",
      "| Other_Cancer                 |  2657.12   | 0            |\n",
      "| Skin_Cancer                  |  2568.86   | 0            |\n",
      "| Checkup                      |  2242.84   | 0            |\n",
      "| Sex                          |  1636.35   | 0            |\n",
      "| Weight_(kg)                  |   695.572  | 4.06444e-153 |\n",
      "| BMI                          |   610.82   | 1.00419e-134 |\n",
      "| Alcohol_Consumption          |   414.491  | 4.43828e-92  |\n",
      "| Depression                   |   326.364  | 6.49154e-73  |\n",
      "| Green_Vegetables_Consumption |   178.362  | 1.13333e-40  |\n",
      "| Fruit_Consumption            |   124.114  | 8.05344e-29  |\n",
      "| Height_(cm)                  |    87.4228 | 8.81918e-21  |\n",
      "| FriedPotato_Consumption      |    26.4145 | 2.7563e-07   |\n"
     ]
    }
   ],
   "source": [
    "# Import pandas and sklearn libraries\n",
    "import pandas as pd\n",
    "from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, f_regression\n",
    "\n",
    "# Separate the features (X) and the target (y)\n",
    "X = data_ordinal_encode.drop(\"Heart_Disease\", axis=1)\n",
    "y = data_ordinal_encode[\"Heart_Disease\"]\n",
    "\n",
    "# Define feature selection methods\n",
    "methods = {\n",
    "    \"SelectKBest_f_classif\": SelectKBest(score_func=f_classif, k=\"all\"),\n",
    "    \"SelectKBest_mutual_info_classif\": SelectKBest(score_func=mutual_info_classif, k=\"all\"),\n",
    "    \"SelectKBest_f_regression\": SelectKBest(score_func=f_regression, k=\"all\")\n",
    "}\n",
    "\n",
    "# Fit and transform each feature selection method\n",
    "results = {}\n",
    "for name, method in methods.items():\n",
    "    X_new = method.fit_transform(X, y)\n",
    "    scores = method.scores_\n",
    "    pvalues = method.pvalues_\n",
    "    results[name] = (X_new, scores, pvalues)\n",
    "\n",
    "# Print the results for each feature selection method\n",
    "for name, (X_new, scores, pvalues) in results.items():\n",
    "    print(f\"{name}:\")\n",
    "    print(f\"Number of features selected: {X_new.shape[1]}\")\n",
    "    # Create a dataframe of the feature names, scores, and p-values\n",
    "    feature_df = pd.DataFrame({\"feature\": X.columns, \"score\": scores, \"p-value\": pvalues})\n",
    "    # Sort the dataframe by score in descending order\n",
    "    feature_df = feature_df.sort_values(by=\"score\", ascending=False)\n",
    "    # Print the dataframe with proper formatting\n",
    "    print(feature_df.to_markdown(index=False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 254,
   "id": "24e978c2",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Feature Ranking (CFS):\n",
      "| feature                      |      score |\n",
      "|:-----------------------------|-----------:|\n",
      "| General_Health               | 17642.4    |\n",
      "| Age_Category                 | 17092.8    |\n",
      "| Diabetes                     |  9433.88   |\n",
      "| Arthritis                    |  7489.83   |\n",
      "| Smoking_History              |  3627.44   |\n",
      "| Exercise                     |  2891.55   |\n",
      "| Other_Cancer                 |  2657.12   |\n",
      "| Skin_Cancer                  |  2568.86   |\n",
      "| Checkup                      |  2242.84   |\n",
      "| Sex                          |  1636.35   |\n",
      "| Weight_(kg)                  |   695.572  |\n",
      "| BMI                          |   610.82   |\n",
      "| Alcohol_Consumption          |   414.491  |\n",
      "| Depression                   |   326.364  |\n",
      "| Green_Vegetables_Consumption |   178.362  |\n",
      "| Fruit_Consumption            |   124.114  |\n",
      "| Height_(cm)                  |    87.4228 |\n",
      "| FriedPotato_Consumption      |    26.4145 |\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.feature_selection import SelectKBest, f_classif\n",
    "\n",
    "# Separate the features (X) and the target (y)\n",
    "X = data_ordinal_encode.drop(\"Heart_Disease\", axis=1)  # Replace \"Heart_Disease\" with any target column name\n",
    "y = data_ordinal_encode[\"Heart_Disease\"]\n",
    "\n",
    "# Define the feature selection method (CFS)\n",
    "cfs_selector = SelectKBest(score_func=f_classif, k=\"all\")\n",
    "\n",
    "# Fit and transform the CFS method\n",
    "X_new = cfs_selector.fit_transform(X, y)\n",
    "scores = cfs_selector.scores_\n",
    "\n",
    "# Create a dataframe of the feature names and scores\n",
    "feature_df = pd.DataFrame({\"feature\": X.columns, \"score\": scores})\n",
    "\n",
    "# Sort the dataframe by score in descending order\n",
    "feature_df = feature_df.sort_values(by=\"score\", ascending=False)\n",
    "\n",
    "# Print the sorted dataframe\n",
    "print(\"Feature Ranking (CFS):\")\n",
    "print(feature_df.to_markdown(index=False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "9526b4a2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Drop the columns 'age' and 'gender' from the original dataframe\n",
    "# data_ordinal_encode = data_ordinal_encode.drop(['FriedPotato_Consumption'], axis=1)\n",
    "# data_ordinal_encode.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d92e830d",
   "metadata": {},
   "source": [
    "#### Saving preprocessed data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "447d5875",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_ordinal_encode.to_csv('cvd_data_preprocessed.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "be932438",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(308774, 19)"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_ordinal_encode.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 325,
   "id": "084e3ca5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy with RFE-selected features: 0.9188243866893369\n",
      "Accuracy with Lasso-selected features: 0.9189701238766091\n",
      "Accuracy with Random Forest-selected features: 0.9179985426281273\n"
     ]
    }
   ],
   "source": [
    "# Import necessary libraries\n",
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.feature_selection import SelectFromModel, RFE\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.metrics import accuracy_score\n",
    "\n",
    "X = data_ordinal_encode.drop(\"Heart_Disease\", axis=1)\n",
    "y = data_ordinal_encode[\"Heart_Disease\"]\n",
    "\n",
    "# Split the data into training and testing sets\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
    "\n",
    "# Method 1: Recursive Feature Elimination (RFE)\n",
    "model_rfe = LogisticRegression(solver='liblinear')\n",
    "rfe = RFE(model_rfe, n_features_to_select=10)\n",
    "X_train_rfe = rfe.fit_transform(X_train, y_train)\n",
    "X_test_rfe = rfe.transform(X_test)\n",
    "\n",
    "# Method 2: Lasso Regression (L1 Regularization)\n",
    "model_lasso = LogisticRegression(penalty='l1', solver='liblinear', C=0.1)\n",
    "model_lasso.fit(X_train, y_train)\n",
    "sfm = SelectFromModel(model_lasso, threshold=0.25)\n",
    "sfm.fit(X_train, y_train)\n",
    "X_train_lasso = sfm.transform(X_train)\n",
    "X_test_lasso = sfm.transform(X_test)\n",
    "\n",
    "# Method 3: Random Forest Feature Importance\n",
    "model_rf = RandomForestClassifier(n_estimators=100, random_state=42)\n",
    "model_rf.fit(X_train, y_train)\n",
    "importances = model_rf.feature_importances_\n",
    "sfm_rf = SelectFromModel(model_rf, threshold=0.05)\n",
    "sfm_rf.fit(X_train, y_train)\n",
    "X_train_rf = sfm_rf.transform(X_train)\n",
    "X_test_rf = sfm_rf.transform(X_test)\n",
    "\n",
    "# Create and evaluate a simple model using selected features\n",
    "def evaluate_model(X_train, X_test, y_train, y_test):\n",
    "    model = LogisticRegression(solver='liblinear')\n",
    "    model.fit(X_train, y_train)\n",
    "    y_pred = model.predict(X_test)\n",
    "    accuracy = accuracy_score(y_test, y_pred)\n",
    "    return accuracy\n",
    "\n",
    "# Evaluate the models\n",
    "accuracy_rfe = evaluate_model(X_train_rfe, X_test_rfe, y_train, y_test)\n",
    "accuracy_lasso = evaluate_model(X_train_lasso, X_test_lasso, y_train, y_test)\n",
    "accuracy_rf = evaluate_model(X_train_rf, X_test_rf, y_train, y_test)\n",
    "\n",
    "# Print the results\n",
    "print(\"Accuracy with RFE-selected features:\", accuracy_rfe)\n",
    "print(\"Accuracy with Lasso-selected features:\", accuracy_lasso)\n",
    "print(\"Accuracy with Random Forest-selected features:\", accuracy_rf)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 175,
   "id": "c3a26a13",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "SelectFromModel_LogisticRegression: Mean accuracy = 0.9170, Std deviation = 0.0052\n",
      "SelectFromModel_RandomForest: Mean accuracy = 0.9125, Std deviation = 0.0051\n",
      "SelectFromModel_SVM: Mean accuracy = 0.9158, Std deviation = 0.0048\n"
     ]
    }
   ],
   "source": [
    "# Import pandas and sklearn libraries\n",
    "import pandas as pd\n",
    "from sklearn.model_selection import cross_val_score, ShuffleSplit\n",
    "from sklearn.feature_selection import SelectFromModel\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.svm import SVC\n",
    "\n",
    "# Use 20% of the data\n",
    "data_ordinal_encode = data_ordinal_encode.sample(frac=0.2, random_state=42)\n",
    "\n",
    "# Separate the features (X) and the target (y)\n",
    "X = data_ordinal_encode.drop(\"Heart_Disease\", axis=1)\n",
    "y = data_ordinal_encode[\"Heart_Disease\"]\n",
    "\n",
    "# Define models using a pipeline\n",
    "models = {\n",
    "    \"SelectFromModel_LogisticRegression\": Pipeline([\n",
    "        ('feature_selector', SelectFromModel(LogisticRegression(max_iter=1000))),\n",
    "        ('classifier', LogisticRegression(max_iter=1000))  # Adjust max_iter as needed\n",
    "    ]),\n",
    "    \"SelectFromModel_RandomForest\": Pipeline([\n",
    "        ('feature_selector', SelectFromModel(RandomForestClassifier())),\n",
    "        ('classifier', RandomForestClassifier())\n",
    "    ]),\n",
    "    # Add a new model with SVM and linear kernel\n",
    "    \"SelectFromModel_SVM\": Pipeline([\n",
    "        ('feature_selector', SelectFromModel(SVC(kernel='linear'))),\n",
    "        ('classifier', SVC(kernel='linear'))\n",
    "    ])\n",
    "}\n",
    "\n",
    "# Define cross-validation settings\n",
    "cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)\n",
    "\n",
    "# Perform cross-validation and compare methods using parallelization\n",
    "results = {}\n",
    "for name, model in models.items():\n",
    "    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy', n_jobs=-1)\n",
    "    results[name] = scores\n",
    "\n",
    "# Print the cross-validation results\n",
    "for name, scores in results.items():\n",
    "    print(f\"{name}: Mean accuracy = {scores.mean():.4f}, Std deviation = {scores.std():.4f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "37705221",
   "metadata": {},
   "source": [
    "# Testing with only features with only High importance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "cbdca9a4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model: Logistic Regression\n",
      "F1 Score: 0.07117960007338102\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "         0.0       0.92      1.00      0.96     56677\n",
      "         1.0       0.52      0.04      0.07      5078\n",
      "\n",
      "    accuracy                           0.92     61755\n",
      "   macro avg       0.72      0.52      0.51     61755\n",
      "weighted avg       0.89      0.92      0.88     61755\n",
      "\n",
      "==========================================================\n",
      "Model: Decision Tree\n",
      "F1 Score: 0.20049893744802735\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "         0.0       0.93      0.92      0.92     56677\n",
      "         1.0       0.19      0.21      0.20      5078\n",
      "\n",
      "    accuracy                           0.86     61755\n",
      "   macro avg       0.56      0.57      0.56     61755\n",
      "weighted avg       0.87      0.86      0.86     61755\n",
      "\n",
      "==========================================================\n",
      "Model: Random Forest\n",
      "F1 Score: 0.09153713298791019\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "         0.0       0.92      0.99      0.96     56677\n",
      "         1.0       0.37      0.05      0.09      5078\n",
      "\n",
      "    accuracy                           0.91     61755\n",
      "   macro avg       0.65      0.52      0.52     61755\n",
      "weighted avg       0.88      0.91      0.88     61755\n",
      "\n",
      "==========================================================\n",
      "Model: XGBoost\n",
      "F1 Score: 0.04165103189493434\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "         0.0       0.92      1.00      0.96     56677\n",
      "         1.0       0.44      0.02      0.04      5078\n",
      "\n",
      "    accuracy                           0.92     61755\n",
      "   macro avg       0.68      0.51      0.50     61755\n",
      "weighted avg       0.88      0.92      0.88     61755\n",
      "\n",
      "==========================================================\n"
     ]
    },
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 800x600 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.model_selection import train_test_split\n",
    "import numpy as np\n",
    "# import libraries\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn import preprocessing\n",
    "\n",
    "# Importing necessary libraries for model development and evaluation\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from xgboost import XGBClassifier\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "\n",
    "# Defining the features (X) and the target (y)\n",
    "X = data_dropped_columns.drop(\"Heart_Disease\", axis=1)\n",
    "y = data_dropped_columns[\"Heart_Disease\"]\n",
    "\n",
    "# Performing the train-test split\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
    "\n",
    "# Defining the function to apply models\n",
    "def apply_model(model, X_train, y_train, X_test, y_test, name):\n",
    "    # Fit the model\n",
    "    model.fit(X_train, y_train)\n",
    "\n",
    "    # Make predictions\n",
    "    y_pred = model.predict(X_test)\n",
    "    \n",
    "    # Calculate performance metrics\n",
    "    accuracy = accuracy_score(y_test, y_pred)\n",
    "    precision = precision_score(y_test, y_pred)\n",
    "    recall = recall_score(y_test, y_pred)\n",
    "    f1 = f1_score(y_test, y_pred)\n",
    "\n",
    "    print(f\"Model: {name}\")\n",
    "    print(f\"F1 Score: {f1}\")\n",
    "    print(classification_report(y_test, y_pred))\n",
    "    print('==========================================================')\n",
    "    \n",
    "    # Compute ROC curve and ROC area\n",
    "    y_pred_proba = model.predict_proba(X_test)[:, 1]\n",
    "    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)\n",
    "    roc_auc = roc_auc_score(y_test, y_pred_proba)\n",
    "\n",
    "    return accuracy, precision, recall, f1, fpr, tpr, roc_auc\n",
    "\n",
    "# Defining the models\n",
    "models = [\n",
    "    (\"Logistic Regression\", LogisticRegression(random_state=42, max_iter=500)),\n",
    "    (\"Decision Tree\", DecisionTreeClassifier(random_state=42)),\n",
    "    (\"Random Forest\", RandomForestClassifier(random_state=42)),\n",
    "    (\"XGBoost\", XGBClassifier(random_state=42))\n",
    "]\n",
    "\n",
    "# Applying the models and storing the results\n",
    "results = []\n",
    "roc_curves = []\n",
    "\n",
    "for name, model in models:\n",
    "    accuracy, precision, recall, f1, fpr, tpr, roc_auc = apply_model(model, X_train, y_train, X_test, y_test, name)\n",
    "    results.append((name, accuracy, precision, recall, f1))\n",
    "    roc_curves.append((name, fpr, tpr, roc_auc))\n",
    "\n",
    "# Plotting the ROC curves for each model\n",
    "plt.figure(figsize=(8, 6))\n",
    "plt.plot([0, 1], [0, 1], 'k--')\n",
    "plt.xlabel('False positive rate')\n",
    "plt.ylabel('True positive rate')\n",
    "plt.title('ROC curve comparison')\n",
    "for name, fpr, tpr, roc_auc in roc_curves:\n",
    "    plt.plot(fpr, tpr, label=f\"{name} (area = {roc_auc:.4f})\")\n",
    "plt.legend(loc='best')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2d6385b6",
   "metadata": {},
   "source": [
    "## With balanced sampling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "b7f8c6b5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model: Logistic Regression\n",
      "F1 Score: 0.31468065943532914\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "         0.0       0.97      0.71      0.82     56677\n",
      "         1.0       0.20      0.79      0.31      5078\n",
      "\n",
      "    accuracy                           0.72     61755\n",
      "   macro avg       0.59      0.75      0.57     61755\n",
      "weighted avg       0.91      0.72      0.78     61755\n",
      "\n",
      "==========================================================\n",
      "Model: Decision Tree\n",
      "F1 Score: 0.19173718267794923\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "         0.0       0.93      0.93      0.93     56677\n",
      "         1.0       0.19      0.19      0.19      5078\n",
      "\n",
      "    accuracy                           0.87     61755\n",
      "   macro avg       0.56      0.56      0.56     61755\n",
      "weighted avg       0.87      0.87      0.87     61755\n",
      "\n",
      "==========================================================\n",
      "Model: Random Forest\n",
      "F1 Score: 0.06391382405745062\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "         0.0       0.92      0.99      0.96     56677\n",
      "         1.0       0.36      0.04      0.06      5078\n",
      "\n",
      "    accuracy                           0.92     61755\n",
      "   macro avg       0.64      0.51      0.51     61755\n",
      "weighted avg       0.87      0.92      0.88     61755\n",
      "\n",
      "==========================================================\n",
      "Model: XGBoost\n",
      "F1 Score: 0.04165103189493434\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "         0.0       0.92      1.00      0.96     56677\n",
      "         1.0       0.44      0.02      0.04      5078\n",
      "\n",
      "    accuracy                           0.92     61755\n",
      "   macro avg       0.68      0.51      0.50     61755\n",
      "weighted avg       0.88      0.92      0.88     61755\n",
      "\n",
      "==========================================================\n"
     ]
    },
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 800x600 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.model_selection import train_test_split\n",
    "import numpy as np\n",
    "# import libraries\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn import preprocessing\n",
    "\n",
    "# Importing necessary libraries for model development and evaluation\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from xgboost import XGBClassifier\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "\n",
    "# Defining the features (X) and the target (y)\n",
    "X = data_dropped_columns.drop(\"Heart_Disease\", axis=1)\n",
    "y = data_dropped_columns[\"Heart_Disease\"]\n",
    "\n",
    "# Performing the train-test split\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
    "\n",
    "# Defining the function to apply models\n",
    "def apply_model(model, X_train, y_train, X_test, y_test, name):\n",
    "    # Fit the model\n",
    "    model.fit(X_train, y_train)\n",
    "\n",
    "    # Make predictions\n",
    "    y_pred = model.predict(X_test)\n",
    "    \n",
    "    # Calculate performance metrics\n",
    "    accuracy = accuracy_score(y_test, y_pred)\n",
    "    precision = precision_score(y_test, y_pred)\n",
    "    recall = recall_score(y_test, y_pred)\n",
    "    f1 = f1_score(y_test, y_pred)\n",
    "\n",
    "    print(f\"Model: {name}\")\n",
    "    print(f\"F1 Score: {f1}\")\n",
    "    print(classification_report(y_test, y_pred))\n",
    "    print('==========================================================')\n",
    "    \n",
    "    # Compute ROC curve and ROC area\n",
    "    y_pred_proba = model.predict_proba(X_test)[:, 1]\n",
    "    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)\n",
    "    roc_auc = roc_auc_score(y_test, y_pred_proba)\n",
    "\n",
    "    return accuracy, precision, recall, f1, fpr, tpr, roc_auc\n",
    "\n",
    "# Defining the models\n",
    "models = [\n",
    "    (\"Logistic Regression\", LogisticRegression(class_weight='balanced', random_state=42, max_iter=500)),\n",
    "    (\"Decision Tree\", DecisionTreeClassifier(class_weight='balanced', random_state=42)),\n",
    "    (\"Random Forest\", RandomForestClassifier(class_weight='balanced', random_state=42)),\n",
    "    (\"XGBoost\", XGBClassifier(scale_pos_weight=1, eval_metric='logloss', random_state=42)),\n",
    "]\n",
    "\n",
    "# Applying the models and storing the results\n",
    "results = []\n",
    "roc_curves = []\n",
    "\n",
    "for name, model in models:\n",
    "    accuracy, precision, recall, f1, fpr, tpr, roc_auc = apply_model(model, X_train, y_train, X_test, y_test, name)\n",
    "    results.append((name, accuracy, precision, recall, f1))\n",
    "    roc_curves.append((name, fpr, tpr, roc_auc))\n",
    "\n",
    "# Plotting the ROC curves for each model\n",
    "plt.figure(figsize=(8, 6))\n",
    "plt.plot([0, 1], [0, 1], 'k--')\n",
    "plt.xlabel('False positive rate')\n",
    "plt.ylabel('True positive rate')\n",
    "plt.title('ROC curve comparison')\n",
    "for name, fpr, tpr, roc_auc in roc_curves:\n",
    "    plt.plot(fpr, tpr, label=f\"{name} (area = {roc_auc:.4f})\")\n",
    "plt.legend(loc='best')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5e448e76",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}