Data-Science-Assignment / World Happiness report / Data Science Assignments.ipynb
Data Science Assignments.ipynb
Raw
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "f1cdad04",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    Unnamed: 0_level_0       Unnamed: 1_level_0       Unnamed: 2_level_0  \\\n",
      "    Unnamed: 0_level_1 Total populationa (000s) Total populationa (000s)   \n",
      "1                  NaN                     Male                   Female   \n",
      "2  Countries and areas                     2021                      NaN   \n",
      "3          Afghanistan                    20255                    19845   \n",
      "4              Albania                     1426                     1429   \n",
      "5              Algeria                    22497                    21681   \n",
      "\n",
      "        Unnamed: 3_level_0                Unnamed: 4_level_0  \\\n",
      "  Total populationa (000s) Life expectancy at birthb (years)   \n",
      "1               Both sexes                              Male   \n",
      "2                      NaN                              2019   \n",
      "3                    40099                              63.3   \n",
      "4                     2855                              76.3   \n",
      "5                    44178                              76.2   \n",
      "\n",
      "                 Unnamed: 5_level_0                Unnamed: 6_level_0  \\\n",
      "  Life expectancy at birthb (years) Life expectancy at birthb (years)   \n",
      "1                            Female                        Both sexes   \n",
      "2                               NaN                               NaN   \n",
      "3                              63.2                              63.2   \n",
      "4                              79.9                                78   \n",
      "5                              78.1                              77.1   \n",
      "\n",
      "                         Unnamed: 7_level_0  \\\n",
      "  Healthy life expectancy at birthb (years)   \n",
      "1                                      Male   \n",
      "2                                      2019   \n",
      "3                                      54.7   \n",
      "4                                        68   \n",
      "5                                      66.7   \n",
      "\n",
      "                         Unnamed: 8_level_0  \\\n",
      "  Healthy life expectancy at birthb (years)   \n",
      "1                                    Female   \n",
      "2                                       NaN   \n",
      "3                                      53.2   \n",
      "4                                      70.3   \n",
      "5                                      66.1   \n",
      "\n",
      "                         Unnamed: 9_level_0  ...  \\\n",
      "  Healthy life expectancy at birthb (years)  ...   \n",
      "1                                Both sexes  ...   \n",
      "2                                       NaN  ...   \n",
      "3                                      53.9  ...   \n",
      "4                                      69.1  ...   \n",
      "5                                      66.4  ...   \n",
      "\n",
      "                                                             3.1  \\\n",
      "  Proportion of births attended by skilled health personneld (%)   \n",
      "1                                                NaN               \n",
      "2                                          2013–2022               \n",
      "3                                                 62               \n",
      "4                                                100               \n",
      "5                                                 99               \n",
      "\n",
      "                                                                    \\\n",
      "  Proportion of births attended by skilled health personneld (%).1   \n",
      "1                                                NaN                 \n",
      "2                                                NaN                 \n",
      "3                                                NaN                 \n",
      "4                                                NaN                 \n",
      "5                                                NaN                 \n",
      "\n",
      "                                                3.2  \\\n",
      "  Under-five mortality ratee (per 1000 live births)   \n",
      "1                                               NaN   \n",
      "2                                              2021   \n",
      "3                                                56   \n",
      "4                                                 9   \n",
      "5                                                22   \n",
      "\n",
      "                                                   \\\n",
      "  Neonatal mortality ratee (per 1000 live births)   \n",
      "1                                             NaN   \n",
      "2                                            2021   \n",
      "3                                              34   \n",
      "4                                               7   \n",
      "5                                              16   \n",
      "\n",
      "                                                   3.3  \\\n",
      "  New HIV infectionsf (per 1000 uninfected population)   \n",
      "1                                                NaN     \n",
      "2                                               2021     \n",
      "3                                               0.04     \n",
      "4                                               0.03     \n",
      "5                                               0.04     \n",
      "\n",
      "                                                    \\\n",
      "  Tuberculosis incidenceg (per 100 000 population)   \n",
      "1                                              NaN   \n",
      "2                                             2021   \n",
      "3                                              189   \n",
      "4                                               17   \n",
      "5                                               54   \n",
      "\n",
      "                                                    \\\n",
      "  Malaria incidenceh (per 1000 population at risk)   \n",
      "1                                              NaN   \n",
      "2                                             2021   \n",
      "3                                              6.3   \n",
      "4                                                -   \n",
      "5                                              0.0   \n",
      "\n",
      "                                                      \\\n",
      "  Malaria incidenceh (per 1000 population at risk).1   \n",
      "1                                                NaN   \n",
      "2                                                NaN   \n",
      "3                                                NaN   \n",
      "4                                                NaN   \n",
      "5                                                NaN   \n",
      "\n",
      "                                                                                    \\\n",
      "  Hepatitis B surface antigen (HBsAg) prevalence among children under 5 yearsi (%)   \n",
      "1                                                NaN                                 \n",
      "2                                               2020                                 \n",
      "3                                               0.39                                 \n",
      "4                                               0.29                                 \n",
      "5                                               0.08                                 \n",
      "\n",
      "                                                                   \n",
      "  Reported number of people requiring interventions against NTDsj  \n",
      "1                                                NaN               \n",
      "2                                               2021               \n",
      "3                                         14 367 281               \n",
      "4                                                 21               \n",
      "5                                              7 081               \n",
      "\n",
      "[5 rows x 21 columns]\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "# Read the Excel file with a multi-level header\n",
    "df = pd.read_excel(r\"C:\\My\\Top-up Degree\\Data Science\\Data Science - Assignment\\Data set\\WHO\\whs2023_annex1.xlsx\", sheet_name=\"Annex 1-1\", header=[0, 1])\n",
    "# Drop the first row\n",
    "df = df.drop(0)\n",
    "\n",
    "\n",
    "# # Rename columns\n",
    "# df = df.rename(columns={\"Unnamed: 0\": \"Country\", \"Unnamed: 1\": \"Total population (000s)\", \"Unnamed: 2\": \"Male population (000s)\", \"Unnamed: 3\": \"Female population (000s)\", \"Unnamed: 4\": \"Life expectancy at birth (male)\", \"Unnamed: 5\": \"Life expectancy at birth (female)\", \"Unnamed: 6\": \"Life expectancy at birth (both sexes)\", \"Unnamed: 7\": \"Healthy life expectancy at birth (male)\", \"Unnamed: 8\": \"Healthy life expectancy at birth (female)\", \"Unnamed: 9\": \"Healthy life expectancy at birth (both sexes)\", \"3.1\": \"Proportion of births attended by skilled health personnel (%)\", \"Unnamed: 11\": \"Primary data (%)\", \"Unnamed: 12\": \"Year\", \"3.2\": \"Under-five mortality rate (per 1000 live births)\", \"Unnamed: 14\": \"Neonatal mortality rate (per 1000 live births)\", \"3.3\": \"New HIV infections (per 1000 uninfected population)\", \"Unnamed: 16\": \"Tuberculosis incidence (per 100 000 population)\", \"Unnamed: 17\": \"Malaria incidence (per 1000 population at risk)\", # Drop the empty column\n",
    "#                 #\"\": float,\n",
    "#                 # Drop the column with mixed values\n",
    "#                 #\"Reported number of people requiring interventions against neglected tropical diseases\": int,\n",
    "#                 })\n",
    "\n",
    "# # Create a dictionary mapping of the column names and data types\n",
    "# data_types = {\"Total population (000s)\": int, \n",
    "#               \"Male population (000s)\": int, \n",
    "#               \"Female population (000s)\": int, \n",
    "#               \"Life expectancy at birth (male)\": float, \n",
    "#               \"Life expectancy at birth (female)\": float, \n",
    "#               \"Life expectancy at birth (both sexes)\": float, \n",
    "#               \"Healthy life expectancy at birth (male)\": float, \n",
    "#               \"Healthy life expectancy at birth (female)\": float, \n",
    "#               \"Healthy life expectancy at birth (both sexes)\": float, \n",
    "#               \"Proportion of births attended by skilled health personneld (%)\": float}\n",
    "\n",
    "# # Convert data types using the dictionary mapping\n",
    "# df = df.astype(data_types)\n",
    "\n",
    "\n",
    "# # Replace dashes with zero\n",
    "# df = df.replace(\"-\", 0)\n",
    "\n",
    "# # Filter out rows with dashes\n",
    "# df = df.query(\"Malaria incidence (per 1000 population at risk) != '-'\")\n",
    "\n",
    "# Inspect the DataFrame\n",
    "print(df.head())\n",
    "# print(df.info())\n",
    "# print(df.describe())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "df0a91e2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index([                                                              ['Unnamed: 0_level_0'],\n",
      "                                                               ['Total populationa (000s)'],\n",
      "                                                               ['Total populationa (000s)'],\n",
      "                                                               ['Total populationa (000s)'],\n",
      "                                                      ['Life expectancy at birthb (years)'],\n",
      "                                                      ['Life expectancy at birthb (years)'],\n",
      "                                                      ['Life expectancy at birthb (years)'],\n",
      "                                              ['Healthy life expectancy at birthb (years)'],\n",
      "                                              ['Healthy life expectancy at birthb (years)'],\n",
      "                                              ['Healthy life expectancy at birthb (years)'],\n",
      "                                    ['Maternal mortality ratioc (per 100 000 live births)'],\n",
      "                         ['Proportion of births attended by skilled health personneld (%)'],\n",
      "                         ['Proportion of births attended by skilled health personneld (%)'],\n",
      "                                      ['Under-five mortality ratee (per 1000 live births)'],\n",
      "                                        ['Neonatal mortality ratee (per 1000 live births)'],\n",
      "                                   ['New HIV infectionsf (per 1000 uninfected population)'],\n",
      "                                       ['Tuberculosis incidenceg (per 100 000 population)'],\n",
      "                                       ['Malaria incidenceh (per 1000 population at risk)'],\n",
      "                                       ['Malaria incidenceh (per 1000 population at risk)'],\n",
      "       ['Hepatitis B surface antigen (HBsAg) prevalence among children under 5 yearsi (%)'],\n",
      "                        ['Reported number of people requiring interventions against NTDsj']],\n",
      "      dtype='object') \n",
      "\n",
      "Index([           ['Data type'], ['Comparable estimates'],\n",
      "       ['Comparable estimates'], ['Comparable estimates'],\n",
      "       ['Comparable estimates'], ['Comparable estimates'],\n",
      "       ['Comparable estimates'], ['Comparable estimates'],\n",
      "       ['Comparable estimates'], ['Comparable estimates'],\n",
      "       ['Comparable estimates'],         ['Primary data'],\n",
      "               ['Primary data'], ['Comparable estimates'],\n",
      "       ['Comparable estimates'], ['Comparable estimates'],\n",
      "       ['Comparable estimates'], ['Comparable estimates'],\n",
      "       ['Comparable estimates'], ['Comparable estimates'],\n",
      "               ['Primary data']],\n",
      "      dtype='object') \n",
      "\n",
      "Index([ ['Unnamed: 0_level_2'],                ['Male'],\n",
      "                    ['Female'],          ['Both sexes'],\n",
      "                      ['Male'],              ['Female'],\n",
      "                ['Both sexes'],                ['Male'],\n",
      "                    ['Female'],          ['Both sexes'],\n",
      "       ['Unnamed: 10_level_2'], ['Unnamed: 11_level_2'],\n",
      "       ['Unnamed: 12_level_2'], ['Unnamed: 13_level_2'],\n",
      "       ['Unnamed: 14_level_2'], ['Unnamed: 15_level_2'],\n",
      "       ['Unnamed: 16_level_2'], ['Unnamed: 17_level_2'],\n",
      "       ['Unnamed: 18_level_2'], ['Unnamed: 19_level_2'],\n",
      "       ['Unnamed: 20_level_2']],\n",
      "      dtype='object') \n",
      "\n",
      "Index([['Countries and areas'],                     nan,\n",
      "        ['Unnamed: 2_level_3'],  ['Unnamed: 3_level_3'],\n",
      "                           nan,  ['Unnamed: 5_level_3'],\n",
      "        ['Unnamed: 6_level_3'],                     nan,\n",
      "        ['Unnamed: 8_level_3'],  ['Unnamed: 9_level_3'],\n",
      "                           nan,           ['2013–2022'],\n",
      "                 ['2013–2022'],                     nan,\n",
      "                           nan,                     nan,\n",
      "                           nan,                     nan,\n",
      "                           nan,                     nan,\n",
      "                           nan],\n",
      "      dtype='object') \n",
      "\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "# Read the Excel file with a multi-level header\n",
    "df = pd.read_excel(r\"C:\\My\\Top-up Degree\\Data Science\\Data Science - Assignment\\Data set\\WHO\\whs2023_annex1.xlsx\", sheet_name=\"Annex 1-1\", header=[1, 2, 3, 4])\n",
    "\n",
    "# Inspect the DataFrame\n",
    "# print(df.head())\n",
    "\n",
    "# Get the first level of the column header as an Index\n",
    "col_level0 = df.columns.get_level_values(0)\n",
    "col_level1 = df.columns.get_level_values(1)\n",
    "col_level2 = df.columns.get_level_values(2)\n",
    "col_level3 = df.columns.get_level_values(3)\n",
    "\n",
    "# Split the first level by ';'\n",
    "col_level0_split = col_level0.str.split(';')\n",
    "col_level1_split = col_level1.str.split(';')\n",
    "col_level2_split = col_level2.str.split(';')\n",
    "col_level3_split = col_level3.str.split(';')\n",
    "\n",
    "# Inspect the result\n",
    "print(col_level0_split, \"\\n\")\n",
    "print(col_level1_split, \"\\n\")\n",
    "print(col_level2_split, \"\\n\")\n",
    "print(col_level3_split, \"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "4a249153",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['Unnamed: 0_level_0_Data type_Unnamed: 0_level_2_Countries and areas',\n",
      "       'Total populationa (000s)_Comparable estimates_Male_2021',\n",
      "       'Total populationa (000s)_Comparable estimates_Female_Unnamed: 2_level_3',\n",
      "       'Total populationa (000s)_Comparable estimates_Both sexes_Unnamed: 3_level_3',\n",
      "       'Life expectancy at birthb (years)_Comparable estimates_Male_2019',\n",
      "       'Life expectancy at birthb (years)_Comparable estimates_Female_Unnamed: 5_level_3',\n",
      "       'Life expectancy at birthb (years)_Comparable estimates_Both sexes_Unnamed: 6_level_3',\n",
      "       'Healthy life expectancy at birthb (years)_Comparable estimates_Male_2019',\n",
      "       'Healthy life expectancy at birthb (years)_Comparable estimates_Female_Unnamed: 8_level_3',\n",
      "       'Healthy life expectancy at birthb (years)_Comparable estimates_Both sexes_Unnamed: 9_level_3',\n",
      "       'Maternal mortality ratioc (per 100 000 live births)_Comparable estimates_Unnamed: 10_level_2_2020',\n",
      "       'Proportion of births attended by skilled health personneld (%)_Primary data_Unnamed: 11_level_2_2013–2022',\n",
      "       'Proportion of births attended by skilled health personneld (%)_Primary data_Unnamed: 12_level_2_2013–2022',\n",
      "       'Under-five mortality ratee (per 1000 live births)_Comparable estimates_Unnamed: 13_level_2_2021',\n",
      "       'Neonatal mortality ratee (per 1000 live births)_Comparable estimates_Unnamed: 14_level_2_2021',\n",
      "       'New HIV infectionsf (per 1000 uninfected population)_Comparable estimates_Unnamed: 15_level_2_2021',\n",
      "       'Tuberculosis incidenceg (per 100 000 population)_Comparable estimates_Unnamed: 16_level_2_2021',\n",
      "       'Malaria incidenceh (per 1000 population at risk)_Comparable estimates_Unnamed: 17_level_2_2021',\n",
      "       'Malaria incidenceh (per 1000 population at risk)_Comparable estimates_Unnamed: 18_level_2_2021',\n",
      "       'Hepatitis B surface antigen (HBsAg) prevalence among children under 5 yearsi (%)_Comparable estimates_Unnamed: 19_level_2_2020',\n",
      "       'Reported number of people requiring interventions against NTDsj_Primary data_Unnamed: 20_level_2_2021'],\n",
      "      dtype='object')\n"
     ]
    },
    {
     "ename": "KeyError",
     "evalue": "['Country']",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[32], line 24\u001b[0m\n\u001b[0;32m     21\u001b[0m \u001b[38;5;28mprint\u001b[39m(df\u001b[38;5;241m.\u001b[39mcolumns)\n\u001b[0;32m     23\u001b[0m \u001b[38;5;66;03m# Drop the last row with NaN values\u001b[39;00m\n\u001b[1;32m---> 24\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdropna\u001b[49m\u001b[43m(\u001b[49m\u001b[43msubset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mCountry\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     26\u001b[0m \u001b[38;5;66;03m# Inspect unique values in each column\u001b[39;00m\n\u001b[0;32m     27\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m col \u001b[38;5;129;01min\u001b[39;00m df\u001b[38;5;241m.\u001b[39mcolumns:\n",
      "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\util\\_decorators.py:331\u001b[0m, in \u001b[0;36mdeprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m    325\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(args) \u001b[38;5;241m>\u001b[39m num_allow_args:\n\u001b[0;32m    326\u001b[0m     warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[0;32m    327\u001b[0m         msg\u001b[38;5;241m.\u001b[39mformat(arguments\u001b[38;5;241m=\u001b[39m_format_argument_list(allow_args)),\n\u001b[0;32m    328\u001b[0m         \u001b[38;5;167;01mFutureWarning\u001b[39;00m,\n\u001b[0;32m    329\u001b[0m         stacklevel\u001b[38;5;241m=\u001b[39mfind_stack_level(),\n\u001b[0;32m    330\u001b[0m     )\n\u001b[1;32m--> 331\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
      "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\core\\frame.py:6561\u001b[0m, in \u001b[0;36mDataFrame.dropna\u001b[1;34m(self, axis, how, thresh, subset, inplace)\u001b[0m\n\u001b[0;32m   6559\u001b[0m     check \u001b[38;5;241m=\u001b[39m indices \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m\n\u001b[0;32m   6560\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m check\u001b[38;5;241m.\u001b[39many():\n\u001b[1;32m-> 6561\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(np\u001b[38;5;241m.\u001b[39marray(subset)[check]\u001b[38;5;241m.\u001b[39mtolist())\n\u001b[0;32m   6562\u001b[0m     agg_obj \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtake(indices, axis\u001b[38;5;241m=\u001b[39magg_axis)\n\u001b[0;32m   6564\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m thresh \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m no_default:\n",
      "\u001b[1;31mKeyError\u001b[0m: ['Country']"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# Read the Excel file with a multi-level header\n",
    "file_path = r\"C:\\My\\Top-up Degree\\Data Science\\Data Science - Assignment\\Data set\\WHO\\whs2023_annex1.xlsx\"\n",
    "df = pd.read_excel(file_path, sheet_name=\"Annex 1-1\", header=[1, 2, 3, 4])\n",
    "\n",
    "# Drop the first row\n",
    "df = df.drop(0)\n",
    "\n",
    "# Combine MultiIndex levels into a single level using list comprehension\n",
    "df.columns = ['_'.join(filter(None, map(str, col))) for col in df.columns]\n",
    "\n",
    "# Rename the first column\n",
    "df.rename(columns={'nan_nan_nan_nan': 'Country'}, inplace=True)\n",
    "\n",
    "# Convert numeric columns to float type\n",
    "numeric_cols = df.columns.difference(['Country'])\n",
    "df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')\n",
    "\n",
    "# Print the column names to check if 'Country' exists\n",
    "print(df.columns)\n",
    "\n",
    "# Drop the last row with NaN values\n",
    "df = df.dropna(subset=['Country'])\n",
    "\n",
    "# Inspect unique values in each column\n",
    "for col in df.columns:\n",
    "    unique_values = df[col].unique()\n",
    "    print(f\"{col}: {unique_values}\")\n",
    "\n",
    "# Inspect the result\n",
    "print(df.head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "5f71ae9e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Country</th>\n",
       "      <th>Total populationa (000s)</th>\n",
       "      <th>Total populationa (000s)</th>\n",
       "      <th>Total populationa (000s)</th>\n",
       "      <th>Life expectancy at birthb (years)</th>\n",
       "      <th>Life expectancy at birthb (years)</th>\n",
       "      <th>Life expectancy at birthb (years)</th>\n",
       "      <th>Healthy life expectancy at birthb (years)</th>\n",
       "      <th>Healthy life expectancy at birthb (years)</th>\n",
       "      <th>Healthy life expectancy at birthb (years)</th>\n",
       "      <th>...</th>\n",
       "      <th>Proportion of births attended by skilled health personneld (%)</th>\n",
       "      <th>Proportion of births attended by skilled health personneld (%)</th>\n",
       "      <th>Under-five mortality ratee (per 1000 live births)</th>\n",
       "      <th>Neonatal mortality ratee (per 1000 live births)</th>\n",
       "      <th>New HIV infectionsf (per 1000 uninfected population)</th>\n",
       "      <th>Tuberculosis incidenceg (per 100 000 population)</th>\n",
       "      <th>Malaria incidenceh (per 1000 population at risk)</th>\n",
       "      <th>Malaria incidenceh (per 1000 population at risk)</th>\n",
       "      <th>Hepatitis B surface antigen (HBsAg) prevalence among children under 5 yearsi (%)</th>\n",
       "      <th>Reported number of people requiring interventions against NTDsj</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>20255</td>\n",
       "      <td>19845</td>\n",
       "      <td>40099.0</td>\n",
       "      <td>63.3</td>\n",
       "      <td>63.2</td>\n",
       "      <td>63.2</td>\n",
       "      <td>54.7</td>\n",
       "      <td>53.2</td>\n",
       "      <td>53.9</td>\n",
       "      <td>...</td>\n",
       "      <td>62</td>\n",
       "      <td>NaN</td>\n",
       "      <td>56</td>\n",
       "      <td>34</td>\n",
       "      <td>0.04</td>\n",
       "      <td>189.0</td>\n",
       "      <td>6.3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.39</td>\n",
       "      <td>14 367 281</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Albania</td>\n",
       "      <td>1426</td>\n",
       "      <td>1429</td>\n",
       "      <td>2855.0</td>\n",
       "      <td>76.3</td>\n",
       "      <td>79.9</td>\n",
       "      <td>78</td>\n",
       "      <td>68</td>\n",
       "      <td>70.3</td>\n",
       "      <td>69.1</td>\n",
       "      <td>...</td>\n",
       "      <td>100</td>\n",
       "      <td>NaN</td>\n",
       "      <td>9</td>\n",
       "      <td>7</td>\n",
       "      <td>0.03</td>\n",
       "      <td>17.0</td>\n",
       "      <td>-</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.29</td>\n",
       "      <td>21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Algeria</td>\n",
       "      <td>22497</td>\n",
       "      <td>21681</td>\n",
       "      <td>44178.0</td>\n",
       "      <td>76.2</td>\n",
       "      <td>78.1</td>\n",
       "      <td>77.1</td>\n",
       "      <td>66.7</td>\n",
       "      <td>66.1</td>\n",
       "      <td>66.4</td>\n",
       "      <td>...</td>\n",
       "      <td>99</td>\n",
       "      <td>NaN</td>\n",
       "      <td>22</td>\n",
       "      <td>16</td>\n",
       "      <td>0.04</td>\n",
       "      <td>54.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.08</td>\n",
       "      <td>7 081</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Andorra</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>79.0</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>...</td>\n",
       "      <td>100</td>\n",
       "      <td>ak</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>-</td>\n",
       "      <td>2.9</td>\n",
       "      <td>-</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.02</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Angola</td>\n",
       "      <td>17051</td>\n",
       "      <td>17452</td>\n",
       "      <td>34504.0</td>\n",
       "      <td>60.7</td>\n",
       "      <td>65.5</td>\n",
       "      <td>63.1</td>\n",
       "      <td>53.6</td>\n",
       "      <td>56.2</td>\n",
       "      <td>54.8</td>\n",
       "      <td>...</td>\n",
       "      <td>50</td>\n",
       "      <td>NaN</td>\n",
       "      <td>69</td>\n",
       "      <td>27</td>\n",
       "      <td>0.52</td>\n",
       "      <td>325.0</td>\n",
       "      <td>254.9</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4.57</td>\n",
       "      <td>8 558 522</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       Country  Total populationa (000s)  Total populationa (000s)  \\\n",
       "0  Afghanistan                     20255                     19845   \n",
       "1      Albania                      1426                      1429   \n",
       "2      Algeria                     22497                     21681   \n",
       "3      Andorra                         -                         -   \n",
       "4       Angola                     17051                     17452   \n",
       "\n",
       "   Total populationa (000s) Life expectancy at birthb (years)  \\\n",
       "0                   40099.0                              63.3   \n",
       "1                    2855.0                              76.3   \n",
       "2                   44178.0                              76.2   \n",
       "3                      79.0                                 -   \n",
       "4                   34504.0                              60.7   \n",
       "\n",
       "  Life expectancy at birthb (years) Life expectancy at birthb (years)  \\\n",
       "0                              63.2                              63.2   \n",
       "1                              79.9                                78   \n",
       "2                              78.1                              77.1   \n",
       "3                                 -                                 -   \n",
       "4                              65.5                              63.1   \n",
       "\n",
       "  Healthy life expectancy at birthb (years)  \\\n",
       "0                                      54.7   \n",
       "1                                        68   \n",
       "2                                      66.7   \n",
       "3                                         -   \n",
       "4                                      53.6   \n",
       "\n",
       "  Healthy life expectancy at birthb (years)  \\\n",
       "0                                      53.2   \n",
       "1                                      70.3   \n",
       "2                                      66.1   \n",
       "3                                         -   \n",
       "4                                      56.2   \n",
       "\n",
       "  Healthy life expectancy at birthb (years)  ...  \\\n",
       "0                                      53.9  ...   \n",
       "1                                      69.1  ...   \n",
       "2                                      66.4  ...   \n",
       "3                                         -  ...   \n",
       "4                                      54.8  ...   \n",
       "\n",
       "  Proportion of births attended by skilled health personneld (%)  \\\n",
       "0                                                 62               \n",
       "1                                                100               \n",
       "2                                                 99               \n",
       "3                                                100               \n",
       "4                                                 50               \n",
       "\n",
       "  Proportion of births attended by skilled health personneld (%)  \\\n",
       "0                                                NaN               \n",
       "1                                                NaN               \n",
       "2                                                NaN               \n",
       "3                                                 ak               \n",
       "4                                                NaN               \n",
       "\n",
       "  Under-five mortality ratee (per 1000 live births)  \\\n",
       "0                                                56   \n",
       "1                                                 9   \n",
       "2                                                22   \n",
       "3                                                 3   \n",
       "4                                                69   \n",
       "\n",
       "  Neonatal mortality ratee (per 1000 live births)  \\\n",
       "0                                              34   \n",
       "1                                               7   \n",
       "2                                              16   \n",
       "3                                               1   \n",
       "4                                              27   \n",
       "\n",
       "  New HIV infectionsf (per 1000 uninfected population)  \\\n",
       "0                                               0.04     \n",
       "1                                               0.03     \n",
       "2                                               0.04     \n",
       "3                                                  -     \n",
       "4                                               0.52     \n",
       "\n",
       "  Tuberculosis incidenceg (per 100 000 population)  \\\n",
       "0                                            189.0   \n",
       "1                                             17.0   \n",
       "2                                             54.0   \n",
       "3                                              2.9   \n",
       "4                                            325.0   \n",
       "\n",
       "  Malaria incidenceh (per 1000 population at risk)  \\\n",
       "0                                              6.3   \n",
       "1                                                -   \n",
       "2                                              0.0   \n",
       "3                                                -   \n",
       "4                                            254.9   \n",
       "\n",
       "  Malaria incidenceh (per 1000 population at risk)  \\\n",
       "0                                              NaN   \n",
       "1                                              NaN   \n",
       "2                                              NaN   \n",
       "3                                              NaN   \n",
       "4                                              NaN   \n",
       "\n",
       "   Hepatitis B surface antigen (HBsAg) prevalence among children under 5 yearsi (%)  \\\n",
       "0                                               0.39                                  \n",
       "1                                               0.29                                  \n",
       "2                                               0.08                                  \n",
       "3                                               0.02                                  \n",
       "4                                               4.57                                  \n",
       "\n",
       "  Reported number of people requiring interventions against NTDsj  \n",
       "0                                         14 367 281               \n",
       "1                                                 21               \n",
       "2                                              7 081               \n",
       "3                                                  0               \n",
       "4                                          8 558 522               \n",
       "\n",
       "[5 rows x 21 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "# Read the Excel file with a multi-level header\n",
    "df = pd.read_excel(r\"C:\\My\\Top-up Degree\\Data Science\\Data Science - Assignment\\Data set\\WHO\\whs2023_annex1.xlsx\", sheet_name=\"Annex 1-1\", header=[1, 2, 3, 4])\n",
    "\n",
    "df = df.rename(columns={\"Unnamed: 0_level_0\": \"Country\", \"Unnamed: 1_level_0\": \"Population\", \"Unnamed: 4_level_0\": \"Life expectancy\"})\n",
    "df.columns = df.columns.droplevel([1, 2, 3])\n",
    "\n",
    "display(df.head())\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "7c272335",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\ziyan\\AppData\\Local\\Temp\\ipykernel_11820\\2273432563.py:10: PerformanceWarning: dropping on a non-lexsorted multi-index without a level parameter may impact performance.\n",
      "  df = df.drop(columns=['Unnamed: 0_level_0', 'Unnamed: 0_level_1', 'Unnamed: 0_level_2'])\n"
     ]
    },
    {
     "ename": "KeyError",
     "evalue": "'Unnamed: 0_level_1'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\core\\indexes\\base.py:3802\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[1;34m(self, key, method, tolerance)\u001b[0m\n\u001b[0;32m   3801\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m-> 3802\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   3803\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n",
      "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\_libs\\index.pyx:138\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[1;34m()\u001b[0m\n",
      "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\_libs\\index.pyx:165\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[1;34m()\u001b[0m\n",
      "File \u001b[1;32mpandas\\_libs\\hashtable_class_helper.pxi:5745\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[1;34m()\u001b[0m\n",
      "File \u001b[1;32mpandas\\_libs\\hashtable_class_helper.pxi:5753\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[1;34m()\u001b[0m\n",
      "\u001b[1;31mKeyError\u001b[0m: 'Unnamed: 0_level_1'",
      "\nThe above exception was the direct cause of the following exception:\n",
      "\u001b[1;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[3], line 10\u001b[0m\n\u001b[0;32m      7\u001b[0m df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_excel(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mC:\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mMy\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mTop-up Degree\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mData Science\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mData Science - Assignment\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mData set\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mWHO\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mwhs2023_annex1.xlsx\u001b[39m\u001b[38;5;124m\"\u001b[39m, sheet_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAnnex 1-1\u001b[39m\u001b[38;5;124m\"\u001b[39m, header\u001b[38;5;241m=\u001b[39m[\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m, \u001b[38;5;241m3\u001b[39m])\n\u001b[0;32m      9\u001b[0m \u001b[38;5;66;03m# Drop columns that aren't useful\u001b[39;00m\n\u001b[1;32m---> 10\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mUnnamed: 0_level_0\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mUnnamed: 0_level_1\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mUnnamed: 0_level_2\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     12\u001b[0m \u001b[38;5;66;03m# Drop rows with missing values\u001b[39;00m\n\u001b[0;32m     13\u001b[0m df \u001b[38;5;241m=\u001b[39m df\u001b[38;5;241m.\u001b[39mdropna()\n",
      "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\util\\_decorators.py:331\u001b[0m, in \u001b[0;36mdeprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m    325\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(args) \u001b[38;5;241m>\u001b[39m num_allow_args:\n\u001b[0;32m    326\u001b[0m     warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[0;32m    327\u001b[0m         msg\u001b[38;5;241m.\u001b[39mformat(arguments\u001b[38;5;241m=\u001b[39m_format_argument_list(allow_args)),\n\u001b[0;32m    328\u001b[0m         \u001b[38;5;167;01mFutureWarning\u001b[39;00m,\n\u001b[0;32m    329\u001b[0m         stacklevel\u001b[38;5;241m=\u001b[39mfind_stack_level(),\n\u001b[0;32m    330\u001b[0m     )\n\u001b[1;32m--> 331\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
      "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\core\\frame.py:5399\u001b[0m, in \u001b[0;36mDataFrame.drop\u001b[1;34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[0m\n\u001b[0;32m   5251\u001b[0m \u001b[38;5;129m@deprecate_nonkeyword_arguments\u001b[39m(version\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, allowed_args\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mself\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlabels\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[0;32m   5252\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdrop\u001b[39m(  \u001b[38;5;66;03m# type: ignore[override]\u001b[39;00m\n\u001b[0;32m   5253\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   5260\u001b[0m     errors: IgnoreRaise \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mraise\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m   5261\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataFrame \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m   5262\u001b[0m     \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m   5263\u001b[0m \u001b[38;5;124;03m    Drop specified labels from rows or columns.\u001b[39;00m\n\u001b[0;32m   5264\u001b[0m \n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   5397\u001b[0m \u001b[38;5;124;03m            weight  1.0     0.8\u001b[39;00m\n\u001b[0;32m   5398\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[1;32m-> 5399\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   5400\u001b[0m \u001b[43m        \u001b[49m\u001b[43mlabels\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5401\u001b[0m \u001b[43m        \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5402\u001b[0m \u001b[43m        \u001b[49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5403\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5404\u001b[0m \u001b[43m        \u001b[49m\u001b[43mlevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlevel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5405\u001b[0m \u001b[43m        \u001b[49m\u001b[43minplace\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minplace\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5406\u001b[0m \u001b[43m        \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5407\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\util\\_decorators.py:331\u001b[0m, in \u001b[0;36mdeprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m    325\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(args) \u001b[38;5;241m>\u001b[39m num_allow_args:\n\u001b[0;32m    326\u001b[0m     warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[0;32m    327\u001b[0m         msg\u001b[38;5;241m.\u001b[39mformat(arguments\u001b[38;5;241m=\u001b[39m_format_argument_list(allow_args)),\n\u001b[0;32m    328\u001b[0m         \u001b[38;5;167;01mFutureWarning\u001b[39;00m,\n\u001b[0;32m    329\u001b[0m         stacklevel\u001b[38;5;241m=\u001b[39mfind_stack_level(),\n\u001b[0;32m    330\u001b[0m     )\n\u001b[1;32m--> 331\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
      "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\core\\generic.py:4505\u001b[0m, in \u001b[0;36mNDFrame.drop\u001b[1;34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[0m\n\u001b[0;32m   4503\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m axis, labels \u001b[38;5;129;01min\u001b[39;00m axes\u001b[38;5;241m.\u001b[39mitems():\n\u001b[0;32m   4504\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m labels \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m-> 4505\u001b[0m         obj \u001b[38;5;241m=\u001b[39m \u001b[43mobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_drop_axis\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlevel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   4507\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inplace:\n\u001b[0;32m   4508\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_update_inplace(obj)\n",
      "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\core\\generic.py:4546\u001b[0m, in \u001b[0;36mNDFrame._drop_axis\u001b[1;34m(self, labels, axis, level, errors, only_slice)\u001b[0m\n\u001b[0;32m   4544\u001b[0m         new_axis \u001b[38;5;241m=\u001b[39m axis\u001b[38;5;241m.\u001b[39mdrop(labels, level\u001b[38;5;241m=\u001b[39mlevel, errors\u001b[38;5;241m=\u001b[39merrors)\n\u001b[0;32m   4545\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 4546\u001b[0m         new_axis \u001b[38;5;241m=\u001b[39m \u001b[43maxis\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   4547\u001b[0m     indexer \u001b[38;5;241m=\u001b[39m axis\u001b[38;5;241m.\u001b[39mget_indexer(new_axis)\n\u001b[0;32m   4549\u001b[0m \u001b[38;5;66;03m# Case for non-unique axis\u001b[39;00m\n\u001b[0;32m   4550\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n",
      "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\core\\indexes\\multi.py:2270\u001b[0m, in \u001b[0;36mMultiIndex.drop\u001b[1;34m(self, codes, level, errors)\u001b[0m\n\u001b[0;32m   2268\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m level_codes \u001b[38;5;129;01min\u001b[39;00m codes:\n\u001b[0;32m   2269\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m-> 2270\u001b[0m         loc \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlevel_codes\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   2271\u001b[0m         \u001b[38;5;66;03m# get_loc returns either an integer, a slice, or a boolean\u001b[39;00m\n\u001b[0;32m   2272\u001b[0m         \u001b[38;5;66;03m# mask\u001b[39;00m\n\u001b[0;32m   2273\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(loc, \u001b[38;5;28mint\u001b[39m):\n",
      "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\core\\indexes\\multi.py:2916\u001b[0m, in \u001b[0;36mMultiIndex.get_loc\u001b[1;34m(self, key, method)\u001b[0m\n\u001b[0;32m   2913\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m mask\n\u001b[0;32m   2915\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, \u001b[38;5;28mtuple\u001b[39m):\n\u001b[1;32m-> 2916\u001b[0m     loc \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_level_indexer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m   2917\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m _maybe_to_slice(loc)\n\u001b[0;32m   2919\u001b[0m keylen \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(key)\n",
      "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\core\\indexes\\multi.py:3263\u001b[0m, in \u001b[0;36mMultiIndex._get_level_indexer\u001b[1;34m(self, key, level, indexer)\u001b[0m\n\u001b[0;32m   3259\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mslice\u001b[39m(i, j, step)\n\u001b[0;32m   3261\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 3263\u001b[0m     idx \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_loc_single_level_index\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlevel_index\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   3265\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m level \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_lexsort_depth \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m   3266\u001b[0m         \u001b[38;5;66;03m# Desired level is not sorted\u001b[39;00m\n\u001b[0;32m   3267\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(idx, \u001b[38;5;28mslice\u001b[39m):\n\u001b[0;32m   3268\u001b[0m             \u001b[38;5;66;03m# test_get_loc_partial_timestamp_multiindex\u001b[39;00m\n",
      "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\core\\indexes\\multi.py:2849\u001b[0m, in \u001b[0;36mMultiIndex._get_loc_single_level_index\u001b[1;34m(self, level_index, key)\u001b[0m\n\u001b[0;32m   2847\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m\n\u001b[0;32m   2848\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 2849\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mlevel_index\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\core\\indexes\\base.py:3804\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[1;34m(self, key, method, tolerance)\u001b[0m\n\u001b[0;32m   3802\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine\u001b[38;5;241m.\u001b[39mget_loc(casted_key)\n\u001b[0;32m   3803\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[1;32m-> 3804\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[0;32m   3805\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[0;32m   3806\u001b[0m     \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[0;32m   3807\u001b[0m     \u001b[38;5;66;03m#  InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[0;32m   3808\u001b[0m     \u001b[38;5;66;03m#  the TypeError.\u001b[39;00m\n\u001b[0;32m   3809\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n",
      "\u001b[1;31mKeyError\u001b[0m: 'Unnamed: 0_level_1'"
     ]
    }
   ],
   "source": [
    "# Import libraries\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "# Read the Excel file with a multi-level header\n",
    "df = pd.read_excel(r\"C:\\My\\Top-up Degree\\Data Science\\Data Science - Assignment\\Data set\\WHO\\whs2023_annex1.xlsx\", sheet_name=\"Annex 1-1\", header=[1, 2, 3])\n",
    "\n",
    "# Drop columns that aren't useful\n",
    "df = df.drop(columns=['Unnamed: 0_level_0', 'Unnamed: 0_level_1', 'Unnamed: 0_level_2'])\n",
    "\n",
    "# Drop rows with missing values\n",
    "df = df.dropna()\n",
    "\n",
    "# Create dummy variables for categorical variables\n",
    "df = pd.get_dummies(df)\n",
    "\n",
    "# Convert the DataFrame to NumPy array\n",
    "X = df.values\n",
    "\n",
    "# Scale the numerical variables using standardization\n",
    "scaler = StandardScaler()\n",
    "X_scaled = scaler.fit_transform(X)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "61825f75",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Country</th>\n",
       "      <th>Total populationa (000s)</th>\n",
       "      <th>Total populationa (000s)</th>\n",
       "      <th>Total populationa (000s)</th>\n",
       "      <th>Life expectancy at birthb (years)</th>\n",
       "      <th>Life expectancy at birthb (years)</th>\n",
       "      <th>Life expectancy at birthb (years)</th>\n",
       "      <th>Healthy life expectancy at birthb (years)</th>\n",
       "      <th>Healthy life expectancy at birthb (years)</th>\n",
       "      <th>Healthy life expectancy at birthb (years)</th>\n",
       "      <th>...</th>\n",
       "      <th>Proportion of births attended by skilled health personneld (%)</th>\n",
       "      <th>Proportion of births attended by skilled health personneld (%)</th>\n",
       "      <th>Under-five mortality ratee (per 1000 live births)</th>\n",
       "      <th>Neonatal mortality ratee (per 1000 live births)</th>\n",
       "      <th>New HIV infectionsf (per 1000 uninfected population)</th>\n",
       "      <th>Tuberculosis incidenceg (per 100 000 population)</th>\n",
       "      <th>Malaria incidenceh (per 1000 population at risk)</th>\n",
       "      <th>Malaria incidenceh (per 1000 population at risk)</th>\n",
       "      <th>Hepatitis B surface antigen (HBsAg) prevalence among children under 5 yearsi (%)</th>\n",
       "      <th>Reported number of people requiring interventions against NTDsj</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>20255</td>\n",
       "      <td>19845</td>\n",
       "      <td>40099.0</td>\n",
       "      <td>63.3</td>\n",
       "      <td>63.2</td>\n",
       "      <td>63.2</td>\n",
       "      <td>54.7</td>\n",
       "      <td>53.2</td>\n",
       "      <td>53.9</td>\n",
       "      <td>...</td>\n",
       "      <td>62</td>\n",
       "      <td>NaN</td>\n",
       "      <td>56</td>\n",
       "      <td>34</td>\n",
       "      <td>0.04</td>\n",
       "      <td>189.0</td>\n",
       "      <td>6.3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.39</td>\n",
       "      <td>14 367 281</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Albania</td>\n",
       "      <td>1426</td>\n",
       "      <td>1429</td>\n",
       "      <td>2855.0</td>\n",
       "      <td>76.3</td>\n",
       "      <td>79.9</td>\n",
       "      <td>78</td>\n",
       "      <td>68</td>\n",
       "      <td>70.3</td>\n",
       "      <td>69.1</td>\n",
       "      <td>...</td>\n",
       "      <td>100</td>\n",
       "      <td>NaN</td>\n",
       "      <td>9</td>\n",
       "      <td>7</td>\n",
       "      <td>0.03</td>\n",
       "      <td>17.0</td>\n",
       "      <td>-</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.29</td>\n",
       "      <td>21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Algeria</td>\n",
       "      <td>22497</td>\n",
       "      <td>21681</td>\n",
       "      <td>44178.0</td>\n",
       "      <td>76.2</td>\n",
       "      <td>78.1</td>\n",
       "      <td>77.1</td>\n",
       "      <td>66.7</td>\n",
       "      <td>66.1</td>\n",
       "      <td>66.4</td>\n",
       "      <td>...</td>\n",
       "      <td>99</td>\n",
       "      <td>NaN</td>\n",
       "      <td>22</td>\n",
       "      <td>16</td>\n",
       "      <td>0.04</td>\n",
       "      <td>54.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.08</td>\n",
       "      <td>7 081</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Andorra</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>79.0</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>...</td>\n",
       "      <td>100</td>\n",
       "      <td>ak</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>-</td>\n",
       "      <td>2.9</td>\n",
       "      <td>-</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.02</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Angola</td>\n",
       "      <td>17051</td>\n",
       "      <td>17452</td>\n",
       "      <td>34504.0</td>\n",
       "      <td>60.7</td>\n",
       "      <td>65.5</td>\n",
       "      <td>63.1</td>\n",
       "      <td>53.6</td>\n",
       "      <td>56.2</td>\n",
       "      <td>54.8</td>\n",
       "      <td>...</td>\n",
       "      <td>50</td>\n",
       "      <td>NaN</td>\n",
       "      <td>69</td>\n",
       "      <td>27</td>\n",
       "      <td>0.52</td>\n",
       "      <td>325.0</td>\n",
       "      <td>254.9</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4.57</td>\n",
       "      <td>8 558 522</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>203</th>\n",
       "      <td>European Region</td>\n",
       "      <td>451859</td>\n",
       "      <td>478950</td>\n",
       "      <td>930809.0</td>\n",
       "      <td>75.1</td>\n",
       "      <td>81.3</td>\n",
       "      <td>78.2</td>\n",
       "      <td>66.6</td>\n",
       "      <td>70</td>\n",
       "      <td>68.3</td>\n",
       "      <td>...</td>\n",
       "      <td>99</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8</td>\n",
       "      <td>4</td>\n",
       "      <td>0.20</td>\n",
       "      <td>25.0</td>\n",
       "      <td>-</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.26</td>\n",
       "      <td>5 951 752</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>204</th>\n",
       "      <td>Eastern Mediterranean Region</td>\n",
       "      <td>392503</td>\n",
       "      <td>374038</td>\n",
       "      <td>766542.0</td>\n",
       "      <td>68.3</td>\n",
       "      <td>71.3</td>\n",
       "      <td>69.7</td>\n",
       "      <td>60.2</td>\n",
       "      <td>60.7</td>\n",
       "      <td>60.4</td>\n",
       "      <td>...</td>\n",
       "      <td>85</td>\n",
       "      <td>NaN</td>\n",
       "      <td>45</td>\n",
       "      <td>25</td>\n",
       "      <td>0.06</td>\n",
       "      <td>112.0</td>\n",
       "      <td>11.6</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.84</td>\n",
       "      <td>72 120 780</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>205</th>\n",
       "      <td>Western Pacific Region</td>\n",
       "      <td>980226</td>\n",
       "      <td>952584</td>\n",
       "      <td>1932809.0</td>\n",
       "      <td>74.8</td>\n",
       "      <td>80.8</td>\n",
       "      <td>77.7</td>\n",
       "      <td>67</td>\n",
       "      <td>70.2</td>\n",
       "      <td>68.6</td>\n",
       "      <td>...</td>\n",
       "      <td>97</td>\n",
       "      <td>NaN</td>\n",
       "      <td>12</td>\n",
       "      <td>6</td>\n",
       "      <td>0.06</td>\n",
       "      <td>98.0</td>\n",
       "      <td>1.9</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.3</td>\n",
       "      <td>72 372 509</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>206</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>207</th>\n",
       "      <td>Global</td>\n",
       "      <td>3973370</td>\n",
       "      <td>3929291</td>\n",
       "      <td>7902660.0</td>\n",
       "      <td>70.8</td>\n",
       "      <td>75.9</td>\n",
       "      <td>73.3</td>\n",
       "      <td>62.5</td>\n",
       "      <td>64.9</td>\n",
       "      <td>63.7</td>\n",
       "      <td>...</td>\n",
       "      <td>86</td>\n",
       "      <td>NaN</td>\n",
       "      <td>38</td>\n",
       "      <td>18</td>\n",
       "      <td>0.19</td>\n",
       "      <td>134.0</td>\n",
       "      <td>59.2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.94</td>\n",
       "      <td>1 653 289 348</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>208 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                          Country  Total populationa (000s)  \\\n",
       "0                     Afghanistan                     20255   \n",
       "1                         Albania                      1426   \n",
       "2                         Algeria                     22497   \n",
       "3                         Andorra                         -   \n",
       "4                          Angola                     17051   \n",
       "..                            ...                       ...   \n",
       "203               European Region                    451859   \n",
       "204  Eastern Mediterranean Region                    392503   \n",
       "205        Western Pacific Region                    980226   \n",
       "206                           NaN                       NaN   \n",
       "207                        Global                   3973370   \n",
       "\n",
       "     Total populationa (000s)  Total populationa (000s)  \\\n",
       "0                       19845                   40099.0   \n",
       "1                        1429                    2855.0   \n",
       "2                       21681                   44178.0   \n",
       "3                           -                      79.0   \n",
       "4                       17452                   34504.0   \n",
       "..                        ...                       ...   \n",
       "203                    478950                  930809.0   \n",
       "204                    374038                  766542.0   \n",
       "205                    952584                 1932809.0   \n",
       "206                       NaN                       NaN   \n",
       "207                   3929291                 7902660.0   \n",
       "\n",
       "    Life expectancy at birthb (years) Life expectancy at birthb (years)  \\\n",
       "0                                63.3                              63.2   \n",
       "1                                76.3                              79.9   \n",
       "2                                76.2                              78.1   \n",
       "3                                   -                                 -   \n",
       "4                                60.7                              65.5   \n",
       "..                                ...                               ...   \n",
       "203                              75.1                              81.3   \n",
       "204                              68.3                              71.3   \n",
       "205                              74.8                              80.8   \n",
       "206                               NaN                               NaN   \n",
       "207                              70.8                              75.9   \n",
       "\n",
       "    Life expectancy at birthb (years)  \\\n",
       "0                                63.2   \n",
       "1                                  78   \n",
       "2                                77.1   \n",
       "3                                   -   \n",
       "4                                63.1   \n",
       "..                                ...   \n",
       "203                              78.2   \n",
       "204                              69.7   \n",
       "205                              77.7   \n",
       "206                               NaN   \n",
       "207                              73.3   \n",
       "\n",
       "    Healthy life expectancy at birthb (years)  \\\n",
       "0                                        54.7   \n",
       "1                                          68   \n",
       "2                                        66.7   \n",
       "3                                           -   \n",
       "4                                        53.6   \n",
       "..                                        ...   \n",
       "203                                      66.6   \n",
       "204                                      60.2   \n",
       "205                                        67   \n",
       "206                                       NaN   \n",
       "207                                      62.5   \n",
       "\n",
       "    Healthy life expectancy at birthb (years)  \\\n",
       "0                                        53.2   \n",
       "1                                        70.3   \n",
       "2                                        66.1   \n",
       "3                                           -   \n",
       "4                                        56.2   \n",
       "..                                        ...   \n",
       "203                                        70   \n",
       "204                                      60.7   \n",
       "205                                      70.2   \n",
       "206                                       NaN   \n",
       "207                                      64.9   \n",
       "\n",
       "    Healthy life expectancy at birthb (years)  ...  \\\n",
       "0                                        53.9  ...   \n",
       "1                                        69.1  ...   \n",
       "2                                        66.4  ...   \n",
       "3                                           -  ...   \n",
       "4                                        54.8  ...   \n",
       "..                                        ...  ...   \n",
       "203                                      68.3  ...   \n",
       "204                                      60.4  ...   \n",
       "205                                      68.6  ...   \n",
       "206                                       NaN  ...   \n",
       "207                                      63.7  ...   \n",
       "\n",
       "    Proportion of births attended by skilled health personneld (%)  \\\n",
       "0                                                   62               \n",
       "1                                                  100               \n",
       "2                                                   99               \n",
       "3                                                  100               \n",
       "4                                                   50               \n",
       "..                                                 ...               \n",
       "203                                                 99               \n",
       "204                                                 85               \n",
       "205                                                 97               \n",
       "206                                                NaN               \n",
       "207                                                 86               \n",
       "\n",
       "    Proportion of births attended by skilled health personneld (%)  \\\n",
       "0                                                  NaN               \n",
       "1                                                  NaN               \n",
       "2                                                  NaN               \n",
       "3                                                   ak               \n",
       "4                                                  NaN               \n",
       "..                                                 ...               \n",
       "203                                                NaN               \n",
       "204                                                NaN               \n",
       "205                                                NaN               \n",
       "206                                                NaN               \n",
       "207                                                NaN               \n",
       "\n",
       "    Under-five mortality ratee (per 1000 live births)  \\\n",
       "0                                                  56   \n",
       "1                                                   9   \n",
       "2                                                  22   \n",
       "3                                                   3   \n",
       "4                                                  69   \n",
       "..                                                ...   \n",
       "203                                                 8   \n",
       "204                                                45   \n",
       "205                                                12   \n",
       "206                                               NaN   \n",
       "207                                                38   \n",
       "\n",
       "    Neonatal mortality ratee (per 1000 live births)  \\\n",
       "0                                                34   \n",
       "1                                                 7   \n",
       "2                                                16   \n",
       "3                                                 1   \n",
       "4                                                27   \n",
       "..                                              ...   \n",
       "203                                               4   \n",
       "204                                              25   \n",
       "205                                               6   \n",
       "206                                             NaN   \n",
       "207                                              18   \n",
       "\n",
       "    New HIV infectionsf (per 1000 uninfected population)  \\\n",
       "0                                                 0.04     \n",
       "1                                                 0.03     \n",
       "2                                                 0.04     \n",
       "3                                                    -     \n",
       "4                                                 0.52     \n",
       "..                                                 ...     \n",
       "203                                               0.20     \n",
       "204                                               0.06     \n",
       "205                                               0.06     \n",
       "206                                                NaN     \n",
       "207                                               0.19     \n",
       "\n",
       "    Tuberculosis incidenceg (per 100 000 population)  \\\n",
       "0                                              189.0   \n",
       "1                                               17.0   \n",
       "2                                               54.0   \n",
       "3                                                2.9   \n",
       "4                                              325.0   \n",
       "..                                               ...   \n",
       "203                                             25.0   \n",
       "204                                            112.0   \n",
       "205                                             98.0   \n",
       "206                                              NaN   \n",
       "207                                            134.0   \n",
       "\n",
       "    Malaria incidenceh (per 1000 population at risk)  \\\n",
       "0                                                6.3   \n",
       "1                                                  -   \n",
       "2                                                0.0   \n",
       "3                                                  -   \n",
       "4                                              254.9   \n",
       "..                                               ...   \n",
       "203                                                -   \n",
       "204                                             11.6   \n",
       "205                                              1.9   \n",
       "206                                              NaN   \n",
       "207                                             59.2   \n",
       "\n",
       "    Malaria incidenceh (per 1000 population at risk)  \\\n",
       "0                                                NaN   \n",
       "1                                                NaN   \n",
       "2                                                NaN   \n",
       "3                                                NaN   \n",
       "4                                                NaN   \n",
       "..                                               ...   \n",
       "203                                              NaN   \n",
       "204                                              NaN   \n",
       "205                                              NaN   \n",
       "206                                              NaN   \n",
       "207                                              NaN   \n",
       "\n",
       "     Hepatitis B surface antigen (HBsAg) prevalence among children under 5 yearsi (%)  \\\n",
       "0                                                 0.39                                  \n",
       "1                                                 0.29                                  \n",
       "2                                                 0.08                                  \n",
       "3                                                 0.02                                  \n",
       "4                                                 4.57                                  \n",
       "..                                                 ...                                  \n",
       "203                                               0.26                                  \n",
       "204                                               0.84                                  \n",
       "205                                                0.3                                  \n",
       "206                                                NaN                                  \n",
       "207                                               0.94                                  \n",
       "\n",
       "    Reported number of people requiring interventions against NTDsj  \n",
       "0                                           14 367 281               \n",
       "1                                                   21               \n",
       "2                                                7 081               \n",
       "3                                                    0               \n",
       "4                                            8 558 522               \n",
       "..                                                 ...               \n",
       "203                                          5 951 752               \n",
       "204                                         72 120 780               \n",
       "205                                         72 372 509               \n",
       "206                                                NaN               \n",
       "207                                      1 653 289 348               \n",
       "\n",
       "[208 rows x 21 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "# Read the Excel file with a multi-level header\n",
    "df = pd.read_excel(r\"C:\\My\\Top-up Degree\\Data Science\\Data Science - Assignment\\Data set\\WHO\\whs2023_annex1.xlsx\", \n",
    "                   sheet_name=\"Annex 1-1\", header=[1, 2, 3, 4])\n",
    "\n",
    "df = df.rename(columns={\"Unnamed: 0_level_0\": \"Country\", \"Unnamed: 1_level_0\": \"Population\", \"Unnamed: 4_level_0\": \"Life expectancy\"})\n",
    "df.columns = df.columns.droplevel([1, 2, 3])\n",
    "display(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e8255f7f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Country                                                                              object\n",
      "Total populationa (000s)                                                             object\n",
      "Total populationa (000s)                                                             object\n",
      "Total populationa (000s)                                                            float64\n",
      "Life expectancy at birthb (years)                                                    object\n",
      "Life expectancy at birthb (years)                                                    object\n",
      "Life expectancy at birthb (years)                                                    object\n",
      "Healthy life expectancy at birthb (years)                                            object\n",
      "Healthy life expectancy at birthb (years)                                            object\n",
      "Healthy life expectancy at birthb (years)                                            object\n",
      "Maternal mortality ratioc (per 100 000 live births)                                  object\n",
      "Proportion of births attended by skilled health personneld (%)                       object\n",
      "Proportion of births attended by skilled health personneld (%)                       object\n",
      "Under-five mortality ratee (per 1000 live births)                                    object\n",
      "Neonatal mortality ratee (per 1000 live births)                                      object\n",
      "New HIV infectionsf (per 1000 uninfected population)                                 object\n",
      "Tuberculosis incidenceg (per 100 000 population)                                    float64\n",
      "Malaria incidenceh (per 1000 population at risk)                                     object\n",
      "Malaria incidenceh (per 1000 population at risk)                                    float64\n",
      "Hepatitis B surface antigen (HBsAg) prevalence among children under 5 yearsi (%)     object\n",
      "Reported number of people requiring interventions against NTDsj                      object\n",
      "dtype: object\n"
     ]
    }
   ],
   "source": [
    "print(df.dtypes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "1351a113",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0_level_0</th>\n",
       "      <th>Probability of dying from any of CVD, cancer, diabetes, CRD between age 30 and exact age 70k (%)</th>\n",
       "      <th>Suicide mortality ratek (per 100 000 population)</th>\n",
       "      <th>Total alcohol per capita (≥ 15 years of age) consumptionl (litres of pure alcohol)</th>\n",
       "      <th>Road traffic mortality ratek,ay (per 100 000 population)</th>\n",
       "      <th colspan=\"2\" halign=\"left\">Proportion of women of reproductive age who have their need for family planning satisfied with modern methodsm (%)</th>\n",
       "      <th>Adolescent birth raten (per 1000 women aged 15–19 years)</th>\n",
       "      <th>Adolescent birth raten (per 1000 women aged 10–14 years)</th>\n",
       "      <th>UHC: Service coverage indexo</th>\n",
       "      <th>Population with household expenditures on health &gt; 10% of total household expenditure or incomep (%)</th>\n",
       "      <th>Population with household expenditures on health &gt; 25% of total household expenditure or incomep (%)</th>\n",
       "      <th>Age-standardized mortality rate attributed to household and ambient air pollutionq  (per 100 000 population)</th>\n",
       "      <th>Mortality rate attributed to exposure to unsafe WASH servicesr (per 100 000 population)</th>\n",
       "      <th>Mortality rate from unintentional poisoningk (per 100 000 population)</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>Data type</th>\n",
       "      <th>Comparable estimates</th>\n",
       "      <th>Comparable estimates</th>\n",
       "      <th>Comparable estimates</th>\n",
       "      <th>Comparable estimates</th>\n",
       "      <th colspan=\"2\" halign=\"left\">Primary data</th>\n",
       "      <th>Primary data</th>\n",
       "      <th>Primary data</th>\n",
       "      <th>Comparable estimates</th>\n",
       "      <th>Primary data</th>\n",
       "      <th>Primary data</th>\n",
       "      <th>Comparable estimates</th>\n",
       "      <th>Comparable estimates</th>\n",
       "      <th>Comparable estimates</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0_level_2</th>\n",
       "      <th>Unnamed: 1_level_2</th>\n",
       "      <th>Unnamed: 2_level_2</th>\n",
       "      <th>Unnamed: 3_level_2</th>\n",
       "      <th>Unnamed: 4_level_2</th>\n",
       "      <th>Unnamed: 5_level_2</th>\n",
       "      <th>Unnamed: 6_level_2</th>\n",
       "      <th>Unnamed: 7_level_2</th>\n",
       "      <th>Unnamed: 8_level_2</th>\n",
       "      <th>Unnamed: 9_level_2</th>\n",
       "      <th>Unnamed: 10_level_2</th>\n",
       "      <th>Unnamed: 11_level_2</th>\n",
       "      <th>Unnamed: 12_level_2</th>\n",
       "      <th>Unnamed: 13_level_2</th>\n",
       "      <th>Unnamed: 14_level_2</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>Countries and areas</th>\n",
       "      <th>2019</th>\n",
       "      <th>2019</th>\n",
       "      <th>2019</th>\n",
       "      <th>2019</th>\n",
       "      <th>2013–2021</th>\n",
       "      <th>2013–2021</th>\n",
       "      <th>2013–2021</th>\n",
       "      <th>Unnamed: 8_level_3</th>\n",
       "      <th>2021</th>\n",
       "      <th>2013–2021</th>\n",
       "      <th>2013–2021</th>\n",
       "      <th>2019</th>\n",
       "      <th>2019</th>\n",
       "      <th>2019</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>35.3</td>\n",
       "      <td>4.1</td>\n",
       "      <td>&lt;0.1</td>\n",
       "      <td>15.9</td>\n",
       "      <td>42.1</td>\n",
       "      <td>am</td>\n",
       "      <td>62.1</td>\n",
       "      <td>0.2</td>\n",
       "      <td>41</td>\n",
       "      <td>26.1</td>\n",
       "      <td>8.0</td>\n",
       "      <td>265.7</td>\n",
       "      <td>16.6</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Albania</td>\n",
       "      <td>11.4</td>\n",
       "      <td>4.3</td>\n",
       "      <td>5.1</td>\n",
       "      <td>11.7</td>\n",
       "      <td>6.3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>13.2</td>\n",
       "      <td>0.3</td>\n",
       "      <td>64</td>\n",
       "      <td>8.9</td>\n",
       "      <td>1.4</td>\n",
       "      <td>92.5</td>\n",
       "      <td>3.2</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Algeria</td>\n",
       "      <td>13.9</td>\n",
       "      <td>2.5</td>\n",
       "      <td>0.6</td>\n",
       "      <td>20.9</td>\n",
       "      <td>66.3</td>\n",
       "      <td>am</td>\n",
       "      <td>12.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>74</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>49.7</td>\n",
       "      <td>4.1</td>\n",
       "      <td>0.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Andorra</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>11.1</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.0</td>\n",
       "      <td>79</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Angola</td>\n",
       "      <td>22.2</td>\n",
       "      <td>6.1</td>\n",
       "      <td>6.2</td>\n",
       "      <td>26.1</td>\n",
       "      <td>29.8</td>\n",
       "      <td>NaN</td>\n",
       "      <td>162.7</td>\n",
       "      <td>10.7</td>\n",
       "      <td>37</td>\n",
       "      <td>35.5</td>\n",
       "      <td>12.5</td>\n",
       "      <td>142.8</td>\n",
       "      <td>48.9</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>203</th>\n",
       "      <td>European Region</td>\n",
       "      <td>16.3</td>\n",
       "      <td>12.8</td>\n",
       "      <td>9.2</td>\n",
       "      <td>7.4</td>\n",
       "      <td>76.8</td>\n",
       "      <td>NaN</td>\n",
       "      <td>13.1</td>\n",
       "      <td>0.1</td>\n",
       "      <td>81</td>\n",
       "      <td>7.94</td>\n",
       "      <td>1.3</td>\n",
       "      <td>44.5</td>\n",
       "      <td>3.6</td>\n",
       "      <td>1.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>204</th>\n",
       "      <td>Eastern Mediterranean Region</td>\n",
       "      <td>24.5</td>\n",
       "      <td>5.8</td>\n",
       "      <td>0.3</td>\n",
       "      <td>17.8</td>\n",
       "      <td>62.3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>45.8</td>\n",
       "      <td>1.1</td>\n",
       "      <td>57</td>\n",
       "      <td>12.11</td>\n",
       "      <td>2.23</td>\n",
       "      <td>136.0</td>\n",
       "      <td>18.4</td>\n",
       "      <td>1.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>205</th>\n",
       "      <td>Western Pacific Region</td>\n",
       "      <td>15.6</td>\n",
       "      <td>8.7</td>\n",
       "      <td>6.1</td>\n",
       "      <td>16.4</td>\n",
       "      <td>87.3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16.9</td>\n",
       "      <td>0.3</td>\n",
       "      <td>79</td>\n",
       "      <td>19.8</td>\n",
       "      <td>5.26</td>\n",
       "      <td>94.0</td>\n",
       "      <td>4.3</td>\n",
       "      <td>1.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>206</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>207</th>\n",
       "      <td>Global</td>\n",
       "      <td>17.8</td>\n",
       "      <td>9.2</td>\n",
       "      <td>5.5</td>\n",
       "      <td>16.7</td>\n",
       "      <td>77.5</td>\n",
       "      <td>NaN</td>\n",
       "      <td>41.9</td>\n",
       "      <td>1.5</td>\n",
       "      <td>68</td>\n",
       "      <td>13.46</td>\n",
       "      <td>3.77</td>\n",
       "      <td>103.6</td>\n",
       "      <td>18.3</td>\n",
       "      <td>1.1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>208 rows × 15 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "               Unnamed: 0_level_0  \\\n",
       "                        Data type   \n",
       "               Unnamed: 0_level_2   \n",
       "              Countries and areas   \n",
       "0                     Afghanistan   \n",
       "1                         Albania   \n",
       "2                         Algeria   \n",
       "3                         Andorra   \n",
       "4                          Angola   \n",
       "..                            ...   \n",
       "203               European Region   \n",
       "204  Eastern Mediterranean Region   \n",
       "205        Western Pacific Region   \n",
       "206                           NaN   \n",
       "207                        Global   \n",
       "\n",
       "    Probability of dying from any of CVD, cancer, diabetes, CRD between age 30 and exact age 70k (%)  \\\n",
       "                                                                                Comparable estimates   \n",
       "                                                                                  Unnamed: 1_level_2   \n",
       "                                                                                                2019   \n",
       "0                                                 35.3                                                 \n",
       "1                                                 11.4                                                 \n",
       "2                                                 13.9                                                 \n",
       "3                                                    -                                                 \n",
       "4                                                 22.2                                                 \n",
       "..                                                 ...                                                 \n",
       "203                                               16.3                                                 \n",
       "204                                               24.5                                                 \n",
       "205                                               15.6                                                 \n",
       "206                                                NaN                                                 \n",
       "207                                               17.8                                                 \n",
       "\n",
       "    Suicide mortality ratek (per 100 000 population)  \\\n",
       "                                Comparable estimates   \n",
       "                                  Unnamed: 2_level_2   \n",
       "                                                2019   \n",
       "0                                                4.1   \n",
       "1                                                4.3   \n",
       "2                                                2.5   \n",
       "3                                                  -   \n",
       "4                                                6.1   \n",
       "..                                               ...   \n",
       "203                                             12.8   \n",
       "204                                              5.8   \n",
       "205                                              8.7   \n",
       "206                                              NaN   \n",
       "207                                              9.2   \n",
       "\n",
       "    Total alcohol per capita (≥ 15 years of age) consumptionl (litres of pure alcohol)  \\\n",
       "                                                                  Comparable estimates   \n",
       "                                                                    Unnamed: 3_level_2   \n",
       "                                                                                  2019   \n",
       "0                                                 <0.1                                   \n",
       "1                                                  5.1                                   \n",
       "2                                                  0.6                                   \n",
       "3                                                 11.1                                   \n",
       "4                                                  6.2                                   \n",
       "..                                                 ...                                   \n",
       "203                                                9.2                                   \n",
       "204                                                0.3                                   \n",
       "205                                                6.1                                   \n",
       "206                                                NaN                                   \n",
       "207                                                5.5                                   \n",
       "\n",
       "    Road traffic mortality ratek,ay (per 100 000 population)  \\\n",
       "                                        Comparable estimates   \n",
       "                                          Unnamed: 4_level_2   \n",
       "                                                        2019   \n",
       "0                                                 15.9         \n",
       "1                                                 11.7         \n",
       "2                                                 20.9         \n",
       "3                                                    -         \n",
       "4                                                 26.1         \n",
       "..                                                 ...         \n",
       "203                                                7.4         \n",
       "204                                               17.8         \n",
       "205                                               16.4         \n",
       "206                                                NaN         \n",
       "207                                               16.7         \n",
       "\n",
       "    Proportion of women of reproductive age who have their need for family planning satisfied with modern methodsm (%)  \\\n",
       "                                                                                                          Primary data   \n",
       "                                                                                                    Unnamed: 5_level_2   \n",
       "                                                                                                             2013–2021   \n",
       "0                                                 42.1                                                                   \n",
       "1                                                  6.3                                                                   \n",
       "2                                                 66.3                                                                   \n",
       "3                                                    -                                                                   \n",
       "4                                                 29.8                                                                   \n",
       "..                                                 ...                                                                   \n",
       "203                                               76.8                                                                   \n",
       "204                                               62.3                                                                   \n",
       "205                                               87.3                                                                   \n",
       "206                                                NaN                                                                   \n",
       "207                                               77.5                                                                   \n",
       "\n",
       "                        \\\n",
       "                         \n",
       "    Unnamed: 6_level_2   \n",
       "             2013–2021   \n",
       "0                   am   \n",
       "1                  NaN   \n",
       "2                   am   \n",
       "3                  NaN   \n",
       "4                  NaN   \n",
       "..                 ...   \n",
       "203                NaN   \n",
       "204                NaN   \n",
       "205                NaN   \n",
       "206                NaN   \n",
       "207                NaN   \n",
       "\n",
       "    Adolescent birth raten (per 1000 women aged 15–19 years)  \\\n",
       "                                                Primary data   \n",
       "                                          Unnamed: 7_level_2   \n",
       "                                                   2013–2021   \n",
       "0                                                 62.1         \n",
       "1                                                 13.2         \n",
       "2                                                 12.0         \n",
       "3                                                  0.5         \n",
       "4                                                162.7         \n",
       "..                                                 ...         \n",
       "203                                               13.1         \n",
       "204                                               45.8         \n",
       "205                                               16.9         \n",
       "206                                                NaN         \n",
       "207                                               41.9         \n",
       "\n",
       "    Adolescent birth raten (per 1000 women aged 10–14 years)  \\\n",
       "                                                Primary data   \n",
       "                                          Unnamed: 8_level_2   \n",
       "                                          Unnamed: 8_level_3   \n",
       "0                                                  0.2         \n",
       "1                                                  0.3         \n",
       "2                                                  0.0         \n",
       "3                                                  0.0         \n",
       "4                                                 10.7         \n",
       "..                                                 ...         \n",
       "203                                                0.1         \n",
       "204                                                1.1         \n",
       "205                                                0.3         \n",
       "206                                                NaN         \n",
       "207                                                1.5         \n",
       "\n",
       "    UHC: Service coverage indexo  \\\n",
       "            Comparable estimates   \n",
       "              Unnamed: 9_level_2   \n",
       "                            2021   \n",
       "0                             41   \n",
       "1                             64   \n",
       "2                             74   \n",
       "3                             79   \n",
       "4                             37   \n",
       "..                           ...   \n",
       "203                           81   \n",
       "204                           57   \n",
       "205                           79   \n",
       "206                          NaN   \n",
       "207                           68   \n",
       "\n",
       "    Population with household expenditures on health > 10% of total household expenditure or incomep (%)  \\\n",
       "                                                                                            Primary data   \n",
       "                                                                                     Unnamed: 10_level_2   \n",
       "                                                                                               2013–2021   \n",
       "0                                                 26.1                                                     \n",
       "1                                                  8.9                                                     \n",
       "2                                                    -                                                     \n",
       "3                                                    -                                                     \n",
       "4                                                 35.5                                                     \n",
       "..                                                 ...                                                     \n",
       "203                                               7.94                                                     \n",
       "204                                              12.11                                                     \n",
       "205                                               19.8                                                     \n",
       "206                                                NaN                                                     \n",
       "207                                              13.46                                                     \n",
       "\n",
       "    Population with household expenditures on health > 25% of total household expenditure or incomep (%)  \\\n",
       "                                                                                            Primary data   \n",
       "                                                                                     Unnamed: 11_level_2   \n",
       "                                                                                               2013–2021   \n",
       "0                                                  8.0                                                     \n",
       "1                                                  1.4                                                     \n",
       "2                                                    -                                                     \n",
       "3                                                    -                                                     \n",
       "4                                                 12.5                                                     \n",
       "..                                                 ...                                                     \n",
       "203                                                1.3                                                     \n",
       "204                                               2.23                                                     \n",
       "205                                               5.26                                                     \n",
       "206                                                NaN                                                     \n",
       "207                                               3.77                                                     \n",
       "\n",
       "    Age-standardized mortality rate attributed to household and ambient air pollutionq  (per 100 000 population)   \\\n",
       "                                                                                             Comparable estimates   \n",
       "                                                                                              Unnamed: 12_level_2   \n",
       "                                                                                                             2019   \n",
       "0                                                265.7                                                              \n",
       "1                                                 92.5                                                              \n",
       "2                                                 49.7                                                              \n",
       "3                                                    -                                                              \n",
       "4                                                142.8                                                              \n",
       "..                                                 ...                                                              \n",
       "203                                               44.5                                                              \n",
       "204                                              136.0                                                              \n",
       "205                                               94.0                                                              \n",
       "206                                                NaN                                                              \n",
       "207                                              103.6                                                              \n",
       "\n",
       "    Mortality rate attributed to exposure to unsafe WASH servicesr (per 100 000 population)  \\\n",
       "                                                                       Comparable estimates   \n",
       "                                                                        Unnamed: 13_level_2   \n",
       "                                                                                       2019   \n",
       "0                                                 16.6                                        \n",
       "1                                                  3.2                                        \n",
       "2                                                  4.1                                        \n",
       "3                                                    -                                        \n",
       "4                                                 48.9                                        \n",
       "..                                                 ...                                        \n",
       "203                                                3.6                                        \n",
       "204                                               18.4                                        \n",
       "205                                                4.3                                        \n",
       "206                                                NaN                                        \n",
       "207                                               18.3                                        \n",
       "\n",
       "    Mortality rate from unintentional poisoningk (per 100 000 population)  \n",
       "                                                     Comparable estimates  \n",
       "                                                      Unnamed: 14_level_2  \n",
       "                                                                     2019  \n",
       "0                                                    1                     \n",
       "1                                                  0.3                     \n",
       "2                                                  0.7                     \n",
       "3                                                    -                     \n",
       "4                                                    2                     \n",
       "..                                                 ...                     \n",
       "203                                                1.1                     \n",
       "204                                                1.1                     \n",
       "205                                                1.4                     \n",
       "206                                                NaN                     \n",
       "207                                                1.1                     \n",
       "\n",
       "[208 rows x 15 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "# Read the Excel file with a multi-level header\n",
    "df = pd.read_excel(r\"C:\\My\\Top-up Degree\\Data Science\\Data Science - Assignment\\Data set\\WHO\\whs2023_annex1.xlsx\", \n",
    "                   sheet_name=\"Annex 1-2\", header=[1, 2, 3, 4])\n",
    "display(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "9a91c485",
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyError",
     "evalue": "['country_and_areas']",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[19], line 2\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;66;03m# Drop the rows that have NaN values in the country_and_areas column\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdropna\u001b[49m\u001b[43m(\u001b[49m\u001b[43msubset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcountry_and_areas\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m      4\u001b[0m \u001b[38;5;66;03m# Drop the columns that have more than 50% missing values\u001b[39;00m\n\u001b[0;32m      5\u001b[0m df \u001b[38;5;241m=\u001b[39m df\u001b[38;5;241m.\u001b[39mdropna(thresh\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mlen\u001b[39m(df)\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m0.5\u001b[39m, axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\n",
      "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\util\\_decorators.py:331\u001b[0m, in \u001b[0;36mdeprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m    325\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(args) \u001b[38;5;241m>\u001b[39m num_allow_args:\n\u001b[0;32m    326\u001b[0m     warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[0;32m    327\u001b[0m         msg\u001b[38;5;241m.\u001b[39mformat(arguments\u001b[38;5;241m=\u001b[39m_format_argument_list(allow_args)),\n\u001b[0;32m    328\u001b[0m         \u001b[38;5;167;01mFutureWarning\u001b[39;00m,\n\u001b[0;32m    329\u001b[0m         stacklevel\u001b[38;5;241m=\u001b[39mfind_stack_level(),\n\u001b[0;32m    330\u001b[0m     )\n\u001b[1;32m--> 331\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
      "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\core\\frame.py:6561\u001b[0m, in \u001b[0;36mDataFrame.dropna\u001b[1;34m(self, axis, how, thresh, subset, inplace)\u001b[0m\n\u001b[0;32m   6559\u001b[0m     check \u001b[38;5;241m=\u001b[39m indices \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m\n\u001b[0;32m   6560\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m check\u001b[38;5;241m.\u001b[39many():\n\u001b[1;32m-> 6561\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(np\u001b[38;5;241m.\u001b[39marray(subset)[check]\u001b[38;5;241m.\u001b[39mtolist())\n\u001b[0;32m   6562\u001b[0m     agg_obj \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtake(indices, axis\u001b[38;5;241m=\u001b[39magg_axis)\n\u001b[0;32m   6564\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m thresh \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m no_default:\n",
      "\u001b[1;31mKeyError\u001b[0m: ['country_and_areas']"
     ]
    }
   ],
   "source": [
    "# Drop the rows that have NaN values in the country_and_areas column\n",
    "df = df.dropna(subset=[\"country_and_areas\"])\n",
    "\n",
    "# Drop the columns that have more than 50% missing values\n",
    "df = df.dropna(thresh=len(df)*0.5, axis=1)\n",
    "\n",
    "# Replace the remaining missing values with the mean or median of the column\n",
    "df = df.fillna(df.mean())\n",
    "\n",
    "# Rename the columns to make them shorter and more descriptive\n",
    "df = df.rename(columns={\"Probability of dying from any of CVD, cancer, diabetes, CRD between age 30 and exact age 70k (%)\": \"cvd_cancer_death_rate\",\n",
    "                        \"Suicide mortality ratek (per 100 000 population)\": \"suicide_rate\",\n",
    "                        \"Total alcohol per capita (≥ 15 years of age) consumptionl (litres of pure alcohol)\": \"alcohol_consumption\",\n",
    "                        \"Road traffic mortality ratek,ay (per 100 000 population)\": \"road_traffic_death_rate\",\n",
    "                        \"Proportion of women of reproductive age who have their need for family planning satisfied with modern methodsm (%)\": \"family_planning_satisfaction\",\n",
    "                        \"Adolescent birth raten (per 1000 women aged 15–19 years)\": \"adolescent_birth_rate_15_19\",\n",
    "                        \"Adolescent birth raten (per 1000 women aged 10–14 years)\": \"adolescent_birth_rate_10_14\",\n",
    "                        \"UHC: Service coverage indexo\": \"uhc_service_coverage_index\",\n",
    "                        \"Population with household expenditures on health > 10% of total household expenditure or incomep (%)\": \"health_expenditure_over_10_percent\",\n",
    "                        \"Population with household expenditures on health > 25% of total household expenditure or incomep (%)\": \"health_expenditure_over_25_percent\",\n",
    "                        \"Age-standardized mortality rate attributed to household and ambient air pollutionq (per 100 000 population)\": \"air_pollution_death_rate\",\n",
    "                        \"Mortality rate attributed to exposure to unsafe WASH servicesr (per 100 000 population)\": \"wash_services_death_rate\",\n",
    "                        \"Mortality rate from unintentional poisoningk (per 100 000 population)\": \"poisoning_death_rate\"})\n",
    "\n",
    "# Check for outliers and remove or transform them if necessary\n",
    "# You can use boxplots, histograms, or z-scores to identify outliers\n",
    "# You can use methods such as winsorization, log transformation, or standardization to deal with outliers\n",
    "display(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "3556f039",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>country_and_areas</th>\n",
       "      <th>cvd_cancer_death_rate</th>\n",
       "      <th>suicide_rate</th>\n",
       "      <th>Total alcohol per capita (≥ 15 years of age) consumptionl (litres of pure alcohol)</th>\n",
       "      <th>road_traffic_death_rate</th>\n",
       "      <th>family_planning_satisfaction</th>\n",
       "      <th>adolescent_birth_rate_15_19</th>\n",
       "      <th>adolescent_birth_rate_10_14</th>\n",
       "      <th>uhc_service_coverage_index</th>\n",
       "      <th>health_expenditure_over_10_percent</th>\n",
       "      <th>health_expenditure_over_25_percent</th>\n",
       "      <th>Age-standardized mortality rate attributed to household and ambient air pollutionq  (per 100 000 population)</th>\n",
       "      <th>wash_services_death_rate</th>\n",
       "      <th>poisoning_death_rate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>206</td>\n",
       "      <td>206</td>\n",
       "      <td>206</td>\n",
       "      <td>206</td>\n",
       "      <td>206</td>\n",
       "      <td>206</td>\n",
       "      <td>206</td>\n",
       "      <td>206</td>\n",
       "      <td>206</td>\n",
       "      <td>206</td>\n",
       "      <td>206</td>\n",
       "      <td>206</td>\n",
       "      <td>206</td>\n",
       "      <td>206.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>unique</th>\n",
       "      <td>206</td>\n",
       "      <td>130</td>\n",
       "      <td>118</td>\n",
       "      <td>103</td>\n",
       "      <td>146</td>\n",
       "      <td>110</td>\n",
       "      <td>164</td>\n",
       "      <td>41</td>\n",
       "      <td>60</td>\n",
       "      <td>105</td>\n",
       "      <td>61</td>\n",
       "      <td>182</td>\n",
       "      <td>129</td>\n",
       "      <td>39.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>top</th>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>75</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>0.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>freq</th>\n",
       "      <td>1</td>\n",
       "      <td>15</td>\n",
       "      <td>15</td>\n",
       "      <td>9</td>\n",
       "      <td>15</td>\n",
       "      <td>90</td>\n",
       "      <td>23</td>\n",
       "      <td>32</td>\n",
       "      <td>8</td>\n",
       "      <td>66</td>\n",
       "      <td>66</td>\n",
       "      <td>15</td>\n",
       "      <td>15</td>\n",
       "      <td>21.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       country_and_areas cvd_cancer_death_rate suicide_rate  \\\n",
       "count                206                   206          206   \n",
       "unique               206                   130          118   \n",
       "top          Afghanistan                     -            -   \n",
       "freq                   1                    15           15   \n",
       "\n",
       "       Total alcohol per capita (≥ 15 years of age) consumptionl (litres of pure alcohol)  \\\n",
       "count                                                 206                                   \n",
       "unique                                                103                                   \n",
       "top                                                     -                                   \n",
       "freq                                                    9                                   \n",
       "\n",
       "       road_traffic_death_rate family_planning_satisfaction  \\\n",
       "count                      206                          206   \n",
       "unique                     146                          110   \n",
       "top                          -                            -   \n",
       "freq                        15                           90   \n",
       "\n",
       "       adolescent_birth_rate_15_19 adolescent_birth_rate_10_14  \\\n",
       "count                          206                         206   \n",
       "unique                         164                          41   \n",
       "top                              -                           -   \n",
       "freq                            23                          32   \n",
       "\n",
       "        uhc_service_coverage_index health_expenditure_over_10_percent  \\\n",
       "count                          206                                206   \n",
       "unique                          60                                105   \n",
       "top                             75                                  -   \n",
       "freq                             8                                 66   \n",
       "\n",
       "       health_expenditure_over_25_percent  \\\n",
       "count                                 206   \n",
       "unique                                 61   \n",
       "top                                     -   \n",
       "freq                                   66   \n",
       "\n",
       "       Age-standardized mortality rate attributed to household and ambient air pollutionq  (per 100 000 population)   \\\n",
       "count                                                 206                                                              \n",
       "unique                                                182                                                              \n",
       "top                                                     -                                                              \n",
       "freq                                                   15                                                              \n",
       "\n",
       "       wash_services_death_rate  poisoning_death_rate  \n",
       "count                       206                 206.0  \n",
       "unique                      129                  39.0  \n",
       "top                           -                   0.4  \n",
       "freq                         15                  21.0  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "60920e90",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>country_and_areas</th>\n",
       "      <th>Probability of dying from any of CVD, cancer, diabetes, CRD between age 30 and exact age 70k (%)</th>\n",
       "      <th>Suicide mortality ratek (per 100 000 population)</th>\n",
       "      <th>Total alcohol per capita (≥ 15 years of age) consumptionl (litres of pure alcohol)</th>\n",
       "      <th>Road traffic mortality ratek,ay (per 100 000 population)</th>\n",
       "      <th>Proportion of women of reproductive age who have their need for family planning satisfied with modern methodsm (%)</th>\n",
       "      <th>Proportion of women of reproductive age who have their need for family planning satisfied with modern methodsm (%)</th>\n",
       "      <th>Adolescent birth raten (per 1000 women aged 15–19 years)</th>\n",
       "      <th>Adolescent birth raten (per 1000 women aged 10–14 years)</th>\n",
       "      <th>UHC: Service coverage indexo</th>\n",
       "      <th>Population with household expenditures on health &gt; 10% of total household expenditure or incomep (%)</th>\n",
       "      <th>Population with household expenditures on health &gt; 25% of total household expenditure or incomep (%)</th>\n",
       "      <th>Age-standardized mortality rate attributed to household and ambient air pollutionq  (per 100 000 population)</th>\n",
       "      <th>Mortality rate attributed to exposure to unsafe WASH servicesr (per 100 000 population)</th>\n",
       "      <th>Mortality rate from unintentional poisoningk (per 100 000 population)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>35.3</td>\n",
       "      <td>4.1</td>\n",
       "      <td>&lt;0.1</td>\n",
       "      <td>15.9</td>\n",
       "      <td>42.1</td>\n",
       "      <td>am</td>\n",
       "      <td>62.1</td>\n",
       "      <td>0.2</td>\n",
       "      <td>41</td>\n",
       "      <td>26.1</td>\n",
       "      <td>8.0</td>\n",
       "      <td>265.7</td>\n",
       "      <td>16.6</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Albania</td>\n",
       "      <td>11.4</td>\n",
       "      <td>4.3</td>\n",
       "      <td>5.1</td>\n",
       "      <td>11.7</td>\n",
       "      <td>6.3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>13.2</td>\n",
       "      <td>0.3</td>\n",
       "      <td>64</td>\n",
       "      <td>8.9</td>\n",
       "      <td>1.4</td>\n",
       "      <td>92.5</td>\n",
       "      <td>3.2</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Algeria</td>\n",
       "      <td>13.9</td>\n",
       "      <td>2.5</td>\n",
       "      <td>0.6</td>\n",
       "      <td>20.9</td>\n",
       "      <td>66.3</td>\n",
       "      <td>am</td>\n",
       "      <td>12.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>74</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>49.7</td>\n",
       "      <td>4.1</td>\n",
       "      <td>0.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Andorra</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>11.1</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.0</td>\n",
       "      <td>79</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Angola</td>\n",
       "      <td>22.2</td>\n",
       "      <td>6.1</td>\n",
       "      <td>6.2</td>\n",
       "      <td>26.1</td>\n",
       "      <td>29.8</td>\n",
       "      <td>NaN</td>\n",
       "      <td>162.7</td>\n",
       "      <td>10.7</td>\n",
       "      <td>37</td>\n",
       "      <td>35.5</td>\n",
       "      <td>12.5</td>\n",
       "      <td>142.8</td>\n",
       "      <td>48.9</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>203</th>\n",
       "      <td>European Region</td>\n",
       "      <td>16.3</td>\n",
       "      <td>12.8</td>\n",
       "      <td>9.2</td>\n",
       "      <td>7.4</td>\n",
       "      <td>76.8</td>\n",
       "      <td>NaN</td>\n",
       "      <td>13.1</td>\n",
       "      <td>0.1</td>\n",
       "      <td>81</td>\n",
       "      <td>7.94</td>\n",
       "      <td>1.3</td>\n",
       "      <td>44.5</td>\n",
       "      <td>3.6</td>\n",
       "      <td>1.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>204</th>\n",
       "      <td>Eastern Mediterranean Region</td>\n",
       "      <td>24.5</td>\n",
       "      <td>5.8</td>\n",
       "      <td>0.3</td>\n",
       "      <td>17.8</td>\n",
       "      <td>62.3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>45.8</td>\n",
       "      <td>1.1</td>\n",
       "      <td>57</td>\n",
       "      <td>12.11</td>\n",
       "      <td>2.23</td>\n",
       "      <td>136.0</td>\n",
       "      <td>18.4</td>\n",
       "      <td>1.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>205</th>\n",
       "      <td>Western Pacific Region</td>\n",
       "      <td>15.6</td>\n",
       "      <td>8.7</td>\n",
       "      <td>6.1</td>\n",
       "      <td>16.4</td>\n",
       "      <td>87.3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16.9</td>\n",
       "      <td>0.3</td>\n",
       "      <td>79</td>\n",
       "      <td>19.8</td>\n",
       "      <td>5.26</td>\n",
       "      <td>94.0</td>\n",
       "      <td>4.3</td>\n",
       "      <td>1.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>206</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>207</th>\n",
       "      <td>Global</td>\n",
       "      <td>17.8</td>\n",
       "      <td>9.2</td>\n",
       "      <td>5.5</td>\n",
       "      <td>16.7</td>\n",
       "      <td>77.5</td>\n",
       "      <td>NaN</td>\n",
       "      <td>41.9</td>\n",
       "      <td>1.5</td>\n",
       "      <td>68</td>\n",
       "      <td>13.46</td>\n",
       "      <td>3.77</td>\n",
       "      <td>103.6</td>\n",
       "      <td>18.3</td>\n",
       "      <td>1.1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>208 rows × 15 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                country_and_areas  \\\n",
       "0                     Afghanistan   \n",
       "1                         Albania   \n",
       "2                         Algeria   \n",
       "3                         Andorra   \n",
       "4                          Angola   \n",
       "..                            ...   \n",
       "203               European Region   \n",
       "204  Eastern Mediterranean Region   \n",
       "205        Western Pacific Region   \n",
       "206                           NaN   \n",
       "207                        Global   \n",
       "\n",
       "    Probability of dying from any of CVD, cancer, diabetes, CRD between age 30 and exact age 70k (%)  \\\n",
       "0                                                 35.3                                                 \n",
       "1                                                 11.4                                                 \n",
       "2                                                 13.9                                                 \n",
       "3                                                    -                                                 \n",
       "4                                                 22.2                                                 \n",
       "..                                                 ...                                                 \n",
       "203                                               16.3                                                 \n",
       "204                                               24.5                                                 \n",
       "205                                               15.6                                                 \n",
       "206                                                NaN                                                 \n",
       "207                                               17.8                                                 \n",
       "\n",
       "    Suicide mortality ratek (per 100 000 population)  \\\n",
       "0                                                4.1   \n",
       "1                                                4.3   \n",
       "2                                                2.5   \n",
       "3                                                  -   \n",
       "4                                                6.1   \n",
       "..                                               ...   \n",
       "203                                             12.8   \n",
       "204                                              5.8   \n",
       "205                                              8.7   \n",
       "206                                              NaN   \n",
       "207                                              9.2   \n",
       "\n",
       "    Total alcohol per capita (≥ 15 years of age) consumptionl (litres of pure alcohol)  \\\n",
       "0                                                 <0.1                                   \n",
       "1                                                  5.1                                   \n",
       "2                                                  0.6                                   \n",
       "3                                                 11.1                                   \n",
       "4                                                  6.2                                   \n",
       "..                                                 ...                                   \n",
       "203                                                9.2                                   \n",
       "204                                                0.3                                   \n",
       "205                                                6.1                                   \n",
       "206                                                NaN                                   \n",
       "207                                                5.5                                   \n",
       "\n",
       "    Road traffic mortality ratek,ay (per 100 000 population)  \\\n",
       "0                                                 15.9         \n",
       "1                                                 11.7         \n",
       "2                                                 20.9         \n",
       "3                                                    -         \n",
       "4                                                 26.1         \n",
       "..                                                 ...         \n",
       "203                                                7.4         \n",
       "204                                               17.8         \n",
       "205                                               16.4         \n",
       "206                                                NaN         \n",
       "207                                               16.7         \n",
       "\n",
       "    Proportion of women of reproductive age who have their need for family planning satisfied with modern methodsm (%)  \\\n",
       "0                                                 42.1                                                                   \n",
       "1                                                  6.3                                                                   \n",
       "2                                                 66.3                                                                   \n",
       "3                                                    -                                                                   \n",
       "4                                                 29.8                                                                   \n",
       "..                                                 ...                                                                   \n",
       "203                                               76.8                                                                   \n",
       "204                                               62.3                                                                   \n",
       "205                                               87.3                                                                   \n",
       "206                                                NaN                                                                   \n",
       "207                                               77.5                                                                   \n",
       "\n",
       "    Proportion of women of reproductive age who have their need for family planning satisfied with modern methodsm (%)  \\\n",
       "0                                                   am                                                                   \n",
       "1                                                  NaN                                                                   \n",
       "2                                                   am                                                                   \n",
       "3                                                  NaN                                                                   \n",
       "4                                                  NaN                                                                   \n",
       "..                                                 ...                                                                   \n",
       "203                                                NaN                                                                   \n",
       "204                                                NaN                                                                   \n",
       "205                                                NaN                                                                   \n",
       "206                                                NaN                                                                   \n",
       "207                                                NaN                                                                   \n",
       "\n",
       "    Adolescent birth raten (per 1000 women aged 15–19 years)  \\\n",
       "0                                                 62.1         \n",
       "1                                                 13.2         \n",
       "2                                                 12.0         \n",
       "3                                                  0.5         \n",
       "4                                                162.7         \n",
       "..                                                 ...         \n",
       "203                                               13.1         \n",
       "204                                               45.8         \n",
       "205                                               16.9         \n",
       "206                                                NaN         \n",
       "207                                               41.9         \n",
       "\n",
       "    Adolescent birth raten (per 1000 women aged 10–14 years)  \\\n",
       "0                                                  0.2         \n",
       "1                                                  0.3         \n",
       "2                                                  0.0         \n",
       "3                                                  0.0         \n",
       "4                                                 10.7         \n",
       "..                                                 ...         \n",
       "203                                                0.1         \n",
       "204                                                1.1         \n",
       "205                                                0.3         \n",
       "206                                                NaN         \n",
       "207                                                1.5         \n",
       "\n",
       "    UHC: Service coverage indexo  \\\n",
       "0                             41   \n",
       "1                             64   \n",
       "2                             74   \n",
       "3                             79   \n",
       "4                             37   \n",
       "..                           ...   \n",
       "203                           81   \n",
       "204                           57   \n",
       "205                           79   \n",
       "206                          NaN   \n",
       "207                           68   \n",
       "\n",
       "    Population with household expenditures on health > 10% of total household expenditure or incomep (%)  \\\n",
       "0                                                 26.1                                                     \n",
       "1                                                  8.9                                                     \n",
       "2                                                    -                                                     \n",
       "3                                                    -                                                     \n",
       "4                                                 35.5                                                     \n",
       "..                                                 ...                                                     \n",
       "203                                               7.94                                                     \n",
       "204                                              12.11                                                     \n",
       "205                                               19.8                                                     \n",
       "206                                                NaN                                                     \n",
       "207                                              13.46                                                     \n",
       "\n",
       "    Population with household expenditures on health > 25% of total household expenditure or incomep (%)  \\\n",
       "0                                                  8.0                                                     \n",
       "1                                                  1.4                                                     \n",
       "2                                                    -                                                     \n",
       "3                                                    -                                                     \n",
       "4                                                 12.5                                                     \n",
       "..                                                 ...                                                     \n",
       "203                                                1.3                                                     \n",
       "204                                               2.23                                                     \n",
       "205                                               5.26                                                     \n",
       "206                                                NaN                                                     \n",
       "207                                               3.77                                                     \n",
       "\n",
       "    Age-standardized mortality rate attributed to household and ambient air pollutionq  (per 100 000 population)   \\\n",
       "0                                                265.7                                                              \n",
       "1                                                 92.5                                                              \n",
       "2                                                 49.7                                                              \n",
       "3                                                    -                                                              \n",
       "4                                                142.8                                                              \n",
       "..                                                 ...                                                              \n",
       "203                                               44.5                                                              \n",
       "204                                              136.0                                                              \n",
       "205                                               94.0                                                              \n",
       "206                                                NaN                                                              \n",
       "207                                              103.6                                                              \n",
       "\n",
       "    Mortality rate attributed to exposure to unsafe WASH servicesr (per 100 000 population)  \\\n",
       "0                                                 16.6                                        \n",
       "1                                                  3.2                                        \n",
       "2                                                  4.1                                        \n",
       "3                                                    -                                        \n",
       "4                                                 48.9                                        \n",
       "..                                                 ...                                        \n",
       "203                                                3.6                                        \n",
       "204                                               18.4                                        \n",
       "205                                                4.3                                        \n",
       "206                                                NaN                                        \n",
       "207                                               18.3                                        \n",
       "\n",
       "    Mortality rate from unintentional poisoningk (per 100 000 population)  \n",
       "0                                                    1                     \n",
       "1                                                  0.3                     \n",
       "2                                                  0.7                     \n",
       "3                                                    -                     \n",
       "4                                                    2                     \n",
       "..                                                 ...                     \n",
       "203                                                1.1                     \n",
       "204                                                1.1                     \n",
       "205                                                1.4                     \n",
       "206                                                NaN                     \n",
       "207                                                1.1                     \n",
       "\n",
       "[208 rows x 15 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "# Read the Excel file with a multi-level header\n",
    "df = pd.read_excel(r\"C:\\My\\Top-up Degree\\Data Science\\Data Science - Assignment\\Data set\\WHO\\whs2023_annex1.xlsx\", \n",
    "                   sheet_name=\"Annex 1-2\", header=[1, 2, 3, 4])\n",
    "\n",
    "df = df.rename(columns={\"Unnamed: 0_level_0\": \"country_and_areas\"})\n",
    "df.columns = df.columns.droplevel([1, 2, 3])\n",
    "display(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "612714a4",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\ziyan\\AppData\\Local\\Temp\\ipykernel_9072\\1376035441.py:18: FutureWarning: The default value of numeric_only in DataFrame.mean is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.\n",
      "  df = df.fillna(df.mean())\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "country_and_areas                     object\n",
       "cvd_cancer_death_rate                 object\n",
       "suicide_rate                          object\n",
       "alcohol_consumption                   object\n",
       "road_traffic_death_rate               object\n",
       "family_planning_satisfaction          object\n",
       "adolescent_birth_rate_15_19           object\n",
       "adolescent_birth_rate_10_14           object\n",
       "uhc_service_coverage_index            object\n",
       "health_expenditure_over_10_percent    object\n",
       "health_expenditure_over_25_percent    object\n",
       "air_pollution_death_rate              object\n",
       "wash_services_death_rate              object\n",
       "poisoning_death_rate                  object\n",
       "dtype: object"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "# Read the Excel file with a multi-level header\n",
    "df = pd.read_excel(r\"C:\\My\\Top-up Degree\\Data Science\\Data Science - Assignment\\Data set\\WHO\\whs2023_annex1.xlsx\", \n",
    "                   sheet_name=\"Annex 1-2\", header=[1, 2, 3, 4])\n",
    "\n",
    "df.dtypes\n",
    "\n",
    "df = df.rename(columns={\"Unnamed: 0_level_0\": \"country_and_areas\"})\n",
    "df.columns = df.columns.droplevel([1, 2, 3])\n",
    "\n",
    "# Drop the rows that have NaN values in the country_and_areas column\n",
    "df = df.dropna(subset=[\"country_and_areas\"])\n",
    "\n",
    "# Drop the columns that have more than 50% missing values\n",
    "df = df.dropna(thresh=len(df)*0.5, axis=1)\n",
    "\n",
    "# Replace the remaining missing values with the mean or median of the column\n",
    "df = df.fillna(df.mean())\n",
    "\n",
    "# Rename the columns to make them shorter and more descriptive\n",
    "df = df.rename(columns={\"Probability of dying from any of CVD, cancer, diabetes, CRD between age 30 and exact age 70k (%)\": \"cvd_cancer_death_rate\",\n",
    "                        \"Suicide mortality ratek (per 100 000 population)\": \"suicide_rate\",\n",
    "                        \"Total alcohol per capita (≥ 15 years of age) consumptionl (litres of pure alcohol)\": \"alcohol_consumption\",\n",
    "                        \"Road traffic mortality ratek,ay (per 100 000 population)\": \"road_traffic_death_rate\",\n",
    "                        \"Proportion of women of reproductive age who have their need for family planning satisfied with modern methodsm (%)\": \"family_planning_satisfaction\",\n",
    "                        \"Adolescent birth raten (per 1000 women aged 15–19 years)\": \"adolescent_birth_rate_15_19\",\n",
    "                        \"Adolescent birth raten (per 1000 women aged 10–14 years)\": \"adolescent_birth_rate_10_14\",\n",
    "                        \"UHC: Service coverage indexo\": \"uhc_service_coverage_index\",\n",
    "                        \"Population with household expenditures on health > 10% of total household expenditure or incomep (%)\": \"health_expenditure_over_10_percent\",\n",
    "                        \"Population with household expenditures on health > 25% of total household expenditure or incomep (%)\": \"health_expenditure_over_25_percent\",\n",
    "                        \"Age-standardized mortality rate attributed to household and ambient air pollutionq  (per 100 000 population) \": \"air_pollution_death_rate\",\n",
    "                        \"Mortality rate attributed to exposure to unsafe WASH servicesr (per 100 000 population)\": \"wash_services_death_rate\",\n",
    "                        \"Mortality rate from unintentional poisoningk (per 100 000 population)\": \"poisoning_death_rate\"})\n",
    "\n",
    "# Check for outliers and remove or transform them if necessary\n",
    "# You can use boxplots, histograms, or z-scores to identify outliers\n",
    "# You can use methods such as winsorization, log transformation, or standardization to deal with outliers\n",
    "# display(df)\n",
    "df.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "09c9a6cd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>country_and_areas</th>\n",
       "      <th>cvd_cancer_death_rate</th>\n",
       "      <th>suicide_rate</th>\n",
       "      <th>alcohol_consumption</th>\n",
       "      <th>road_traffic_death_rate</th>\n",
       "      <th>family_planning_satisfaction</th>\n",
       "      <th>adolescent_birth_rate_15_19</th>\n",
       "      <th>adolescent_birth_rate_10_14</th>\n",
       "      <th>uhc_service_coverage_index</th>\n",
       "      <th>health_expenditure_over_10_percent</th>\n",
       "      <th>health_expenditure_over_25_percent</th>\n",
       "      <th>air_pollution_death_rate</th>\n",
       "      <th>wash_services_death_rate</th>\n",
       "      <th>poisoning_death_rate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>35.3</td>\n",
       "      <td>4.1</td>\n",
       "      <td>&lt;0.1</td>\n",
       "      <td>15.9</td>\n",
       "      <td>42.1</td>\n",
       "      <td>62.1</td>\n",
       "      <td>0.2</td>\n",
       "      <td>41</td>\n",
       "      <td>26.1</td>\n",
       "      <td>8.0</td>\n",
       "      <td>265.7</td>\n",
       "      <td>16.6</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Albania</td>\n",
       "      <td>11.4</td>\n",
       "      <td>4.3</td>\n",
       "      <td>5.1</td>\n",
       "      <td>11.7</td>\n",
       "      <td>6.3</td>\n",
       "      <td>13.2</td>\n",
       "      <td>0.3</td>\n",
       "      <td>64</td>\n",
       "      <td>8.9</td>\n",
       "      <td>1.4</td>\n",
       "      <td>92.5</td>\n",
       "      <td>3.2</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Algeria</td>\n",
       "      <td>13.9</td>\n",
       "      <td>2.5</td>\n",
       "      <td>0.6</td>\n",
       "      <td>20.9</td>\n",
       "      <td>66.3</td>\n",
       "      <td>12.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>74</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>49.7</td>\n",
       "      <td>4.1</td>\n",
       "      <td>0.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Andorra</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>11.1</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.0</td>\n",
       "      <td>79</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Angola</td>\n",
       "      <td>22.2</td>\n",
       "      <td>6.1</td>\n",
       "      <td>6.2</td>\n",
       "      <td>26.1</td>\n",
       "      <td>29.8</td>\n",
       "      <td>162.7</td>\n",
       "      <td>10.7</td>\n",
       "      <td>37</td>\n",
       "      <td>35.5</td>\n",
       "      <td>12.5</td>\n",
       "      <td>142.8</td>\n",
       "      <td>48.9</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>202</th>\n",
       "      <td>South-East Asia Region</td>\n",
       "      <td>21.6</td>\n",
       "      <td>10.1</td>\n",
       "      <td>3.8</td>\n",
       "      <td>15.8</td>\n",
       "      <td>77.7</td>\n",
       "      <td>25.2</td>\n",
       "      <td>0.5</td>\n",
       "      <td>62</td>\n",
       "      <td>16.14</td>\n",
       "      <td>5.89</td>\n",
       "      <td>132.8</td>\n",
       "      <td>29.6</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>203</th>\n",
       "      <td>European Region</td>\n",
       "      <td>16.3</td>\n",
       "      <td>12.8</td>\n",
       "      <td>9.2</td>\n",
       "      <td>7.4</td>\n",
       "      <td>76.8</td>\n",
       "      <td>13.1</td>\n",
       "      <td>0.1</td>\n",
       "      <td>81</td>\n",
       "      <td>7.94</td>\n",
       "      <td>1.3</td>\n",
       "      <td>44.5</td>\n",
       "      <td>3.6</td>\n",
       "      <td>1.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>204</th>\n",
       "      <td>Eastern Mediterranean Region</td>\n",
       "      <td>24.5</td>\n",
       "      <td>5.8</td>\n",
       "      <td>0.3</td>\n",
       "      <td>17.8</td>\n",
       "      <td>62.3</td>\n",
       "      <td>45.8</td>\n",
       "      <td>1.1</td>\n",
       "      <td>57</td>\n",
       "      <td>12.11</td>\n",
       "      <td>2.23</td>\n",
       "      <td>136.0</td>\n",
       "      <td>18.4</td>\n",
       "      <td>1.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>205</th>\n",
       "      <td>Western Pacific Region</td>\n",
       "      <td>15.6</td>\n",
       "      <td>8.7</td>\n",
       "      <td>6.1</td>\n",
       "      <td>16.4</td>\n",
       "      <td>87.3</td>\n",
       "      <td>16.9</td>\n",
       "      <td>0.3</td>\n",
       "      <td>79</td>\n",
       "      <td>19.8</td>\n",
       "      <td>5.26</td>\n",
       "      <td>94.0</td>\n",
       "      <td>4.3</td>\n",
       "      <td>1.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>207</th>\n",
       "      <td>Global</td>\n",
       "      <td>17.8</td>\n",
       "      <td>9.2</td>\n",
       "      <td>5.5</td>\n",
       "      <td>16.7</td>\n",
       "      <td>77.5</td>\n",
       "      <td>41.9</td>\n",
       "      <td>1.5</td>\n",
       "      <td>68</td>\n",
       "      <td>13.46</td>\n",
       "      <td>3.77</td>\n",
       "      <td>103.6</td>\n",
       "      <td>18.3</td>\n",
       "      <td>1.1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>206 rows × 14 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                country_and_areas cvd_cancer_death_rate suicide_rate  \\\n",
       "0                     Afghanistan                  35.3          4.1   \n",
       "1                         Albania                  11.4          4.3   \n",
       "2                         Algeria                  13.9          2.5   \n",
       "3                         Andorra                     -            -   \n",
       "4                          Angola                  22.2          6.1   \n",
       "..                            ...                   ...          ...   \n",
       "202        South-East Asia Region                  21.6         10.1   \n",
       "203               European Region                  16.3         12.8   \n",
       "204  Eastern Mediterranean Region                  24.5          5.8   \n",
       "205        Western Pacific Region                  15.6          8.7   \n",
       "207                        Global                  17.8          9.2   \n",
       "\n",
       "    alcohol_consumption road_traffic_death_rate family_planning_satisfaction  \\\n",
       "0                  <0.1                    15.9                         42.1   \n",
       "1                   5.1                    11.7                          6.3   \n",
       "2                   0.6                    20.9                         66.3   \n",
       "3                  11.1                       -                            -   \n",
       "4                   6.2                    26.1                         29.8   \n",
       "..                  ...                     ...                          ...   \n",
       "202                 3.8                    15.8                         77.7   \n",
       "203                 9.2                     7.4                         76.8   \n",
       "204                 0.3                    17.8                         62.3   \n",
       "205                 6.1                    16.4                         87.3   \n",
       "207                 5.5                    16.7                         77.5   \n",
       "\n",
       "    adolescent_birth_rate_15_19 adolescent_birth_rate_10_14  \\\n",
       "0                          62.1                         0.2   \n",
       "1                          13.2                         0.3   \n",
       "2                          12.0                         0.0   \n",
       "3                           0.5                         0.0   \n",
       "4                         162.7                        10.7   \n",
       "..                          ...                         ...   \n",
       "202                        25.2                         0.5   \n",
       "203                        13.1                         0.1   \n",
       "204                        45.8                         1.1   \n",
       "205                        16.9                         0.3   \n",
       "207                        41.9                         1.5   \n",
       "\n",
       "    uhc_service_coverage_index health_expenditure_over_10_percent  \\\n",
       "0                           41                               26.1   \n",
       "1                           64                                8.9   \n",
       "2                           74                                  -   \n",
       "3                           79                                  -   \n",
       "4                           37                               35.5   \n",
       "..                         ...                                ...   \n",
       "202                         62                              16.14   \n",
       "203                         81                               7.94   \n",
       "204                         57                              12.11   \n",
       "205                         79                               19.8   \n",
       "207                         68                              13.46   \n",
       "\n",
       "    health_expenditure_over_25_percent air_pollution_death_rate  \\\n",
       "0                                  8.0                    265.7   \n",
       "1                                  1.4                     92.5   \n",
       "2                                    -                     49.7   \n",
       "3                                    -                        -   \n",
       "4                                 12.5                    142.8   \n",
       "..                                 ...                      ...   \n",
       "202                               5.89                    132.8   \n",
       "203                                1.3                     44.5   \n",
       "204                               2.23                    136.0   \n",
       "205                               5.26                     94.0   \n",
       "207                               3.77                    103.6   \n",
       "\n",
       "    wash_services_death_rate poisoning_death_rate  \n",
       "0                       16.6                    1  \n",
       "1                        3.2                  0.3  \n",
       "2                        4.1                  0.7  \n",
       "3                          -                    -  \n",
       "4                       48.9                    2  \n",
       "..                       ...                  ...  \n",
       "202                     29.6                  0.3  \n",
       "203                      3.6                  1.1  \n",
       "204                     18.4                  1.1  \n",
       "205                      4.3                  1.4  \n",
       "207                     18.3                  1.1  \n",
       "\n",
       "[206 rows x 14 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "227fa343",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>country_and_areas</th>\n",
       "      <th>cvd_cancer_death_rate</th>\n",
       "      <th>suicide_rate</th>\n",
       "      <th>alcohol_consumption</th>\n",
       "      <th>road_traffic_death_rate</th>\n",
       "      <th>family_planning_satisfaction</th>\n",
       "      <th>adolescent_birth_rate_15_19</th>\n",
       "      <th>adolescent_birth_rate_10_14</th>\n",
       "      <th>uhc_service_coverage_index</th>\n",
       "      <th>health_expenditure_over_10_percent</th>\n",
       "      <th>health_expenditure_over_25_percent</th>\n",
       "      <th>air_pollution_death_rate</th>\n",
       "      <th>wash_services_death_rate</th>\n",
       "      <th>poisoning_death_rate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>35.3</td>\n",
       "      <td>4.1</td>\n",
       "      <td>0.1</td>\n",
       "      <td>15.9</td>\n",
       "      <td>42.1</td>\n",
       "      <td>62.1</td>\n",
       "      <td>0.2</td>\n",
       "      <td>41.0</td>\n",
       "      <td>26.10</td>\n",
       "      <td>8.00</td>\n",
       "      <td>265.7</td>\n",
       "      <td>16.6</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Albania</td>\n",
       "      <td>11.4</td>\n",
       "      <td>4.3</td>\n",
       "      <td>5.1</td>\n",
       "      <td>11.7</td>\n",
       "      <td>6.3</td>\n",
       "      <td>13.2</td>\n",
       "      <td>0.3</td>\n",
       "      <td>64.0</td>\n",
       "      <td>8.90</td>\n",
       "      <td>1.40</td>\n",
       "      <td>92.5</td>\n",
       "      <td>3.2</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Algeria</td>\n",
       "      <td>13.9</td>\n",
       "      <td>2.5</td>\n",
       "      <td>0.6</td>\n",
       "      <td>20.9</td>\n",
       "      <td>66.3</td>\n",
       "      <td>12.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>74.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>49.7</td>\n",
       "      <td>4.1</td>\n",
       "      <td>0.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Andorra</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11.1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.0</td>\n",
       "      <td>79.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Angola</td>\n",
       "      <td>22.2</td>\n",
       "      <td>6.1</td>\n",
       "      <td>6.2</td>\n",
       "      <td>26.1</td>\n",
       "      <td>29.8</td>\n",
       "      <td>162.7</td>\n",
       "      <td>10.7</td>\n",
       "      <td>37.0</td>\n",
       "      <td>35.50</td>\n",
       "      <td>12.50</td>\n",
       "      <td>142.8</td>\n",
       "      <td>48.9</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>202</th>\n",
       "      <td>South-East Asia Region</td>\n",
       "      <td>21.6</td>\n",
       "      <td>10.1</td>\n",
       "      <td>3.8</td>\n",
       "      <td>15.8</td>\n",
       "      <td>77.7</td>\n",
       "      <td>25.2</td>\n",
       "      <td>0.5</td>\n",
       "      <td>62.0</td>\n",
       "      <td>16.14</td>\n",
       "      <td>5.89</td>\n",
       "      <td>132.8</td>\n",
       "      <td>29.6</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>203</th>\n",
       "      <td>European Region</td>\n",
       "      <td>16.3</td>\n",
       "      <td>12.8</td>\n",
       "      <td>9.2</td>\n",
       "      <td>7.4</td>\n",
       "      <td>76.8</td>\n",
       "      <td>13.1</td>\n",
       "      <td>0.1</td>\n",
       "      <td>81.0</td>\n",
       "      <td>7.94</td>\n",
       "      <td>1.30</td>\n",
       "      <td>44.5</td>\n",
       "      <td>3.6</td>\n",
       "      <td>1.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>204</th>\n",
       "      <td>Eastern Mediterranean Region</td>\n",
       "      <td>24.5</td>\n",
       "      <td>5.8</td>\n",
       "      <td>0.3</td>\n",
       "      <td>17.8</td>\n",
       "      <td>62.3</td>\n",
       "      <td>45.8</td>\n",
       "      <td>1.1</td>\n",
       "      <td>57.0</td>\n",
       "      <td>12.11</td>\n",
       "      <td>2.23</td>\n",
       "      <td>136.0</td>\n",
       "      <td>18.4</td>\n",
       "      <td>1.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>205</th>\n",
       "      <td>Western Pacific Region</td>\n",
       "      <td>15.6</td>\n",
       "      <td>8.7</td>\n",
       "      <td>6.1</td>\n",
       "      <td>16.4</td>\n",
       "      <td>87.3</td>\n",
       "      <td>16.9</td>\n",
       "      <td>0.3</td>\n",
       "      <td>79.0</td>\n",
       "      <td>19.80</td>\n",
       "      <td>5.26</td>\n",
       "      <td>94.0</td>\n",
       "      <td>4.3</td>\n",
       "      <td>1.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>207</th>\n",
       "      <td>Global</td>\n",
       "      <td>17.8</td>\n",
       "      <td>9.2</td>\n",
       "      <td>5.5</td>\n",
       "      <td>16.7</td>\n",
       "      <td>77.5</td>\n",
       "      <td>41.9</td>\n",
       "      <td>1.5</td>\n",
       "      <td>68.0</td>\n",
       "      <td>13.46</td>\n",
       "      <td>3.77</td>\n",
       "      <td>103.6</td>\n",
       "      <td>18.3</td>\n",
       "      <td>1.1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>206 rows × 14 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                country_and_areas  cvd_cancer_death_rate  suicide_rate  \\\n",
       "0                     Afghanistan                   35.3           4.1   \n",
       "1                         Albania                   11.4           4.3   \n",
       "2                         Algeria                   13.9           2.5   \n",
       "3                         Andorra                    NaN           NaN   \n",
       "4                          Angola                   22.2           6.1   \n",
       "..                            ...                    ...           ...   \n",
       "202        South-East Asia Region                   21.6          10.1   \n",
       "203               European Region                   16.3          12.8   \n",
       "204  Eastern Mediterranean Region                   24.5           5.8   \n",
       "205        Western Pacific Region                   15.6           8.7   \n",
       "207                        Global                   17.8           9.2   \n",
       "\n",
       "     alcohol_consumption  road_traffic_death_rate  \\\n",
       "0                    0.1                     15.9   \n",
       "1                    5.1                     11.7   \n",
       "2                    0.6                     20.9   \n",
       "3                   11.1                      NaN   \n",
       "4                    6.2                     26.1   \n",
       "..                   ...                      ...   \n",
       "202                  3.8                     15.8   \n",
       "203                  9.2                      7.4   \n",
       "204                  0.3                     17.8   \n",
       "205                  6.1                     16.4   \n",
       "207                  5.5                     16.7   \n",
       "\n",
       "     family_planning_satisfaction  adolescent_birth_rate_15_19  \\\n",
       "0                            42.1                         62.1   \n",
       "1                             6.3                         13.2   \n",
       "2                            66.3                         12.0   \n",
       "3                             NaN                          0.5   \n",
       "4                            29.8                        162.7   \n",
       "..                            ...                          ...   \n",
       "202                          77.7                         25.2   \n",
       "203                          76.8                         13.1   \n",
       "204                          62.3                         45.8   \n",
       "205                          87.3                         16.9   \n",
       "207                          77.5                         41.9   \n",
       "\n",
       "     adolescent_birth_rate_10_14  uhc_service_coverage_index  \\\n",
       "0                            0.2                        41.0   \n",
       "1                            0.3                        64.0   \n",
       "2                            0.0                        74.0   \n",
       "3                            0.0                        79.0   \n",
       "4                           10.7                        37.0   \n",
       "..                           ...                         ...   \n",
       "202                          0.5                        62.0   \n",
       "203                          0.1                        81.0   \n",
       "204                          1.1                        57.0   \n",
       "205                          0.3                        79.0   \n",
       "207                          1.5                        68.0   \n",
       "\n",
       "     health_expenditure_over_10_percent  health_expenditure_over_25_percent  \\\n",
       "0                                 26.10                                8.00   \n",
       "1                                  8.90                                1.40   \n",
       "2                                   NaN                                 NaN   \n",
       "3                                   NaN                                 NaN   \n",
       "4                                 35.50                               12.50   \n",
       "..                                  ...                                 ...   \n",
       "202                               16.14                                5.89   \n",
       "203                                7.94                                1.30   \n",
       "204                               12.11                                2.23   \n",
       "205                               19.80                                5.26   \n",
       "207                               13.46                                3.77   \n",
       "\n",
       "     air_pollution_death_rate  wash_services_death_rate  poisoning_death_rate  \n",
       "0                       265.7                      16.6                   1.0  \n",
       "1                        92.5                       3.2                   0.3  \n",
       "2                        49.7                       4.1                   0.7  \n",
       "3                         NaN                       NaN                   NaN  \n",
       "4                       142.8                      48.9                   2.0  \n",
       "..                        ...                       ...                   ...  \n",
       "202                     132.8                      29.6                   0.3  \n",
       "203                      44.5                       3.6                   1.1  \n",
       "204                     136.0                      18.4                   1.1  \n",
       "205                      94.0                       4.3                   1.4  \n",
       "207                     103.6                      18.3                   1.1  \n",
       "\n",
       "[206 rows x 14 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Function to convert columns to numeric and handle special values\n",
    "def convert_to_numeric(value):\n",
    "    if isinstance(value, str):\n",
    "        # Remove special characters and convert to numeric\n",
    "        value = pd.to_numeric(value.replace('<', '').replace('>', '').replace('%', ''), errors='coerce')\n",
    "    return value\n",
    "\n",
    "# Convert all columns (except 'country_and_areas') to numeric\n",
    "columns_to_convert = df.columns.drop('country_and_areas')\n",
    "df[columns_to_convert] = df[columns_to_convert].applymap(convert_to_numeric)\n",
    "\n",
    "# Display the preprocessed DataFrame\n",
    "display(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "e9656da6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "country_and_areas                      object\n",
       "cvd_cancer_death_rate                 float64\n",
       "suicide_rate                          float64\n",
       "alcohol_consumption                   float64\n",
       "road_traffic_death_rate               float64\n",
       "family_planning_satisfaction          float64\n",
       "adolescent_birth_rate_15_19           float64\n",
       "adolescent_birth_rate_10_14           float64\n",
       "uhc_service_coverage_index            float64\n",
       "health_expenditure_over_10_percent    float64\n",
       "health_expenditure_over_25_percent    float64\n",
       "air_pollution_death_rate              float64\n",
       "wash_services_death_rate              float64\n",
       "poisoning_death_rate                  float64\n",
       "dtype: object"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "b8a184cf",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>country_and_areas</th>\n",
       "      <th>cvd_cancer_death_rate</th>\n",
       "      <th>suicide_rate</th>\n",
       "      <th>alcohol_consumption</th>\n",
       "      <th>road_traffic_death_rate</th>\n",
       "      <th>family_planning_satisfaction</th>\n",
       "      <th>adolescent_birth_rate_15_19</th>\n",
       "      <th>adolescent_birth_rate_10_14</th>\n",
       "      <th>uhc_service_coverage_index</th>\n",
       "      <th>health_expenditure_over_10_percent</th>\n",
       "      <th>health_expenditure_over_25_percent</th>\n",
       "      <th>air_pollution_death_rate</th>\n",
       "      <th>wash_services_death_rate</th>\n",
       "      <th>poisoning_death_rate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>35.3</td>\n",
       "      <td>4.1</td>\n",
       "      <td>0.1</td>\n",
       "      <td>15.9</td>\n",
       "      <td>42.1</td>\n",
       "      <td>62.1</td>\n",
       "      <td>0.2</td>\n",
       "      <td>41.0</td>\n",
       "      <td>26.10</td>\n",
       "      <td>8.00</td>\n",
       "      <td>265.7</td>\n",
       "      <td>16.6</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Albania</td>\n",
       "      <td>11.4</td>\n",
       "      <td>4.3</td>\n",
       "      <td>5.1</td>\n",
       "      <td>11.7</td>\n",
       "      <td>6.3</td>\n",
       "      <td>13.2</td>\n",
       "      <td>0.3</td>\n",
       "      <td>64.0</td>\n",
       "      <td>8.90</td>\n",
       "      <td>1.40</td>\n",
       "      <td>92.5</td>\n",
       "      <td>3.2</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Algeria</td>\n",
       "      <td>13.9</td>\n",
       "      <td>2.5</td>\n",
       "      <td>0.6</td>\n",
       "      <td>20.9</td>\n",
       "      <td>66.3</td>\n",
       "      <td>12.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>74.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>49.7</td>\n",
       "      <td>4.1</td>\n",
       "      <td>0.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Andorra</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11.1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.0</td>\n",
       "      <td>79.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Angola</td>\n",
       "      <td>22.2</td>\n",
       "      <td>6.1</td>\n",
       "      <td>6.2</td>\n",
       "      <td>26.1</td>\n",
       "      <td>29.8</td>\n",
       "      <td>162.7</td>\n",
       "      <td>10.7</td>\n",
       "      <td>37.0</td>\n",
       "      <td>35.50</td>\n",
       "      <td>12.50</td>\n",
       "      <td>142.8</td>\n",
       "      <td>48.9</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>202</th>\n",
       "      <td>South-East Asia Region</td>\n",
       "      <td>21.6</td>\n",
       "      <td>10.1</td>\n",
       "      <td>3.8</td>\n",
       "      <td>15.8</td>\n",
       "      <td>77.7</td>\n",
       "      <td>25.2</td>\n",
       "      <td>0.5</td>\n",
       "      <td>62.0</td>\n",
       "      <td>16.14</td>\n",
       "      <td>5.89</td>\n",
       "      <td>132.8</td>\n",
       "      <td>29.6</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>203</th>\n",
       "      <td>European Region</td>\n",
       "      <td>16.3</td>\n",
       "      <td>12.8</td>\n",
       "      <td>9.2</td>\n",
       "      <td>7.4</td>\n",
       "      <td>76.8</td>\n",
       "      <td>13.1</td>\n",
       "      <td>0.1</td>\n",
       "      <td>81.0</td>\n",
       "      <td>7.94</td>\n",
       "      <td>1.30</td>\n",
       "      <td>44.5</td>\n",
       "      <td>3.6</td>\n",
       "      <td>1.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>204</th>\n",
       "      <td>Eastern Mediterranean Region</td>\n",
       "      <td>24.5</td>\n",
       "      <td>5.8</td>\n",
       "      <td>0.3</td>\n",
       "      <td>17.8</td>\n",
       "      <td>62.3</td>\n",
       "      <td>45.8</td>\n",
       "      <td>1.1</td>\n",
       "      <td>57.0</td>\n",
       "      <td>12.11</td>\n",
       "      <td>2.23</td>\n",
       "      <td>136.0</td>\n",
       "      <td>18.4</td>\n",
       "      <td>1.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>205</th>\n",
       "      <td>Western Pacific Region</td>\n",
       "      <td>15.6</td>\n",
       "      <td>8.7</td>\n",
       "      <td>6.1</td>\n",
       "      <td>16.4</td>\n",
       "      <td>87.3</td>\n",
       "      <td>16.9</td>\n",
       "      <td>0.3</td>\n",
       "      <td>79.0</td>\n",
       "      <td>19.80</td>\n",
       "      <td>5.26</td>\n",
       "      <td>94.0</td>\n",
       "      <td>4.3</td>\n",
       "      <td>1.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>207</th>\n",
       "      <td>Global</td>\n",
       "      <td>17.8</td>\n",
       "      <td>9.2</td>\n",
       "      <td>5.5</td>\n",
       "      <td>16.7</td>\n",
       "      <td>77.5</td>\n",
       "      <td>41.9</td>\n",
       "      <td>1.5</td>\n",
       "      <td>68.0</td>\n",
       "      <td>13.46</td>\n",
       "      <td>3.77</td>\n",
       "      <td>103.6</td>\n",
       "      <td>18.3</td>\n",
       "      <td>1.1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>206 rows × 14 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                country_and_areas  cvd_cancer_death_rate  suicide_rate  \\\n",
       "0                     Afghanistan                   35.3           4.1   \n",
       "1                         Albania                   11.4           4.3   \n",
       "2                         Algeria                   13.9           2.5   \n",
       "3                         Andorra                    NaN           NaN   \n",
       "4                          Angola                   22.2           6.1   \n",
       "..                            ...                    ...           ...   \n",
       "202        South-East Asia Region                   21.6          10.1   \n",
       "203               European Region                   16.3          12.8   \n",
       "204  Eastern Mediterranean Region                   24.5           5.8   \n",
       "205        Western Pacific Region                   15.6           8.7   \n",
       "207                        Global                   17.8           9.2   \n",
       "\n",
       "     alcohol_consumption  road_traffic_death_rate  \\\n",
       "0                    0.1                     15.9   \n",
       "1                    5.1                     11.7   \n",
       "2                    0.6                     20.9   \n",
       "3                   11.1                      NaN   \n",
       "4                    6.2                     26.1   \n",
       "..                   ...                      ...   \n",
       "202                  3.8                     15.8   \n",
       "203                  9.2                      7.4   \n",
       "204                  0.3                     17.8   \n",
       "205                  6.1                     16.4   \n",
       "207                  5.5                     16.7   \n",
       "\n",
       "     family_planning_satisfaction  adolescent_birth_rate_15_19  \\\n",
       "0                            42.1                         62.1   \n",
       "1                             6.3                         13.2   \n",
       "2                            66.3                         12.0   \n",
       "3                             NaN                          0.5   \n",
       "4                            29.8                        162.7   \n",
       "..                            ...                          ...   \n",
       "202                          77.7                         25.2   \n",
       "203                          76.8                         13.1   \n",
       "204                          62.3                         45.8   \n",
       "205                          87.3                         16.9   \n",
       "207                          77.5                         41.9   \n",
       "\n",
       "     adolescent_birth_rate_10_14  uhc_service_coverage_index  \\\n",
       "0                            0.2                        41.0   \n",
       "1                            0.3                        64.0   \n",
       "2                            0.0                        74.0   \n",
       "3                            0.0                        79.0   \n",
       "4                           10.7                        37.0   \n",
       "..                           ...                         ...   \n",
       "202                          0.5                        62.0   \n",
       "203                          0.1                        81.0   \n",
       "204                          1.1                        57.0   \n",
       "205                          0.3                        79.0   \n",
       "207                          1.5                        68.0   \n",
       "\n",
       "     health_expenditure_over_10_percent  health_expenditure_over_25_percent  \\\n",
       "0                                 26.10                                8.00   \n",
       "1                                  8.90                                1.40   \n",
       "2                                   NaN                                 NaN   \n",
       "3                                   NaN                                 NaN   \n",
       "4                                 35.50                               12.50   \n",
       "..                                  ...                                 ...   \n",
       "202                               16.14                                5.89   \n",
       "203                                7.94                                1.30   \n",
       "204                               12.11                                2.23   \n",
       "205                               19.80                                5.26   \n",
       "207                               13.46                                3.77   \n",
       "\n",
       "     air_pollution_death_rate  wash_services_death_rate  poisoning_death_rate  \n",
       "0                       265.7                      16.6                   1.0  \n",
       "1                        92.5                       3.2                   0.3  \n",
       "2                        49.7                       4.1                   0.7  \n",
       "3                         NaN                       NaN                   NaN  \n",
       "4                       142.8                      48.9                   2.0  \n",
       "..                        ...                       ...                   ...  \n",
       "202                     132.8                      29.6                   0.3  \n",
       "203                      44.5                       3.6                   1.1  \n",
       "204                     136.0                      18.4                   1.1  \n",
       "205                      94.0                       4.3                   1.4  \n",
       "207                     103.6                      18.3                   1.1  \n",
       "\n",
       "[206 rows x 14 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "5ca0201e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "country_and_areas                      0\n",
      "cvd_cancer_death_rate                 15\n",
      "suicide_rate                          15\n",
      "alcohol_consumption                    9\n",
      "road_traffic_death_rate               15\n",
      "family_planning_satisfaction          90\n",
      "adolescent_birth_rate_15_19           23\n",
      "adolescent_birth_rate_10_14           32\n",
      "uhc_service_coverage_index             4\n",
      "health_expenditure_over_10_percent    66\n",
      "health_expenditure_over_25_percent    66\n",
      "air_pollution_death_rate              15\n",
      "wash_services_death_rate              15\n",
      "poisoning_death_rate                  15\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "# Check for missing values\n",
    "print(df.isnull().sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c4dd8fb3",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}