{ "cells": [ { "cell_type": "code", "execution_count": 8, "id": "f1cdad04", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Unnamed: 0_level_0 Unnamed: 1_level_0 Unnamed: 2_level_0 \\\n", " Unnamed: 0_level_1 Total populationa (000s) Total populationa (000s) \n", "1 NaN Male Female \n", "2 Countries and areas 2021 NaN \n", "3 Afghanistan 20255 19845 \n", "4 Albania 1426 1429 \n", "5 Algeria 22497 21681 \n", "\n", " Unnamed: 3_level_0 Unnamed: 4_level_0 \\\n", " Total populationa (000s) Life expectancy at birthb (years) \n", "1 Both sexes Male \n", "2 NaN 2019 \n", "3 40099 63.3 \n", "4 2855 76.3 \n", "5 44178 76.2 \n", "\n", " Unnamed: 5_level_0 Unnamed: 6_level_0 \\\n", " Life expectancy at birthb (years) Life expectancy at birthb (years) \n", "1 Female Both sexes \n", "2 NaN NaN \n", "3 63.2 63.2 \n", "4 79.9 78 \n", "5 78.1 77.1 \n", "\n", " Unnamed: 7_level_0 \\\n", " Healthy life expectancy at birthb (years) \n", "1 Male \n", "2 2019 \n", "3 54.7 \n", "4 68 \n", "5 66.7 \n", "\n", " Unnamed: 8_level_0 \\\n", " Healthy life expectancy at birthb (years) \n", "1 Female \n", "2 NaN \n", "3 53.2 \n", "4 70.3 \n", "5 66.1 \n", "\n", " Unnamed: 9_level_0 ... \\\n", " Healthy life expectancy at birthb (years) ... \n", "1 Both sexes ... \n", "2 NaN ... \n", "3 53.9 ... \n", "4 69.1 ... \n", "5 66.4 ... \n", "\n", " 3.1 \\\n", " Proportion of births attended by skilled health personneld (%) \n", "1 NaN \n", "2 2013–2022 \n", "3 62 \n", "4 100 \n", "5 99 \n", "\n", " \\\n", " Proportion of births attended by skilled health personneld (%).1 \n", "1 NaN \n", "2 NaN \n", "3 NaN \n", "4 NaN \n", "5 NaN \n", "\n", " 3.2 \\\n", " Under-five mortality ratee (per 1000 live births) \n", "1 NaN \n", "2 2021 \n", "3 56 \n", "4 9 \n", "5 22 \n", "\n", " \\\n", " Neonatal mortality ratee (per 1000 live births) \n", "1 NaN \n", "2 2021 \n", "3 34 \n", "4 7 \n", "5 16 \n", "\n", " 3.3 \\\n", " New HIV infectionsf (per 1000 uninfected population) \n", "1 NaN \n", "2 2021 \n", "3 0.04 \n", "4 0.03 \n", "5 0.04 \n", "\n", " \\\n", " Tuberculosis incidenceg (per 100 000 population) \n", "1 NaN \n", "2 2021 \n", "3 189 \n", "4 17 \n", "5 54 \n", "\n", " \\\n", " Malaria incidenceh (per 1000 population at risk) \n", "1 NaN \n", "2 2021 \n", "3 6.3 \n", "4 - \n", "5 0.0 \n", "\n", " \\\n", " Malaria incidenceh (per 1000 population at risk).1 \n", "1 NaN \n", "2 NaN \n", "3 NaN \n", "4 NaN \n", "5 NaN \n", "\n", " \\\n", " Hepatitis B surface antigen (HBsAg) prevalence among children under 5 yearsi (%) \n", "1 NaN \n", "2 2020 \n", "3 0.39 \n", "4 0.29 \n", "5 0.08 \n", "\n", " \n", " Reported number of people requiring interventions against NTDsj \n", "1 NaN \n", "2 2021 \n", "3 14 367 281 \n", "4 21 \n", "5 7 081 \n", "\n", "[5 rows x 21 columns]\n" ] } ], "source": [ "import pandas as pd\n", "# Read the Excel file with a multi-level header\n", "df = pd.read_excel(r\"C:\\My\\Top-up Degree\\Data Science\\Data Science - Assignment\\Data set\\WHO\\whs2023_annex1.xlsx\", sheet_name=\"Annex 1-1\", header=[0, 1])\n", "# Drop the first row\n", "df = df.drop(0)\n", "\n", "\n", "# # Rename columns\n", "# df = df.rename(columns={\"Unnamed: 0\": \"Country\", \"Unnamed: 1\": \"Total population (000s)\", \"Unnamed: 2\": \"Male population (000s)\", \"Unnamed: 3\": \"Female population (000s)\", \"Unnamed: 4\": \"Life expectancy at birth (male)\", \"Unnamed: 5\": \"Life expectancy at birth (female)\", \"Unnamed: 6\": \"Life expectancy at birth (both sexes)\", \"Unnamed: 7\": \"Healthy life expectancy at birth (male)\", \"Unnamed: 8\": \"Healthy life expectancy at birth (female)\", \"Unnamed: 9\": \"Healthy life expectancy at birth (both sexes)\", \"3.1\": \"Proportion of births attended by skilled health personnel (%)\", \"Unnamed: 11\": \"Primary data (%)\", \"Unnamed: 12\": \"Year\", \"3.2\": \"Under-five mortality rate (per 1000 live births)\", \"Unnamed: 14\": \"Neonatal mortality rate (per 1000 live births)\", \"3.3\": \"New HIV infections (per 1000 uninfected population)\", \"Unnamed: 16\": \"Tuberculosis incidence (per 100 000 population)\", \"Unnamed: 17\": \"Malaria incidence (per 1000 population at risk)\", # Drop the empty column\n", "# #\"\": float,\n", "# # Drop the column with mixed values\n", "# #\"Reported number of people requiring interventions against neglected tropical diseases\": int,\n", "# })\n", "\n", "# # Create a dictionary mapping of the column names and data types\n", "# data_types = {\"Total population (000s)\": int, \n", "# \"Male population (000s)\": int, \n", "# \"Female population (000s)\": int, \n", "# \"Life expectancy at birth (male)\": float, \n", "# \"Life expectancy at birth (female)\": float, \n", "# \"Life expectancy at birth (both sexes)\": float, \n", "# \"Healthy life expectancy at birth (male)\": float, \n", "# \"Healthy life expectancy at birth (female)\": float, \n", "# \"Healthy life expectancy at birth (both sexes)\": float, \n", "# \"Proportion of births attended by skilled health personneld (%)\": float}\n", "\n", "# # Convert data types using the dictionary mapping\n", "# df = df.astype(data_types)\n", "\n", "\n", "# # Replace dashes with zero\n", "# df = df.replace(\"-\", 0)\n", "\n", "# # Filter out rows with dashes\n", "# df = df.query(\"Malaria incidence (per 1000 population at risk) != '-'\")\n", "\n", "# Inspect the DataFrame\n", "print(df.head())\n", "# print(df.info())\n", "# print(df.describe())\n" ] }, { "cell_type": "code", "execution_count": 20, "id": "df0a91e2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index([ ['Unnamed: 0_level_0'],\n", " ['Total populationa (000s)'],\n", " ['Total populationa (000s)'],\n", " ['Total populationa (000s)'],\n", " ['Life expectancy at birthb (years)'],\n", " ['Life expectancy at birthb (years)'],\n", " ['Life expectancy at birthb (years)'],\n", " ['Healthy life expectancy at birthb (years)'],\n", " ['Healthy life expectancy at birthb (years)'],\n", " ['Healthy life expectancy at birthb (years)'],\n", " ['Maternal mortality ratioc (per 100 000 live births)'],\n", " ['Proportion of births attended by skilled health personneld (%)'],\n", " ['Proportion of births attended by skilled health personneld (%)'],\n", " ['Under-five mortality ratee (per 1000 live births)'],\n", " ['Neonatal mortality ratee (per 1000 live births)'],\n", " ['New HIV infectionsf (per 1000 uninfected population)'],\n", " ['Tuberculosis incidenceg (per 100 000 population)'],\n", " ['Malaria incidenceh (per 1000 population at risk)'],\n", " ['Malaria incidenceh (per 1000 population at risk)'],\n", " ['Hepatitis B surface antigen (HBsAg) prevalence among children under 5 yearsi (%)'],\n", " ['Reported number of people requiring interventions against NTDsj']],\n", " dtype='object') \n", "\n", "Index([ ['Data type'], ['Comparable estimates'],\n", " ['Comparable estimates'], ['Comparable estimates'],\n", " ['Comparable estimates'], ['Comparable estimates'],\n", " ['Comparable estimates'], ['Comparable estimates'],\n", " ['Comparable estimates'], ['Comparable estimates'],\n", " ['Comparable estimates'], ['Primary data'],\n", " ['Primary data'], ['Comparable estimates'],\n", " ['Comparable estimates'], ['Comparable estimates'],\n", " ['Comparable estimates'], ['Comparable estimates'],\n", " ['Comparable estimates'], ['Comparable estimates'],\n", " ['Primary data']],\n", " dtype='object') \n", "\n", "Index([ ['Unnamed: 0_level_2'], ['Male'],\n", " ['Female'], ['Both sexes'],\n", " ['Male'], ['Female'],\n", " ['Both sexes'], ['Male'],\n", " ['Female'], ['Both sexes'],\n", " ['Unnamed: 10_level_2'], ['Unnamed: 11_level_2'],\n", " ['Unnamed: 12_level_2'], ['Unnamed: 13_level_2'],\n", " ['Unnamed: 14_level_2'], ['Unnamed: 15_level_2'],\n", " ['Unnamed: 16_level_2'], ['Unnamed: 17_level_2'],\n", " ['Unnamed: 18_level_2'], ['Unnamed: 19_level_2'],\n", " ['Unnamed: 20_level_2']],\n", " dtype='object') \n", "\n", "Index([['Countries and areas'], nan,\n", " ['Unnamed: 2_level_3'], ['Unnamed: 3_level_3'],\n", " nan, ['Unnamed: 5_level_3'],\n", " ['Unnamed: 6_level_3'], nan,\n", " ['Unnamed: 8_level_3'], ['Unnamed: 9_level_3'],\n", " nan, ['2013–2022'],\n", " ['2013–2022'], nan,\n", " nan, nan,\n", " nan, nan,\n", " nan, nan,\n", " nan],\n", " dtype='object') \n", "\n" ] } ], "source": [ "import pandas as pd\n", "# Read the Excel file with a multi-level header\n", "df = pd.read_excel(r\"C:\\My\\Top-up Degree\\Data Science\\Data Science - Assignment\\Data set\\WHO\\whs2023_annex1.xlsx\", sheet_name=\"Annex 1-1\", header=[1, 2, 3, 4])\n", "\n", "# Inspect the DataFrame\n", "# print(df.head())\n", "\n", "# Get the first level of the column header as an Index\n", "col_level0 = df.columns.get_level_values(0)\n", "col_level1 = df.columns.get_level_values(1)\n", "col_level2 = df.columns.get_level_values(2)\n", "col_level3 = df.columns.get_level_values(3)\n", "\n", "# Split the first level by ';'\n", "col_level0_split = col_level0.str.split(';')\n", "col_level1_split = col_level1.str.split(';')\n", "col_level2_split = col_level2.str.split(';')\n", "col_level3_split = col_level3.str.split(';')\n", "\n", "# Inspect the result\n", "print(col_level0_split, \"\\n\")\n", "print(col_level1_split, \"\\n\")\n", "print(col_level2_split, \"\\n\")\n", "print(col_level3_split, \"\\n\")" ] }, { "cell_type": "code", "execution_count": 32, "id": "4a249153", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['Unnamed: 0_level_0_Data type_Unnamed: 0_level_2_Countries and areas',\n", " 'Total populationa (000s)_Comparable estimates_Male_2021',\n", " 'Total populationa (000s)_Comparable estimates_Female_Unnamed: 2_level_3',\n", " 'Total populationa (000s)_Comparable estimates_Both sexes_Unnamed: 3_level_3',\n", " 'Life expectancy at birthb (years)_Comparable estimates_Male_2019',\n", " 'Life expectancy at birthb (years)_Comparable estimates_Female_Unnamed: 5_level_3',\n", " 'Life expectancy at birthb (years)_Comparable estimates_Both sexes_Unnamed: 6_level_3',\n", " 'Healthy life expectancy at birthb (years)_Comparable estimates_Male_2019',\n", " 'Healthy life expectancy at birthb (years)_Comparable estimates_Female_Unnamed: 8_level_3',\n", " 'Healthy life expectancy at birthb (years)_Comparable estimates_Both sexes_Unnamed: 9_level_3',\n", " 'Maternal mortality ratioc (per 100 000 live births)_Comparable estimates_Unnamed: 10_level_2_2020',\n", " 'Proportion of births attended by skilled health personneld (%)_Primary data_Unnamed: 11_level_2_2013–2022',\n", " 'Proportion of births attended by skilled health personneld (%)_Primary data_Unnamed: 12_level_2_2013–2022',\n", " 'Under-five mortality ratee (per 1000 live births)_Comparable estimates_Unnamed: 13_level_2_2021',\n", " 'Neonatal mortality ratee (per 1000 live births)_Comparable estimates_Unnamed: 14_level_2_2021',\n", " 'New HIV infectionsf (per 1000 uninfected population)_Comparable estimates_Unnamed: 15_level_2_2021',\n", " 'Tuberculosis incidenceg (per 100 000 population)_Comparable estimates_Unnamed: 16_level_2_2021',\n", " 'Malaria incidenceh (per 1000 population at risk)_Comparable estimates_Unnamed: 17_level_2_2021',\n", " 'Malaria incidenceh (per 1000 population at risk)_Comparable estimates_Unnamed: 18_level_2_2021',\n", " 'Hepatitis B surface antigen (HBsAg) prevalence among children under 5 yearsi (%)_Comparable estimates_Unnamed: 19_level_2_2020',\n", " 'Reported number of people requiring interventions against NTDsj_Primary data_Unnamed: 20_level_2_2021'],\n", " dtype='object')\n" ] }, { "ename": "KeyError", "evalue": "['Country']", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[32], line 24\u001b[0m\n\u001b[0;32m 21\u001b[0m \u001b[38;5;28mprint\u001b[39m(df\u001b[38;5;241m.\u001b[39mcolumns)\n\u001b[0;32m 23\u001b[0m \u001b[38;5;66;03m# Drop the last row with NaN values\u001b[39;00m\n\u001b[1;32m---> 24\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdropna\u001b[49m\u001b[43m(\u001b[49m\u001b[43msubset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mCountry\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 26\u001b[0m \u001b[38;5;66;03m# Inspect unique values in each column\u001b[39;00m\n\u001b[0;32m 27\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m col \u001b[38;5;129;01min\u001b[39;00m df\u001b[38;5;241m.\u001b[39mcolumns:\n", "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\util\\_decorators.py:331\u001b[0m, in \u001b[0;36mdeprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 325\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(args) \u001b[38;5;241m>\u001b[39m num_allow_args:\n\u001b[0;32m 326\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[0;32m 327\u001b[0m msg\u001b[38;5;241m.\u001b[39mformat(arguments\u001b[38;5;241m=\u001b[39m_format_argument_list(allow_args)),\n\u001b[0;32m 328\u001b[0m \u001b[38;5;167;01mFutureWarning\u001b[39;00m,\n\u001b[0;32m 329\u001b[0m stacklevel\u001b[38;5;241m=\u001b[39mfind_stack_level(),\n\u001b[0;32m 330\u001b[0m )\n\u001b[1;32m--> 331\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\core\\frame.py:6561\u001b[0m, in \u001b[0;36mDataFrame.dropna\u001b[1;34m(self, axis, how, thresh, subset, inplace)\u001b[0m\n\u001b[0;32m 6559\u001b[0m check \u001b[38;5;241m=\u001b[39m indices \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m\n\u001b[0;32m 6560\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m check\u001b[38;5;241m.\u001b[39many():\n\u001b[1;32m-> 6561\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(np\u001b[38;5;241m.\u001b[39marray(subset)[check]\u001b[38;5;241m.\u001b[39mtolist())\n\u001b[0;32m 6562\u001b[0m agg_obj \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtake(indices, axis\u001b[38;5;241m=\u001b[39magg_axis)\n\u001b[0;32m 6564\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m thresh \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m no_default:\n", "\u001b[1;31mKeyError\u001b[0m: ['Country']" ] } ], "source": [ "import pandas as pd\n", "\n", "# Read the Excel file with a multi-level header\n", "file_path = r\"C:\\My\\Top-up Degree\\Data Science\\Data Science - Assignment\\Data set\\WHO\\whs2023_annex1.xlsx\"\n", "df = pd.read_excel(file_path, sheet_name=\"Annex 1-1\", header=[1, 2, 3, 4])\n", "\n", "# Drop the first row\n", "df = df.drop(0)\n", "\n", "# Combine MultiIndex levels into a single level using list comprehension\n", "df.columns = ['_'.join(filter(None, map(str, col))) for col in df.columns]\n", "\n", "# Rename the first column\n", "df.rename(columns={'nan_nan_nan_nan': 'Country'}, inplace=True)\n", "\n", "# Convert numeric columns to float type\n", "numeric_cols = df.columns.difference(['Country'])\n", "df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')\n", "\n", "# Print the column names to check if 'Country' exists\n", "print(df.columns)\n", "\n", "# Drop the last row with NaN values\n", "df = df.dropna(subset=['Country'])\n", "\n", "# Inspect unique values in each column\n", "for col in df.columns:\n", " unique_values = df[col].unique()\n", " print(f\"{col}: {unique_values}\")\n", "\n", "# Inspect the result\n", "print(df.head())\n" ] }, { "cell_type": "code", "execution_count": 18, "id": "5f71ae9e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Country</th>\n", " <th>Total populationa (000s)</th>\n", " <th>Total populationa (000s)</th>\n", " <th>Total populationa (000s)</th>\n", " <th>Life expectancy at birthb (years)</th>\n", " <th>Life expectancy at birthb (years)</th>\n", " <th>Life expectancy at birthb (years)</th>\n", " <th>Healthy life expectancy at birthb (years)</th>\n", " <th>Healthy life expectancy at birthb (years)</th>\n", " <th>Healthy life expectancy at birthb (years)</th>\n", " <th>...</th>\n", " <th>Proportion of births attended by skilled health personneld (%)</th>\n", " <th>Proportion of births attended by skilled health personneld (%)</th>\n", " <th>Under-five mortality ratee (per 1000 live births)</th>\n", " <th>Neonatal mortality ratee (per 1000 live births)</th>\n", " <th>New HIV infectionsf (per 1000 uninfected population)</th>\n", " <th>Tuberculosis incidenceg (per 100 000 population)</th>\n", " <th>Malaria incidenceh (per 1000 population at risk)</th>\n", " <th>Malaria incidenceh (per 1000 population at risk)</th>\n", " <th>Hepatitis B surface antigen (HBsAg) prevalence among children under 5 yearsi (%)</th>\n", " <th>Reported number of people requiring interventions against NTDsj</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Afghanistan</td>\n", " <td>20255</td>\n", " <td>19845</td>\n", " <td>40099.0</td>\n", " <td>63.3</td>\n", " <td>63.2</td>\n", " <td>63.2</td>\n", " <td>54.7</td>\n", " <td>53.2</td>\n", " <td>53.9</td>\n", " <td>...</td>\n", " <td>62</td>\n", " <td>NaN</td>\n", " <td>56</td>\n", " <td>34</td>\n", " <td>0.04</td>\n", " <td>189.0</td>\n", " <td>6.3</td>\n", " <td>NaN</td>\n", " <td>0.39</td>\n", " <td>14 367 281</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Albania</td>\n", " <td>1426</td>\n", " <td>1429</td>\n", " <td>2855.0</td>\n", " <td>76.3</td>\n", " <td>79.9</td>\n", " <td>78</td>\n", " <td>68</td>\n", " <td>70.3</td>\n", " <td>69.1</td>\n", " <td>...</td>\n", " <td>100</td>\n", " <td>NaN</td>\n", " <td>9</td>\n", " <td>7</td>\n", " <td>0.03</td>\n", " <td>17.0</td>\n", " <td>-</td>\n", " <td>NaN</td>\n", " <td>0.29</td>\n", " <td>21</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Algeria</td>\n", " <td>22497</td>\n", " <td>21681</td>\n", " <td>44178.0</td>\n", " <td>76.2</td>\n", " <td>78.1</td>\n", " <td>77.1</td>\n", " <td>66.7</td>\n", " <td>66.1</td>\n", " <td>66.4</td>\n", " <td>...</td>\n", " <td>99</td>\n", " <td>NaN</td>\n", " <td>22</td>\n", " <td>16</td>\n", " <td>0.04</td>\n", " <td>54.0</td>\n", " <td>0.0</td>\n", " <td>NaN</td>\n", " <td>0.08</td>\n", " <td>7 081</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Andorra</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>79.0</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>...</td>\n", " <td>100</td>\n", " <td>ak</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>-</td>\n", " <td>2.9</td>\n", " <td>-</td>\n", " <td>NaN</td>\n", " <td>0.02</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>Angola</td>\n", " <td>17051</td>\n", " <td>17452</td>\n", " <td>34504.0</td>\n", " <td>60.7</td>\n", " <td>65.5</td>\n", " <td>63.1</td>\n", " <td>53.6</td>\n", " <td>56.2</td>\n", " <td>54.8</td>\n", " <td>...</td>\n", " <td>50</td>\n", " <td>NaN</td>\n", " <td>69</td>\n", " <td>27</td>\n", " <td>0.52</td>\n", " <td>325.0</td>\n", " <td>254.9</td>\n", " <td>NaN</td>\n", " <td>4.57</td>\n", " <td>8 558 522</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>5 rows × 21 columns</p>\n", "</div>" ], "text/plain": [ " Country Total populationa (000s) Total populationa (000s) \\\n", "0 Afghanistan 20255 19845 \n", "1 Albania 1426 1429 \n", "2 Algeria 22497 21681 \n", "3 Andorra - - \n", "4 Angola 17051 17452 \n", "\n", " Total populationa (000s) Life expectancy at birthb (years) \\\n", "0 40099.0 63.3 \n", "1 2855.0 76.3 \n", "2 44178.0 76.2 \n", "3 79.0 - \n", "4 34504.0 60.7 \n", "\n", " Life expectancy at birthb (years) Life expectancy at birthb (years) \\\n", "0 63.2 63.2 \n", "1 79.9 78 \n", "2 78.1 77.1 \n", "3 - - \n", "4 65.5 63.1 \n", "\n", " Healthy life expectancy at birthb (years) \\\n", "0 54.7 \n", "1 68 \n", "2 66.7 \n", "3 - \n", "4 53.6 \n", "\n", " Healthy life expectancy at birthb (years) \\\n", "0 53.2 \n", "1 70.3 \n", "2 66.1 \n", "3 - \n", "4 56.2 \n", "\n", " Healthy life expectancy at birthb (years) ... \\\n", "0 53.9 ... \n", "1 69.1 ... \n", "2 66.4 ... \n", "3 - ... \n", "4 54.8 ... \n", "\n", " Proportion of births attended by skilled health personneld (%) \\\n", "0 62 \n", "1 100 \n", "2 99 \n", "3 100 \n", "4 50 \n", "\n", " Proportion of births attended by skilled health personneld (%) \\\n", "0 NaN \n", "1 NaN \n", "2 NaN \n", "3 ak \n", "4 NaN \n", "\n", " Under-five mortality ratee (per 1000 live births) \\\n", "0 56 \n", "1 9 \n", "2 22 \n", "3 3 \n", "4 69 \n", "\n", " Neonatal mortality ratee (per 1000 live births) \\\n", "0 34 \n", "1 7 \n", "2 16 \n", "3 1 \n", "4 27 \n", "\n", " New HIV infectionsf (per 1000 uninfected population) \\\n", "0 0.04 \n", "1 0.03 \n", "2 0.04 \n", "3 - \n", "4 0.52 \n", "\n", " Tuberculosis incidenceg (per 100 000 population) \\\n", "0 189.0 \n", "1 17.0 \n", "2 54.0 \n", "3 2.9 \n", "4 325.0 \n", "\n", " Malaria incidenceh (per 1000 population at risk) \\\n", "0 6.3 \n", "1 - \n", "2 0.0 \n", "3 - \n", "4 254.9 \n", "\n", " Malaria incidenceh (per 1000 population at risk) \\\n", "0 NaN \n", "1 NaN \n", "2 NaN \n", "3 NaN \n", "4 NaN \n", "\n", " Hepatitis B surface antigen (HBsAg) prevalence among children under 5 yearsi (%) \\\n", "0 0.39 \n", "1 0.29 \n", "2 0.08 \n", "3 0.02 \n", "4 4.57 \n", "\n", " Reported number of people requiring interventions against NTDsj \n", "0 14 367 281 \n", "1 21 \n", "2 7 081 \n", "3 0 \n", "4 8 558 522 \n", "\n", "[5 rows x 21 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import pandas as pd\n", "# Read the Excel file with a multi-level header\n", "df = pd.read_excel(r\"C:\\My\\Top-up Degree\\Data Science\\Data Science - Assignment\\Data set\\WHO\\whs2023_annex1.xlsx\", sheet_name=\"Annex 1-1\", header=[1, 2, 3, 4])\n", "\n", "df = df.rename(columns={\"Unnamed: 0_level_0\": \"Country\", \"Unnamed: 1_level_0\": \"Population\", \"Unnamed: 4_level_0\": \"Life expectancy\"})\n", "df.columns = df.columns.droplevel([1, 2, 3])\n", "\n", "display(df.head())\n", "\n" ] }, { "cell_type": "code", "execution_count": 3, "id": "7c272335", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\ziyan\\AppData\\Local\\Temp\\ipykernel_11820\\2273432563.py:10: PerformanceWarning: dropping on a non-lexsorted multi-index without a level parameter may impact performance.\n", " df = df.drop(columns=['Unnamed: 0_level_0', 'Unnamed: 0_level_1', 'Unnamed: 0_level_2'])\n" ] }, { "ename": "KeyError", "evalue": "'Unnamed: 0_level_1'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)", "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\core\\indexes\\base.py:3802\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[1;34m(self, key, method, tolerance)\u001b[0m\n\u001b[0;32m 3801\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m-> 3802\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 3803\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n", "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\_libs\\index.pyx:138\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[1;34m()\u001b[0m\n", "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\_libs\\index.pyx:165\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[1;34m()\u001b[0m\n", "File \u001b[1;32mpandas\\_libs\\hashtable_class_helper.pxi:5745\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[1;34m()\u001b[0m\n", "File \u001b[1;32mpandas\\_libs\\hashtable_class_helper.pxi:5753\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[1;34m()\u001b[0m\n", "\u001b[1;31mKeyError\u001b[0m: 'Unnamed: 0_level_1'", "\nThe above exception was the direct cause of the following exception:\n", "\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[3], line 10\u001b[0m\n\u001b[0;32m 7\u001b[0m df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_excel(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mC:\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mMy\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mTop-up Degree\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mData Science\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mData Science - Assignment\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mData set\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mWHO\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mwhs2023_annex1.xlsx\u001b[39m\u001b[38;5;124m\"\u001b[39m, sheet_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAnnex 1-1\u001b[39m\u001b[38;5;124m\"\u001b[39m, header\u001b[38;5;241m=\u001b[39m[\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m, \u001b[38;5;241m3\u001b[39m])\n\u001b[0;32m 9\u001b[0m \u001b[38;5;66;03m# Drop columns that aren't useful\u001b[39;00m\n\u001b[1;32m---> 10\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mUnnamed: 0_level_0\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mUnnamed: 0_level_1\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mUnnamed: 0_level_2\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 12\u001b[0m \u001b[38;5;66;03m# Drop rows with missing values\u001b[39;00m\n\u001b[0;32m 13\u001b[0m df \u001b[38;5;241m=\u001b[39m df\u001b[38;5;241m.\u001b[39mdropna()\n", "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\util\\_decorators.py:331\u001b[0m, in \u001b[0;36mdeprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 325\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(args) \u001b[38;5;241m>\u001b[39m num_allow_args:\n\u001b[0;32m 326\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[0;32m 327\u001b[0m msg\u001b[38;5;241m.\u001b[39mformat(arguments\u001b[38;5;241m=\u001b[39m_format_argument_list(allow_args)),\n\u001b[0;32m 328\u001b[0m \u001b[38;5;167;01mFutureWarning\u001b[39;00m,\n\u001b[0;32m 329\u001b[0m stacklevel\u001b[38;5;241m=\u001b[39mfind_stack_level(),\n\u001b[0;32m 330\u001b[0m )\n\u001b[1;32m--> 331\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\core\\frame.py:5399\u001b[0m, in \u001b[0;36mDataFrame.drop\u001b[1;34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[0m\n\u001b[0;32m 5251\u001b[0m \u001b[38;5;129m@deprecate_nonkeyword_arguments\u001b[39m(version\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, allowed_args\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mself\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlabels\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[0;32m 5252\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdrop\u001b[39m( \u001b[38;5;66;03m# type: ignore[override]\u001b[39;00m\n\u001b[0;32m 5253\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 5260\u001b[0m errors: IgnoreRaise \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mraise\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 5261\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataFrame \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 5262\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 5263\u001b[0m \u001b[38;5;124;03m Drop specified labels from rows or columns.\u001b[39;00m\n\u001b[0;32m 5264\u001b[0m \n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 5397\u001b[0m \u001b[38;5;124;03m weight 1.0 0.8\u001b[39;00m\n\u001b[0;32m 5398\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m-> 5399\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 5400\u001b[0m \u001b[43m \u001b[49m\u001b[43mlabels\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5401\u001b[0m \u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5402\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5403\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5404\u001b[0m \u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlevel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5405\u001b[0m \u001b[43m \u001b[49m\u001b[43minplace\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minplace\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5406\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5407\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\util\\_decorators.py:331\u001b[0m, in \u001b[0;36mdeprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 325\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(args) \u001b[38;5;241m>\u001b[39m num_allow_args:\n\u001b[0;32m 326\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[0;32m 327\u001b[0m msg\u001b[38;5;241m.\u001b[39mformat(arguments\u001b[38;5;241m=\u001b[39m_format_argument_list(allow_args)),\n\u001b[0;32m 328\u001b[0m \u001b[38;5;167;01mFutureWarning\u001b[39;00m,\n\u001b[0;32m 329\u001b[0m stacklevel\u001b[38;5;241m=\u001b[39mfind_stack_level(),\n\u001b[0;32m 330\u001b[0m )\n\u001b[1;32m--> 331\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\core\\generic.py:4505\u001b[0m, in \u001b[0;36mNDFrame.drop\u001b[1;34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[0m\n\u001b[0;32m 4503\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m axis, labels \u001b[38;5;129;01min\u001b[39;00m axes\u001b[38;5;241m.\u001b[39mitems():\n\u001b[0;32m 4504\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m labels \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m-> 4505\u001b[0m obj \u001b[38;5;241m=\u001b[39m \u001b[43mobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_drop_axis\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlevel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 4507\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inplace:\n\u001b[0;32m 4508\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_update_inplace(obj)\n", "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\core\\generic.py:4546\u001b[0m, in \u001b[0;36mNDFrame._drop_axis\u001b[1;34m(self, labels, axis, level, errors, only_slice)\u001b[0m\n\u001b[0;32m 4544\u001b[0m new_axis \u001b[38;5;241m=\u001b[39m axis\u001b[38;5;241m.\u001b[39mdrop(labels, level\u001b[38;5;241m=\u001b[39mlevel, errors\u001b[38;5;241m=\u001b[39merrors)\n\u001b[0;32m 4545\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 4546\u001b[0m new_axis \u001b[38;5;241m=\u001b[39m \u001b[43maxis\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 4547\u001b[0m indexer \u001b[38;5;241m=\u001b[39m axis\u001b[38;5;241m.\u001b[39mget_indexer(new_axis)\n\u001b[0;32m 4549\u001b[0m \u001b[38;5;66;03m# Case for non-unique axis\u001b[39;00m\n\u001b[0;32m 4550\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\core\\indexes\\multi.py:2270\u001b[0m, in \u001b[0;36mMultiIndex.drop\u001b[1;34m(self, codes, level, errors)\u001b[0m\n\u001b[0;32m 2268\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m level_codes \u001b[38;5;129;01min\u001b[39;00m codes:\n\u001b[0;32m 2269\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m-> 2270\u001b[0m loc \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlevel_codes\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2271\u001b[0m \u001b[38;5;66;03m# get_loc returns either an integer, a slice, or a boolean\u001b[39;00m\n\u001b[0;32m 2272\u001b[0m \u001b[38;5;66;03m# mask\u001b[39;00m\n\u001b[0;32m 2273\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(loc, \u001b[38;5;28mint\u001b[39m):\n", "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\core\\indexes\\multi.py:2916\u001b[0m, in \u001b[0;36mMultiIndex.get_loc\u001b[1;34m(self, key, method)\u001b[0m\n\u001b[0;32m 2913\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m mask\n\u001b[0;32m 2915\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, \u001b[38;5;28mtuple\u001b[39m):\n\u001b[1;32m-> 2916\u001b[0m loc \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_level_indexer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2917\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _maybe_to_slice(loc)\n\u001b[0;32m 2919\u001b[0m keylen \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(key)\n", "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\core\\indexes\\multi.py:3263\u001b[0m, in \u001b[0;36mMultiIndex._get_level_indexer\u001b[1;34m(self, key, level, indexer)\u001b[0m\n\u001b[0;32m 3259\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mslice\u001b[39m(i, j, step)\n\u001b[0;32m 3261\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 3263\u001b[0m idx \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_loc_single_level_index\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlevel_index\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 3265\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m level \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_lexsort_depth \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m 3266\u001b[0m \u001b[38;5;66;03m# Desired level is not sorted\u001b[39;00m\n\u001b[0;32m 3267\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(idx, \u001b[38;5;28mslice\u001b[39m):\n\u001b[0;32m 3268\u001b[0m \u001b[38;5;66;03m# test_get_loc_partial_timestamp_multiindex\u001b[39;00m\n", "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\core\\indexes\\multi.py:2849\u001b[0m, in \u001b[0;36mMultiIndex._get_loc_single_level_index\u001b[1;34m(self, level_index, key)\u001b[0m\n\u001b[0;32m 2847\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m\n\u001b[0;32m 2848\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 2849\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mlevel_index\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\core\\indexes\\base.py:3804\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[1;34m(self, key, method, tolerance)\u001b[0m\n\u001b[0;32m 3802\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine\u001b[38;5;241m.\u001b[39mget_loc(casted_key)\n\u001b[0;32m 3803\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[1;32m-> 3804\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[0;32m 3805\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[0;32m 3806\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[0;32m 3807\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[0;32m 3808\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[0;32m 3809\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n", "\u001b[1;31mKeyError\u001b[0m: 'Unnamed: 0_level_1'" ] } ], "source": [ "# Import libraries\n", "import pandas as pd\n", "import numpy as np\n", "from sklearn.preprocessing import StandardScaler\n", "\n", "# Read the Excel file with a multi-level header\n", "df = pd.read_excel(r\"C:\\My\\Top-up Degree\\Data Science\\Data Science - Assignment\\Data set\\WHO\\whs2023_annex1.xlsx\", sheet_name=\"Annex 1-1\", header=[1, 2, 3])\n", "\n", "# Drop columns that aren't useful\n", "df = df.drop(columns=['Unnamed: 0_level_0', 'Unnamed: 0_level_1', 'Unnamed: 0_level_2'])\n", "\n", "# Drop rows with missing values\n", "df = df.dropna()\n", "\n", "# Create dummy variables for categorical variables\n", "df = pd.get_dummies(df)\n", "\n", "# Convert the DataFrame to NumPy array\n", "X = df.values\n", "\n", "# Scale the numerical variables using standardization\n", "scaler = StandardScaler()\n", "X_scaled = scaler.fit_transform(X)\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "61825f75", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Country</th>\n", " <th>Total populationa (000s)</th>\n", " <th>Total populationa (000s)</th>\n", " <th>Total populationa (000s)</th>\n", " <th>Life expectancy at birthb (years)</th>\n", " <th>Life expectancy at birthb (years)</th>\n", " <th>Life expectancy at birthb (years)</th>\n", " <th>Healthy life expectancy at birthb (years)</th>\n", " <th>Healthy life expectancy at birthb (years)</th>\n", " <th>Healthy life expectancy at birthb (years)</th>\n", " <th>...</th>\n", " <th>Proportion of births attended by skilled health personneld (%)</th>\n", " <th>Proportion of births attended by skilled health personneld (%)</th>\n", " <th>Under-five mortality ratee (per 1000 live births)</th>\n", " <th>Neonatal mortality ratee (per 1000 live births)</th>\n", " <th>New HIV infectionsf (per 1000 uninfected population)</th>\n", " <th>Tuberculosis incidenceg (per 100 000 population)</th>\n", " <th>Malaria incidenceh (per 1000 population at risk)</th>\n", " <th>Malaria incidenceh (per 1000 population at risk)</th>\n", " <th>Hepatitis B surface antigen (HBsAg) prevalence among children under 5 yearsi (%)</th>\n", " <th>Reported number of people requiring interventions against NTDsj</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Afghanistan</td>\n", " <td>20255</td>\n", " <td>19845</td>\n", " <td>40099.0</td>\n", " <td>63.3</td>\n", " <td>63.2</td>\n", " <td>63.2</td>\n", " <td>54.7</td>\n", " <td>53.2</td>\n", " <td>53.9</td>\n", " <td>...</td>\n", " <td>62</td>\n", " <td>NaN</td>\n", " <td>56</td>\n", " <td>34</td>\n", " <td>0.04</td>\n", " <td>189.0</td>\n", " <td>6.3</td>\n", " <td>NaN</td>\n", " <td>0.39</td>\n", " <td>14 367 281</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Albania</td>\n", " <td>1426</td>\n", " <td>1429</td>\n", " <td>2855.0</td>\n", " <td>76.3</td>\n", " <td>79.9</td>\n", " <td>78</td>\n", " <td>68</td>\n", " <td>70.3</td>\n", " <td>69.1</td>\n", " <td>...</td>\n", " <td>100</td>\n", " <td>NaN</td>\n", " <td>9</td>\n", " <td>7</td>\n", " <td>0.03</td>\n", " <td>17.0</td>\n", " <td>-</td>\n", " <td>NaN</td>\n", " <td>0.29</td>\n", " <td>21</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Algeria</td>\n", " <td>22497</td>\n", " <td>21681</td>\n", " <td>44178.0</td>\n", " <td>76.2</td>\n", " <td>78.1</td>\n", " <td>77.1</td>\n", " <td>66.7</td>\n", " <td>66.1</td>\n", " <td>66.4</td>\n", " <td>...</td>\n", " <td>99</td>\n", " <td>NaN</td>\n", " <td>22</td>\n", " <td>16</td>\n", " <td>0.04</td>\n", " <td>54.0</td>\n", " <td>0.0</td>\n", " <td>NaN</td>\n", " <td>0.08</td>\n", " <td>7 081</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Andorra</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>79.0</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>...</td>\n", " <td>100</td>\n", " <td>ak</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>-</td>\n", " <td>2.9</td>\n", " <td>-</td>\n", " <td>NaN</td>\n", " <td>0.02</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>Angola</td>\n", " <td>17051</td>\n", " <td>17452</td>\n", " <td>34504.0</td>\n", " <td>60.7</td>\n", " <td>65.5</td>\n", " <td>63.1</td>\n", " <td>53.6</td>\n", " <td>56.2</td>\n", " <td>54.8</td>\n", " <td>...</td>\n", " <td>50</td>\n", " <td>NaN</td>\n", " <td>69</td>\n", " <td>27</td>\n", " <td>0.52</td>\n", " <td>325.0</td>\n", " <td>254.9</td>\n", " <td>NaN</td>\n", " <td>4.57</td>\n", " <td>8 558 522</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>203</th>\n", " <td>European Region</td>\n", " <td>451859</td>\n", " <td>478950</td>\n", " <td>930809.0</td>\n", " <td>75.1</td>\n", " <td>81.3</td>\n", " <td>78.2</td>\n", " <td>66.6</td>\n", " <td>70</td>\n", " <td>68.3</td>\n", " <td>...</td>\n", " <td>99</td>\n", " <td>NaN</td>\n", " <td>8</td>\n", " <td>4</td>\n", " <td>0.20</td>\n", " <td>25.0</td>\n", " <td>-</td>\n", " <td>NaN</td>\n", " <td>0.26</td>\n", " <td>5 951 752</td>\n", " </tr>\n", " <tr>\n", " <th>204</th>\n", " <td>Eastern Mediterranean Region</td>\n", " <td>392503</td>\n", " <td>374038</td>\n", " <td>766542.0</td>\n", " <td>68.3</td>\n", " <td>71.3</td>\n", " <td>69.7</td>\n", " <td>60.2</td>\n", " <td>60.7</td>\n", " <td>60.4</td>\n", " <td>...</td>\n", " <td>85</td>\n", " <td>NaN</td>\n", " <td>45</td>\n", " <td>25</td>\n", " <td>0.06</td>\n", " <td>112.0</td>\n", " <td>11.6</td>\n", " <td>NaN</td>\n", " <td>0.84</td>\n", " <td>72 120 780</td>\n", " </tr>\n", " <tr>\n", " <th>205</th>\n", " <td>Western Pacific Region</td>\n", " <td>980226</td>\n", " <td>952584</td>\n", " <td>1932809.0</td>\n", " <td>74.8</td>\n", " <td>80.8</td>\n", " <td>77.7</td>\n", " <td>67</td>\n", " <td>70.2</td>\n", " <td>68.6</td>\n", " <td>...</td>\n", " <td>97</td>\n", " <td>NaN</td>\n", " <td>12</td>\n", " <td>6</td>\n", " <td>0.06</td>\n", " <td>98.0</td>\n", " <td>1.9</td>\n", " <td>NaN</td>\n", " <td>0.3</td>\n", " <td>72 372 509</td>\n", " </tr>\n", " <tr>\n", " <th>206</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>...</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>207</th>\n", " <td>Global</td>\n", " <td>3973370</td>\n", " <td>3929291</td>\n", " <td>7902660.0</td>\n", " <td>70.8</td>\n", " <td>75.9</td>\n", " <td>73.3</td>\n", " <td>62.5</td>\n", " <td>64.9</td>\n", " <td>63.7</td>\n", " <td>...</td>\n", " <td>86</td>\n", " <td>NaN</td>\n", " <td>38</td>\n", " <td>18</td>\n", " <td>0.19</td>\n", " <td>134.0</td>\n", " <td>59.2</td>\n", " <td>NaN</td>\n", " <td>0.94</td>\n", " <td>1 653 289 348</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>208 rows × 21 columns</p>\n", "</div>" ], "text/plain": [ " Country Total populationa (000s) \\\n", "0 Afghanistan 20255 \n", "1 Albania 1426 \n", "2 Algeria 22497 \n", "3 Andorra - \n", "4 Angola 17051 \n", ".. ... ... \n", "203 European Region 451859 \n", "204 Eastern Mediterranean Region 392503 \n", "205 Western Pacific Region 980226 \n", "206 NaN NaN \n", "207 Global 3973370 \n", "\n", " Total populationa (000s) Total populationa (000s) \\\n", "0 19845 40099.0 \n", "1 1429 2855.0 \n", "2 21681 44178.0 \n", "3 - 79.0 \n", "4 17452 34504.0 \n", ".. ... ... \n", "203 478950 930809.0 \n", "204 374038 766542.0 \n", "205 952584 1932809.0 \n", "206 NaN NaN \n", "207 3929291 7902660.0 \n", "\n", " Life expectancy at birthb (years) Life expectancy at birthb (years) \\\n", "0 63.3 63.2 \n", "1 76.3 79.9 \n", "2 76.2 78.1 \n", "3 - - \n", "4 60.7 65.5 \n", ".. ... ... \n", "203 75.1 81.3 \n", "204 68.3 71.3 \n", "205 74.8 80.8 \n", "206 NaN NaN \n", "207 70.8 75.9 \n", "\n", " Life expectancy at birthb (years) \\\n", "0 63.2 \n", "1 78 \n", "2 77.1 \n", "3 - \n", "4 63.1 \n", ".. ... \n", "203 78.2 \n", "204 69.7 \n", "205 77.7 \n", "206 NaN \n", "207 73.3 \n", "\n", " Healthy life expectancy at birthb (years) \\\n", "0 54.7 \n", "1 68 \n", "2 66.7 \n", "3 - \n", "4 53.6 \n", ".. ... \n", "203 66.6 \n", "204 60.2 \n", "205 67 \n", "206 NaN \n", "207 62.5 \n", "\n", " Healthy life expectancy at birthb (years) \\\n", "0 53.2 \n", "1 70.3 \n", "2 66.1 \n", "3 - \n", "4 56.2 \n", ".. ... \n", "203 70 \n", "204 60.7 \n", "205 70.2 \n", "206 NaN \n", "207 64.9 \n", "\n", " Healthy life expectancy at birthb (years) ... \\\n", "0 53.9 ... \n", "1 69.1 ... \n", "2 66.4 ... \n", "3 - ... \n", "4 54.8 ... \n", ".. ... ... \n", "203 68.3 ... \n", "204 60.4 ... \n", "205 68.6 ... \n", "206 NaN ... \n", "207 63.7 ... \n", "\n", " Proportion of births attended by skilled health personneld (%) \\\n", "0 62 \n", "1 100 \n", "2 99 \n", "3 100 \n", "4 50 \n", ".. ... \n", "203 99 \n", "204 85 \n", "205 97 \n", "206 NaN \n", "207 86 \n", "\n", " Proportion of births attended by skilled health personneld (%) \\\n", "0 NaN \n", "1 NaN \n", "2 NaN \n", "3 ak \n", "4 NaN \n", ".. ... \n", "203 NaN \n", "204 NaN \n", "205 NaN \n", "206 NaN \n", "207 NaN \n", "\n", " Under-five mortality ratee (per 1000 live births) \\\n", "0 56 \n", "1 9 \n", "2 22 \n", "3 3 \n", "4 69 \n", ".. ... \n", "203 8 \n", "204 45 \n", "205 12 \n", "206 NaN \n", "207 38 \n", "\n", " Neonatal mortality ratee (per 1000 live births) \\\n", "0 34 \n", "1 7 \n", "2 16 \n", "3 1 \n", "4 27 \n", ".. ... \n", "203 4 \n", "204 25 \n", "205 6 \n", "206 NaN \n", "207 18 \n", "\n", " New HIV infectionsf (per 1000 uninfected population) \\\n", "0 0.04 \n", "1 0.03 \n", "2 0.04 \n", "3 - \n", "4 0.52 \n", ".. ... \n", "203 0.20 \n", "204 0.06 \n", "205 0.06 \n", "206 NaN \n", "207 0.19 \n", "\n", " Tuberculosis incidenceg (per 100 000 population) \\\n", "0 189.0 \n", "1 17.0 \n", "2 54.0 \n", "3 2.9 \n", "4 325.0 \n", ".. ... \n", "203 25.0 \n", "204 112.0 \n", "205 98.0 \n", "206 NaN \n", "207 134.0 \n", "\n", " Malaria incidenceh (per 1000 population at risk) \\\n", "0 6.3 \n", "1 - \n", "2 0.0 \n", "3 - \n", "4 254.9 \n", ".. ... \n", "203 - \n", "204 11.6 \n", "205 1.9 \n", "206 NaN \n", "207 59.2 \n", "\n", " Malaria incidenceh (per 1000 population at risk) \\\n", "0 NaN \n", "1 NaN \n", "2 NaN \n", "3 NaN \n", "4 NaN \n", ".. ... \n", "203 NaN \n", "204 NaN \n", "205 NaN \n", "206 NaN \n", "207 NaN \n", "\n", " Hepatitis B surface antigen (HBsAg) prevalence among children under 5 yearsi (%) \\\n", "0 0.39 \n", "1 0.29 \n", "2 0.08 \n", "3 0.02 \n", "4 4.57 \n", ".. ... \n", "203 0.26 \n", "204 0.84 \n", "205 0.3 \n", "206 NaN \n", "207 0.94 \n", "\n", " Reported number of people requiring interventions against NTDsj \n", "0 14 367 281 \n", "1 21 \n", "2 7 081 \n", "3 0 \n", "4 8 558 522 \n", ".. ... \n", "203 5 951 752 \n", "204 72 120 780 \n", "205 72 372 509 \n", "206 NaN \n", "207 1 653 289 348 \n", "\n", "[208 rows x 21 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import pandas as pd\n", "# Read the Excel file with a multi-level header\n", "df = pd.read_excel(r\"C:\\My\\Top-up Degree\\Data Science\\Data Science - Assignment\\Data set\\WHO\\whs2023_annex1.xlsx\", \n", " sheet_name=\"Annex 1-1\", header=[1, 2, 3, 4])\n", "\n", "df = df.rename(columns={\"Unnamed: 0_level_0\": \"Country\", \"Unnamed: 1_level_0\": \"Population\", \"Unnamed: 4_level_0\": \"Life expectancy\"})\n", "df.columns = df.columns.droplevel([1, 2, 3])\n", "display(df)" ] }, { "cell_type": "code", "execution_count": 5, "id": "e8255f7f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Country object\n", "Total populationa (000s) object\n", "Total populationa (000s) object\n", "Total populationa (000s) float64\n", "Life expectancy at birthb (years) object\n", "Life expectancy at birthb (years) object\n", "Life expectancy at birthb (years) object\n", "Healthy life expectancy at birthb (years) object\n", "Healthy life expectancy at birthb (years) object\n", "Healthy life expectancy at birthb (years) object\n", "Maternal mortality ratioc (per 100 000 live births) object\n", "Proportion of births attended by skilled health personneld (%) object\n", "Proportion of births attended by skilled health personneld (%) object\n", "Under-five mortality ratee (per 1000 live births) object\n", "Neonatal mortality ratee (per 1000 live births) object\n", "New HIV infectionsf (per 1000 uninfected population) object\n", "Tuberculosis incidenceg (per 100 000 population) float64\n", "Malaria incidenceh (per 1000 population at risk) object\n", "Malaria incidenceh (per 1000 population at risk) float64\n", "Hepatitis B surface antigen (HBsAg) prevalence among children under 5 yearsi (%) object\n", "Reported number of people requiring interventions against NTDsj object\n", "dtype: object\n" ] } ], "source": [ "print(df.dtypes)" ] }, { "cell_type": "code", "execution_count": 18, "id": "1351a113", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead tr th {\n", " text-align: left;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr>\n", " <th></th>\n", " <th>Unnamed: 0_level_0</th>\n", " <th>Probability of dying from any of CVD, cancer, diabetes, CRD between age 30 and exact age 70k (%)</th>\n", " <th>Suicide mortality ratek (per 100 000 population)</th>\n", " <th>Total alcohol per capita (≥ 15 years of age) consumptionl (litres of pure alcohol)</th>\n", " <th>Road traffic mortality ratek,ay (per 100 000 population)</th>\n", " <th colspan=\"2\" halign=\"left\">Proportion of women of reproductive age who have their need for family planning satisfied with modern methodsm (%)</th>\n", " <th>Adolescent birth raten (per 1000 women aged 15–19 years)</th>\n", " <th>Adolescent birth raten (per 1000 women aged 10–14 years)</th>\n", " <th>UHC: Service coverage indexo</th>\n", " <th>Population with household expenditures on health > 10% of total household expenditure or incomep (%)</th>\n", " <th>Population with household expenditures on health > 25% of total household expenditure or incomep (%)</th>\n", " <th>Age-standardized mortality rate attributed to household and ambient air pollutionq (per 100 000 population)</th>\n", " <th>Mortality rate attributed to exposure to unsafe WASH servicesr (per 100 000 population)</th>\n", " <th>Mortality rate from unintentional poisoningk (per 100 000 population)</th>\n", " </tr>\n", " <tr>\n", " <th></th>\n", " <th>Data type</th>\n", " <th>Comparable estimates</th>\n", " <th>Comparable estimates</th>\n", " <th>Comparable estimates</th>\n", " <th>Comparable estimates</th>\n", " <th colspan=\"2\" halign=\"left\">Primary data</th>\n", " <th>Primary data</th>\n", " <th>Primary data</th>\n", " <th>Comparable estimates</th>\n", " <th>Primary data</th>\n", " <th>Primary data</th>\n", " <th>Comparable estimates</th>\n", " <th>Comparable estimates</th>\n", " <th>Comparable estimates</th>\n", " </tr>\n", " <tr>\n", " <th></th>\n", " <th>Unnamed: 0_level_2</th>\n", " <th>Unnamed: 1_level_2</th>\n", " <th>Unnamed: 2_level_2</th>\n", " <th>Unnamed: 3_level_2</th>\n", " <th>Unnamed: 4_level_2</th>\n", " <th>Unnamed: 5_level_2</th>\n", " <th>Unnamed: 6_level_2</th>\n", " <th>Unnamed: 7_level_2</th>\n", " <th>Unnamed: 8_level_2</th>\n", " <th>Unnamed: 9_level_2</th>\n", " <th>Unnamed: 10_level_2</th>\n", " <th>Unnamed: 11_level_2</th>\n", " <th>Unnamed: 12_level_2</th>\n", " <th>Unnamed: 13_level_2</th>\n", " <th>Unnamed: 14_level_2</th>\n", " </tr>\n", " <tr>\n", " <th></th>\n", " <th>Countries and areas</th>\n", " <th>2019</th>\n", " <th>2019</th>\n", " <th>2019</th>\n", " <th>2019</th>\n", " <th>2013–2021</th>\n", " <th>2013–2021</th>\n", " <th>2013–2021</th>\n", " <th>Unnamed: 8_level_3</th>\n", " <th>2021</th>\n", " <th>2013–2021</th>\n", " <th>2013–2021</th>\n", " <th>2019</th>\n", " <th>2019</th>\n", " <th>2019</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Afghanistan</td>\n", " <td>35.3</td>\n", " <td>4.1</td>\n", " <td><0.1</td>\n", " <td>15.9</td>\n", " <td>42.1</td>\n", " <td>am</td>\n", " <td>62.1</td>\n", " <td>0.2</td>\n", " <td>41</td>\n", " <td>26.1</td>\n", " <td>8.0</td>\n", " <td>265.7</td>\n", " <td>16.6</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Albania</td>\n", " <td>11.4</td>\n", " <td>4.3</td>\n", " <td>5.1</td>\n", " <td>11.7</td>\n", " <td>6.3</td>\n", " <td>NaN</td>\n", " <td>13.2</td>\n", " <td>0.3</td>\n", " <td>64</td>\n", " <td>8.9</td>\n", " <td>1.4</td>\n", " <td>92.5</td>\n", " <td>3.2</td>\n", " <td>0.3</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Algeria</td>\n", " <td>13.9</td>\n", " <td>2.5</td>\n", " <td>0.6</td>\n", " <td>20.9</td>\n", " <td>66.3</td>\n", " <td>am</td>\n", " <td>12.0</td>\n", " <td>0.0</td>\n", " <td>74</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>49.7</td>\n", " <td>4.1</td>\n", " <td>0.7</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Andorra</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>11.1</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>NaN</td>\n", " <td>0.5</td>\n", " <td>0.0</td>\n", " <td>79</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>-</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>Angola</td>\n", " <td>22.2</td>\n", " <td>6.1</td>\n", " <td>6.2</td>\n", " <td>26.1</td>\n", " <td>29.8</td>\n", " <td>NaN</td>\n", " <td>162.7</td>\n", " <td>10.7</td>\n", " <td>37</td>\n", " <td>35.5</td>\n", " <td>12.5</td>\n", " <td>142.8</td>\n", " <td>48.9</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>203</th>\n", " <td>European Region</td>\n", " <td>16.3</td>\n", " <td>12.8</td>\n", " <td>9.2</td>\n", " <td>7.4</td>\n", " <td>76.8</td>\n", " <td>NaN</td>\n", " <td>13.1</td>\n", " <td>0.1</td>\n", " <td>81</td>\n", " <td>7.94</td>\n", " <td>1.3</td>\n", " <td>44.5</td>\n", " <td>3.6</td>\n", " <td>1.1</td>\n", " </tr>\n", " <tr>\n", " <th>204</th>\n", " <td>Eastern Mediterranean Region</td>\n", " <td>24.5</td>\n", " <td>5.8</td>\n", " <td>0.3</td>\n", " <td>17.8</td>\n", " <td>62.3</td>\n", " <td>NaN</td>\n", " <td>45.8</td>\n", " <td>1.1</td>\n", " <td>57</td>\n", " <td>12.11</td>\n", " <td>2.23</td>\n", " <td>136.0</td>\n", " <td>18.4</td>\n", " <td>1.1</td>\n", " </tr>\n", " <tr>\n", " <th>205</th>\n", " <td>Western Pacific Region</td>\n", " <td>15.6</td>\n", " <td>8.7</td>\n", " <td>6.1</td>\n", " <td>16.4</td>\n", " <td>87.3</td>\n", " <td>NaN</td>\n", " <td>16.9</td>\n", " <td>0.3</td>\n", " <td>79</td>\n", " <td>19.8</td>\n", " <td>5.26</td>\n", " <td>94.0</td>\n", " <td>4.3</td>\n", " <td>1.4</td>\n", " </tr>\n", " <tr>\n", " <th>206</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>207</th>\n", " <td>Global</td>\n", " <td>17.8</td>\n", " <td>9.2</td>\n", " <td>5.5</td>\n", " <td>16.7</td>\n", " <td>77.5</td>\n", " <td>NaN</td>\n", " <td>41.9</td>\n", " <td>1.5</td>\n", " <td>68</td>\n", " <td>13.46</td>\n", " <td>3.77</td>\n", " <td>103.6</td>\n", " <td>18.3</td>\n", " <td>1.1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>208 rows × 15 columns</p>\n", "</div>" ], "text/plain": [ " Unnamed: 0_level_0 \\\n", " Data type \n", " Unnamed: 0_level_2 \n", " Countries and areas \n", "0 Afghanistan \n", "1 Albania \n", "2 Algeria \n", "3 Andorra \n", "4 Angola \n", ".. ... \n", "203 European Region \n", "204 Eastern Mediterranean Region \n", "205 Western Pacific Region \n", "206 NaN \n", "207 Global \n", "\n", " Probability of dying from any of CVD, cancer, diabetes, CRD between age 30 and exact age 70k (%) \\\n", " Comparable estimates \n", " Unnamed: 1_level_2 \n", " 2019 \n", "0 35.3 \n", "1 11.4 \n", "2 13.9 \n", "3 - \n", "4 22.2 \n", ".. ... \n", "203 16.3 \n", "204 24.5 \n", "205 15.6 \n", "206 NaN \n", "207 17.8 \n", "\n", " Suicide mortality ratek (per 100 000 population) \\\n", " Comparable estimates \n", " Unnamed: 2_level_2 \n", " 2019 \n", "0 4.1 \n", "1 4.3 \n", "2 2.5 \n", "3 - \n", "4 6.1 \n", ".. ... \n", "203 12.8 \n", "204 5.8 \n", "205 8.7 \n", "206 NaN \n", "207 9.2 \n", "\n", " Total alcohol per capita (≥ 15 years of age) consumptionl (litres of pure alcohol) \\\n", " Comparable estimates \n", " Unnamed: 3_level_2 \n", " 2019 \n", "0 <0.1 \n", "1 5.1 \n", "2 0.6 \n", "3 11.1 \n", "4 6.2 \n", ".. ... \n", "203 9.2 \n", "204 0.3 \n", "205 6.1 \n", "206 NaN \n", "207 5.5 \n", "\n", " Road traffic mortality ratek,ay (per 100 000 population) \\\n", " Comparable estimates \n", " Unnamed: 4_level_2 \n", " 2019 \n", "0 15.9 \n", "1 11.7 \n", "2 20.9 \n", "3 - \n", "4 26.1 \n", ".. ... \n", "203 7.4 \n", "204 17.8 \n", "205 16.4 \n", "206 NaN \n", "207 16.7 \n", "\n", " Proportion of women of reproductive age who have their need for family planning satisfied with modern methodsm (%) \\\n", " Primary data \n", " Unnamed: 5_level_2 \n", " 2013–2021 \n", "0 42.1 \n", "1 6.3 \n", "2 66.3 \n", "3 - \n", "4 29.8 \n", ".. ... \n", "203 76.8 \n", "204 62.3 \n", "205 87.3 \n", "206 NaN \n", "207 77.5 \n", "\n", " \\\n", " \n", " Unnamed: 6_level_2 \n", " 2013–2021 \n", "0 am \n", "1 NaN \n", "2 am \n", "3 NaN \n", "4 NaN \n", ".. ... \n", "203 NaN \n", "204 NaN \n", "205 NaN \n", "206 NaN \n", "207 NaN \n", "\n", " Adolescent birth raten (per 1000 women aged 15–19 years) \\\n", " Primary data \n", " Unnamed: 7_level_2 \n", " 2013–2021 \n", "0 62.1 \n", "1 13.2 \n", "2 12.0 \n", "3 0.5 \n", "4 162.7 \n", ".. ... \n", "203 13.1 \n", "204 45.8 \n", "205 16.9 \n", "206 NaN \n", "207 41.9 \n", "\n", " Adolescent birth raten (per 1000 women aged 10–14 years) \\\n", " Primary data \n", " Unnamed: 8_level_2 \n", " Unnamed: 8_level_3 \n", "0 0.2 \n", "1 0.3 \n", "2 0.0 \n", "3 0.0 \n", "4 10.7 \n", ".. ... \n", "203 0.1 \n", "204 1.1 \n", "205 0.3 \n", "206 NaN \n", "207 1.5 \n", "\n", " UHC: Service coverage indexo \\\n", " Comparable estimates \n", " Unnamed: 9_level_2 \n", " 2021 \n", "0 41 \n", "1 64 \n", "2 74 \n", "3 79 \n", "4 37 \n", ".. ... \n", "203 81 \n", "204 57 \n", "205 79 \n", "206 NaN \n", "207 68 \n", "\n", " Population with household expenditures on health > 10% of total household expenditure or incomep (%) \\\n", " Primary data \n", " Unnamed: 10_level_2 \n", " 2013–2021 \n", "0 26.1 \n", "1 8.9 \n", "2 - \n", "3 - \n", "4 35.5 \n", ".. ... \n", "203 7.94 \n", "204 12.11 \n", "205 19.8 \n", "206 NaN \n", "207 13.46 \n", "\n", " Population with household expenditures on health > 25% of total household expenditure or incomep (%) \\\n", " Primary data \n", " Unnamed: 11_level_2 \n", " 2013–2021 \n", "0 8.0 \n", "1 1.4 \n", "2 - \n", "3 - \n", "4 12.5 \n", ".. ... \n", "203 1.3 \n", "204 2.23 \n", "205 5.26 \n", "206 NaN \n", "207 3.77 \n", "\n", " Age-standardized mortality rate attributed to household and ambient air pollutionq (per 100 000 population) \\\n", " Comparable estimates \n", " Unnamed: 12_level_2 \n", " 2019 \n", "0 265.7 \n", "1 92.5 \n", "2 49.7 \n", "3 - \n", "4 142.8 \n", ".. ... \n", "203 44.5 \n", "204 136.0 \n", "205 94.0 \n", "206 NaN \n", "207 103.6 \n", "\n", " Mortality rate attributed to exposure to unsafe WASH servicesr (per 100 000 population) \\\n", " Comparable estimates \n", " Unnamed: 13_level_2 \n", " 2019 \n", "0 16.6 \n", "1 3.2 \n", "2 4.1 \n", "3 - \n", "4 48.9 \n", ".. ... \n", "203 3.6 \n", "204 18.4 \n", "205 4.3 \n", "206 NaN \n", "207 18.3 \n", "\n", " Mortality rate from unintentional poisoningk (per 100 000 population) \n", " Comparable estimates \n", " Unnamed: 14_level_2 \n", " 2019 \n", "0 1 \n", "1 0.3 \n", "2 0.7 \n", "3 - \n", "4 2 \n", ".. ... \n", "203 1.1 \n", "204 1.1 \n", "205 1.4 \n", "206 NaN \n", "207 1.1 \n", "\n", "[208 rows x 15 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import pandas as pd\n", "# Read the Excel file with a multi-level header\n", "df = pd.read_excel(r\"C:\\My\\Top-up Degree\\Data Science\\Data Science - Assignment\\Data set\\WHO\\whs2023_annex1.xlsx\", \n", " sheet_name=\"Annex 1-2\", header=[1, 2, 3, 4])\n", "display(df)" ] }, { "cell_type": "code", "execution_count": 19, "id": "9a91c485", "metadata": {}, "outputs": [ { "ename": "KeyError", "evalue": "['country_and_areas']", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[19], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# Drop the rows that have NaN values in the country_and_areas column\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdropna\u001b[49m\u001b[43m(\u001b[49m\u001b[43msubset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcountry_and_areas\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;66;03m# Drop the columns that have more than 50% missing values\u001b[39;00m\n\u001b[0;32m 5\u001b[0m df \u001b[38;5;241m=\u001b[39m df\u001b[38;5;241m.\u001b[39mdropna(thresh\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mlen\u001b[39m(df)\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m0.5\u001b[39m, axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\n", "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\util\\_decorators.py:331\u001b[0m, in \u001b[0;36mdeprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 325\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(args) \u001b[38;5;241m>\u001b[39m num_allow_args:\n\u001b[0;32m 326\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[0;32m 327\u001b[0m msg\u001b[38;5;241m.\u001b[39mformat(arguments\u001b[38;5;241m=\u001b[39m_format_argument_list(allow_args)),\n\u001b[0;32m 328\u001b[0m \u001b[38;5;167;01mFutureWarning\u001b[39;00m,\n\u001b[0;32m 329\u001b[0m stacklevel\u001b[38;5;241m=\u001b[39mfind_stack_level(),\n\u001b[0;32m 330\u001b[0m )\n\u001b[1;32m--> 331\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\core\\frame.py:6561\u001b[0m, in \u001b[0;36mDataFrame.dropna\u001b[1;34m(self, axis, how, thresh, subset, inplace)\u001b[0m\n\u001b[0;32m 6559\u001b[0m check \u001b[38;5;241m=\u001b[39m indices \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m\n\u001b[0;32m 6560\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m check\u001b[38;5;241m.\u001b[39many():\n\u001b[1;32m-> 6561\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(np\u001b[38;5;241m.\u001b[39marray(subset)[check]\u001b[38;5;241m.\u001b[39mtolist())\n\u001b[0;32m 6562\u001b[0m agg_obj \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtake(indices, axis\u001b[38;5;241m=\u001b[39magg_axis)\n\u001b[0;32m 6564\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m thresh \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m no_default:\n", "\u001b[1;31mKeyError\u001b[0m: ['country_and_areas']" ] } ], "source": [ "# Drop the rows that have NaN values in the country_and_areas column\n", "df = df.dropna(subset=[\"country_and_areas\"])\n", "\n", "# Drop the columns that have more than 50% missing values\n", "df = df.dropna(thresh=len(df)*0.5, axis=1)\n", "\n", "# Replace the remaining missing values with the mean or median of the column\n", "df = df.fillna(df.mean())\n", "\n", "# Rename the columns to make them shorter and more descriptive\n", "df = df.rename(columns={\"Probability of dying from any of CVD, cancer, diabetes, CRD between age 30 and exact age 70k (%)\": \"cvd_cancer_death_rate\",\n", " \"Suicide mortality ratek (per 100 000 population)\": \"suicide_rate\",\n", " \"Total alcohol per capita (≥ 15 years of age) consumptionl (litres of pure alcohol)\": \"alcohol_consumption\",\n", " \"Road traffic mortality ratek,ay (per 100 000 population)\": \"road_traffic_death_rate\",\n", " \"Proportion of women of reproductive age who have their need for family planning satisfied with modern methodsm (%)\": \"family_planning_satisfaction\",\n", " \"Adolescent birth raten (per 1000 women aged 15–19 years)\": \"adolescent_birth_rate_15_19\",\n", " \"Adolescent birth raten (per 1000 women aged 10–14 years)\": \"adolescent_birth_rate_10_14\",\n", " \"UHC: Service coverage indexo\": \"uhc_service_coverage_index\",\n", " \"Population with household expenditures on health > 10% of total household expenditure or incomep (%)\": \"health_expenditure_over_10_percent\",\n", " \"Population with household expenditures on health > 25% of total household expenditure or incomep (%)\": \"health_expenditure_over_25_percent\",\n", " \"Age-standardized mortality rate attributed to household and ambient air pollutionq (per 100 000 population)\": \"air_pollution_death_rate\",\n", " \"Mortality rate attributed to exposure to unsafe WASH servicesr (per 100 000 population)\": \"wash_services_death_rate\",\n", " \"Mortality rate from unintentional poisoningk (per 100 000 population)\": \"poisoning_death_rate\"})\n", "\n", "# Check for outliers and remove or transform them if necessary\n", "# You can use boxplots, histograms, or z-scores to identify outliers\n", "# You can use methods such as winsorization, log transformation, or standardization to deal with outliers\n", "display(df)" ] }, { "cell_type": "code", "execution_count": 4, "id": "3556f039", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>country_and_areas</th>\n", " <th>cvd_cancer_death_rate</th>\n", " <th>suicide_rate</th>\n", " <th>Total alcohol per capita (≥ 15 years of age) consumptionl (litres of pure alcohol)</th>\n", " <th>road_traffic_death_rate</th>\n", " <th>family_planning_satisfaction</th>\n", " <th>adolescent_birth_rate_15_19</th>\n", " <th>adolescent_birth_rate_10_14</th>\n", " <th>uhc_service_coverage_index</th>\n", " <th>health_expenditure_over_10_percent</th>\n", " <th>health_expenditure_over_25_percent</th>\n", " <th>Age-standardized mortality rate attributed to household and ambient air pollutionq (per 100 000 population)</th>\n", " <th>wash_services_death_rate</th>\n", " <th>poisoning_death_rate</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>count</th>\n", " <td>206</td>\n", " <td>206</td>\n", " <td>206</td>\n", " <td>206</td>\n", " <td>206</td>\n", " <td>206</td>\n", " <td>206</td>\n", " <td>206</td>\n", " <td>206</td>\n", " <td>206</td>\n", " <td>206</td>\n", " <td>206</td>\n", " <td>206</td>\n", " <td>206.0</td>\n", " </tr>\n", " <tr>\n", " <th>unique</th>\n", " <td>206</td>\n", " <td>130</td>\n", " <td>118</td>\n", " <td>103</td>\n", " <td>146</td>\n", " <td>110</td>\n", " <td>164</td>\n", " <td>41</td>\n", " <td>60</td>\n", " <td>105</td>\n", " <td>61</td>\n", " <td>182</td>\n", " <td>129</td>\n", " <td>39.0</td>\n", " </tr>\n", " <tr>\n", " <th>top</th>\n", " <td>Afghanistan</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>75</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>0.4</td>\n", " </tr>\n", " <tr>\n", " <th>freq</th>\n", " <td>1</td>\n", " <td>15</td>\n", " <td>15</td>\n", " <td>9</td>\n", " <td>15</td>\n", " <td>90</td>\n", " <td>23</td>\n", " <td>32</td>\n", " <td>8</td>\n", " <td>66</td>\n", " <td>66</td>\n", " <td>15</td>\n", " <td>15</td>\n", " <td>21.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " country_and_areas cvd_cancer_death_rate suicide_rate \\\n", "count 206 206 206 \n", "unique 206 130 118 \n", "top Afghanistan - - \n", "freq 1 15 15 \n", "\n", " Total alcohol per capita (≥ 15 years of age) consumptionl (litres of pure alcohol) \\\n", "count 206 \n", "unique 103 \n", "top - \n", "freq 9 \n", "\n", " road_traffic_death_rate family_planning_satisfaction \\\n", "count 206 206 \n", "unique 146 110 \n", "top - - \n", "freq 15 90 \n", "\n", " adolescent_birth_rate_15_19 adolescent_birth_rate_10_14 \\\n", "count 206 206 \n", "unique 164 41 \n", "top - - \n", "freq 23 32 \n", "\n", " uhc_service_coverage_index health_expenditure_over_10_percent \\\n", "count 206 206 \n", "unique 60 105 \n", "top 75 - \n", "freq 8 66 \n", "\n", " health_expenditure_over_25_percent \\\n", "count 206 \n", "unique 61 \n", "top - \n", "freq 66 \n", "\n", " Age-standardized mortality rate attributed to household and ambient air pollutionq (per 100 000 population) \\\n", "count 206 \n", "unique 182 \n", "top - \n", "freq 15 \n", "\n", " wash_services_death_rate poisoning_death_rate \n", "count 206 206.0 \n", "unique 129 39.0 \n", "top - 0.4 \n", "freq 15 21.0 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe()" ] }, { "cell_type": "code", "execution_count": 10, "id": "60920e90", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>country_and_areas</th>\n", " <th>Probability of dying from any of CVD, cancer, diabetes, CRD between age 30 and exact age 70k (%)</th>\n", " <th>Suicide mortality ratek (per 100 000 population)</th>\n", " <th>Total alcohol per capita (≥ 15 years of age) consumptionl (litres of pure alcohol)</th>\n", " <th>Road traffic mortality ratek,ay (per 100 000 population)</th>\n", " <th>Proportion of women of reproductive age who have their need for family planning satisfied with modern methodsm (%)</th>\n", " <th>Proportion of women of reproductive age who have their need for family planning satisfied with modern methodsm (%)</th>\n", " <th>Adolescent birth raten (per 1000 women aged 15–19 years)</th>\n", " <th>Adolescent birth raten (per 1000 women aged 10–14 years)</th>\n", " <th>UHC: Service coverage indexo</th>\n", " <th>Population with household expenditures on health > 10% of total household expenditure or incomep (%)</th>\n", " <th>Population with household expenditures on health > 25% of total household expenditure or incomep (%)</th>\n", " <th>Age-standardized mortality rate attributed to household and ambient air pollutionq (per 100 000 population)</th>\n", " <th>Mortality rate attributed to exposure to unsafe WASH servicesr (per 100 000 population)</th>\n", " <th>Mortality rate from unintentional poisoningk (per 100 000 population)</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Afghanistan</td>\n", " <td>35.3</td>\n", " <td>4.1</td>\n", " <td><0.1</td>\n", " <td>15.9</td>\n", " <td>42.1</td>\n", " <td>am</td>\n", " <td>62.1</td>\n", " <td>0.2</td>\n", " <td>41</td>\n", " <td>26.1</td>\n", " <td>8.0</td>\n", " <td>265.7</td>\n", " <td>16.6</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Albania</td>\n", " <td>11.4</td>\n", " <td>4.3</td>\n", " <td>5.1</td>\n", " <td>11.7</td>\n", " <td>6.3</td>\n", " <td>NaN</td>\n", " <td>13.2</td>\n", " <td>0.3</td>\n", " <td>64</td>\n", " <td>8.9</td>\n", " <td>1.4</td>\n", " <td>92.5</td>\n", " <td>3.2</td>\n", " <td>0.3</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Algeria</td>\n", " <td>13.9</td>\n", " <td>2.5</td>\n", " <td>0.6</td>\n", " <td>20.9</td>\n", " <td>66.3</td>\n", " <td>am</td>\n", " <td>12.0</td>\n", " <td>0.0</td>\n", " <td>74</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>49.7</td>\n", " <td>4.1</td>\n", " <td>0.7</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Andorra</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>11.1</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>NaN</td>\n", " <td>0.5</td>\n", " <td>0.0</td>\n", " <td>79</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>-</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>Angola</td>\n", " <td>22.2</td>\n", " <td>6.1</td>\n", " <td>6.2</td>\n", " <td>26.1</td>\n", " <td>29.8</td>\n", " <td>NaN</td>\n", " <td>162.7</td>\n", " <td>10.7</td>\n", " <td>37</td>\n", " <td>35.5</td>\n", " <td>12.5</td>\n", " <td>142.8</td>\n", " <td>48.9</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>203</th>\n", " <td>European Region</td>\n", " <td>16.3</td>\n", " <td>12.8</td>\n", " <td>9.2</td>\n", " <td>7.4</td>\n", " <td>76.8</td>\n", " <td>NaN</td>\n", " <td>13.1</td>\n", " <td>0.1</td>\n", " <td>81</td>\n", " <td>7.94</td>\n", " <td>1.3</td>\n", " <td>44.5</td>\n", " <td>3.6</td>\n", " <td>1.1</td>\n", " </tr>\n", " <tr>\n", " <th>204</th>\n", " <td>Eastern Mediterranean Region</td>\n", " <td>24.5</td>\n", " <td>5.8</td>\n", " <td>0.3</td>\n", " <td>17.8</td>\n", " <td>62.3</td>\n", " <td>NaN</td>\n", " <td>45.8</td>\n", " <td>1.1</td>\n", " <td>57</td>\n", " <td>12.11</td>\n", " <td>2.23</td>\n", " <td>136.0</td>\n", " <td>18.4</td>\n", " <td>1.1</td>\n", " </tr>\n", " <tr>\n", " <th>205</th>\n", " <td>Western Pacific Region</td>\n", " <td>15.6</td>\n", " <td>8.7</td>\n", " <td>6.1</td>\n", " <td>16.4</td>\n", " <td>87.3</td>\n", " <td>NaN</td>\n", " <td>16.9</td>\n", " <td>0.3</td>\n", " <td>79</td>\n", " <td>19.8</td>\n", " <td>5.26</td>\n", " <td>94.0</td>\n", " <td>4.3</td>\n", " <td>1.4</td>\n", " </tr>\n", " <tr>\n", " <th>206</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>207</th>\n", " <td>Global</td>\n", " <td>17.8</td>\n", " <td>9.2</td>\n", " <td>5.5</td>\n", " <td>16.7</td>\n", " <td>77.5</td>\n", " <td>NaN</td>\n", " <td>41.9</td>\n", " <td>1.5</td>\n", " <td>68</td>\n", " <td>13.46</td>\n", " <td>3.77</td>\n", " <td>103.6</td>\n", " <td>18.3</td>\n", " <td>1.1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>208 rows × 15 columns</p>\n", "</div>" ], "text/plain": [ " country_and_areas \\\n", "0 Afghanistan \n", "1 Albania \n", "2 Algeria \n", "3 Andorra \n", "4 Angola \n", ".. ... \n", "203 European Region \n", "204 Eastern Mediterranean Region \n", "205 Western Pacific Region \n", "206 NaN \n", "207 Global \n", "\n", " Probability of dying from any of CVD, cancer, diabetes, CRD between age 30 and exact age 70k (%) \\\n", "0 35.3 \n", "1 11.4 \n", "2 13.9 \n", "3 - \n", "4 22.2 \n", ".. ... \n", "203 16.3 \n", "204 24.5 \n", "205 15.6 \n", "206 NaN \n", "207 17.8 \n", "\n", " Suicide mortality ratek (per 100 000 population) \\\n", "0 4.1 \n", "1 4.3 \n", "2 2.5 \n", "3 - \n", "4 6.1 \n", ".. ... \n", "203 12.8 \n", "204 5.8 \n", "205 8.7 \n", "206 NaN \n", "207 9.2 \n", "\n", " Total alcohol per capita (≥ 15 years of age) consumptionl (litres of pure alcohol) \\\n", "0 <0.1 \n", "1 5.1 \n", "2 0.6 \n", "3 11.1 \n", "4 6.2 \n", ".. ... \n", "203 9.2 \n", "204 0.3 \n", "205 6.1 \n", "206 NaN \n", "207 5.5 \n", "\n", " Road traffic mortality ratek,ay (per 100 000 population) \\\n", "0 15.9 \n", "1 11.7 \n", "2 20.9 \n", "3 - \n", "4 26.1 \n", ".. ... \n", "203 7.4 \n", "204 17.8 \n", "205 16.4 \n", "206 NaN \n", "207 16.7 \n", "\n", " Proportion of women of reproductive age who have their need for family planning satisfied with modern methodsm (%) \\\n", "0 42.1 \n", "1 6.3 \n", "2 66.3 \n", "3 - \n", "4 29.8 \n", ".. ... \n", "203 76.8 \n", "204 62.3 \n", "205 87.3 \n", "206 NaN \n", "207 77.5 \n", "\n", " Proportion of women of reproductive age who have their need for family planning satisfied with modern methodsm (%) \\\n", "0 am \n", "1 NaN \n", "2 am \n", "3 NaN \n", "4 NaN \n", ".. ... \n", "203 NaN \n", "204 NaN \n", "205 NaN \n", "206 NaN \n", "207 NaN \n", "\n", " Adolescent birth raten (per 1000 women aged 15–19 years) \\\n", "0 62.1 \n", "1 13.2 \n", "2 12.0 \n", "3 0.5 \n", "4 162.7 \n", ".. ... \n", "203 13.1 \n", "204 45.8 \n", "205 16.9 \n", "206 NaN \n", "207 41.9 \n", "\n", " Adolescent birth raten (per 1000 women aged 10–14 years) \\\n", "0 0.2 \n", "1 0.3 \n", "2 0.0 \n", "3 0.0 \n", "4 10.7 \n", ".. ... \n", "203 0.1 \n", "204 1.1 \n", "205 0.3 \n", "206 NaN \n", "207 1.5 \n", "\n", " UHC: Service coverage indexo \\\n", "0 41 \n", "1 64 \n", "2 74 \n", "3 79 \n", "4 37 \n", ".. ... \n", "203 81 \n", "204 57 \n", "205 79 \n", "206 NaN \n", "207 68 \n", "\n", " Population with household expenditures on health > 10% of total household expenditure or incomep (%) \\\n", "0 26.1 \n", "1 8.9 \n", "2 - \n", "3 - \n", "4 35.5 \n", ".. ... \n", "203 7.94 \n", "204 12.11 \n", "205 19.8 \n", "206 NaN \n", "207 13.46 \n", "\n", " Population with household expenditures on health > 25% of total household expenditure or incomep (%) \\\n", "0 8.0 \n", "1 1.4 \n", "2 - \n", "3 - \n", "4 12.5 \n", ".. ... \n", "203 1.3 \n", "204 2.23 \n", "205 5.26 \n", "206 NaN \n", "207 3.77 \n", "\n", " Age-standardized mortality rate attributed to household and ambient air pollutionq (per 100 000 population) \\\n", "0 265.7 \n", "1 92.5 \n", "2 49.7 \n", "3 - \n", "4 142.8 \n", ".. ... \n", "203 44.5 \n", "204 136.0 \n", "205 94.0 \n", "206 NaN \n", "207 103.6 \n", "\n", " Mortality rate attributed to exposure to unsafe WASH servicesr (per 100 000 population) \\\n", "0 16.6 \n", "1 3.2 \n", "2 4.1 \n", "3 - \n", "4 48.9 \n", ".. ... \n", "203 3.6 \n", "204 18.4 \n", "205 4.3 \n", "206 NaN \n", "207 18.3 \n", "\n", " Mortality rate from unintentional poisoningk (per 100 000 population) \n", "0 1 \n", "1 0.3 \n", "2 0.7 \n", "3 - \n", "4 2 \n", ".. ... \n", "203 1.1 \n", "204 1.1 \n", "205 1.4 \n", "206 NaN \n", "207 1.1 \n", "\n", "[208 rows x 15 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import pandas as pd\n", "# Read the Excel file with a multi-level header\n", "df = pd.read_excel(r\"C:\\My\\Top-up Degree\\Data Science\\Data Science - Assignment\\Data set\\WHO\\whs2023_annex1.xlsx\", \n", " sheet_name=\"Annex 1-2\", header=[1, 2, 3, 4])\n", "\n", "df = df.rename(columns={\"Unnamed: 0_level_0\": \"country_and_areas\"})\n", "df.columns = df.columns.droplevel([1, 2, 3])\n", "display(df)" ] }, { "cell_type": "code", "execution_count": 2, "id": "612714a4", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\ziyan\\AppData\\Local\\Temp\\ipykernel_9072\\1376035441.py:18: FutureWarning: The default value of numeric_only in DataFrame.mean is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.\n", " df = df.fillna(df.mean())\n" ] }, { "data": { "text/plain": [ "country_and_areas object\n", "cvd_cancer_death_rate object\n", "suicide_rate object\n", "alcohol_consumption object\n", "road_traffic_death_rate object\n", "family_planning_satisfaction object\n", "adolescent_birth_rate_15_19 object\n", "adolescent_birth_rate_10_14 object\n", "uhc_service_coverage_index object\n", "health_expenditure_over_10_percent object\n", "health_expenditure_over_25_percent object\n", "air_pollution_death_rate object\n", "wash_services_death_rate object\n", "poisoning_death_rate object\n", "dtype: object" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "# Read the Excel file with a multi-level header\n", "df = pd.read_excel(r\"C:\\My\\Top-up Degree\\Data Science\\Data Science - Assignment\\Data set\\WHO\\whs2023_annex1.xlsx\", \n", " sheet_name=\"Annex 1-2\", header=[1, 2, 3, 4])\n", "\n", "df.dtypes\n", "\n", "df = df.rename(columns={\"Unnamed: 0_level_0\": \"country_and_areas\"})\n", "df.columns = df.columns.droplevel([1, 2, 3])\n", "\n", "# Drop the rows that have NaN values in the country_and_areas column\n", "df = df.dropna(subset=[\"country_and_areas\"])\n", "\n", "# Drop the columns that have more than 50% missing values\n", "df = df.dropna(thresh=len(df)*0.5, axis=1)\n", "\n", "# Replace the remaining missing values with the mean or median of the column\n", "df = df.fillna(df.mean())\n", "\n", "# Rename the columns to make them shorter and more descriptive\n", "df = df.rename(columns={\"Probability of dying from any of CVD, cancer, diabetes, CRD between age 30 and exact age 70k (%)\": \"cvd_cancer_death_rate\",\n", " \"Suicide mortality ratek (per 100 000 population)\": \"suicide_rate\",\n", " \"Total alcohol per capita (≥ 15 years of age) consumptionl (litres of pure alcohol)\": \"alcohol_consumption\",\n", " \"Road traffic mortality ratek,ay (per 100 000 population)\": \"road_traffic_death_rate\",\n", " \"Proportion of women of reproductive age who have their need for family planning satisfied with modern methodsm (%)\": \"family_planning_satisfaction\",\n", " \"Adolescent birth raten (per 1000 women aged 15–19 years)\": \"adolescent_birth_rate_15_19\",\n", " \"Adolescent birth raten (per 1000 women aged 10–14 years)\": \"adolescent_birth_rate_10_14\",\n", " \"UHC: Service coverage indexo\": \"uhc_service_coverage_index\",\n", " \"Population with household expenditures on health > 10% of total household expenditure or incomep (%)\": \"health_expenditure_over_10_percent\",\n", " \"Population with household expenditures on health > 25% of total household expenditure or incomep (%)\": \"health_expenditure_over_25_percent\",\n", " \"Age-standardized mortality rate attributed to household and ambient air pollutionq (per 100 000 population) \": \"air_pollution_death_rate\",\n", " \"Mortality rate attributed to exposure to unsafe WASH servicesr (per 100 000 population)\": \"wash_services_death_rate\",\n", " \"Mortality rate from unintentional poisoningk (per 100 000 population)\": \"poisoning_death_rate\"})\n", "\n", "# Check for outliers and remove or transform them if necessary\n", "# You can use boxplots, histograms, or z-scores to identify outliers\n", "# You can use methods such as winsorization, log transformation, or standardization to deal with outliers\n", "# display(df)\n", "df.dtypes" ] }, { "cell_type": "code", "execution_count": 30, "id": "09c9a6cd", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>country_and_areas</th>\n", " <th>cvd_cancer_death_rate</th>\n", " <th>suicide_rate</th>\n", " <th>alcohol_consumption</th>\n", " <th>road_traffic_death_rate</th>\n", " <th>family_planning_satisfaction</th>\n", " <th>adolescent_birth_rate_15_19</th>\n", " <th>adolescent_birth_rate_10_14</th>\n", " <th>uhc_service_coverage_index</th>\n", " <th>health_expenditure_over_10_percent</th>\n", " <th>health_expenditure_over_25_percent</th>\n", " <th>air_pollution_death_rate</th>\n", " <th>wash_services_death_rate</th>\n", " <th>poisoning_death_rate</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Afghanistan</td>\n", " <td>35.3</td>\n", " <td>4.1</td>\n", " <td><0.1</td>\n", " <td>15.9</td>\n", " <td>42.1</td>\n", " <td>62.1</td>\n", " <td>0.2</td>\n", " <td>41</td>\n", " <td>26.1</td>\n", " <td>8.0</td>\n", " <td>265.7</td>\n", " <td>16.6</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Albania</td>\n", " <td>11.4</td>\n", " <td>4.3</td>\n", " <td>5.1</td>\n", " <td>11.7</td>\n", " <td>6.3</td>\n", " <td>13.2</td>\n", " <td>0.3</td>\n", " <td>64</td>\n", " <td>8.9</td>\n", " <td>1.4</td>\n", " <td>92.5</td>\n", " <td>3.2</td>\n", " <td>0.3</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Algeria</td>\n", " <td>13.9</td>\n", " <td>2.5</td>\n", " <td>0.6</td>\n", " <td>20.9</td>\n", " <td>66.3</td>\n", " <td>12.0</td>\n", " <td>0.0</td>\n", " <td>74</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>49.7</td>\n", " <td>4.1</td>\n", " <td>0.7</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Andorra</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>11.1</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>0.5</td>\n", " <td>0.0</td>\n", " <td>79</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>-</td>\n", " <td>-</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>Angola</td>\n", " <td>22.2</td>\n", " <td>6.1</td>\n", " <td>6.2</td>\n", " <td>26.1</td>\n", " <td>29.8</td>\n", " <td>162.7</td>\n", " <td>10.7</td>\n", " <td>37</td>\n", " <td>35.5</td>\n", " <td>12.5</td>\n", " <td>142.8</td>\n", " <td>48.9</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>202</th>\n", " <td>South-East Asia Region</td>\n", " <td>21.6</td>\n", " <td>10.1</td>\n", " <td>3.8</td>\n", " <td>15.8</td>\n", " <td>77.7</td>\n", " <td>25.2</td>\n", " <td>0.5</td>\n", " <td>62</td>\n", " <td>16.14</td>\n", " <td>5.89</td>\n", " <td>132.8</td>\n", " <td>29.6</td>\n", " <td>0.3</td>\n", " </tr>\n", " <tr>\n", " <th>203</th>\n", " <td>European Region</td>\n", " <td>16.3</td>\n", " <td>12.8</td>\n", " <td>9.2</td>\n", " <td>7.4</td>\n", " <td>76.8</td>\n", " <td>13.1</td>\n", " <td>0.1</td>\n", " <td>81</td>\n", " <td>7.94</td>\n", " <td>1.3</td>\n", " <td>44.5</td>\n", " <td>3.6</td>\n", " <td>1.1</td>\n", " </tr>\n", " <tr>\n", " <th>204</th>\n", " <td>Eastern Mediterranean Region</td>\n", " <td>24.5</td>\n", " <td>5.8</td>\n", " <td>0.3</td>\n", " <td>17.8</td>\n", " <td>62.3</td>\n", " <td>45.8</td>\n", " <td>1.1</td>\n", " <td>57</td>\n", " <td>12.11</td>\n", " <td>2.23</td>\n", " <td>136.0</td>\n", " <td>18.4</td>\n", " <td>1.1</td>\n", " </tr>\n", " <tr>\n", " <th>205</th>\n", " <td>Western Pacific Region</td>\n", " <td>15.6</td>\n", " <td>8.7</td>\n", " <td>6.1</td>\n", " <td>16.4</td>\n", " <td>87.3</td>\n", " <td>16.9</td>\n", " <td>0.3</td>\n", " <td>79</td>\n", " <td>19.8</td>\n", " <td>5.26</td>\n", " <td>94.0</td>\n", " <td>4.3</td>\n", " <td>1.4</td>\n", " </tr>\n", " <tr>\n", " <th>207</th>\n", " <td>Global</td>\n", " <td>17.8</td>\n", " <td>9.2</td>\n", " <td>5.5</td>\n", " <td>16.7</td>\n", " <td>77.5</td>\n", " <td>41.9</td>\n", " <td>1.5</td>\n", " <td>68</td>\n", " <td>13.46</td>\n", " <td>3.77</td>\n", " <td>103.6</td>\n", " <td>18.3</td>\n", " <td>1.1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>206 rows × 14 columns</p>\n", "</div>" ], "text/plain": [ " country_and_areas cvd_cancer_death_rate suicide_rate \\\n", "0 Afghanistan 35.3 4.1 \n", "1 Albania 11.4 4.3 \n", "2 Algeria 13.9 2.5 \n", "3 Andorra - - \n", "4 Angola 22.2 6.1 \n", ".. ... ... ... \n", "202 South-East Asia Region 21.6 10.1 \n", "203 European Region 16.3 12.8 \n", "204 Eastern Mediterranean Region 24.5 5.8 \n", "205 Western Pacific Region 15.6 8.7 \n", "207 Global 17.8 9.2 \n", "\n", " alcohol_consumption road_traffic_death_rate family_planning_satisfaction \\\n", "0 <0.1 15.9 42.1 \n", "1 5.1 11.7 6.3 \n", "2 0.6 20.9 66.3 \n", "3 11.1 - - \n", "4 6.2 26.1 29.8 \n", ".. ... ... ... \n", "202 3.8 15.8 77.7 \n", "203 9.2 7.4 76.8 \n", "204 0.3 17.8 62.3 \n", "205 6.1 16.4 87.3 \n", "207 5.5 16.7 77.5 \n", "\n", " adolescent_birth_rate_15_19 adolescent_birth_rate_10_14 \\\n", "0 62.1 0.2 \n", "1 13.2 0.3 \n", "2 12.0 0.0 \n", "3 0.5 0.0 \n", "4 162.7 10.7 \n", ".. ... ... \n", "202 25.2 0.5 \n", "203 13.1 0.1 \n", "204 45.8 1.1 \n", "205 16.9 0.3 \n", "207 41.9 1.5 \n", "\n", " uhc_service_coverage_index health_expenditure_over_10_percent \\\n", "0 41 26.1 \n", "1 64 8.9 \n", "2 74 - \n", "3 79 - \n", "4 37 35.5 \n", ".. ... ... \n", "202 62 16.14 \n", "203 81 7.94 \n", "204 57 12.11 \n", "205 79 19.8 \n", "207 68 13.46 \n", "\n", " health_expenditure_over_25_percent air_pollution_death_rate \\\n", "0 8.0 265.7 \n", "1 1.4 92.5 \n", "2 - 49.7 \n", "3 - - \n", "4 12.5 142.8 \n", ".. ... ... \n", "202 5.89 132.8 \n", "203 1.3 44.5 \n", "204 2.23 136.0 \n", "205 5.26 94.0 \n", "207 3.77 103.6 \n", "\n", " wash_services_death_rate poisoning_death_rate \n", "0 16.6 1 \n", "1 3.2 0.3 \n", "2 4.1 0.7 \n", "3 - - \n", "4 48.9 2 \n", ".. ... ... \n", "202 29.6 0.3 \n", "203 3.6 1.1 \n", "204 18.4 1.1 \n", "205 4.3 1.4 \n", "207 18.3 1.1 \n", "\n", "[206 rows x 14 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display(df)" ] }, { "cell_type": "code", "execution_count": 3, "id": "227fa343", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>country_and_areas</th>\n", " <th>cvd_cancer_death_rate</th>\n", " <th>suicide_rate</th>\n", " <th>alcohol_consumption</th>\n", " <th>road_traffic_death_rate</th>\n", " <th>family_planning_satisfaction</th>\n", " <th>adolescent_birth_rate_15_19</th>\n", " <th>adolescent_birth_rate_10_14</th>\n", " <th>uhc_service_coverage_index</th>\n", " <th>health_expenditure_over_10_percent</th>\n", " <th>health_expenditure_over_25_percent</th>\n", " <th>air_pollution_death_rate</th>\n", " <th>wash_services_death_rate</th>\n", " <th>poisoning_death_rate</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Afghanistan</td>\n", " <td>35.3</td>\n", " <td>4.1</td>\n", " <td>0.1</td>\n", " <td>15.9</td>\n", " <td>42.1</td>\n", " <td>62.1</td>\n", " <td>0.2</td>\n", " <td>41.0</td>\n", " <td>26.10</td>\n", " <td>8.00</td>\n", " <td>265.7</td>\n", " <td>16.6</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Albania</td>\n", " <td>11.4</td>\n", " <td>4.3</td>\n", " <td>5.1</td>\n", " <td>11.7</td>\n", " <td>6.3</td>\n", " <td>13.2</td>\n", " <td>0.3</td>\n", " <td>64.0</td>\n", " <td>8.90</td>\n", " <td>1.40</td>\n", " <td>92.5</td>\n", " <td>3.2</td>\n", " <td>0.3</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Algeria</td>\n", " <td>13.9</td>\n", " <td>2.5</td>\n", " <td>0.6</td>\n", " <td>20.9</td>\n", " <td>66.3</td>\n", " <td>12.0</td>\n", " <td>0.0</td>\n", " <td>74.0</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>49.7</td>\n", " <td>4.1</td>\n", " <td>0.7</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Andorra</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>11.1</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>0.5</td>\n", " <td>0.0</td>\n", " <td>79.0</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>Angola</td>\n", " <td>22.2</td>\n", " <td>6.1</td>\n", " <td>6.2</td>\n", " <td>26.1</td>\n", " <td>29.8</td>\n", " <td>162.7</td>\n", " <td>10.7</td>\n", " <td>37.0</td>\n", " <td>35.50</td>\n", " <td>12.50</td>\n", " <td>142.8</td>\n", " <td>48.9</td>\n", " <td>2.0</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>202</th>\n", " <td>South-East Asia Region</td>\n", " <td>21.6</td>\n", " <td>10.1</td>\n", " <td>3.8</td>\n", " <td>15.8</td>\n", " <td>77.7</td>\n", " <td>25.2</td>\n", " <td>0.5</td>\n", " <td>62.0</td>\n", " <td>16.14</td>\n", " <td>5.89</td>\n", " <td>132.8</td>\n", " <td>29.6</td>\n", " <td>0.3</td>\n", " </tr>\n", " <tr>\n", " <th>203</th>\n", " <td>European Region</td>\n", " <td>16.3</td>\n", " <td>12.8</td>\n", " <td>9.2</td>\n", " <td>7.4</td>\n", " <td>76.8</td>\n", " <td>13.1</td>\n", " <td>0.1</td>\n", " <td>81.0</td>\n", " <td>7.94</td>\n", " <td>1.30</td>\n", " <td>44.5</td>\n", " <td>3.6</td>\n", " <td>1.1</td>\n", " </tr>\n", " <tr>\n", " <th>204</th>\n", " <td>Eastern Mediterranean Region</td>\n", " <td>24.5</td>\n", " <td>5.8</td>\n", " <td>0.3</td>\n", " <td>17.8</td>\n", " <td>62.3</td>\n", " <td>45.8</td>\n", " <td>1.1</td>\n", " <td>57.0</td>\n", " <td>12.11</td>\n", " <td>2.23</td>\n", " <td>136.0</td>\n", " <td>18.4</td>\n", " <td>1.1</td>\n", " </tr>\n", " <tr>\n", " <th>205</th>\n", " <td>Western Pacific Region</td>\n", " <td>15.6</td>\n", " <td>8.7</td>\n", " <td>6.1</td>\n", " <td>16.4</td>\n", " <td>87.3</td>\n", " <td>16.9</td>\n", " <td>0.3</td>\n", " <td>79.0</td>\n", " <td>19.80</td>\n", " <td>5.26</td>\n", " <td>94.0</td>\n", " <td>4.3</td>\n", " <td>1.4</td>\n", " </tr>\n", " <tr>\n", " <th>207</th>\n", " <td>Global</td>\n", " <td>17.8</td>\n", " <td>9.2</td>\n", " <td>5.5</td>\n", " <td>16.7</td>\n", " <td>77.5</td>\n", " <td>41.9</td>\n", " <td>1.5</td>\n", " <td>68.0</td>\n", " <td>13.46</td>\n", " <td>3.77</td>\n", " <td>103.6</td>\n", " <td>18.3</td>\n", " <td>1.1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>206 rows × 14 columns</p>\n", "</div>" ], "text/plain": [ " country_and_areas cvd_cancer_death_rate suicide_rate \\\n", "0 Afghanistan 35.3 4.1 \n", "1 Albania 11.4 4.3 \n", "2 Algeria 13.9 2.5 \n", "3 Andorra NaN NaN \n", "4 Angola 22.2 6.1 \n", ".. ... ... ... \n", "202 South-East Asia Region 21.6 10.1 \n", "203 European Region 16.3 12.8 \n", "204 Eastern Mediterranean Region 24.5 5.8 \n", "205 Western Pacific Region 15.6 8.7 \n", "207 Global 17.8 9.2 \n", "\n", " alcohol_consumption road_traffic_death_rate \\\n", "0 0.1 15.9 \n", "1 5.1 11.7 \n", "2 0.6 20.9 \n", "3 11.1 NaN \n", "4 6.2 26.1 \n", ".. ... ... \n", "202 3.8 15.8 \n", "203 9.2 7.4 \n", "204 0.3 17.8 \n", "205 6.1 16.4 \n", "207 5.5 16.7 \n", "\n", " family_planning_satisfaction adolescent_birth_rate_15_19 \\\n", "0 42.1 62.1 \n", "1 6.3 13.2 \n", "2 66.3 12.0 \n", "3 NaN 0.5 \n", "4 29.8 162.7 \n", ".. ... ... \n", "202 77.7 25.2 \n", "203 76.8 13.1 \n", "204 62.3 45.8 \n", "205 87.3 16.9 \n", "207 77.5 41.9 \n", "\n", " adolescent_birth_rate_10_14 uhc_service_coverage_index \\\n", "0 0.2 41.0 \n", "1 0.3 64.0 \n", "2 0.0 74.0 \n", "3 0.0 79.0 \n", "4 10.7 37.0 \n", ".. ... ... \n", "202 0.5 62.0 \n", "203 0.1 81.0 \n", "204 1.1 57.0 \n", "205 0.3 79.0 \n", "207 1.5 68.0 \n", "\n", " health_expenditure_over_10_percent health_expenditure_over_25_percent \\\n", "0 26.10 8.00 \n", "1 8.90 1.40 \n", "2 NaN NaN \n", "3 NaN NaN \n", "4 35.50 12.50 \n", ".. ... ... \n", "202 16.14 5.89 \n", "203 7.94 1.30 \n", "204 12.11 2.23 \n", "205 19.80 5.26 \n", "207 13.46 3.77 \n", "\n", " air_pollution_death_rate wash_services_death_rate poisoning_death_rate \n", "0 265.7 16.6 1.0 \n", "1 92.5 3.2 0.3 \n", "2 49.7 4.1 0.7 \n", "3 NaN NaN NaN \n", "4 142.8 48.9 2.0 \n", ".. ... ... ... \n", "202 132.8 29.6 0.3 \n", "203 44.5 3.6 1.1 \n", "204 136.0 18.4 1.1 \n", "205 94.0 4.3 1.4 \n", "207 103.6 18.3 1.1 \n", "\n", "[206 rows x 14 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Function to convert columns to numeric and handle special values\n", "def convert_to_numeric(value):\n", " if isinstance(value, str):\n", " # Remove special characters and convert to numeric\n", " value = pd.to_numeric(value.replace('<', '').replace('>', '').replace('%', ''), errors='coerce')\n", " return value\n", "\n", "# Convert all columns (except 'country_and_areas') to numeric\n", "columns_to_convert = df.columns.drop('country_and_areas')\n", "df[columns_to_convert] = df[columns_to_convert].applymap(convert_to_numeric)\n", "\n", "# Display the preprocessed DataFrame\n", "display(df)" ] }, { "cell_type": "code", "execution_count": 33, "id": "e9656da6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "country_and_areas object\n", "cvd_cancer_death_rate float64\n", "suicide_rate float64\n", "alcohol_consumption float64\n", "road_traffic_death_rate float64\n", "family_planning_satisfaction float64\n", "adolescent_birth_rate_15_19 float64\n", "adolescent_birth_rate_10_14 float64\n", "uhc_service_coverage_index float64\n", "health_expenditure_over_10_percent float64\n", "health_expenditure_over_25_percent float64\n", "air_pollution_death_rate float64\n", "wash_services_death_rate float64\n", "poisoning_death_rate float64\n", "dtype: object" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.dtypes" ] }, { "cell_type": "code", "execution_count": 36, "id": "b8a184cf", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>country_and_areas</th>\n", " <th>cvd_cancer_death_rate</th>\n", " <th>suicide_rate</th>\n", " <th>alcohol_consumption</th>\n", " <th>road_traffic_death_rate</th>\n", " <th>family_planning_satisfaction</th>\n", " <th>adolescent_birth_rate_15_19</th>\n", " <th>adolescent_birth_rate_10_14</th>\n", " <th>uhc_service_coverage_index</th>\n", " <th>health_expenditure_over_10_percent</th>\n", " <th>health_expenditure_over_25_percent</th>\n", " <th>air_pollution_death_rate</th>\n", " <th>wash_services_death_rate</th>\n", " <th>poisoning_death_rate</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Afghanistan</td>\n", " <td>35.3</td>\n", " <td>4.1</td>\n", " <td>0.1</td>\n", " <td>15.9</td>\n", " <td>42.1</td>\n", " <td>62.1</td>\n", " <td>0.2</td>\n", " <td>41.0</td>\n", " <td>26.10</td>\n", " <td>8.00</td>\n", " <td>265.7</td>\n", " <td>16.6</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Albania</td>\n", " <td>11.4</td>\n", " <td>4.3</td>\n", " <td>5.1</td>\n", " <td>11.7</td>\n", " <td>6.3</td>\n", " <td>13.2</td>\n", " <td>0.3</td>\n", " <td>64.0</td>\n", " <td>8.90</td>\n", " <td>1.40</td>\n", " <td>92.5</td>\n", " <td>3.2</td>\n", " <td>0.3</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Algeria</td>\n", " <td>13.9</td>\n", " <td>2.5</td>\n", " <td>0.6</td>\n", " <td>20.9</td>\n", " <td>66.3</td>\n", " <td>12.0</td>\n", " <td>0.0</td>\n", " <td>74.0</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>49.7</td>\n", " <td>4.1</td>\n", " <td>0.7</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Andorra</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>11.1</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>0.5</td>\n", " <td>0.0</td>\n", " <td>79.0</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>Angola</td>\n", " <td>22.2</td>\n", " <td>6.1</td>\n", " <td>6.2</td>\n", " <td>26.1</td>\n", " <td>29.8</td>\n", " <td>162.7</td>\n", " <td>10.7</td>\n", " <td>37.0</td>\n", " <td>35.50</td>\n", " <td>12.50</td>\n", " <td>142.8</td>\n", " <td>48.9</td>\n", " <td>2.0</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>202</th>\n", " <td>South-East Asia Region</td>\n", " <td>21.6</td>\n", " <td>10.1</td>\n", " <td>3.8</td>\n", " <td>15.8</td>\n", " <td>77.7</td>\n", " <td>25.2</td>\n", " <td>0.5</td>\n", " <td>62.0</td>\n", " <td>16.14</td>\n", " <td>5.89</td>\n", " <td>132.8</td>\n", " <td>29.6</td>\n", " <td>0.3</td>\n", " </tr>\n", " <tr>\n", " <th>203</th>\n", " <td>European Region</td>\n", " <td>16.3</td>\n", " <td>12.8</td>\n", " <td>9.2</td>\n", " <td>7.4</td>\n", " <td>76.8</td>\n", " <td>13.1</td>\n", " <td>0.1</td>\n", " <td>81.0</td>\n", " <td>7.94</td>\n", " <td>1.30</td>\n", " <td>44.5</td>\n", " <td>3.6</td>\n", " <td>1.1</td>\n", " </tr>\n", " <tr>\n", " <th>204</th>\n", " <td>Eastern Mediterranean Region</td>\n", " <td>24.5</td>\n", " <td>5.8</td>\n", " <td>0.3</td>\n", " <td>17.8</td>\n", " <td>62.3</td>\n", " <td>45.8</td>\n", " <td>1.1</td>\n", " <td>57.0</td>\n", " <td>12.11</td>\n", " <td>2.23</td>\n", " <td>136.0</td>\n", " <td>18.4</td>\n", " <td>1.1</td>\n", " </tr>\n", " <tr>\n", " <th>205</th>\n", " <td>Western Pacific Region</td>\n", " <td>15.6</td>\n", " <td>8.7</td>\n", " <td>6.1</td>\n", " <td>16.4</td>\n", " <td>87.3</td>\n", " <td>16.9</td>\n", " <td>0.3</td>\n", " <td>79.0</td>\n", " <td>19.80</td>\n", " <td>5.26</td>\n", " <td>94.0</td>\n", " <td>4.3</td>\n", " <td>1.4</td>\n", " </tr>\n", " <tr>\n", " <th>207</th>\n", " <td>Global</td>\n", " <td>17.8</td>\n", " <td>9.2</td>\n", " <td>5.5</td>\n", " <td>16.7</td>\n", " <td>77.5</td>\n", " <td>41.9</td>\n", " <td>1.5</td>\n", " <td>68.0</td>\n", " <td>13.46</td>\n", " <td>3.77</td>\n", " <td>103.6</td>\n", " <td>18.3</td>\n", " <td>1.1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>206 rows × 14 columns</p>\n", "</div>" ], "text/plain": [ " country_and_areas cvd_cancer_death_rate suicide_rate \\\n", "0 Afghanistan 35.3 4.1 \n", "1 Albania 11.4 4.3 \n", "2 Algeria 13.9 2.5 \n", "3 Andorra NaN NaN \n", "4 Angola 22.2 6.1 \n", ".. ... ... ... \n", "202 South-East Asia Region 21.6 10.1 \n", "203 European Region 16.3 12.8 \n", "204 Eastern Mediterranean Region 24.5 5.8 \n", "205 Western Pacific Region 15.6 8.7 \n", "207 Global 17.8 9.2 \n", "\n", " alcohol_consumption road_traffic_death_rate \\\n", "0 0.1 15.9 \n", "1 5.1 11.7 \n", "2 0.6 20.9 \n", "3 11.1 NaN \n", "4 6.2 26.1 \n", ".. ... ... \n", "202 3.8 15.8 \n", "203 9.2 7.4 \n", "204 0.3 17.8 \n", "205 6.1 16.4 \n", "207 5.5 16.7 \n", "\n", " family_planning_satisfaction adolescent_birth_rate_15_19 \\\n", "0 42.1 62.1 \n", "1 6.3 13.2 \n", "2 66.3 12.0 \n", "3 NaN 0.5 \n", "4 29.8 162.7 \n", ".. ... ... \n", "202 77.7 25.2 \n", "203 76.8 13.1 \n", "204 62.3 45.8 \n", "205 87.3 16.9 \n", "207 77.5 41.9 \n", "\n", " adolescent_birth_rate_10_14 uhc_service_coverage_index \\\n", "0 0.2 41.0 \n", "1 0.3 64.0 \n", "2 0.0 74.0 \n", "3 0.0 79.0 \n", "4 10.7 37.0 \n", ".. ... ... \n", "202 0.5 62.0 \n", "203 0.1 81.0 \n", "204 1.1 57.0 \n", "205 0.3 79.0 \n", "207 1.5 68.0 \n", "\n", " health_expenditure_over_10_percent health_expenditure_over_25_percent \\\n", "0 26.10 8.00 \n", "1 8.90 1.40 \n", "2 NaN NaN \n", "3 NaN NaN \n", "4 35.50 12.50 \n", ".. ... ... \n", "202 16.14 5.89 \n", "203 7.94 1.30 \n", "204 12.11 2.23 \n", "205 19.80 5.26 \n", "207 13.46 3.77 \n", "\n", " air_pollution_death_rate wash_services_death_rate poisoning_death_rate \n", "0 265.7 16.6 1.0 \n", "1 92.5 3.2 0.3 \n", "2 49.7 4.1 0.7 \n", "3 NaN NaN NaN \n", "4 142.8 48.9 2.0 \n", ".. ... ... ... \n", "202 132.8 29.6 0.3 \n", "203 44.5 3.6 1.1 \n", "204 136.0 18.4 1.1 \n", "205 94.0 4.3 1.4 \n", "207 103.6 18.3 1.1 \n", "\n", "[206 rows x 14 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display(df)" ] }, { "cell_type": "code", "execution_count": 4, "id": "5ca0201e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "country_and_areas 0\n", "cvd_cancer_death_rate 15\n", "suicide_rate 15\n", "alcohol_consumption 9\n", "road_traffic_death_rate 15\n", "family_planning_satisfaction 90\n", "adolescent_birth_rate_15_19 23\n", "adolescent_birth_rate_10_14 32\n", "uhc_service_coverage_index 4\n", "health_expenditure_over_10_percent 66\n", "health_expenditure_over_25_percent 66\n", "air_pollution_death_rate 15\n", "wash_services_death_rate 15\n", "poisoning_death_rate 15\n", "dtype: int64\n" ] } ], "source": [ "# Check for missing values\n", "print(df.isnull().sum())" ] }, { "cell_type": "code", "execution_count": null, "id": "c4dd8fb3", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" } }, "nbformat": 4, "nbformat_minor": 5 }