{ "cells": [ { "cell_type": "markdown", "id": "74012e00", "metadata": {}, "source": [ "# Install and import libraries" ] }, { "cell_type": "code", "execution_count": 22, "id": "cd7ae131", "metadata": { "scrolled": true }, "outputs": [], "source": [ "# Install libraries\n", "!pip install -q missingno\n", "!pip install -q pyod\n", "\n", "# import libraries\n", "import missingno as msno\n", "import pandas as pd\n", "from pyod.models.knn import KNN" ] }, { "cell_type": "markdown", "id": "acc6fb0f", "metadata": {}, "source": [ "# Importing the dataset" ] }, { "cell_type": "code", "execution_count": 8, "id": "99953ed1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Shape before dropping columns: (330782, 67)\n" ] }, { "data": { "text/html": [ "
\n", " | iso_code | \n", "continent | \n", "location | \n", "date | \n", "total_cases | \n", "new_cases | \n", "new_cases_smoothed | \n", "total_deaths | \n", "new_deaths | \n", "new_deaths_smoothed | \n", "... | \n", "male_smokers | \n", "handwashing_facilities | \n", "hospital_beds_per_thousand | \n", "life_expectancy | \n", "human_development_index | \n", "population | \n", "excess_mortality_cumulative_absolute | \n", "excess_mortality_cumulative | \n", "excess_mortality | \n", "excess_mortality_cumulative_per_million | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "AFG | \n", "Asia | \n", "Afghanistan | \n", "2020-01-03 | \n", "NaN | \n", "0.00 | \n", "NaN | \n", "NaN | \n", "0.00 | \n", "NaN | \n", "... | \n", "NaN | \n", "37.75 | \n", "0.50 | \n", "64.83 | \n", "0.51 | \n", "41128772.00 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
1 | \n", "AFG | \n", "Asia | \n", "Afghanistan | \n", "2020-01-04 | \n", "NaN | \n", "0.00 | \n", "NaN | \n", "NaN | \n", "0.00 | \n", "NaN | \n", "... | \n", "NaN | \n", "37.75 | \n", "0.50 | \n", "64.83 | \n", "0.51 | \n", "41128772.00 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
2 | \n", "AFG | \n", "Asia | \n", "Afghanistan | \n", "2020-01-05 | \n", "NaN | \n", "0.00 | \n", "NaN | \n", "NaN | \n", "0.00 | \n", "NaN | \n", "... | \n", "NaN | \n", "37.75 | \n", "0.50 | \n", "64.83 | \n", "0.51 | \n", "41128772.00 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
3 | \n", "AFG | \n", "Asia | \n", "Afghanistan | \n", "2020-01-06 | \n", "NaN | \n", "0.00 | \n", "NaN | \n", "NaN | \n", "0.00 | \n", "NaN | \n", "... | \n", "NaN | \n", "37.75 | \n", "0.50 | \n", "64.83 | \n", "0.51 | \n", "41128772.00 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
4 | \n", "AFG | \n", "Asia | \n", "Afghanistan | \n", "2020-01-07 | \n", "NaN | \n", "0.00 | \n", "NaN | \n", "NaN | \n", "0.00 | \n", "NaN | \n", "... | \n", "NaN | \n", "37.75 | \n", "0.50 | \n", "64.83 | \n", "0.51 | \n", "41128772.00 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
5 rows × 67 columns
\n", "\n", " | total_cases | \n", "new_cases | \n", "new_cases_smoothed | \n", "total_deaths | \n", "new_deaths | \n", "new_deaths_smoothed | \n", "total_cases_per_million | \n", "new_cases_per_million | \n", "new_cases_smoothed_per_million | \n", "total_deaths_per_million | \n", "... | \n", "male_smokers | \n", "handwashing_facilities | \n", "hospital_beds_per_thousand | \n", "life_expectancy | \n", "human_development_index | \n", "population | \n", "excess_mortality_cumulative_absolute | \n", "excess_mortality_cumulative | \n", "excess_mortality | \n", "excess_mortality_cumulative_per_million | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | \n", "2.933000e+05 | \n", "3.216330e+05 | \n", "3.203740e+05 | \n", "2.721740e+05 | \n", "321702.000000 | \n", "320472.000000 | \n", "293300.000000 | \n", "321633.000000 | \n", "320374.000000 | \n", "272174.000000 | \n", "... | \n", "189702.000000 | \n", "125583.000000 | \n", "226326.000000 | \n", "304234.000000 | \n", "248529.000000 | \n", "3.307820e+05 | \n", "1.153500e+04 | \n", "11535.000000 | \n", "11535.000000 | \n", "11535.000000 | \n", "
mean | \n", "6.253904e+06 | \n", "1.013110e+04 | \n", "1.016799e+04 | \n", "8.357613e+04 | \n", "91.896227 | \n", "92.240686 | \n", "95300.478343 | \n", "153.109948 | \n", "153.681658 | \n", "843.644772 | \n", "... | \n", "32.910416 | \n", "50.790677 | \n", "3.097145 | \n", "73.716225 | \n", "0.722483 | \n", "1.282799e+08 | \n", "4.933200e+04 | \n", "9.635586 | \n", "11.815644 | \n", "1591.662698 | \n", "
std | \n", "3.854149e+07 | \n", "1.134175e+05 | \n", "9.718695e+04 | \n", "4.293215e+05 | \n", "761.899876 | \n", "597.590018 | \n", "145463.270075 | \n", "1196.206835 | \n", "616.176235 | \n", "1079.976631 | \n", "... | \n", "13.574225 | \n", "31.956510 | \n", "2.548339 | \n", "7.396354 | \n", "0.148987 | \n", "6.602098e+08 | \n", "1.412464e+05 | \n", "12.479665 | \n", "25.623475 | \n", "1894.037851 | \n", "
min | \n", "1.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "1.000000e+00 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "... | \n", "7.700000 | \n", "1.188000 | \n", "0.100000 | \n", "53.280000 | \n", "0.394000 | \n", "4.700000e+01 | \n", "-3.772610e+04 | \n", "-44.230000 | \n", "-95.920000 | \n", "-2142.340300 | \n", "
25% | \n", "7.474000e+03 | \n", "0.000000e+00 | \n", "5.710000e-01 | \n", "1.250000e+02 | \n", "0.000000 | \n", "0.000000 | \n", "2273.895000 | \n", "0.000000 | \n", "0.120000 | \n", "55.469500 | \n", "... | \n", "22.600000 | \n", "20.859000 | \n", "1.300000 | \n", "69.590000 | \n", "0.602000 | \n", "4.490020e+05 | \n", "7.479999e+01 | \n", "1.070000 | \n", "-1.495000 | \n", "46.896362 | \n", "
50% | \n", "6.538100e+04 | \n", "4.000000e+00 | \n", "3.007150e+01 | \n", "1.250000e+03 | \n", "0.000000 | \n", "0.143000 | \n", "24535.964500 | \n", "0.366000 | \n", "8.249000 | \n", "358.698000 | \n", "... | \n", "33.100000 | \n", "49.839000 | \n", "2.500000 | \n", "75.050000 | \n", "0.740000 | \n", "5.882259e+06 | \n", "5.352899e+03 | \n", "7.960000 | \n", "6.020000 | \n", "1021.922500 | \n", "
75% | \n", "6.940385e+05 | \n", "3.160000e+02 | \n", "5.622860e+02 | \n", "1.116475e+04 | \n", "3.000000 | \n", "5.857000 | \n", "122228.212250 | \n", "41.737000 | \n", "92.141750 | \n", "1298.258000 | \n", "... | \n", "41.300000 | \n", "83.241000 | \n", "4.200000 | \n", "79.460000 | \n", "0.829000 | \n", "2.830170e+07 | \n", "3.510710e+04 | \n", "15.370000 | \n", "16.920000 | \n", "2615.629150 | \n", "
max | \n", "7.689823e+08 | \n", "8.401763e+06 | \n", "6.402721e+06 | \n", "6.953730e+06 | \n", "121590.000000 | \n", "18214.143000 | \n", "737554.506000 | \n", "228872.025000 | \n", "37241.781000 | \n", "6501.224000 | \n", "... | \n", "78.100000 | \n", "100.000000 | \n", "13.800000 | \n", "86.750000 | \n", "0.957000 | \n", "7.975105e+09 | \n", "1.281224e+06 | \n", "76.550000 | \n", "377.430000 | \n", "10292.468000 | \n", "
8 rows × 62 columns
\n", "\n", " | total_cases | \n", "new_cases | \n", "new_cases_smoothed | \n", "total_deaths | \n", "new_deaths | \n", "new_deaths_smoothed | \n", "total_cases_per_million | \n", "new_cases_per_million | \n", "new_cases_smoothed_per_million | \n", "total_deaths_per_million | \n", "... | \n", "male_smokers | \n", "handwashing_facilities | \n", "hospital_beds_per_thousand | \n", "life_expectancy | \n", "human_development_index | \n", "population | \n", "excess_mortality_cumulative_absolute | \n", "excess_mortality_cumulative | \n", "excess_mortality | \n", "excess_mortality_cumulative_per_million | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | \n", "293300.00 | \n", "321633.00 | \n", "320374.00 | \n", "272174.00 | \n", "321702.00 | \n", "320472.00 | \n", "293300.00 | \n", "321633.00 | \n", "320374.00 | \n", "272174.00 | \n", "... | \n", "189702.00 | \n", "125583.00 | \n", "226326.00 | \n", "304234.00 | \n", "248529.00 | \n", "330782.00 | \n", "11535.00 | \n", "11535.00 | \n", "11535.00 | \n", "11535.00 | \n", "
mean | \n", "6253904.17 | \n", "10131.10 | \n", "10167.99 | \n", "83576.13 | \n", "91.90 | \n", "92.24 | \n", "95300.48 | \n", "153.11 | \n", "153.68 | \n", "843.64 | \n", "... | \n", "32.91 | \n", "50.79 | \n", "3.10 | \n", "73.72 | \n", "0.72 | \n", "128279905.10 | \n", "49332.00 | \n", "9.64 | \n", "11.82 | \n", "1591.66 | \n", "
std | \n", "38541489.37 | \n", "113417.45 | \n", "97186.95 | \n", "429321.51 | \n", "761.90 | \n", "597.59 | \n", "145463.27 | \n", "1196.21 | \n", "616.18 | \n", "1079.98 | \n", "... | \n", "13.57 | \n", "31.96 | \n", "2.55 | \n", "7.40 | \n", "0.15 | \n", "660209762.20 | \n", "141246.42 | \n", "12.48 | \n", "25.62 | \n", "1894.04 | \n", "
min | \n", "1.00 | \n", "0.00 | \n", "0.00 | \n", "1.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "... | \n", "7.70 | \n", "1.19 | \n", "0.10 | \n", "53.28 | \n", "0.39 | \n", "47.00 | \n", "-37726.10 | \n", "-44.23 | \n", "-95.92 | \n", "-2142.34 | \n", "
25% | \n", "7474.00 | \n", "0.00 | \n", "0.57 | \n", "125.00 | \n", "0.00 | \n", "0.00 | \n", "2273.89 | \n", "0.00 | \n", "0.12 | \n", "55.47 | \n", "... | \n", "22.60 | \n", "20.86 | \n", "1.30 | \n", "69.59 | \n", "0.60 | \n", "449002.00 | \n", "74.80 | \n", "1.07 | \n", "-1.50 | \n", "46.90 | \n", "
50% | \n", "65381.00 | \n", "4.00 | \n", "30.07 | \n", "1250.00 | \n", "0.00 | \n", "0.14 | \n", "24535.96 | \n", "0.37 | \n", "8.25 | \n", "358.70 | \n", "... | \n", "33.10 | \n", "49.84 | \n", "2.50 | \n", "75.05 | \n", "0.74 | \n", "5882259.00 | \n", "5352.90 | \n", "7.96 | \n", "6.02 | \n", "1021.92 | \n", "
75% | \n", "694038.50 | \n", "316.00 | \n", "562.29 | \n", "11164.75 | \n", "3.00 | \n", "5.86 | \n", "122228.21 | \n", "41.74 | \n", "92.14 | \n", "1298.26 | \n", "... | \n", "41.30 | \n", "83.24 | \n", "4.20 | \n", "79.46 | \n", "0.83 | \n", "28301700.00 | \n", "35107.10 | \n", "15.37 | \n", "16.92 | \n", "2615.63 | \n", "
max | \n", "768982331.00 | \n", "8401763.00 | \n", "6402720.57 | \n", "6953730.00 | \n", "121590.00 | \n", "18214.14 | \n", "737554.51 | \n", "228872.02 | \n", "37241.78 | \n", "6501.22 | \n", "... | \n", "78.10 | \n", "100.00 | \n", "13.80 | \n", "86.75 | \n", "0.96 | \n", "7975105024.00 | \n", "1281224.50 | \n", "76.55 | \n", "377.43 | \n", "10292.47 | \n", "
8 rows × 62 columns
\n", "\n", " | Missing Data Ratio (%) | \n", "
---|---|
iso_code | \n", "0.00 | \n", "
continent | \n", "4.75 | \n", "
location | \n", "0.00 | \n", "
date | \n", "0.00 | \n", "
total_cases | \n", "11.33 | \n", "
new_cases | \n", "2.77 | \n", "
new_cases_smoothed | \n", "3.15 | \n", "
total_deaths | \n", "17.72 | \n", "
new_deaths | \n", "2.75 | \n", "
new_deaths_smoothed | \n", "3.12 | \n", "
total_cases_per_million | \n", "11.33 | \n", "
new_cases_per_million | \n", "2.77 | \n", "
new_cases_smoothed_per_million | \n", "3.15 | \n", "
total_deaths_per_million | \n", "17.72 | \n", "
new_deaths_per_million | \n", "2.75 | \n", "
new_deaths_smoothed_per_million | \n", "3.12 | \n", "
reproduction_rate | \n", "44.13 | \n", "
icu_patients | \n", "88.80 | \n", "
icu_patients_per_million | \n", "88.80 | \n", "
hosp_patients | \n", "88.50 | \n", "
hosp_patients_per_million | \n", "88.50 | \n", "
weekly_icu_admissions | \n", "97.01 | \n", "
weekly_icu_admissions_per_million | \n", "97.01 | \n", "
weekly_hosp_admissions | \n", "93.15 | \n", "
weekly_hosp_admissions_per_million | \n", "93.15 | \n", "
total_tests | \n", "76.00 | \n", "
new_tests | \n", "77.20 | \n", "
total_tests_per_thousand | \n", "76.00 | \n", "
new_tests_per_thousand | \n", "77.20 | \n", "
new_tests_smoothed | \n", "68.57 | \n", "
new_tests_smoothed_per_thousand | \n", "68.57 | \n", "
positive_rate | \n", "71.00 | \n", "
tests_per_case | \n", "71.48 | \n", "
tests_units | \n", "67.72 | \n", "
total_vaccinations | \n", "76.59 | \n", "
people_vaccinated | \n", "77.59 | \n", "
people_fully_vaccinated | \n", "78.63 | \n", "
total_boosters | \n", "86.15 | \n", "
new_vaccinations | \n", "80.73 | \n", "
new_vaccinations_smoothed | \n", "47.11 | \n", "
total_vaccinations_per_hundred | \n", "76.59 | \n", "
people_vaccinated_per_hundred | \n", "77.59 | \n", "
people_fully_vaccinated_per_hundred | \n", "78.63 | \n", "
total_boosters_per_hundred | \n", "86.15 | \n", "
new_vaccinations_smoothed_per_million | \n", "47.11 | \n", "
new_people_vaccinated_smoothed | \n", "47.10 | \n", "
new_people_vaccinated_smoothed_per_hundred | \n", "47.10 | \n", "
stringency_index | \n", "40.25 | \n", "
population_density | \n", "15.13 | \n", "
median_age | \n", "21.07 | \n", "
aged_65_older | \n", "23.83 | \n", "
aged_70_older | \n", "21.87 | \n", "
gdp_per_capita | \n", "22.65 | \n", "
extreme_poverty | \n", "50.17 | \n", "
cardiovasc_death_rate | \n", "22.49 | \n", "
diabetes_prevalence | \n", "18.54 | \n", "
female_smokers | \n", "41.86 | \n", "
male_smokers | \n", "42.65 | \n", "
handwashing_facilities | \n", "62.03 | \n", "
hospital_beds_per_thousand | \n", "31.58 | \n", "
life_expectancy | \n", "8.03 | \n", "
human_development_index | \n", "24.87 | \n", "
population | \n", "0.00 | \n", "
excess_mortality_cumulative_absolute | \n", "96.51 | \n", "
excess_mortality_cumulative | \n", "96.51 | \n", "
excess_mortality | \n", "96.51 | \n", "
excess_mortality_cumulative_per_million | \n", "96.51 | \n", "
\n", " | iso_code | \n", "continent | \n", "location | \n", "date | \n", "total_cases | \n", "new_cases | \n", "new_cases_smoothed | \n", "total_deaths | \n", "new_deaths | \n", "new_deaths_smoothed | \n", "... | \n", "male_smokers | \n", "handwashing_facilities | \n", "hospital_beds_per_thousand | \n", "life_expectancy | \n", "human_development_index | \n", "population | \n", "excess_mortality_cumulative_absolute | \n", "excess_mortality_cumulative | \n", "excess_mortality | \n", "excess_mortality_cumulative_per_million | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
16498 | \n", "OWID_ASI | \n", "NaN | \n", "Asia | \n", "2022-03-08 | \n", "122507617.00 | \n", "791293.00 | \n", "706920.57 | \n", "1365917.00 | \n", "1917.00 | \n", "1951.86 | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "4721383370.00 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
16499 | \n", "OWID_ASI | \n", "NaN | \n", "Asia | \n", "2022-03-09 | \n", "123354783.00 | \n", "847166.00 | \n", "727234.14 | \n", "1367885.00 | \n", "1968.00 | \n", "1926.00 | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "4721383370.00 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
16500 | \n", "OWID_ASI | \n", "NaN | \n", "Asia | \n", "2022-03-10 | \n", "124101007.00 | \n", "746224.00 | \n", "721573.14 | \n", "1369710.00 | \n", "1825.00 | \n", "1916.57 | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "4721383370.00 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
16501 | \n", "OWID_ASI | \n", "NaN | \n", "Asia | \n", "2022-03-11 | \n", "124898209.00 | \n", "797202.00 | \n", "730206.43 | \n", "1371566.00 | \n", "1856.00 | \n", "1864.14 | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "4721383370.00 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
16502 | \n", "OWID_ASI | \n", "NaN | \n", "Asia | \n", "2022-03-12 | \n", "125930442.00 | \n", "1032233.00 | \n", "775388.00 | \n", "1373481.00 | \n", "1915.00 | \n", "1848.57 | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "4721383370.00 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
326850 | \n", "OWID_WRL | \n", "NaN | \n", "World | \n", "2023-07-29 | \n", "768632837.00 | \n", "694.00 | \n", "46823.57 | \n", "6953094.00 | \n", "11.00 | \n", "75.00 | \n", "... | \n", "34.63 | \n", "60.13 | \n", "2.71 | \n", "72.58 | \n", "0.74 | \n", "7975105024.00 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
326851 | \n", "OWID_WRL | \n", "NaN | \n", "World | \n", "2023-07-30 | \n", "768654204.00 | \n", "21367.00 | \n", "45918.43 | \n", "6953470.00 | \n", "376.00 | \n", "87.00 | \n", "... | \n", "34.63 | \n", "60.13 | \n", "2.71 | \n", "72.58 | \n", "0.74 | \n", "7975105024.00 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
326852 | \n", "OWID_WRL | \n", "NaN | \n", "World | \n", "2023-07-31 | \n", "768978837.00 | \n", "324633.00 | \n", "50862.86 | \n", "6953676.00 | \n", "206.00 | \n", "97.29 | \n", "... | \n", "34.63 | \n", "60.13 | \n", "2.71 | \n", "72.58 | \n", "0.74 | \n", "7975105024.00 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
326853 | \n", "OWID_WRL | \n", "NaN | \n", "World | \n", "2023-08-01 | \n", "768982331.00 | \n", "3494.00 | \n", "50703.29 | \n", "6953730.00 | \n", "54.00 | \n", "97.00 | \n", "... | \n", "34.63 | \n", "60.13 | \n", "2.71 | \n", "72.58 | \n", "0.74 | \n", "7975105024.00 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
326854 | \n", "OWID_WRL | \n", "NaN | \n", "World | \n", "2023-08-02 | \n", "768982331.00 | \n", "0.00 | \n", "50420.57 | \n", "6953730.00 | \n", "0.00 | \n", "96.29 | \n", "... | \n", "34.63 | \n", "60.13 | \n", "2.71 | \n", "72.58 | \n", "0.74 | \n", "7975105024.00 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
3675 rows × 67 columns
\n", "