{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "l0M4HuawOF_h" }, "source": [ "## Data Wrangling for a polynomial regression model\n", "Data wrangled from Excel file `BrazilCOVIDData.xlsx`\n", "\n", "\n" ] }, { "cell_type": "markdown", "metadata": { "id": "4GKRT-8TebAe" }, "source": [ "# Model based on:\n", "\n", "David N. Prata, Waldecy Rodrigues, Paulo H. Bermejo. Temperature significantly\n", "changes COVID-19 transmission in (sub)tropical cities of Brazil. Science of The Total\n", "Environment. Volume 729. 2020. 138862. ISSN 0048-9697.\n", "https://doi.org/10.1016/j.scitotenv.2020.138862." ] }, { "cell_type": "markdown", "metadata": { "id": "Rvjw5z_Ogw6K" }, "source": [ "# Model Parameters:" ] }, { "cell_type": "markdown", "metadata": { "id": "IyMe-xDxgoeF" }, "source": [ "" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "id": "pVHGRY2NOltD" }, "outputs": [], "source": [ "# ignoring warning to ensure proper execution\n", "import warnings\n", "warnings.filterwarnings('ignore') " ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "id": "-julks0ELUEp" }, "outputs": [], "source": [ "# Common Imports\n", "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "id": "SrDIcMtMbIRi" }, "outputs": [], "source": [ "#creating an Excel File object\n", "xls = pd.ExcelFile('BrazilCOVIDData.xlsx')" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "id": "jXshi-DsOF_k" }, "outputs": [], "source": [ "#Creating Data Frames for each sheet of Brazil Covid Data\n", "#this is the largest excel file, so it's seperated from the others\n", "brazil_covid = pd.read_excel(xls, 'Brazil Covid-19 data')\n", "brazil_covid = brazil_covid.applymap(lambda s: s.upper() if type(s) == str else s)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "id": "HnUW4xhtQgyd" }, "outputs": [], "source": [ "#reading the rest of the relevant data sheets from the file\n", "temp_by_state = pd.read_excel(xls, \"Temperature by State\")\n", "brazil_states = pd.read_excel(xls, \"Brazil State Stats\",header =4)\n", "city_area = pd.read_excel(xls, \"City area\")" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "id": "bVY0kcUKmV6e" }, "outputs": [], "source": [ "#setting all string data to uppercase to allow for easier data wrangling\n", "temp_by_state = temp_by_state.applymap(lambda s: s.upper() if type(s) == str else s)\n", "brazil_states = brazil_states.applymap(lambda s: s.upper() if type(s) == str else s)\n", "city_area = city_area.applymap(lambda s: s.upper() if type(s) == str else s)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "id": "QK1UmpUZNgxG" }, "outputs": [], "source": [ "# Dropping unnecessary header data and resting the index to 0 through n-1\n", "#renaming abbreviation column for easier merging with temperature data\n", "#renaming Population to pop\n", "brazil_states = brazil_states.drop([0,1]).reset_index(drop=True).rename(columns = {\"Common Two Letter\":\"STATE_ABBR\"})\n" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "id": "zvm1B5LlQwCZ" }, "outputs": [], "source": [ "# removing non-capitol cities from Temperature data and renaming the annual temperature to temp\n", "temp_by_state = temp_by_state.loc[temp_by_state.IS_CAPITOL == 'Y'].reset_index(drop=True).rename(columns={\"ANNUAL\":\"temp\"})" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "id": "2bV43QJfl2D-" }, "outputs": [], "source": [ "#creating a list of the Capitol Cities from the filtered Temp Data\n", "capitol_cities = temp_by_state.CITY" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "id": "ZTE4l4-tCnQL" }, "outputs": [], "source": [ "#creating a dataframe of Capitol Cities and their respective state\n", "#renaming columns for easier merging with brazil covid data\n", "capitol_state_combo = temp_by_state[[\"CITY\",\"STATE_ABBR\"]].rename(columns ={\"CITY\":\"Municipality\",\"STATE_ABBR\":\"State\"})" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "id": "t-TUkCBUfIgI" }, "outputs": [], "source": [ "#inner joining the brazil_states data with the temperature data\n", "capitol_data = brazil_states.merge(temp_by_state, on = 'STATE_ABBR', how = 'inner')" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "id": "xraCGucIm97z" }, "outputs": [], "source": [ "#reducing the Brazil Covid Data to only the Capital Cities\n", "#merging with capitol state combo to ensure only capitol cities with the correct state label are included\n", "#data is sorted first by State in alphabetical order then dates sequentially\n", "brazil_covid_reduced = brazil_covid.copy()\n", "brazil_covid_reduced = brazil_covid_reduced.merge(capitol_state_combo).sort_values(by=['State','Date'])" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "8BGeRifhB5xT", "outputId": "51128509-95e1-4644-f954-38ae4fe5dd88" }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Region</th>\n", " <th>State</th>\n", " <th>State-code</th>\n", " <th>Municipality-code</th>\n", " <th>Health-region-code</th>\n", " <th>Health-region-name</th>\n", " <th>Date</th>\n", " <th>Week #</th>\n", " <th>Population as of 2019</th>\n", " <th>Accumulated cases</th>\n", " <th>New cases</th>\n", " <th>Accumulated deaths</th>\n", " <th>New deaths</th>\n", " <th>New Recoveries</th>\n", " <th>New followups (?)</th>\n", " <th>Interior/Metropolitan</th>\n", " </tr>\n", " <tr>\n", " <th>Municipality</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>ARACAJU</th>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>151</td>\n", " </tr>\n", " <tr>\n", " <th>BELO HORIZONTE</th>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>151</td>\n", " </tr>\n", " <tr>\n", " <th>BELÉM</th>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>151</td>\n", " </tr>\n", " <tr>\n", " <th>BOA VISTA</th>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>151</td>\n", " </tr>\n", " <tr>\n", " <th>BRASÍLIA</th>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>151</td>\n", " </tr>\n", " <tr>\n", " <th>CAMPO GRANDE</th>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>151</td>\n", " </tr>\n", " <tr>\n", " <th>CUIABÁ</th>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>151</td>\n", " </tr>\n", " <tr>\n", " <th>CURITIBA</th>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>151</td>\n", " </tr>\n", " <tr>\n", " <th>FLORIANÓPOLIS</th>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>151</td>\n", " </tr>\n", " <tr>\n", " <th>FORTALEZA</th>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>151</td>\n", " </tr>\n", " <tr>\n", " <th>GOIÂNIA</th>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>151</td>\n", " </tr>\n", " <tr>\n", " <th>JOÃO PESSOA</th>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>151</td>\n", " </tr>\n", " <tr>\n", " <th>MACAPÁ</th>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>151</td>\n", " </tr>\n", " <tr>\n", " <th>MACEIÓ</th>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>151</td>\n", " </tr>\n", " <tr>\n", " <th>MANAUS</th>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>151</td>\n", " </tr>\n", " <tr>\n", " <th>NATAL</th>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>151</td>\n", " </tr>\n", " <tr>\n", " <th>PALMAS</th>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>151</td>\n", " </tr>\n", " <tr>\n", " <th>PORTO ALEGRE</th>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>151</td>\n", " </tr>\n", " <tr>\n", " <th>PORTO VELHO</th>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>151</td>\n", " </tr>\n", " <tr>\n", " <th>RECIFE</th>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>151</td>\n", " </tr>\n", " <tr>\n", " <th>RIO BRANCO</th>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>151</td>\n", " </tr>\n", " <tr>\n", " <th>RIO DE JANEIRO</th>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>151</td>\n", " </tr>\n", " <tr>\n", " <th>SALVADOR</th>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>151</td>\n", " </tr>\n", " <tr>\n", " <th>SÃO LUÍS</th>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>151</td>\n", " </tr>\n", " <tr>\n", " <th>SÃO PAULO</th>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>151</td>\n", " </tr>\n", " <tr>\n", " <th>TERESINA</th>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>151</td>\n", " </tr>\n", " <tr>\n", " <th>VITÓRIA</th>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>151</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>151</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Region State State-code Municipality-code \\\n", "Municipality \n", "ARACAJU 151 151 151 151 \n", "BELO HORIZONTE 151 151 151 151 \n", "BELÉM 151 151 151 151 \n", "BOA VISTA 151 151 151 151 \n", "BRASÍLIA 151 151 151 151 \n", "CAMPO GRANDE 151 151 151 151 \n", "CUIABÁ 151 151 151 151 \n", "CURITIBA 151 151 151 151 \n", "FLORIANÓPOLIS 151 151 151 151 \n", "FORTALEZA 151 151 151 151 \n", "GOIÂNIA 151 151 151 151 \n", "JOÃO PESSOA 151 151 151 151 \n", "MACAPÁ 151 151 151 151 \n", "MACEIÓ 151 151 151 151 \n", "MANAUS 151 151 151 151 \n", "NATAL 151 151 151 151 \n", "PALMAS 151 151 151 151 \n", "PORTO ALEGRE 151 151 151 151 \n", "PORTO VELHO 151 151 151 151 \n", "RECIFE 151 151 151 151 \n", "RIO BRANCO 151 151 151 151 \n", "RIO DE JANEIRO 151 151 151 151 \n", "SALVADOR 151 151 151 151 \n", "SÃO LUÍS 151 151 151 151 \n", "SÃO PAULO 151 151 151 151 \n", "TERESINA 151 151 151 151 \n", "VITÓRIA 151 151 151 151 \n", "\n", " Health-region-code Health-region-name Date Week # \\\n", "Municipality \n", "ARACAJU 151 151 151 151 \n", "BELO HORIZONTE 151 151 151 151 \n", "BELÉM 151 151 151 151 \n", "BOA VISTA 151 151 151 151 \n", "BRASÍLIA 151 151 151 151 \n", "CAMPO GRANDE 151 151 151 151 \n", "CUIABÁ 151 151 151 151 \n", "CURITIBA 151 151 151 151 \n", "FLORIANÓPOLIS 151 151 151 151 \n", "FORTALEZA 151 151 151 151 \n", "GOIÂNIA 151 151 151 151 \n", "JOÃO PESSOA 151 151 151 151 \n", "MACAPÁ 151 151 151 151 \n", "MACEIÓ 151 151 151 151 \n", "MANAUS 151 151 151 151 \n", "NATAL 151 151 151 151 \n", "PALMAS 151 151 151 151 \n", "PORTO ALEGRE 151 151 151 151 \n", "PORTO VELHO 151 151 151 151 \n", "RECIFE 151 151 151 151 \n", "RIO BRANCO 151 151 151 151 \n", "RIO DE JANEIRO 151 151 151 151 \n", "SALVADOR 151 151 151 151 \n", "SÃO LUÍS 151 151 151 151 \n", "SÃO PAULO 151 151 151 151 \n", "TERESINA 151 151 151 151 \n", "VITÓRIA 151 151 151 151 \n", "\n", " Population as of 2019 Accumulated cases New cases \\\n", "Municipality \n", "ARACAJU 151 151 151 \n", "BELO HORIZONTE 151 151 151 \n", "BELÉM 151 151 151 \n", "BOA VISTA 151 151 151 \n", "BRASÍLIA 151 151 151 \n", "CAMPO GRANDE 151 151 151 \n", "CUIABÁ 151 151 151 \n", "CURITIBA 151 151 151 \n", "FLORIANÓPOLIS 151 151 151 \n", "FORTALEZA 151 151 151 \n", "GOIÂNIA 151 151 151 \n", "JOÃO PESSOA 151 151 151 \n", "MACAPÁ 151 151 151 \n", "MACEIÓ 151 151 151 \n", "MANAUS 151 151 151 \n", "NATAL 151 151 151 \n", "PALMAS 151 151 151 \n", "PORTO ALEGRE 151 151 151 \n", "PORTO VELHO 151 151 151 \n", "RECIFE 151 151 151 \n", "RIO BRANCO 151 151 151 \n", "RIO DE JANEIRO 151 151 151 \n", "SALVADOR 151 151 151 \n", "SÃO LUÍS 151 151 151 \n", "SÃO PAULO 151 151 151 \n", "TERESINA 151 151 151 \n", "VITÓRIA 151 151 151 \n", "\n", " Accumulated deaths New deaths New Recoveries \\\n", "Municipality \n", "ARACAJU 151 151 0 \n", "BELO HORIZONTE 151 151 0 \n", "BELÉM 151 151 0 \n", "BOA VISTA 151 151 0 \n", "BRASÍLIA 151 151 0 \n", "CAMPO GRANDE 151 151 0 \n", "CUIABÁ 151 151 0 \n", "CURITIBA 151 151 0 \n", "FLORIANÓPOLIS 151 151 0 \n", "FORTALEZA 151 151 0 \n", "GOIÂNIA 151 151 0 \n", "JOÃO PESSOA 151 151 0 \n", "MACAPÁ 151 151 0 \n", "MACEIÓ 151 151 0 \n", "MANAUS 151 151 0 \n", "NATAL 151 151 0 \n", "PALMAS 151 151 0 \n", "PORTO ALEGRE 151 151 0 \n", "PORTO VELHO 151 151 0 \n", "RECIFE 151 151 0 \n", "RIO BRANCO 151 151 0 \n", "RIO DE JANEIRO 151 151 0 \n", "SALVADOR 151 151 0 \n", "SÃO LUÍS 151 151 0 \n", "SÃO PAULO 151 151 0 \n", "TERESINA 151 151 0 \n", "VITÓRIA 151 151 0 \n", "\n", " New followups (?) Interior/Metropolitan \n", "Municipality \n", "ARACAJU 0 151 \n", "BELO HORIZONTE 0 151 \n", "BELÉM 0 151 \n", "BOA VISTA 0 151 \n", "BRASÍLIA 0 151 \n", "CAMPO GRANDE 0 151 \n", "CUIABÁ 0 151 \n", "CURITIBA 0 151 \n", "FLORIANÓPOLIS 0 151 \n", "FORTALEZA 0 151 \n", "GOIÂNIA 0 151 \n", "JOÃO PESSOA 0 151 \n", "MACAPÁ 0 151 \n", "MACEIÓ 0 151 \n", "MANAUS 0 151 \n", "NATAL 0 151 \n", "PALMAS 0 151 \n", "PORTO ALEGRE 0 151 \n", "PORTO VELHO 0 151 \n", "RECIFE 0 151 \n", "RIO BRANCO 0 151 \n", "RIO DE JANEIRO 0 151 \n", "SALVADOR 0 151 \n", "SÃO LUÍS 0 151 \n", "SÃO PAULO 0 151 \n", "TERESINA 0 151 \n", "VITÓRIA 0 151 " ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#ensuring only 151 days of data for each city\n", "brazil_covid_reduced.groupby(\"Municipality\").count()" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "id": "e3RS29Umu0Jj" }, "outputs": [], "source": [ "#creating a days since outbreak column\n", "brazil_covid_reduced[\"days\"] = np.nan" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "id": "T7QUaL1mn0l4" }, "outputs": [], "source": [ "# for each city, adding values to the days column\n", "data_grouped = brazil_covid_reduced.groupby(\"Municipality\")\n", "for name, group in data_grouped:\n", " brazil_covid_reduced[\"days\"].loc[group.index[:151]] = range(151)\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "id": "yisUEpb-xvUP" }, "outputs": [], "source": [ "# filtering out the irrelevant columns and renaming for an easier merge\n", "brazil_covid_reduced = brazil_covid_reduced[[\"State\",'Population as of 2019','Accumulated cases','days']].rename(columns={\"State\":\"STATE_ABBR\", \"Population as of 2019\":\"pop\"})" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "id": "0YgDTFdc035y" }, "outputs": [], "source": [ "#merging all the data\n", "full_merge = brazil_covid_reduced.merge(capitol_data,how='inner', on =\"STATE_ABBR\")" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "id": "_LudGI2A6uym" }, "outputs": [], "source": [ "#changing population column from string to float\n", "full_merge[\"pop\"] = pd.to_numeric(full_merge[\"pop\"], downcast = \"float\")" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "id": "a_1g9ZpB1TVl" }, "outputs": [], "source": [ "# adding the days cubes, days square, population density and population density squared feature columns\n", "full_merge[\"days_cube\"] = full_merge[\"days\"]**3.0\n", "full_merge[\"days_sq\"] = full_merge[\"days\"]**2.0\n", "full_merge[\"pop_dense\"]=full_merge['pop']/full_merge['Size']\n", "full_merge[\"pop_dense_sq\"] = full_merge.pop_dense**2.0" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "id": "gzD1kunjOF_l" }, "outputs": [], "source": [ "#taking the relevant features columns from full_merge\n", "features = full_merge[[\"days_cube\",\"days_sq\",\"days\",\"temp\",\"pop_dense_sq\",\"pop_dense\",\"pop\"]]\n", "\n", "# taking the Accumulated cases column from full merge as the training response\n", "response = full_merge[[\"Accumulated cases\"]]" ] }, { "cell_type": "markdown", "metadata": { "id": "S86pqaveVrwA" }, "source": [ "## Featrues for the model" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "jP5FUR9zOF_m", "outputId": "4951ed22-7bfd-4cfb-b828-b6eb9e2eb74e" }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>days_cube</th>\n", " <th>days_sq</th>\n", " <th>days</th>\n", " <th>temp</th>\n", " <th>pop_dense_sq</th>\n", " <th>pop_dense</th>\n", " <th>pop</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>76.6</td>\n", " <td>7.126371</td>\n", " <td>2.669526</td>\n", " <td>407319.0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>76.6</td>\n", " <td>7.126371</td>\n", " <td>2.669526</td>\n", " <td>407319.0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>8.0</td>\n", " <td>4.0</td>\n", " <td>2.0</td>\n", " <td>76.6</td>\n", " <td>7.126371</td>\n", " <td>2.669526</td>\n", " <td>407319.0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>27.0</td>\n", " <td>9.0</td>\n", " <td>3.0</td>\n", " <td>76.6</td>\n", " <td>7.126371</td>\n", " <td>2.669526</td>\n", " <td>407319.0</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>64.0</td>\n", " <td>16.0</td>\n", " <td>4.0</td>\n", " <td>76.6</td>\n", " <td>7.126371</td>\n", " <td>2.669526</td>\n", " <td>407319.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " days_cube days_sq days temp pop_dense_sq pop_dense pop\n", "0 0.0 0.0 0.0 76.6 7.126371 2.669526 407319.0\n", "1 1.0 1.0 1.0 76.6 7.126371 2.669526 407319.0\n", "2 8.0 4.0 2.0 76.6 7.126371 2.669526 407319.0\n", "3 27.0 9.0 3.0 76.6 7.126371 2.669526 407319.0\n", "4 64.0 16.0 4.0 76.6 7.126371 2.669526 407319.0" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "features.head()" ] }, { "cell_type": "markdown", "metadata": { "id": "6gqpkrjpV5FA" }, "source": [ "## Responses for the training set" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "M3CH9awgOF_m", "outputId": "5268df43-fcc0-4271-f139-8e52e218e88e" }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Accumulated cases</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>25</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>31</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>35</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>34</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Accumulated cases\n", "0 0\n", "1 25\n", "2 31\n", "3 35\n", "4 34" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "response.head()" ] }, { "cell_type": "markdown", "metadata": { "id": "qXuF8srTCqXx" }, "source": [ "## Sample Implementation" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "id": "UehWZEwhAezA" }, "outputs": [], "source": [ "#creating a test and training set\n", "from sklearn.model_selection import train_test_split\n", "features_train, features_test, response_train, response_test = train_test_split(features, response, test_size = 0.2)" ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "lg5A0P_1_OqZ", "outputId": "89ff873a-c661-4f10-d47e-83c5814a03da" }, "outputs": [ { "data": { "text/plain": [ "LogisticRegression()" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Logistic Regression Model Fitting\n", "from sklearn.linear_model import LogisticRegression\n", "log_reg = LogisticRegression()\n", "log_reg.fit(features_train,response_train)" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "id": "CygFoFFiB95H" }, "outputs": [], "source": [ "# Finding the model predictions for the test set\n", "cases_predictions = log_reg.predict(features_test)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "1CAOSweWBr2j", "outputId": "62127d26-3d70-459e-a05f-3bc38042c4f4" }, "outputs": [ { "data": { "text/plain": [ "53495.134803921566" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Evaluating model with mean_absolute_error\n", "from sklearn.metrics import mean_absolute_error\n", "\n", "log_mae = mean_absolute_error(response_test, cases_predictions)\n", "log_mae" ] }, { "cell_type": "markdown", "metadata": { "id": "cr_ce7yXDruv" }, "source": [ "Quite a large error. The model is underfitted. Features may not be representative enough and/or the model is not powerful enough among other potential reasons." ] } ], "metadata": { "colab": { "collapsed_sections": [], "name": "Polynomial_data_wrangling.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 1 }