CSC108-Fall-2022-A3 / a3.py
a3.py
Raw
"""CSC108: Fall 2022 -- Assignment 3: Hypertension and Low Income

This code is provided solely for the personal and private use of
students taking the CSC108/CSCA08 course at the University of
Toronto. Copying for purposes other than this use is expressly
prohibited. All forms of distribution of this code, whether as given
or with any changes, are expressly prohibited.

All of the files in this directory and all subdirectories are:
Copyright (c) 2022 Jacqueline Smith and David Liu
"""
from typing import TextIO
import statistics  # Note that this requires Python 3.10

ID = "id"
HT_KEY = "hypertension"
TOTAL = "total"
LOW_INCOME = "low_income"

# Indexes in the inner lists of hypertension data in CityData
# HT is an abbreviation of hypertension, NBH is an abbreviation of neighbourhood
HT_20_44 = 0
NBH_20_44 = 1
HT_45_64 = 2
NBH_45_64 = 3
HT_65_UP = 4
NBH_65_UP = 5

# columns in input files
ID_COL = 0
NBH_NAME_COL = 1
POP_COL = 2
LI_POP_COL = 3

SAMPLE_DATA = {
    "West Humber-Clairville": {
        "id": 1,
        "hypertension": [703, 13291, 3741, 9663, 3959, 5176],
        "total": 33230,
        "low_income": 5950,
    },
    "Mount Olive-Silverstone-Jamestown": {
        "id": 2,
        "hypertension": [789, 12906, 3578, 8815, 2927, 3902],
        "total": 32940,
        "low_income": 9690,
    },
    "Thistletown-Beaumond Heights": {
        "id": 3,
        "hypertension": [220, 3631, 1047, 2829, 1349, 1767],
        "total": 10365,
        "low_income": 2005,
    },
    "Rexdale-Kipling": {
        "id": 4,
        "hypertension": [201, 3669, 1134, 3229, 1393, 1854],
        "total": 10540,
        "low_income": 2140,
    },
    "Elms-Old Rexdale": {
        "id": 5,
        "hypertension": [176, 3353, 1040, 2842, 948, 1322],
        "total": 9460,
        "low_income": 2315,
    },
}

# constructed SAMPLE_DATA_2 for testing purposes 
SAMPLE_DATA_2 = {
    "West Humber-Clairville": {
        "id": 1,
        "hypertension": [703, 13291, 3741, 9663, 3959, 5176],
        "total": 33230,
        "low_income": 5950,
    },
    "Mount Olive-Silverstone-Jamestown": {
        "id": 2,
        "hypertension": [789, 12906, 3578, 8815, 2927, 3902],
        "total": 32940,
        "low_income": 9690,
    },
    "Thistletown-Beaumond Heights": {
        "id": 3,
        "hypertension": [220, 3631, 1047, 2829, 1349, 1767],
        "total": 10365,
        "low_income": 2005,
    },
    "Rexdale-Kipling": {
        "id": 4,
        "hypertension": [201, 3669, 1134, 3229, 1393, 1854],
        "total": 10540,
        "low_income": 2140,
    },
    "Rexdale-Kipling TWO": {
        "id": 5,
        "hypertension": [176, 3353, 1040, 2842, 948, 1322],
        "total": 10540,
        "low_income": 2315,
    },
}


def get_hypertension_data(input_dict: dict, file: TextIO) -> None:
    """Modify input_dict such that it will contain the hypertension data from \
    file. 
    
    >>> input_file = open('hypertension_data_small.csv')
    >>> input_dict = {}
    >>> get_hypertension_data(input_dict, input_file)
    >>> input_dict
    {'West Humber-Clairville': {'id': 1, 'hypertension': [703, 13291, 3741, \
9663, 3959, 5176]}, 'Mount Olive-Silverstone-Jamestown': {'id': 2, \
'hypertension': [789, 12906, 3578, 8815, 2927, 3902]}, \
'Thistletown-Beaumond Heights': {'id': 3, \
'hypertension': [220, 3631, 1047, 2829, 1349, 1767]}, \
'Rexdale-Kipling': {'id': 4, \
'hypertension': [201, 3669, 1134, 3229, 1393, 1854]}, \
'Elms-Old Rexdale': {'id': 5, \
'hypertension': [176, 3353, 1040, 2842, 948, 1322]}}
    
    >>> input_file = open('hypertension_data_small.csv')
    >>> input_dict = {'West Humber-Clairville': {'id': 1, 'total': 33230, \
'low_income': 5950}, 'Mount Olive-Silverstone-Jamestown': {'id': 2, \
'total': 32940, 'low_income': 9690}, 'Thistletown-Beaumond Heights': {'id': 3, \
'total': 10365, 'low_income': 2005}, 'Rexdale-Kipling': {'id': 4, \
'total': 10540, 'low_income': 2140}, 'Elms-Old Rexdale': {'id': 5, \
'total': 9460, 'low_income': 2315}}
    >>> get_hypertension_data(input_dict, input_file)
    >>> input_dict == SAMPLE_DATA
    True
    """
    file.readline().strip()
    line = file.readline().strip().split(',')
    while line != ['']:
        if line[NBH_NAME_COL] not in input_dict:
            input_dict[line[NBH_NAME_COL]] = {}
            input_dict[line[NBH_NAME_COL]][ID] = int(line[ID_COL])
            input_dict[line[NBH_NAME_COL]][HT_KEY] = []
            for col in line[POP_COL:]:
                input_dict[line[NBH_NAME_COL]][HT_KEY].append(int(col))
        else:
            input_dict[line[NBH_NAME_COL]][HT_KEY] = []
            for col in line[POP_COL:]:
                input_dict[line[NBH_NAME_COL]][HT_KEY].append(int(col))         
                
        line = file.readline().strip().split(',')
        
        
if __name__ == "__main__":
    import doctest
    doctest.testmod()
    
    
def get_low_income_data(input_dict: dict, file: TextIO) -> None:
    """Modify input_dict such that it will contain the low income data from \
    file. 
    
    >>> input_file = open('low_income_small.csv')
    >>> input_dict = {}
    >>> get_low_income_data(input_dict, input_file)
    >>> input_dict
    {'West Humber-Clairville': {'id': 1, 'total': 33230, 'low_income': 5950}, \
'Mount Olive-Silverstone-Jamestown': {'id': 2, 'total': 32940, \
'low_income': 9690}, 'Thistletown-Beaumond Heights': {'id': 3, 'total': 10365, \
'low_income': 2005}, 'Rexdale-Kipling': {'id': 4, 'total': 10540, \
'low_income': 2140}, 'Elms-Old Rexdale': {'id': 5, 'total': 9460, \
'low_income': 2315}}
    
    >>> input_file = open('low_income_small.csv')
    >>> input_dict = {"West Humber-Clairville": {"id": 1, \
"hypertension": [703, 13291, 3741, 9663, 3959, 5176]}, \
"Mount Olive-Silverstone-Jamestown": {"id": 2, \
"hypertension": [789, 12906, 3578, 8815, 2927, 3902]}, \
"Thistletown-Beaumond Heights": {"id": 3, \
"hypertension": [220, 3631, 1047, 2829, 1349, 1767]}, \
"Rexdale-Kipling": {"id": 4, \
"hypertension": [201, 3669, 1134, 3229, 1393, 1854]}, \
"Elms-Old Rexdale": {"id": 5, \
"hypertension": [176, 3353, 1040, 2842, 948, 1322]}}
    >>> get_low_income_data(input_dict, input_file) 
    >>> input_dict == SAMPLE_DATA
    True
    """
    file.readline().strip()
    line = file.readline().strip().split(',')
    while line != ['']:
        if line[NBH_NAME_COL] not in input_dict:
            input_dict[line[NBH_NAME_COL]] = {}
            input_dict[line[NBH_NAME_COL]][ID] = int(line[ID_COL])
            input_dict[line[NBH_NAME_COL]][TOTAL] = int(line[POP_COL])
            input_dict[line[NBH_NAME_COL]][LOW_INCOME] = int(line[LI_POP_COL])
            
        else:
            input_dict[line[NBH_NAME_COL]][TOTAL] = int(line[POP_COL])
            input_dict[line[NBH_NAME_COL]][LOW_INCOME] = int(line[LI_POP_COL])      
        
        line = file.readline().strip().split(',')
    
        
if __name__ == "__main__":
    import doctest
    doctest.testmod()   
    
    
def get_bigger_neighbourhood(data: 'CityData', neighbourhood_1: str, 
                             neighbourhood_2: str) -> str:
    """Return the name of the neigbourhood, between neighbourhood_1 and 
    neighbourhood_2 that has a higher population according to the low income 
    data. If either neighbourhood_1 or neighbourhood_2 is not in CityData, 
    assume population of that neighbourhood is 0. If the two neighbourhoods 
    are the same size, return neighbourhood_1.
    
    Precondition: The two neighbourhood names are different 
    
    >>> get_bigger_neighbourhood(SAMPLE_DATA,'Elms-Old Rexdale',\
    'Rexdale-Kipling') 
    'Rexdale-Kipling'
    
    >>> get_bigger_neighbourhood(SAMPLE_DATA, \
    'Mount Olive-Silverstone-Jamestown', 'Thistletown-Beaumond Heights')
    'Mount Olive-Silverstone-Jamestown'
    
    """
    if (neighbourhood_1 and neighbourhood_2) not in data:
        return neighbourhood_1
    elif neighbourhood_1 not in data:
        return neighbourhood_2
    elif neighbourhood_2 not in data:
        return neighbourhood_1
    if data[neighbourhood_2][TOTAL] > data[neighbourhood_1][TOTAL]:
        return neighbourhood_2
    else:
        return neighbourhood_1 


if __name__ == '__main__':
    import doctest 
    doctest.testmod()


def build_neighbourhood_to_hypertension_rate(data: 'CityData') -> dict: 
    """Return a dictionary that maps the name of a neighbourhood in CityData \
    to its hypertension rate.
    
    >>> build_neighbourhood_to_hypertension_rate(SAMPLE_DATA)
    {'West Humber-Clairville': 0.2987202275151084,\
 'Mount Olive-Silverstone-Jamestown': 0.28466612028255867,\
 'Thistletown-Beaumond Heights': 0.31797739151574084,\
 'Rexdale-Kipling': 0.3117001828153565,\
 'Elms-Old Rexdale': 0.2878808035120394}
    """
    neighbourhood_to_hypertension_rate = {}

    for neighbourhood in data:
        total_hypertension = data[neighbourhood][HT_KEY][HT_20_44] + \
            data[neighbourhood][HT_KEY][HT_45_64] + \
            data[neighbourhood][HT_KEY][HT_65_UP]
        total_neighbourhood = data[neighbourhood][HT_KEY][NBH_20_44] + \
            data[neighbourhood][HT_KEY][NBH_45_64] + \
            data[neighbourhood][HT_KEY][NBH_65_UP]
        hypertension_rate = total_hypertension / total_neighbourhood
        neighbourhood_to_hypertension_rate[neighbourhood] = hypertension_rate
    return neighbourhood_to_hypertension_rate


if __name__ == '__main__':
    import doctest 
    doctest.testmod()

       
def get_high_hypertension_rate(data: 'CityData', 
                               threshold: float) -> list[tuple[str, float]]:
    
    """Return a list of tuples of neighbourhoods with a hypertension rate\
    greater than or equal to the threshold in the form (neighbour_name,
    hypertension_rate).
    
    Precondition: 0.0 <= threshold <= 1.0 
    
    >>> get_high_hypertension_rate(SAMPLE_DATA, 0.3)
    [('Thistletown-Beaumond Heights', 0.31797739151574084),\
 ('Rexdale-Kipling', 0.3117001828153565)]
    
    >>> get_high_hypertension_rate(SAMPLE_DATA, 0.2)
    [('West Humber-Clairville', 0.2987202275151084),\
 ('Mount Olive-Silverstone-Jamestown', 0.28466612028255867),\
 ('Thistletown-Beaumond Heights', 0.31797739151574084),\
 ('Rexdale-Kipling', 0.3117001828153565),\
 ('Elms-Old Rexdale', 0.2878808035120394)]
    
    """
    neighbourhood_to_hypertension_rate = \
        build_neighbourhood_to_hypertension_rate(data)
    
    high_hypertension = []
    
    for nbh in neighbourhood_to_hypertension_rate:
        if neighbourhood_to_hypertension_rate[nbh] >= threshold:
            high_hypertension.append((nbh,
                                      neighbourhood_to_hypertension_rate[nbh]))
    return high_hypertension 


if __name__ == '__main__':
    import doctest 
    doctest.testmod()


def build_neighbourhood_to_low_income_rate(data: 'CityData') -> dict:
    """Return a dictionary that maps the name of a neighbourhood in CityData \
    to its low income rate.
    
    >>> build_neighbourhood_to_low_income_rate(SAMPLE_DATA)
    {'West Humber-Clairville': 0.1790550707192296,\
 'Mount Olive-Silverstone-Jamestown': 0.2941712204007286,\
 'Thistletown-Beaumond Heights': 0.19343945972021226,\
 'Rexdale-Kipling': 0.2030360531309298,\
 'Elms-Old Rexdale': 0.24471458773784355}
    """
    neighbourhood_to_low_income_rate = {}
    for nbh in data:
        low_income_rate = data[nbh][LOW_INCOME] / data[nbh][TOTAL]
        neighbourhood_to_low_income_rate[nbh] = low_income_rate
    return neighbourhood_to_low_income_rate


if __name__ == '__main__':
    import doctest 
    doctest.testmod()    


def get_ht_to_low_income_ratios(data: 'CityData') -> dict[str, float]:
    """Return a dictionary where the keys are the names of neighbourhoods as \
    in CityData, and the values are the ratio of the hypertension rate to the \
    low income rate for that neighbourhood.
    
    >>> get_ht_to_low_income_ratios(SAMPLE_DATA)
    {'West Humber-Clairville': 1.6683148168616895,\
 'Mount Olive-Silverstone-Jamestown': 0.9676885451091314,\
 'Thistletown-Beaumond Heights': 1.6438083107534431,\
 'Rexdale-Kipling': 1.5351962275111484,\
 'Elms-Old Rexdale': 1.1763941257986577}
    
    """
    neighbourhood_to_hypertension_rate = \
        build_neighbourhood_to_hypertension_rate(data)
    
    neighbourhood_to_low_income_rate = \
        build_neighbourhood_to_low_income_rate(data)
    
    ht_to_low_income_ratios = {}
    
    for nbh in data:
        ht_low_income_ratio = neighbourhood_to_hypertension_rate[nbh] / \
            neighbourhood_to_low_income_rate[nbh]
        ht_to_low_income_ratios[nbh] = ht_low_income_ratio
    return ht_to_low_income_ratios


if __name__ == '__main__':
    import doctest 
    doctest.testmod()
    
    
def calculate_ht_rates_by_age_group(data: 'CityData', 
                                    nbh_name: str) -> tuple[float, 
                                                            float, float]:
    """Return a tuple of three values representing the hypertension rate for \
    each of the three age groups in nbh_name as a percentage.
    >>> calculate_ht_rates_by_age_group(SAMPLE_DATA, 'Elms-Old Rexdale')
    (5.24903071875932, 36.593947923997185, 71.70953101361573)
    
    """
    ht_rate_20_44 = (data[nbh_name][HT_KEY][HT_20_44] / data[nbh_name][HT_KEY]
                     [NBH_20_44]) * 100 
    
    ht_rate_45_64 = (data[nbh_name][HT_KEY][HT_45_64] / data[nbh_name][HT_KEY]
                     [NBH_45_64]) * 100
    
    ht_rate_65_up = (data[nbh_name][HT_KEY][HT_65_UP] / data[nbh_name][HT_KEY]
                     [NBH_65_UP]) * 100 
    
    return (ht_rate_20_44, ht_rate_45_64, ht_rate_65_up)


if __name__ == '__main__':
    import doctest 
    doctest.testmod()


def get_age_standardized_ht_rate(ndata: 'CityData', name: str) -> float:
    """Return the age standardized hypertension rate from the neighbourhood in
    ndata matching the given name.

    Precondition: name is in ndata

    >>> get_age_standardized_ht_rate(SAMPLE_DATA, 'Elms-Old Rexdale')
    24.44627521389894
    >>> get_age_standardized_ht_rate(SAMPLE_DATA, 'Rexdale-Kipling')
    24.72562462246556
    """
    rates = calculate_ht_rates_by_age_group(ndata, name)

    # These rates are normalized for only 20+ ages, using the census data
    # that our datasets are based on.
    canada_20_44 = 11_199_830 / 19_735_665  # Number of 20-44 / Number of 20+
    canada_45_64 = 5_365_865 / 19_735_665  # Number of 45-64 / Number of 20+
    canada_65_plus = 3_169_970 / 19_735_665  # Number of 65+ / Number of 20+

    return (rates[0] * canada_20_44
            + rates[1] * canada_45_64
            + rates[2] * canada_65_plus)


if __name__ == "__main__":
    import doctest
    doctest.testmod()

    # Using the small data files
    small_data = {}

    # Add hypertension data
    ht_file = open("hypertension_data_small.csv")
    get_hypertension_data(small_data, ht_file)
    ht_file.close()

    # Add low income data
    li_file = open("low_income_small.csv")
    get_low_income_data(small_data, li_file)
    li_file.close()

    # Created dictionary should be the same as SAMPLE_DATA
    print(small_data == SAMPLE_DATA)


def get_stats_summary(data: 'CityData') -> float:
    """Return the correlation between age standardized hypertension rates and \
low income rates across all neighbourhoods.

    >>> get_stats_summary(SAMPLE_DATA)
    0.28509539188554994
    """
    ht_rates = []
    li_rates = []
    nbh_to_li = build_neighbourhood_to_low_income_rate(data) 
    
    for nbh in data:
        ht_rate = get_age_standardized_ht_rate(data, nbh)
        ht_rates.append(ht_rate)
        li_rate = nbh_to_li[nbh]
        li_rates.append(li_rate)
    
    return statistics.correlation(ht_rates, li_rates)


if __name__ == "__main__":
    import doctest
    doctest.testmod()
    
    
def order_by_ht_rate(data: 'CityData') -> list[str]:
    """Return a list of the names of the neighbourhoods, ordered from lowest to\
highest age-standardized hypertension rate.
    
    Precondition: Every neighbourhood has a unique hypertension rate.
    
    >>> order_by_ht_rate(SAMPLE_DATA) 
    ['Elms-Old Rexdale', 'Rexdale-Kipling', 'Thistletown-Beaumond Heights', \
'West Humber-Clairville', 'Mount Olive-Silverstone-Jamestown']
    
    """
    ht_rates = []
    nbh_to_ht_rates = {}
    for nbh in data:
        ht_rate = get_age_standardized_ht_rate(data, nbh)
        ht_rates.append(ht_rate)
        nbh_to_ht_rates[nbh] = ht_rate 
    ht_rates.sort()
    nbh_sorted = []
    for rate in ht_rates:
        for nbh in data:
            if rate == nbh_to_ht_rates[nbh]:
                nbh_sorted.append(nbh) 
    return nbh_sorted


if __name__ == "__main__":
    import doctest
    doctest.testmod()