CSC110 / assignments / a3 / a3_part1.py
a3_part1.py
Raw
"""CSC110 Fall 2022 Assignment 3, Part 1: Data Analysis with Toronto Health

Instructions (READ THIS FIRST!)
===============================

This Python module contains the functions you will complete for Part 1 of this assignment.

Copyright and Usage Information
===============================

This file is provided solely for the personal and private use of students
taking CSC110 at the University of Toronto St. George campus. All forms of
distribution of this code, whether as given or with any changes, are
expressly prohibited. For more information on copyright for CSC110 materials,
please consult our Course Syllabus.

This file is Copyright (c) 2022 David Liu, Tom Fairgrieve, and Angela Zavaleta Bernuy
"""
import csv
from dataclasses import dataclass

from plotly.express import scatter


###############################################################################
# Part 1(a)
###############################################################################
@dataclass
class HypertensionData:
    """A data class representing neighbourhood hypertension data.

    (You'll note that this data class has a lot of attributes! Another representation
    we could have used was to store the counts in a dictionary with appropriate keys.)

    Instance Attributes:
        - name: the name of the neighbourhood
        - num_hypertension_all: the number of people in the neighbourhood with hypertension
        - num_all: the number of people in the neighbourhood
        - num_hypertension_20_44: the number of people aged 20-44 in the neighbourhood with hypertension
        - num_20_44: the number of people aged 20-44 in the neighbourhood
        - num_hypertension_45_64: the number of people aged 45-64 in the neighbourhood with hypertension
        - num_45_64: the number of people aged 45-64 in the neighbourhood
        - num_hypertension_65_plus: the number of people aged 65 and older in the neighbourhood with hypertension
        - num_65_plus: the number of people aged 65 and older in the neighbourhood
    """
    name: str
    num_hypertension_all: int
    num_all: int
    num_hypertension_20_44: int
    num_20_44: int
    num_hypertension_45_64: int
    num_45_64: int
    num_hypertension_65_plus: int
    num_65_plus: int


@dataclass
class LowIncomeData:
    """A data class representing neighbourhood low income data.

    Instance Attributes:
        - name: the name of the neighbourhood
        - num_low_income: the number of people in the neighbourhood with low income status
        - population_total: the total number of people in the neighbourhood
    """
    name: str
    num_low_income: int
    population_total: int


def load_hypertension_data(filename: str) -> list[HypertensionData]:
    """Return a list of HypertensionData based on the data in filename.

    The returned list must match the same order the rows appear in the given file.

    Preconditions:
    - filename refers to a csv file whose format matches the hypertension dataset description
      on the assignment handout.

    >>> data = load_hypertension_data('datasets/part1/hypertension_data_small.csv')
    >>> len(data)
    5
    """
    data_so_far = []

    with open(filename) as f:
        reader = csv.reader(f, delimiter=',')

        # Skip the first header row.
        next(reader)

        for row in reader:
            # row is a list of strings
            # Your task is to extract the relevant data from row and add it to the accumulator.
            # Make sure to use type conversion to ensure instance attributes have the correct type
            list.append(data_so_far, HypertensionData(row[0], int(row[1]), int(row[2]), int(row[3]), int(row[4]),
                                                      int(row[5]), int(row[6]), int(row[7]), int(row[8])))

    return data_so_far


def load_low_income_data(filename: str) -> list[LowIncomeData]:
    """Return a list of LowIncomeData values representing the data in filename.

    The returned list must match the same order the rows appear in the given file.

    Preconditions:
    - filename refers to a csv file whose format matches the low income dataset description
      on the assignment handout.

    >>> data = load_low_income_data('datasets/part1/low_income_data_small.csv')
    >>> len(data)
    5
    """
    data_so_far = []

    with open(filename) as f:
        reader = csv.reader(f, delimiter=',')

        # Skip header row
        next(reader)

        for row in reader:
            # row is a list of strings
            # Your task is to extract the relevant data from row and add it to the accumulator.
            # Make sure to use type conversion to ensure instance attributes have the correct type.
            list.append(data_so_far, LowIncomeData(row[0], int(row[2]), int(row[1])))
    return data_so_far


###############################################################################
# Part 1(b)
###############################################################################
def total_num_hypertension(data: list[HypertensionData]) -> int:
    """Return the total number of people aged 20+ with hypertension in the given data.

    Preconditions:
    - data does not contain any duplicated neighbourhood names

    >>> data = load_hypertension_data('datasets/part1/hypertension_data_small.csv')
    >>> total_num_hypertension(data)
    23205

    """
    num = 0
    for i in data:
        num += i.num_hypertension_all
    return num


def high_hypertension_rate(data: list[HypertensionData], threshold: float) -> set[str]:
    """Return the names of the neighbourhoods in the given data whose hypertension rate is >= threshold.

    The *hypertension rate* of a neighbourhood is defined as:

        (# people aged 20+ with hypertension) / (# people aged 20+)

    Preconditions:
    - data does not contain any duplicated neighbourhood names
    - 0.0 <= threshold <= 1.0

    >>> data = load_hypertension_data('datasets/part1/hypertension_data_small.csv')
    >>> result = high_hypertension_rate(data, 0.24)
    >>> result == ['Thistletown-Beaumond Heights', 'Rexdale-Kipling']
    True
    """
    acc = []
    for i in data:
        if (i.num_hypertension_all / i.num_all) >= threshold:
            list.append(acc, i.name)
    return acc


def get_hypertension_rates(data: list[HypertensionData], age_group: str) -> dict[str, float]:
    """Return a dictionary mapping each given neighbourhood's name to the neighbourhood's hypertension rate
    for the given age group.

    age_group specifies which group to calculate the hypertension rates for, and can be one of:
        - '20+': all people aged 20+, i.e., the whole dataset
        - '20-44': only people aged 20-44
        - '45-64': only people aged 45-64
        - '65+': only people aged 65+

    Preconditions:
    - data does not contain any duplicated neighbourhood names
    - age_group in {'20+', '20-44', '45-64', '65+'}

    >>> data = load_hypertension_data('datasets/part1/hypertension_data_small.csv')
    >>> result = get_hypertension_rates(data, '65+')
    >>> len(result)
    5

    >>> data = load_hypertension_data('datasets/part1/hypertension_data_small.csv')
    >>> result = get_hypertension_rates(data, '20-44')
    >>> round(result['Thistletown-Beaumond Heights'], 4)  # For testing purposes, round to 4 decimal places
    0.0606
    """
    dic = {}
    for i in data:
        if age_group == '20+':
            dic[i.name] = i.num_hypertension_all / i.num_all
        elif age_group == '20-44':
            dic[i.name] = i.num_hypertension_20_44 / i.num_20_44
        elif age_group == '45-64':
            dic[i.name] = i.num_hypertension_45_64 / i.num_45_64
        elif age_group == '65+':
            dic[i.name] = i.num_hypertension_65_plus / i.num_65_plus
    return dic


###############################################################################
# Part 1(c)
###############################################################################
@dataclass
class CombinedRateData:
    """A data class representing neighbourhood hypertension and low income rate data.

    Instance Attributes:
        - name: the name of the neighbourhood
        - hypertension_rate: the hypertension rate for a particular age group in the neighbourhood
            NOTE: this attribute will be used to store rates for different age groups,
                  e.g. "people aged 20+" or "people aged 45-64"
        - low_income_rate: the proportion of neighbourhood residents with low income status

    Representation Invariants:
    - 0.0 <= self.hypertension_rate <= 1.0
    - 0.0 <= self.low_income_rate <= 1.0
    """
    name: str
    hypertension_rate: float
    low_income_rate: float


def combine_rates(hypertension_data: list[HypertensionData],
                  low_income_data: list[LowIncomeData],
                  age_group: str) -> list[CombinedRateData]:
    """Return a list of CombinedRateData values for the neighbourhoods in both hypertension_data and low_income_data.

    The age_group parameter determines what age group to calculate hypertension rates for. It has the same
    meaning as the age_group parameter for get_hypertension_rates from Part 1(b).

    Review the above data class definition for CombinedRateData to understand what pieces of information each
    instance of CombinedRateData should store. To compute the low income rate, use the population total in
    the LowIncomeData instance.

    Note that you should NOT assume that hypertension_data and low_income_data store the same neighbourhoods,
    or have a particular order. If a neighbourhood appears in one of the input lists but not the other,
    that neighbourhood should NOT be included in the returned list.

    Preconditions:
    - neighbourhood names in hypertension_data are unique
    - neighbourhood names in low_income_data are unique
    - age_group in {'20+', '20-44', '45-64', '65+'}

    >>> example_hypertension_data = load_hypertension_data('datasets/part1/hypertension_data_small.csv')
    >>> example_low_income_data = load_low_income_data('datasets/part1/low_income_data_small.csv')
    >>> example_combined_data = combine_rates(example_hypertension_data, example_low_income_data, '20+')
    >>> len(example_combined_data)
    5

    HINTS:
    1. You may find the get_hypertension_rates function from above useful, and may wish to define a similar
       "get_low_income_rates" function.
    2. Remember that you can check whether a given value k is a key in a dictionary using the "in" operator.
    """
    l_acc = []
    h = get_hypertension_rates(hypertension_data, age_group)
    for i in h:
        for j in low_income_data:
            if i in j.name:
                list.append(l_acc, CombinedRateData(name=i, hypertension_rate=h[i],
                                                    low_income_rate=j.num_low_income / j.population_total))
    return l_acc


def plot_combined_rates(neighbourhood_data: list[CombinedRateData], age_group: str) -> None:
    """Display a scatterplot of the neighbourhood low income rates vs. hypertension rates, using plotly.

    Also label each point in the scatterplot using the name of the neighbourhood.
    age_group is used to label the y-axis with the correct age group for the hypertension rates.

    Preconditions:
    - neighbourhood_data does not contain any duplicated neighbourhood names
    - age_group in {'20+', '20-44', '45-64', '65+'}

    NOTE: You should NOT modify this function, but should be able to (roughly) understand what
    it is doing.
    """
    figure = scatter(
        data_frame=neighbourhood_data,  # The data to plot (in our case, a list of data class instances)
        x='low_income_rate',            # The instance attribute name to use for x values
        y='hypertension_rate',          # The instance attribute name to use for y values
        hover_name='name',              # The instance attribute name to use for point labels
        title='Low Income vs. Hypertension Rates by Toronto Neighbourhood',  # The graph title
        labels={
            'low_income_rate': 'Proportion of Residents with Low Income Status',                  # Label for the x-axis
            'hypertension_rate': f'Proportion of Residents (aged {age_group}) with Hypertension'  # Label for the y-axis
        }
    )

    # Show the figure in the browser
    figure.show()
    # Is the above not working for you? Comment out that line of code, and uncomment the following line:
    # figure.write_html('my_figure.html')
    # This will create a new file called 'my_figure.html', which you can manually open in your web browser.


def part1_example(hypertension_file: str, low_income_file: str, age_group: str) -> None:
    """Display a scatterplot comparing the low income and hypertension rates in the given datasets.

    age_group is used to specify which age group to compute hypertension rates for.

    Preconditions:
    - age_group in {'20+', '20-44', '45-64', '65+'}
    - hypertension_file refers to a csv file whose format matches the hypertension dataset description
      on the assignment handout.
    - low_income_file refers to a csv file whose format matches the low income dataset description

    HINTS:
    - This is a "putting it all together" function, so the actual code here should be pretty
      simple, and mainly call functions you've already implemented above!
    # >>> part1_example('datasets/part1/hypertension_data_small.csv', 'datasets/part1/low_income_data_small.csv', '20+')
    # >>> part1_example('datasets/part1/hypertension_data_small.csv', 'datasets/part1/low_income_data_small.csv', '65+')
    """
    example_hypertension_data = load_hypertension_data(hypertension_file)
    example_low_income_data = load_low_income_data(low_income_file)
    example_combined_data = combine_rates(example_hypertension_data, example_low_income_data, age_group)
    plot_combined_rates(example_combined_data, age_group)


if __name__ == '__main__':
    import doctest
    doctest.testmod(verbose=True)

    # When you are ready to check your work with python_ta, uncomment the following lines.
    # (In PyCharm, select the lines below and press Ctrl/Cmd + / to toggle comments.)
    import python_ta
    python_ta.check_all(config={
        'max-line-length': 120,
        'disable': ['too-many-instance-attributes'],
        'allowed-io': ['load_hypertension_data', 'load_low_income_data'],
        'extra-imports': ['csv', 'plotly.express'],
    })