food <- readr::read_csv("https://wilkelab.org/DSC385/datasets/food_coded.csv")
food
## # A tibble: 125 × 61
##    GPA   Gender breakfast calories_chicken calories_day calories_scone coffee
##    <chr>  <dbl>     <dbl>            <dbl>        <dbl>          <dbl>  <dbl>
##  1 2.4        2         1              430          NaN            315      1
##  2 3.654      1         1              610            3            420      2
##  3 3.3        1         1              720            4            420      2
##  4 3.2        1         1              430            3            420      2
##  5 3.5        1         1              720            2            420      2
##  6 2.25       1         1              610            3            980      2
##  7 3.8        2         1              610            3            420      2
##  8 3.3        1         1              720            3            420      1
##  9 3.3        1         1              430          NaN            420      1
## 10 3.3        1         1              430            3            315      2
## # ℹ 115 more rows
## # ℹ 54 more variables: comfort_food <chr>, comfort_food_reasons <chr>,
## #   comfort_food_reasons_coded...10 <dbl>, cook <dbl>,
## #   comfort_food_reasons_coded...12 <dbl>, cuisine <dbl>, diet_current <chr>,
## #   diet_current_coded <dbl>, drink <dbl>, eating_changes <chr>,
## #   eating_changes_coded <dbl>, eating_changes_coded1 <dbl>, eating_out <dbl>,
## #   employment <dbl>, ethnic_food <dbl>, exercise <dbl>, …

A detailed data dictionary for this dataset is available here. The dataset was originally downloaded from Kaggle, and you can find additional information about the dataset here.

Question: Is GPA related to student income, the father’s educational level, or the student’s perception of what an ideal diet is?

Introduction:

We will explore the food dataset (named “Food choices” on Kaggle) for this project. This dataset contains 126 college student responses regarding food choices, income, GPA and other personal details. We will be considering the following attributes/variables out of the dataset to answer our project’s question - Is GPA related to student income, the father’s educational level, or the student’s perception of what an ideal diet is? :

Approach:

For our analysis, we will be following the steps below:

  1. Wrangle our dataset to ready it for analysis as follows:
  1. Create violin plots for income, father_education and ideal_diet_coded versus GPA

The reason to use violin plots for categorical representations is that violins show the distribution of the dependent variable across each category which would best enable us to compare the categories against each other regarding the dependent variable value.

Analysis:

# Data Wrangling

food_wrangled <- food %>%
  select(GPA, income, father_education, ideal_diet_coded) %>%
  na.omit() %>%
  mutate(GPA = trimws(GPA, whitespace = "[^0-9]")) %>% 
  filter(nzchar(GPA)) %>%
  transform(GPA = as.numeric(GPA)) %>%
  mutate(income = case_when(
    income == 1 ~ "less than $15,000",
    income == 2 ~ "$15,001 to $30,000",
    income == 3 ~ "$30,001 to $50,000",
    income == 4 ~ "$50,001 to $70,000",
    income == 5 ~ "$70,001 to $100,000",
    income == 6 ~ "higher than $100,000"
  ), father_education = case_when(
    father_education == 1 ~ "less than high school",
    father_education == 2 ~ "high school degree",
    father_education == 3 ~ "some college degree",
    father_education == 4 ~ "college degree",
    father_education == 5 ~ "graduate degree"
  ), ideal_diet_coded = case_when(
    ideal_diet_coded == 1 ~ "portion control",
    ideal_diet_coded == 2 ~ "add fruit/veggies or eat healthier",
    ideal_diet_coded == 3 ~ "balance",
    ideal_diet_coded == 4 ~ "less sugar",
    ideal_diet_coded == 5 ~ "home cooked/organic",
    ideal_diet_coded == 6 ~ "current diet",
    ideal_diet_coded == 7 ~ "more protein",
    ideal_diet_coded == 8 ~ "unclear"
  )) %>%
  mutate(income = as.factor(income),
         father_education = as.factor(father_education),
         ideal_diet_coded = as.factor(ideal_diet_coded)) %>%
  mutate(income = fct_reorder(income, GPA, mean),
         father_education = fct_reorder(father_education, GPA, mean),
         ideal_diet_coded = fct_reorder(ideal_diet_coded, GPA, mean))
# Violin Plots

ggplot(food_wrangled, aes(income, GPA)) +
  geom_violin(fill = "#E69F00") +
  scale_x_discrete(
  name = "Income Bracket", # x-axis name
  ) +
  scale_y_continuous(
  name = "GPA" # y-axis name
  ) +
  ggtitle("GPA vs. Income Bracket")+ 
  theme_bw(12)

ggplot(food_wrangled, aes(father_education, GPA)) +
  geom_violin(fill = "#56B4E9") +
  scale_x_discrete(
  name = "Father's Education Category", # x-axis name
  ) +
  scale_y_continuous(
  name = "GPA" # y-axis name
  ) +
  ggtitle("GPA vs. Father's Education")+ 
  theme_bw(12)

ggplot(food_wrangled, aes(ideal_diet_coded, GPA)) +
  geom_violin(fill = "#009E73") +
  scale_x_discrete(
  name = "Student's Idea of Ideal Diet", # x-axis name
  ) +
  scale_y_continuous(
  name = "GPA" # y-axis name
  ) +
  ggtitle("GPA vs. Ideal Diet")+ 
  theme_bw(12)

Discussion:

As apparent in the results, our charts show the following:

To obtain even more accurate results, we can consider implementing the following: