Gaza-Mortality-Survey / Combined_various_calculations.R
Combined_various_calculations.R
Raw
library(tidyverse)
library(haven)
library(gt)

# Load the main data files
fates <- read_sav("GMS Household roster.sav")

# Recode governorates: in the raw file HR00=3 is Khan Younis and HR00=4 is
# Deir al-Balah (questionnaire order). Swap to standard order.
fates <- fates %>% mutate(HR00=case_when(HR00==1~1,
                                          HR00==2~2,
                                          HR00==3~4,
                                          HR00==4~3,
                                          HR00==5~5))

births <- read_sav("GMS Births.sav")
births <- births %>% mutate(HR00=case_when(HR00==1~1,
                                            HR00==2~2,
                                            HR00==3~4,
                                            HR00==4~3,
                                            HR00==5~5))

#Fates - 8,740 still present in the household, 281 left Gaza, 137 moved
#within Gaza, 457 dead, 62 missing, 52 imprisoned

nrow(filter(fates, HR05 == 1))
nrow(filter(fates, HR05 == 2))
nrow(filter(fates, HR05 == 3))
nrow(filter(fates, HR05 == 4))
nrow(filter(fates, HR05 == 5))
nrow(filter(fates, HR05 == 6))

#Types of deaths - 14 nonviolent with medical aid, 45 nonviolent 
#without medical aid, 5 accidental, 393 violent

nrow(filter(fates, HR06 == 1))
nrow(filter(fates, HR06 == 2))
nrow(filter(fates, HR06 == 3))
nrow(filter(fates, HR06 == 4))
nrow(filter(fates, HR06 == 5))

#Sample demographics (male == 1) - Note that small children with ages
#less than 1 were coded as having ages of 1

nrow(filter(fates, HR04 == 1, HR03 < 6))
nrow(filter(fates, HR04 == 2, HR03 < 6))
nrow(filter(fates, HR04 == 1, HR03 > 5, HR03 < 15))
nrow(filter(fates, HR04 == 2, HR03 > 5, HR03 < 15))
nrow(filter(fates, HR04 == 1, HR03 > 14, HR03 < 18))
nrow(filter(fates, HR04 == 2, HR03 > 14, HR03 < 18))
nrow(filter(fates, HR04 == 1, HR03 > 17, HR03 < 40))
nrow(filter(fates, HR04 == 2, HR03 > 17, HR03 < 40))
nrow(filter(fates, HR04 == 1, HR03 > 39, HR03 < 65))
nrow(filter(fates, HR04 == 2, HR03 > 39, HR03 < 65))
nrow(filter(fates, HR04 == 1, HR03 > 64))
nrow(filter(fates, HR04 == 2, HR03 > 64))

#Make a table of fates broken down by demographics

fates_dem <- c()
for (i in c(1:6)) {
  x1 <- nrow(filter(fates, HR04 == 1, HR03 < 5, HR05 == i))
  x2 <- nrow(filter(fates, HR04 == 2, HR03 < 5, HR05 == i))
  x3 <-nrow(filter(fates, HR04 == 1, HR03 > 4, HR03 < 12, HR05 == i))
  x4 <- nrow(filter(fates, HR04 == 2, HR03 > 4, HR03 < 12, HR05 == i))
  x5 <- nrow(filter(fates, HR04 == 1, HR03 > 11, HR03 < 18, HR05 == i))
  x6 <- nrow(filter(fates, HR04 == 2, HR03 > 11, HR03 < 18, HR05 == i))
  x7 <- nrow(filter(fates, HR04 == 1, HR03 > 17, HR03 < 30, HR05 == i))
  x8 <- nrow(filter(fates, HR04 == 2, HR03 > 17, HR03 < 30, HR05 == i))
  x9 <- nrow(filter(fates, HR04 == 1, HR03 > 29, HR03 < 40, HR05 == i))
  x10 <- nrow(filter(fates, HR04 == 2, HR03 > 29, HR03 < 40, HR05 == i))
  x11 <- nrow(filter(fates, HR04 == 1, HR03 > 39, HR03 < 65, HR05 == i))
  x12 <- nrow(filter(fates, HR04 == 2, HR03 > 39, HR03 < 65, HR05 == i))
  x13 <-nrow(filter(fates, HR04 == 1, HR03 > 64, HR05 == i))
  x14 <- nrow(filter(fates, HR04 == 2, HR03 > 64, HR05 == i))
  fates_dem <- c(fates_dem, x1, x2, x3, x4, x5, x6, x7, x8,
                 x9, x10, x11, x12, x13, x14)
}

matrix_fates <- matrix(fates_dem, nrow = 14, ncol = 6)

df_fates <- as.data.frame(matrix_fates)
demo_categories <- c("Male 0-4", "Female 0-4", "Male 5-11", 
                     "Female 5-11", "Male 12-17", "Female 12-17",
                     "Male 18-29", "Female 18-29", "Male 30-39", 
                     "Female 30-39", "Male 40-64",
                     "Female 40-64", "Male 65+", "Female 65+")
df_fates <- mutate(df_fates, demo_categories)
colnames(df_fates) <- c("Present", "Left Gaza", "Moved Out of Household",
                        "Dead", "Missing", "In Prison", "Gender/Age")

df_fates <- relocate(df_fates, "Gender/Age")

df_fates <- df_fates %>%
  bind_rows(summarise(., across(where(is.numeric), sum),
                      across(where(is.character), ~'Total')))
Total <- c()
for (i in c(1:15)) {
  Total <- c(Total, sum(df_fates[i,2:7]))
}

df_fates <- mutate(df_fates, Total)

fates_table <- gt(df_fates)%>%
  tab_source_note(source_note = "Unweighted GMS Data")

gtsave(fates_table, "table1.png")

#Now let's do a similar table for deaths

deaths_dem <- c()
for (i in c(1:4)) {
  x1 <- nrow(filter(fates, HR04 == 1, HR03 < 5, HR06 == i))
  x2 <- nrow(filter(fates, HR04 == 2, HR03 < 5, HR06 == i))
  x3 <-nrow(filter(fates, HR04 == 1, HR03 > 4, HR03 < 12, HR06 == i))
  x4 <- nrow(filter(fates, HR04 == 2, HR03 > 4, HR03 < 12, HR06 == i))
  x5 <- nrow(filter(fates, HR04 == 1, HR03 > 11, HR03 < 18, HR06 == i))
  x6 <- nrow(filter(fates, HR04 == 2, HR03 > 11, HR03 < 18, HR06 == i))
  x7 <- nrow(filter(fates, HR04 == 1, HR03 > 17, HR03 < 30, HR06 == i))
  x8 <- nrow(filter(fates, HR04 == 2, HR03 > 17, HR03 < 30, HR06 == i))
  x9 <- nrow(filter(fates, HR04 == 1, HR03 > 29, HR03 < 40, HR06 == i))
  x10 <- nrow(filter(fates, HR04 == 2, HR03 > 29, HR03 < 40, HR06 == i))
  x11 <- nrow(filter(fates, HR04 == 1, HR03 > 39, HR03 < 65, HR06 == i))
  x12 <- nrow(filter(fates, HR04 == 2, HR03 > 39, HR03 < 65, HR06 == i))
  x13 <-nrow(filter(fates, HR04 == 1, HR03 > 64, HR06 == i))
  x14 <- nrow(filter(fates, HR04 == 2, HR03 > 64, HR06 == i))
  deaths_dem <- c(deaths_dem, x1, x2, x3, x4, x5, x6, x7, x8,
                  x9, x10, x11, x12, x13, x14)
}

matrix_deaths <- matrix(deaths_dem, nrow = 14, ncol = 4)

df_deaths <- as.data.frame(matrix_deaths)
df_deaths <- mutate(df_deaths, df_deaths$V1 + df_deaths$V2)
df_deaths <- df_deaths[,-(1:2)]  
df_deaths <- relocate(df_deaths, "df_deaths$V1 + df_deaths$V2")
demo_categories <- c("Male 0-4", "Female 0-4", "Male 5-11", 
                     "Female 5-11", "Male 12-17", "Female 12-17",
                     "Male 18-29", "Female 18-29", "Male 30-39", 
                     "Female 30-39", "Male 40-64",
                     "Female 40-64", "Male 65+", "Female 65+")
df_deaths <- mutate(df_deaths, demo_categories)
colnames(df_deaths) <- c("Nonviolent", 
                         "Accident", "Violent", "Gender/Age")

df_deaths <- relocate(df_deaths, "Gender/Age")

df_deaths <- df_deaths %>%
  bind_rows(summarise(., across(where(is.numeric), sum),
                      across(where(is.character), ~'Total')))
Total <- c()
for (i in c(1:15)) {
  Total <- c(Total, sum(df_deaths[i,2:4]))
}

df_deaths <- mutate(df_deaths, Total)

deaths_table <- gt(df_deaths)%>%
  tab_source_note(source_note = "Unweighted GMS Data")

gtsave(deaths_table, "table2.png")

#Here is a breakdown of violent deaths classified by governorate of origin
#Note: HR00 has been recoded above so 3=Deir al-Balah, 4=Khan Younis

table(fates$HR00)

north <- nrow(filter(fates, HR06 == 4, HR00 == 1))
city <- nrow(filter(fates, HR06 == 4, HR00 == 2))
central <- nrow(filter(fates, HR06 == 4, HR00 == 3))
khan <- nrow(filter(fates, HR06 == 4, HR00 == 4))
rafah <- nrow(filter(fates, HR06 == 4, HR00 == 5))

north+city+central+khan+rafah

north_pop <- nrow(filter(fates, HR00 == 1))
city_pop <- nrow(filter(fates, HR00 == 2))
central_pop <- nrow(filter(fates, HR00 == 3))
khan_pop <- nrow(filter(fates, HR00 == 4))
rafah_pop <- nrow(filter(fates, HR00 == 5))

north_rate <- round(100*north/north_pop, digits = 1)
city_rate <- round(100*city/city_pop, digits = 1)
central_rate <- round(100*central/central_pop, digits = 1)
khan_rate <- round(100*khan/khan_pop, digits = 1)
rafah_rate <- round(100*rafah/rafah_pop, digits = 1)

goverornate_data <- c(north, city, central, khan, rafah,
                      north_rate, city_rate, central_rate,
                      khan_rate, rafah_rate)

matrix_governorate <- matrix(goverornate_data, nrow = 5, ncol = 2)

df_governorate <- as.data.frame(matrix_governorate)

Governorates <- c("Northern Gaza", "Gaza City",
                  "Deir al-Balah", "Khan Younis",
                  "Rafah")

df_governorate <- mutate(df_governorate, Governorates)

colnames(df_governorate) <-  c("Violent Deaths", 
                               "Violent Deaths as Percent of Sample",
                               "Governorates")

df_governorate <- relocate(df_governorate, "Governorates")

governorates_table <- gt(df_governorate)%>%
  tab_source_note(source_note = "Unweighted GMS Data")

gtsave(governorates_table, "table3.png")


#Now pull some basic information on births
#Note: in GMS Births.sav, HR04=sex, HR05=status, HR06=cause of death, HR03=0 (age)
#Birth dates are not available in the public data file.

nrow(births)

#179 boys and 178 girls
nrow(filter(births, HR04 == 1))
nrow(filter(births, HR04 == 2))

boys <- filter(births, HR04 == 1)
girls <- filter(births, HR04 == 2)

#4 infant deaths
dead <- filter(births, HR05 == 4)
nrow(dead)

#2 killed violently, 1 in an accident, 1 nonviolent
nrow(filter(births, HR06 == 4))   # violent
nrow(filter(births, HR06 == 3))   # accident
nrow(filter(births, HR06 < 3))    # nonviolent

infant_data <- c(nrow(boys), nrow(girls), nrow(dead))

matrix_infant <- matrix(infant_data, nrow = 1)

df_infant <- as.data.frame(matrix_infant)

colnames(df_infant) <- c("Boys", "Girls", "Dead")

infant_table <- gt(df_infant)%>%
  tab_header(
    title = "Born After October 6, 2023"
  ) 

gtsave(infant_table, "infant_table.png")

#An unweighted estimate for the annual birth rate during the war is 62,832
(nrow(boys)+nrow(girls))*220/1.25

#calculate the percentage of our lower and upper violent-death estimates
#captured by the MoH (MoH figure of 49,090 for the survey period)

moh <- 49090
lower <- moh/63600
upper <- moh/86800
central <- moh/75200

1 - central   # 34.7% below central estimate
1 - lower     # below lower bound
1 - upper     # below upper bound

#Calculate percentage of population killed
#According to the US census there were 1,064,348 males and 1,034,041 females
#in Gaza at the beginning of the war

population <- 1064348 + 1034041

violent_lower <- 63600
violent_upper <- 86800
violent_central <- 75200

100*violent_lower/population
100*violent_upper/population
100*violent_central/population


#The rest of this code evaluates the quality of our raking by creating a table
#that compares population distributions with weighted (after raking) sample 
#distributions. It requires running Gaza_Strip_Mortality_RMD_fixed.rmd first
#to generate the trimmed raked design, then extracting the data and weights.
#
# After running the RMD, run the following in the same R session
# (or simply run these lines if trimmedrakedesign is already in your environment):
jon <- model.frame(trimmedrakedesign)
jon$weights <- weights(trimmedrakedesign)

# Population demographic marginals (from IDB US Census Bureau)
# Note: these include births added to age 0 row, so total slightly exceeds
# the pre-war population of 2,098,389
pop_dem <- c(181716, 173794, 200191, 188991,
             165430, 157849, 248133, 240116,
             130574, 133015, 146967, 149998,
             29944, 28670)

pop_total <- sum(pop_dem)  # 2,175,388 including births during survey period

pop_dem_percentages <- 100*pop_dem/pop_total
sum(pop_dem_percentages)  # should be 100

sample_dem <- c(nrow(filter(jon, HR04 == 1, HR03 < 5)),
                nrow(filter(jon, HR04 == 2, HR03 < 5)),
                nrow(filter(jon, HR04 == 1, HR03 > 4, HR03 < 12)),
                nrow(filter(jon, HR04 == 2, HR03 > 4, HR03 < 12)),
                nrow(filter(jon, HR04 == 1, HR03 > 11, HR03 < 18)),
                nrow(filter(jon, HR04 == 2, HR03 > 11, HR03 < 18)),
                nrow(filter(jon, HR04 == 1, HR03 > 17, HR03 < 30)),
                nrow(filter(jon, HR04 == 2, HR03 > 17, HR03 < 30)),
                nrow(filter(jon, HR04 == 1, HR03 > 29, HR03 < 40)),
                nrow(filter(jon, HR04 == 2, HR03 > 29, HR03 < 40)),
                nrow(filter(jon, HR04 == 1, HR03 > 39, HR03 < 65)),
                nrow(filter(jon, HR04 == 2, HR03 > 39, HR03 < 65)),
                nrow(filter(jon, HR04 == 1, HR03 > 64)),
                nrow(filter(jon, HR04 == 2, HR03 > 64))
)

sample_total <- nrow(jon)
sample_dem_percentages <- 100*sample_dem/sample_total
sum(jon$weights)

sample_dem_weighted <- c(sum(filter(jon, HR04 == 1, HR03 < 5)$weights),
                         sum(filter(jon, HR04 == 2, HR03 < 5)$weights),
                         sum(filter(jon, HR04 == 1, HR03 > 4, HR03 < 12)$weights),
                         sum(filter(jon, HR04 == 2, HR03 > 4, HR03 < 12)$weights),
                         sum(filter(jon, HR04 == 1, HR03 > 11, HR03 < 18)$weights),
                         sum(filter(jon, HR04 == 2, HR03 > 11, HR03 < 18)$weights),
                         sum(filter(jon, HR04 == 1, HR03 > 17, HR03 < 30)$weights),
                         sum(filter(jon, HR04 == 2, HR03 > 17, HR03 < 30)$weights),
                         sum(filter(jon, HR04 == 1, HR03 > 29, HR03 < 40)$weights),
                         sum(filter(jon, HR04 == 2, HR03 > 29, HR03 < 40)$weights),
                         sum(filter(jon, HR04 == 1, HR03 > 39, HR03 < 65)$weights),
                         sum(filter(jon, HR04 == 2, HR03 > 39, HR03 < 65)$weights),
                         sum(filter(jon, HR04 == 1, HR03 > 64)$weights),
                         sum(filter(jon, HR04 == 2, HR03 > 64)$weights)
)

sample_weighted_total <- sum(sample_dem_weighted)
sample_dem_weighted_percentages <- 100*sample_dem_weighted/sample_weighted_total
sample_dem_weighted_percentages-pop_dem_percentages

#Here are governorate populations for, in order, North Gaza, Gaza City,
#Deir al Balah, Khan Younis and Rafah

pop_gov <- c(418833, 705983, 300835, 413315, 259423)
sum(pop_gov)
pop_gov_percentages <- 100*pop_gov/sum(pop_gov)

sample_gov <- c(nrow(filter(jon, HR00 == 1)),
                nrow(filter(jon, HR00 == 2)),
                nrow(filter(jon, HR00 == 3)),
                nrow(filter(jon, HR00 == 4)),
                nrow(filter(jon, HR00 == 5)))

sample_gov_percentages <- 100*sample_gov/nrow(jon)

sample_gov_weighted <- c(sum(filter(jon, HR00 == 1)$weights),
                         sum(filter(jon, HR00 == 2)$weights),
                         sum(filter(jon, HR00 == 3)$weights),
                         sum(filter(jon, HR00 == 4)$weights),
                         sum(filter(jon, HR00 == 5)$weights))

sample_gov_weighted_percentages <- 100*sample_gov_weighted/sum(sample_gov_weighted)
sample_gov_weighted_percentages - pop_gov_percentages

#Household size distributions
#Jon used MICS household sizes in the raking; census information included
#here for comparison

census_hh <- c(11920.57, 70385.14, 114332.80, 194472.65, 275651.26,
               337047.61, 334535.13, 274554.36, 191615.78, 293873.71)
census_household_percentages <- 100*census_hh/sum(census_hh)

mics_hh <- c(110, 501, 1069, 1841, 2458,
             3218, 3289, 2255, 1816, 2502)
mics_household_percentages <- 100*mics_hh/sum(mics_hh)
sum(mics_household_percentages)

households_df <- jon %>%
  group_by(prikey)%>%
  summarize(weights = sum(weights),
            sizes = mean(hsize10))

sample_hh_percentages <- sapply(1:10, function(s) 
  100*nrow(filter(households_df, sizes == s))/nrow(households_df))

sample_hh_weighted_percentages <- sapply(1:10, function(s)
  100*sum(filter(households_df, sizes == s)$weights)/sum(households_df$weights))

sum(sample_hh_percentages)
sum(sample_hh_weighted_percentages)
census_household_percentages - sample_hh_weighted_percentages
sum(mics_household_percentages - sample_hh_weighted_percentages)

raking_data <- c(sample_dem_percentages, sample_gov_percentages, 
                 sample_hh_percentages,
                 pop_dem_percentages, pop_gov_percentages, 
                 mics_household_percentages,
                 sample_dem_weighted_percentages, sample_gov_weighted_percentages,
                 sample_hh_weighted_percentages)

raking_matrix <- matrix(raking_data, ncol = 3)
df_raking <- as.data.frame(raking_matrix)
df_raking <- mutate(df_raking, df_raking[,2] - df_raking[,3])

categories <- c("Male 0-4", "Female 0-4", "Male 5-11", 
                "Female 5-11", "Male 12-17", "Female 12-17",
                "Male 18-29", "Female 18-29", "Male 30-39", 
                "Female 30-39", "Male 40-64",
                "Female 40-64", "Male 65+", "Female 65+",
                "North Gaza", "Gaza City", "Deir al-Balah", 
                "Khan Younis", "Rafah",
                "1", "2", "3", "4", "5", "6", "7", "8", "9", "10+"
)

df_raking <- mutate(df_raking, categories)
colnames(df_raking) <- c("Sample Percentages", "Population Percentages", 
                         "Raked Sample Percentages", "Population - Raked Sample",
                         "Categories")
df_raking <- relocate(df_raking, "Categories")

raking_table <- gt(df_raking)%>%
  tab_header(
    title = "Table S7: Comparison of Population Values with Sample and Raked Sample Values",
    subtitle = "By Demographics, Governorate and Household Size"
  ) %>%
  fmt_number(n_sigfig = 3)%>%
  tab_row_group(label = "Household Sizes", rows = c(20:29)) %>%
  tab_row_group(label = "Governorates", rows = c(15:19)) %>%
  tab_row_group(label = "Ages/Sexes", rows = c(1:14))

gtsave(raking_table, "raking_table.png")

summary(df_raking$`Population - Raked Sample`)

#Check whether 10+ households have substantially different death rates
100*nrow(filter(jon, hsize > 9 & Violent == 1))/nrow(filter(jon, hsize > 9))
100*nrow(filter(jon, hsize < 10 & Violent == 1))/nrow(filter(jon, hsize < 10))