Gaza-Mortality-Survey / power_calculations_vary_deaths_by_household.R
power_calculations_vary_deaths_by_household.R
Raw
#We've raised a budget for a sample of 2,000 households so these calculations
#are somewhat of a moot point. Still, it would be nice to know that 
#the confidence interval on violent deaths won't be a lot wider than 
#plus or minus 10,000

# Let's assume a population of 2.1 million and an average household size of
#5.5 and 60,000 violent deaths.

households_number <- 2100000/5.5
violent_deaths <- 60000
violent_deaths_per_household <- violent_deaths/households_number

#We will assume that households can have 0, 1, 2, 3, 4 or 5 deaths. In terms 
#of confidence interval width it is pretty pessimistic to assume such a 
#heavy tail on this distribution.

possibilities <- c(0, 1, 2, 3, 4, 5)

#The probabilities on the number of deaths within a household are set so
#that the expected number of deaths is equal to to the mean number of deaths
#per household

#First I assign relative (heavy-tailed) probabilities to all outcomes with at
#least 1 death

probabilities_relative_1plus <- c(0.035, 0.02, 0.0075, 0.0050, 0.0025)

#Then I normalize these relative probabilities so that they will produce the
#right expected number of deaths

base_expectation <- sum(probabilities_relative_1plus*c(1,2,3,4,5))
multiplier <- violent_deaths_per_household/base_expectation

#Then I set all the probabilities so that they add up to 1 and check that 
#these give the right expected number of deaths.

probabilities <- c(1 - sum(probabilities_relative_1plus)*multiplier, 
                   probabilities_relative_1plus*multiplier) 

sum(probabilities*possibilities)
violent_deaths_per_household

violent_deaths_per_household*households_number

#To summarize, we've created probabilities that a household will have
# 0, 1, 2,.., 5 deaths that will generate 60,000 expected deaths for 
#the population


#Draw 10,000 simple random samples of 2,000 households and estimate the number of 
#violent deaths

sample_size <- 2000

results <- c()
for (i in c(1:10000)) {
  results <- c(results, sum(sample(possibilities, sample_size, replace = TRUE, 
                                   probabilities))/sample_size)
}
estimates <- results*households_number

summary(estimates)

quantile(estimates, probs = c(0.025, 0.05,0.5, 0.975))

#The 95% CI on violent deaths will be roughly plus or minus 10,000 deaths under
#this scenarios which is comforting. 


#Now assume that because of clustering, the effective sample size is just 400

sample_size <- 400

results <- c()
for (i in c(1:10000)) {
  results <- c(results, sum(sample(possibilities, sample_size, replace = TRUE, 
                                   probabilities))/sample_size)
}
estimates <- results*households_number

summary(estimates)

quantile(estimates, probs = c(0.025, 0.05,0.5, 0.975))

#Now the CI is more like plus or minus 20,000 which we could live with although
#it would be annoyingly wide. But effective sample size of 400 is really
#pessimistic.

#We have to live with the possibility that, e.g., the central estimate
#is 53,000 (even though the true number is 60,000), higher than the 
#comparable MoH figure but we cannot reject the the hypothesis that the
#MoH is exaggerating deaths. We definitely do not have 80% power to 
#reject that exaggeration hypothesis is the true number is 60,000.
#I larger sample would help here but we cannot afford it.

#Still, we have a good shot at a confidence interval not much wider than
#plus or minus 10,000 which would be good.
#