Malaria-network-exposure / 01_prepare_data.R
01_prepare_data.R
Raw

### LOAD REQUIRED PACKAGES AND AUXILIARY FUNCTIONS

library(openxlsx)

setwd(workdir)
source('00_functions.R')

###


### 1. LOAD DATA

setwd(datadir)

village.codes <- c('v1', 'v2', 'v3', 'v4', 'v5', 'v6', 'v7', 'v8', 'v9', 'v10')
village.names <- c('WK1', 'WK2', 'WK3', 'WJ1', 'WJ2', 'WJ3', 'SG1', 'SG2', 'SG3', 'SG4')
sheet.names <- c('edgelist', 'attributes')
data.file <- 'Raw data v2 anonymised.xlsx'

data.sheets <- matrix(c(rep(village.codes, each=2),
                        paste(rep(village.names, each=2), sheet.names),
                        rep(sheet.names, 10)), ncol=3)

for (i in 1:nrow(data.sheets)) {
  
  sheet <- read.xlsx(data.file, sheet=data.sheets[i,2], rowNames=F, sep.names=' ')
  object.name <- paste0(data.sheets[i,1], '.', data.sheets[i,3])
  assign(object.name, eval(sheet))
  
}

rm(i, sheet, object.name, data.sheets)

###


### 2. TRANSFORM EDGELISTS TO ADJACENCY MATRICES

for (i in 1:length(village.names)) {
  
  edge.obj <- paste0('v', i, '.', sheet.names[1])
  attr.obj <- paste0('v', i, '.', sheet.names[2])
  edge <- eval(parse(text=edge.obj))
  attr <- eval(parse(text=attr.obj))
  
  matr.obj <- paste0('v', i, '.adjmatrix')
  matr <- matrix(0, nrow(attr), nrow(attr))
  rownames(matr) <- attr$ID
  colnames(matr) <- attr$ID
  
  for (j in 1:nrow(edge)) {
    matr[rownames(matr)==edge[j,1], colnames(matr)==edge[j,2]] <- 1
  }
  
  assign(matr.obj, eval(matr))
  
}
rm(i, j, edge.obj, attr.obj, edge, attr, matr.obj, matr)

###


### 3. REMOVE REDUNDANT EDGELIST OBJECTS

rm(list=ls(pattern='edgelist'))

sheet.names[1] <- 'adjmatrix'

###


### 4. DEFINE THREE TYPES OF OUTDEGREES

for (i in 1:length(village.names)) {
  
  matr.obj <- paste0('v', i, '.', sheet.names[1])
  attr.obj <- paste0('v', i, '.', sheet.names[2])
  matr <- eval(parse(text=matr.obj))
  attr <- eval(parse(text=attr.obj))
  
  # IDs of respondents, non-respondents, and those from external villages
  id.resp <- attr$ID[attr$Interviewed=='Yes']
  id.nonresp <- attr$ID[attr$Interviewed=='No' & attr$Village==village.names[i]]
  id.ext <- attr$ID[attr$Interviewed=='No' & attr$Village!=village.names[i]]
  
  # calculate different outdegrees
  odeg.to.resp <- rowSums(matr[,colnames(matr) %in% id.resp])
  odeg.to.nonresp <- rowSums(matr[,colnames(matr) %in% id.nonresp])
  odeg.to.ext <- rowSums(matr[,colnames(matr) %in% id.ext])
  
  # add outdegree columns to attributes object
  attr$'Outdegree to respondents' <- odeg.to.resp
  attr$'Outdegree to non-respondents' <- odeg.to.nonresp
  attr$'Outdegree out of village' <- odeg.to.ext
  
  assign(attr.obj, eval(attr))
  
}
rm(i, attr.obj, attr, matr.obj, matr, id.resp, id.nonresp, id.ext,
   odeg.to.resp, odeg.to.nonresp, odeg.to.ext)

###


### 5. DROP NON-RESPONDENTS AND PEOPLE FROM OTHER VILLAGES

# exception: people in WJ1 talk to healer of WJ2
#            make sure WJ2 healer kept in WJ1
#            copy their measure use data from WJ2 to WJ1 for later steps
which.healer <- v4.attributes$ID[v4.attributes$Village %in% 'WJ2' & 
                                 v4.attributes$`Role in the village` %in% 'Traditional Healer']
v4.attributes[v4.attributes$ID==which.healer,] <- v5.attributes[v5.attributes$ID==
                                                                  which.healer,]
rm(which.healer)

# drop unnecessary people
for (i in 1:length(village.names)) {

  matr.obj <- paste0('v', i, '.', sheet.names[1])
  attr.obj <- paste0('v', i, '.', sheet.names[2])
  matr <- eval(parse(text=matr.obj))
  attr <- eval(parse(text=attr.obj))

  ids.keep <- attr$ID[attr$Interviewed=='Yes']
  matr <- matr[rownames(matr) %in% ids.keep, colnames(matr) %in% ids.keep]
  attr <- attr[attr$ID %in% ids.keep,]

  assign(matr.obj, eval(matr))
  assign(attr.obj, eval(attr))

}
rm(i, matr.obj, attr.obj, ids.keep, matr, attr)

###


### 6. DEFINE INDEGREE VARIABLE

for (i in 1:length(village.names)) {
  
  matr.obj <- paste0('v', i, '.', sheet.names[1])
  attr.obj <- paste0('v', i, '.', sheet.names[2])
  matr <- eval(parse(text=matr.obj))
  attr <- eval(parse(text=attr.obj))
  
  indeg <- colSums(matr)
  attr$Indegree <- indeg
  
  assign(attr.obj, eval(attr))
  
}
rm(i, matr.obj, attr.obj, indeg, matr, attr)

###


### 7. DEFINE ASHA- AND HEALER-RELATED VARIABLES

for (i in 1:length(village.names)) {
  
  matr.obj <- paste0('v', i, '.', sheet.names[1])
  attr.obj <- paste0('v', i, '.', sheet.names[2])
  matr <- eval(parse(text=matr.obj))
  attr <- eval(parse(text=attr.obj))
  
  asha <- ifelse(attr$`Role in the village` %in% 'ASHA', 1, 0)
  healer <- ifelse(attr$`Role in the village` %in% 'Traditional Healer', 1, 0)
  # if there are more than one interviewed Healer, keep the one with highest indegree
  # (two villages have two - WK1: 5-4 indegree, SG2: 27-3 indegree)
  if (sum(healer)>1) {
    healer[healer==1 & attr$Indegree<max(attr$Indegree[healer==1])] <- 0
  }
  
  asha.talk <- matr[, asha==1]
  if (sum(healer)==1) {
    healer.talk <- matr[, healer==1]
  } else { 
    healer.talk <- rep(0, nrow(matr))
  }
  
  attr$ASHA <- asha
  attr$'Traditional Healer' <- healer
  attr$'Talks to ASHA' <- asha.talk
  attr$'Talks to Healer' <- healer.talk
  
  assign(attr.obj, eval(attr))
  
}
rm(i, matr.obj, attr.obj, asha, healer, asha.talk, healer.talk, matr, attr)

###


### 8. RECODE VARIABLES

for (i in 1:length(village.names)) {
  
  attr.obj <- paste0('v', i, '.', sheet.names[2])
  attr <- eval(parse(text=attr.obj))
  
  # measure use frequencies to yes-no
  attr[attr=='Always'] <- 'Yes'
  attr[attr=='Sometimes'] <- 'Yes'
  attr[attr=='Rarely'] <- 'No'
  attr[attr=='Never'] <- 'No'
  
  # all yes-no to 1-0
  attr[attr=='Yes'] <- 1
  attr[attr=='No'] <- 0
  
  # female - 1, male - 0
  colnames(attr)[colnames(attr)=='Gender'] <- 'Female'
  attr[attr=='Female'] <- 1
  attr[attr=='Male'] <- 0
  
  # education to levels, 0 - no schooling
  colnames(attr)[colnames(attr)=='Level of education'] <- 'Education'
  attr[attr=='No Schooling'] <- 0
  attr[attr=='Below Primary'] <- 1
  attr[attr=='Primary'] <- 2
  attr[attr=='Middle'] <- 3
  attr[attr=='Secondary'] <- 4
  attr[attr=='Higher Secondary'] <- 5
  attr[attr=='Graduate'] <- 6
  
  # occupation to works in fields, 1- agricultural labor or cultivator
  colnames(attr)[colnames(attr)=='Usual occupation'] <- 'Works in fields'
  attr$`Works in fields` <- ifelse(attr$`Works in fields`=='Agricultural labor' |
                                     attr$`Works in fields`=='Cultivator', 1, 0)
  
  # remove unnecessary variables
  attr <- attr[,!(colnames(attr) %in% c('Interviewed', 'Notes from the interview', 
                                        'Role in the village', 'Use of nets at night', 
                                        'Use mosquito mats'))]
  
  # rename a few variables
  colnames(attr)[colnames(attr)=='Head of Household'] <- 'Head of household'
  colnames(attr)[colnames(attr)=='Indegree'] <- 'Network size in village (indegree)'
  colnames(attr)[colnames(attr)=='Outdegree to respondents'] <- 'Network size in village'
  colnames(attr)[colnames(attr)=='Outdegree to non-respondents'] <- 
                                                          'Network size among non-respondents'
  colnames(attr)[colnames(attr)=='Outdegree out of village'] <- 'Network size out of village'
  colnames(attr)[colnames(attr)=='In charge of caring the sick'] <- 'Carer for a sick person'
  colnames(attr)[colnames(attr)=='Nets with insecticide'] <- 'Use: LLINs'
  colnames(attr)[colnames(attr)=='Covers up'] <- 'Use: Covering clothes'
  colnames(attr)[colnames(attr)=='Use boots'] <- 'Use: Boots'
  colnames(attr)[colnames(attr)=='Use gloves'] <- 'Use: Gloves'
  colnames(attr)[colnames(attr)=='Use insecticide cream'] <- 'Use: Insecticide cream'
  colnames(attr)[colnames(attr)=='Use coils'] <- 'Use: Coils'
  colnames(attr)[colnames(attr)=='Use vaporizers'] <- 'Use: Vaporizers'
  colnames(attr)[colnames(attr)=='Burn materials'] <- 'Use: Burning materials'
  
  # make most variables numeric
  attr[,!(colnames(attr) %in% c('ID', 'Village', 'Household'))] <-
    as.numeric(unlist(attr[,!(colnames(attr) %in% c('ID', 'Village', 'Household'))]))

  # save new data object              
  assign(attr.obj, eval(attr))
  
}
rm(i, attr.obj, attr)

###


### 9. DEFINE HOUSEHOLD VARIABLES

for (i in 1:length(village.names)) {
  
  matr.obj <- paste0('v', i, '.', sheet.names[1])
  attr.obj <- paste0('v', i, '.', sheet.names[2])
  matr <- eval(parse(text=matr.obj))
  attr <- eval(parse(text=attr.obj))
  
  household <- substr(attr$ID, 1, 9)
  attr$Household <- household
  
  hh.asha.talk <- apply(attr, 1, FUN=function(x) 
    household.aggregate(attr, x['Household'], x['ID'], 'Talks to ASHA', agg_fun='max'))
  hh.healer.talk  <- apply(attr, 1, FUN=function(x) 
    household.aggregate(attr, x['Household'], x['ID'], 'Talks to Healer', agg_fun='max'))
  hh.exp.llins <- apply(attr, 1, FUN=function(x) 
    household.aggregate(attr, x['Household'], x['ID'], 'Use: LLINs', agg_fun='max'))
  hh.exp.cover <- apply(attr, 1, FUN=function(x) 
    household.aggregate(attr, x['Household'], x['ID'], 'Use: Covering clothes', agg_fun='max'))
  hh.exp.boots <- apply(attr, 1, FUN=function(x) 
    household.aggregate(attr, x['Household'], x['ID'], 'Use: Boots', agg_fun='max'))
  hh.exp.gloves <- apply(attr, 1, FUN=function(x) 
    household.aggregate(attr, x['Household'], x['ID'], 'Use: Gloves', agg_fun='max'))
  hh.exp.cream <- apply(attr, 1, FUN=function(x) 
    household.aggregate(attr, x['Household'], x['ID'], 'Use: Insecticide cream', 
                        agg_fun='max'))
  hh.exp.coils <- apply(attr, 1, FUN=function(x) 
    household.aggregate(attr, x['Household'], x['ID'], 'Use: Coils', agg_fun='max'))
  hh.exp.evap <- apply(attr, 1, FUN=function(x) 
    household.aggregate(attr, x['Household'], x['ID'], 'Use: Vaporizers', agg_fun='max'))
  hh.exp.burn <- apply(attr, 1, FUN=function(x) 
    household.aggregate(attr, x['Household'], x['ID'], 'Use: Burning materials', 
                        agg_fun='max'))
  
  attr$'Household talks to ASHA' <- hh.asha.talk
  attr$'Household talks to Healer' <- hh.healer.talk
  attr$'Household exposure to LLINs' <- hh.exp.llins
  attr$'Household exposure to Covering clothes' <- hh.exp.cover
  attr$'Household exposure to Boots' <- hh.exp.boots
  attr$'Household exposure to Gloves' <- hh.exp.gloves
  attr$'Household exposure to Insecticide cream' <- hh.exp.cream
  attr$'Household exposure to Coils' <- hh.exp.coils
  attr$'Household exposure to Vaporizers' <- hh.exp.evap
  attr$'Household exposure to Burning materials' <- hh.exp.burn
  
  assign(attr.obj, eval(attr))
  
}
rm(i, matr.obj, attr.obj, household, hh.asha.talk, hh.healer.talk, matr, attr)
rm(list=ls(pattern='hh.exp'))

###


### 10. DEFINE NETWORK EXPOSURE VARIABLES

for (i in 1:length(village.names)) {
  
  matr.obj <- paste0('v', i, '.', sheet.names[1])
  attr.obj <- paste0('v', i, '.', sheet.names[2])
  matr <- eval(parse(text=matr.obj))
  attr <- eval(parse(text=attr.obj))
  
  net.exp.llins <- colSums(attr[,'Use: LLINs']*t(matr), na.rm=T) / rowSums(matr)
  net.exp.cover <- colSums(attr[,'Use: Covering clothes']*t(matr), na.rm=T) / rowSums(matr)
  net.exp.boots <- colSums(attr[,'Use: Boots']*t(matr), na.rm=T) / rowSums(matr)
  net.exp.gloves <- colSums(attr[,'Use: Gloves']*t(matr), na.rm=T) / rowSums(matr)
  net.exp.cream <- colSums(attr[,'Use: Insecticide cream']*t(matr), na.rm=T) / rowSums(matr)
  net.exp.coils <- colSums(attr[,'Use: Coils']*t(matr), na.rm=T) / rowSums(matr)
  net.exp.evap <- colSums(attr[,'Use: Vaporizers']*t(matr), na.rm=T) / rowSums(matr)
  net.exp.burn <- colSums(attr[,'Use: Burning materials']*t(matr), na.rm=T) / rowSums(matr)
  
  # those with 0 outdegree have 0 exposure
  net.exp.llins[is.nan(net.exp.llins)] <- 0
  net.exp.cover[is.nan(net.exp.cover)] <- 0
  net.exp.boots[is.nan(net.exp.boots)] <- 0
  net.exp.gloves[is.nan(net.exp.gloves)] <- 0
  net.exp.cream[is.nan(net.exp.cream)] <- 0
  net.exp.coils[is.nan(net.exp.coils)] <- 0
  net.exp.evap[is.nan(net.exp.evap)] <- 0
  net.exp.burn[is.nan(net.exp.burn)] <- 0
  
  attr$'Network exposure to LLINs' <- net.exp.llins
  attr$'Network exposure to Covering clothes' <- net.exp.cover
  attr$'Network exposure to Boots' <- net.exp.boots
  attr$'Network exposure to Gloves' <- net.exp.gloves
  attr$'Network exposure to Insecticide cream' <- net.exp.cream
  attr$'Network exposure to Coils' <- net.exp.coils
  attr$'Network exposure to Vaporizers' <- net.exp.evap
  attr$'Network exposure to Burning materials' <- net.exp.burn
  
  assign(attr.obj, eval(attr))
  
}
rm(i, matr.obj, attr.obj, matr, attr)
rm(list=ls(pattern='net.exp'))

###


### 11. CREATE OBJECTS USEFUL INFORMATION FOR LATER ANALYSES

# variable names by groups
vars.dep <- c('Use: LLINs', 'Use: Covering clothes',
              'Use: Boots', 'Use: Gloves', 'Use: Insecticide cream',
              'Use: Coils', 'Use: Vaporizers', 'Use: Burning materials')
vars.indiv <- c('Female', 'Head of household', 'Carer for a sick person',
                'Works in fields', 'Age', 'Education')
vars.oplead <- c('Talks to ASHA', 'Talks to Healer', 
                 'Household talks to ASHA', 'Household talks to Healer')
vars.netsize <- c('Network size in village', 'Network size among non-respondents',
                  'Network size out of village', 'Network size in village (indegree)')
vars.netexp <- c('Network exposure to LLINs', 'Network exposure to Covering clothes', 
                 'Network exposure to Boots', 'Network exposure to Gloves', 
                 'Network exposure to Insecticide cream', 'Network exposure to Coils',
                 'Network exposure to Vaporizers', 'Network exposure to Burning materials')
vars.hhexp <- c('Household exposure to LLINs', 'Household exposure to Covering clothes', 
                'Household exposure to Boots', 'Household exposure to Gloves', 
                'Household exposure to Insecticide cream', 'Household exposure to Coils',
                'Household exposure to Vaporizers', 'Household exposure to Burning materials')

# which villages have a healer?
has.healer <- rep(NA, length(village.names))
for (i in 1:length(village.names)) {
  attr.obj <- paste0('v', i, '.', sheet.names[2])
  attr <- eval(parse(text=attr.obj))
  has.healer[i] <- ifelse(sum(attr$`Traditional Healer`)>0, T, F)
}
rm(i, attr.obj, attr)
  

###


### 12. CREATE SINGLE ATTRIBUTES DATASET WITH ALL OF THE VILLAGES

# create single attribute dataset with ashas and healers
v.attributes.with.ashas.healers <- rbind(v1.attributes, v2.attributes, v3.attributes,
                                         v4.attributes, v5.attributes, v6.attributes,
                                         v7.attributes, v8.attributes, v9.attributes,
                                         v10.attributes)
# remove duplicate row of WJ2 healer in WJ1
v.attributes.with.ashas.healers <- 
  v.attributes.with.ashas.healers[!duplicated(v.attributes.with.ashas.healers$ID, fromLast=T),]

# create attribute dataset without ashas and healers (to be analysed)
v.attributes <- v.attributes.with.ashas.healers
v.attributes <- v.attributes[v.attributes$ASHA!=1 & v.attributes$`Traditional Healer`!=1,]

###


### 13. SAVE DATA AS CSVs (IF REQUESTED) AND CLEAN UP WORKSPACE

if (save.data) {
  
  # save single dataset without ashas and healers (used in logit analysis)
  write.csv(v.attributes, file = 'attributes_analytical_sample.csv', row.names=F)
  
  # save village datasets with ashas and healers (used in SAOM analysis)
  s.names <- paste(rep(village.names, each=2), sheet.names[2:1], sep='_')
  s.objects <- paste0(rep(village.codes, each=2), '.', sheet.names[2:1])
  for (i in 1:length(s.objects)) {
    towrite <- eval(parse(text=s.objects[i]))
    write.csv(towrite, 
              file=paste0('v', sprintf("%02d", ceiling(i/2)), '_', s.names[i], '.csv'))
  }
  rm(i, s.names, s.objects, towrite)
  
}

# delete unnecessary objects
rm(household.aggregate, predictive.acc, reg.backward, siena07ToConvergence,
   GeodesicDistribution, CliqueCensus, mixedTriadCensus.agree,
   save.data)

# change back to working directory
setwd(workdir)