UKB-project / 14_Incident_disease_summary_table.R
14_Incident_disease_summary_table.R
Raw
############################################################################################

### Incident disease summary table 

############################################################################################

# srun -p interactive --pty bash

library(tidyverse)

# Set disease and read in disease codes for the iteration (array)
diseases <- c('AL', 'ALS', 'BRAIN', 'Breast', 'Colorectal', 'COPD', 'CYS', 'Dep', 'Diab', 'ENDO', 'GYN',
              'IBD', 'IHD', 'LIV', 'LUNG', 'LUP', 'MS', 'PD', 'Prostate', 'RA', 'SCZ', 'ST', 'VD')

files <- list.files('/path.../Tables/', '.csv')
location <- 'path.../Tables/'

# Read in and extract relevant info (i.e. smaller files minus protein data)
for(i in 1:length(files)){
  name <- as.character(diseases[i])
  print(name)
  cox <- read.csv(paste0(location, name, '.csv'))
  data <- cox[,which(colnames(cox) %in% c('SampleID', 'pseudo_ind_id', "Age_assessment", "Sex", "dead", "aged", "Event", "age_death", "age_at_event", "tte", 'BMI', 'Dep', 'Alc', 'Smo', 'Edu'))]
  data$Outcome <- name
  write.csv(data, paste0('path.../', name, '.csv'), row.names = F)
  print(dim(cox))
  cox = NULL
  data = NULL
}

# Read subset files back in as list and check max tte

location_new <- 'path...'
results <- list()
files <- list.files(location_new, '.csv')
for(i in 1:length(files)){
  name <- files[i]
  name <- gsub("\\..*", "", name)  
  results[[i]] <- read.csv(paste0(location_new, name, '.csv'))
  print(i)
}
bind <- do.call(rbind, results)
max <- max(bind$tte, na.rm = T)
# 15.25 max years of follow up over cases and controls

### Get the cases, controls and mean tte for each trait - basic model

output <- matrix(nrow = 1*length(files), ncol = 4)
output <- as.data.frame(output)
j=c(1:length(files))

for(i in 1:length(files)){
  df <- results[[i]]
  df <- as.data.frame(df)
  join <- df
  # remove missing covariates 
  # join <- join[!is.na(join$Alc), ]
  # join <- join[!is.na(join$Smo), ]
  # join <- join[!is.na(join$Dep), ]
  # join <- join[!is.na(join$Edu), ]
  # join <- join[!is.na(join$BMI), ]
  df <- join
  # generate metrics required
  cases <- df[which(df$Event == "1"),]
  case <- cases %>% filter(!tte == 'NA')
  case_count <- nrow(case)
  control <- df[which(df$Event == "0"),]
  control_count <- nrow(control)
  mean_tte <- mean(case$tte, na.rm = T) 
  mean_tte <- round(mean_tte, digits = 1)
  sd_tte <- sd(case$tte, na.rm = T)
  sd_tte <- round(sd_tte, digits = 1) 
  mean_sd <- paste0(mean_tte, " ", "(", sd_tte, ")") 
  name_dis <- unique(df$Outcome)
  # output metrics 
  output[j[i],1] <- name_dis
  output[j[i],2] <- case_count
  output[j[i],3] <- control_count
  output[j[i],4] <- mean_sd
  names(output) <- c("Disease", "Cases", "Controls", "Mean time to event")
}

# order by cases low to high
sort <- output[order(output$Cases),]  
write.csv(sort, file = 'path.../summary_N_basic_table.csv', row.names =F)

### Get the cases, controls and mean tte for each trait - basic model
output <- matrix(nrow = 1*length(files), ncol = 4)
output <- as.data.frame(output)
j=c(1:length(files))

for(i in 1:length(files)){
  df <- results[[i]]
  df <- as.data.frame(df)
  join <- df
  # remove missing covariates 
  join <- join[!is.na(join$Alc), ]
  join <- join[!is.na(join$Smo), ]
  join <- join[!is.na(join$Dep), ]
  join <- join[!is.na(join$Edu), ]
  join <- join[!is.na(join$BMI), ]
  df <- join
  # generate metrics required
  cases <- df[which(df$Event == "1"),]
  case <- cases %>% filter(!tte == 'NA')
  case_count <- nrow(case)
  control <- df[which(df$Event == "0"),]
  control_count <- nrow(control)
  mean_tte <- mean(case$tte, na.rm = T) 
  mean_tte <- round(mean_tte, digits = 1)
  sd_tte <- sd(case$tte, na.rm = T)
  sd_tte <- round(sd_tte, digits = 1) 
  mean_sd <- paste0(mean_tte, " ", "(", sd_tte, ")") 
  name_dis <- unique(df$Outcome)
  # output metrics 
  output[j[i],1] <- name_dis
  output[j[i],2] <- case_count
  output[j[i],3] <- control_count
  output[j[i],4] <- mean_sd
  names(output) <- c("Disease", "Cases", "Controls", "Mean time to event")
}

# order by cases low to high
sort <- output[order(output$Cases),]  
write.csv(sort, file = 'path.../summary_N_full_table.csv', row.names =F)

### JOIN THEM TOGETHER

basic <- read.csv('path.../summary_N_basic_table.csv')
full <- read.csv('path.../summary_N_full_table.csv')
names(basic) <- c("Disease", "Cases basic", "Controls basic", "Mean time to event basic")
names(full) <- c("Disease", "Cases full", "Controls full", "Mean time to event full")
join <- left_join(basic, full, by = "Disease")
write.csv(join, "path.../joint.csv", row.names = F)

### ADD DEATH

d <- read.csv("path.../DEATH.csv")
cox <- d
data <- cox[,which(colnames(cox) %in% c('SampleID', 'pseudo_ind_id', "Age_assessment", "Sex", "dead", "aged", "Event", "age_death", "age_at_event", "tte", 'BMI', 'Dep', 'Alc', 'Smo', 'Edu'))]
data$Outcome <- name
write.csv(data, 'path.../DEATH.csv', row.names = F)
output <- matrix(nrow = 1, ncol = 4)
output <- as.data.frame(output)

join <- data
# remove missing covariates 
join <- join[!is.na(join$Alc), ]
join <- join[!is.na(join$Smo), ]
join <- join[!is.na(join$Dep), ]
join <- join[!is.na(join$Edu), ]
join <- join[!is.na(join$BMI), ]
df <- join
# generate metrics required
cases <- df[which(df$Event == "1"),]
case <- cases %>% filter(!tte == 'NA')
case_count <- nrow(case)
control <- df[which(df$Event == "0"),]
control_count <- nrow(control)
mean_tte <- mean(case$tte, na.rm = T) 
mean_tte <- round(mean_tte, digits = 1)
sd_tte <- sd(case$tte, na.rm = T)
sd_tte <- round(sd_tte, digits = 1) 
mean_sd <- paste0(mean_tte, " ", "(", sd_tte, ")") 
name_dis <- unique(df$Outcome)
# output metrics 
output[1,1] <- 'DEATH'
output[1,2] <- case_count
output[1,3] <- control_count
output[1,4] <- mean_sd
names(output) <- c("Disease", "Cases", "Controls", "Mean time to event")