UKB-project / 11_Process_cox_results.R
11_Process_cox_results.R
Raw
###############################################################################################

##### Process models - incident disease and mortality together 

###############################################################################################

# srun -p interactive --pty bash

library(tidyverse)
library(readxl)
naming <- read_excel("...path/naming_index.xlsx")
naming <- as.data.frame(naming)
names(naming)[1] <- 'Outcome'

# COLLATE RESULTS 
# Set disease and read in disease codes for the iteration (array)
diseases <- c('AL', 'ALS', 'BRAIN', 'Breast', 'Colorectal', 'COPD', 'CYS', 'Dep', 'Diab', 'ENDO', 'GYN',
              'IBD', 'IHD', 'LIV', 'LUNG', 'LUP', 'MS', 'PD', 'Prostate', 'RA', 'SCZ', 'ST', 'VD', 'DEATH')

location <- '...path/Results/'
files <- list.files(location, '.csv')
files_full <- files[grep('FULL', files)]
files_basic <- files[-grep('FULL', files)]

# Basic results collation

results <- list()
for(i in 1:length(files_basic)){
  name <- as.character(diseases[i])
  print(name)
  data <- read.csv(paste0(location, name, '.csv'))
  results[[i]] <- data
}

res <- do.call(rbind, results)
res <- res[order(res$P.Value),]
res <- left_join(res, naming, by = 'Outcome')
write.csv(res, '...path/basic_Bon.csv', row.names = F)
top <- res[which(res$P.Value < 0.00000541125),] # 4916 passing

# Full results collation 
for(i in 1:length(files_full)){
  name <- as.character(diseases[i])
  print(name)
  data <- read.csv(paste0(location, name, '_FULL.csv'))
  results[[i]] <- data
}

res2 <- do.call(rbind, results)
res2 <- res2[order(res2$P.Value),]
res2 <- left_join(res2, naming, by = 'Outcome')
write.csv(res2, '...path/full_Bon.csv', row.names = F)


# Associations passing adjustment
top$retain <- paste0(top$Predictor, top$Outcome)
res2$retain <- paste0(res2$Predictor, res2$Outcome)
keep <- res2[which(res2$retain %in% top$retain),]
keep <- keep[which(keep$P.Value < 0.00000541125),] # 3123
keep <- left_join(keep, naming, by = 'Outcome')
write.csv(keep, '...path/Associations_retained_Bon.csv', row.names = F)
length(unique(keep$Predictor)) # 1052 proteins

# Failure rate across top models
assocs <- as.data.frame(table(keep$Outcome))
assocs <- assocs[order(assocs$Freq),]
names(assocs) <- c('Incident disease', 'Associations')
fail <- keep[which(keep$cox.zph_protein < 0.05),]
fails <- as.data.frame(table(fail$Outcome))
names(fails) <- c('Incident disease', 'Protein_fail')
assocs <- left_join(assocs, fails, by = 'Incident disease')
write.csv(assocs, '...path/Assocs_summary_table_Bon.csv', row.names = F)

###############################################################################################

### Create a summary table showing proteins associated with each disease and direction of effect 

# srun -p interactive --pty bash

library(tidyverse)
keep <- read.csv('...path/Associations_retained_Bon.csv')
diseases <- c('DEATH', 'AL', 'ALS', 'BRAIN', 'Breast', 'Colorectal', 'COPD', 'CYS', 'Dep', 'Diab', 'ENDO', 'GYN',
              'IBD', 'IHD', 'LIV', 'LUNG', 'LUP', 'MS', 'PD', 'Prostate', 'RA', 'SCZ', 'ST', 'VD')
res <- data.frame(Outcome = 1:24, Associations = 1:24, Negative = 1:24, Positive = 1:24, Names_neg = 1:24, Names_pos = 1:24)

for(i in 1:length(diseases)){
  dis <- as.character(diseases[i])
  table <- keep[which(keep$Outcome %in% dis),]
  num <- dim(table)[1]
  
  neg <- table[which(table$Hazard.Ratio < 1),]
  pos <- table[which(table$Hazard.Ratio >= 1),]
 
  neg_list <- neg[,1]
  pos_list <- pos[,1]
  
  neg_list <- gsub("\\..*", "", neg_list)
  pos_list <- gsub("\\..*", "", pos_list)
  
  neg_list <- str_c(neg_list, collapse = ", ")
  pos_list <- str_c(pos_list, collapse = ", ")
  
  res[i,1] <- dis
  res[i,2] <- num
  res[i,3] <- dim(neg)[1] 
  res[i,4] <- dim(pos)[1]
  res[i,5] <- neg_list
  res[i,6] <- pos_list
}

res <- res[order(res[,2]),]
write.csv(res, '...path/summary_table_associations_Bon.csv', row.names = F)

###############################################################################