UKB-project / 19_MethylPipeR_processing2.R
19_MethylPipeR_processing2.R
Raw
###############################################################################################

##### MethylPipeR chosen models - processing Cox PH AUC PRAUC and saving results together for each score

###############################################################################################

# Train ProteinScores for traits using desired case: control ratio
# srun -p interactive --pty bash
library(readxl)
library(glmnet)
library(survival)
library(gbm)
library(data.table)
library(pacman)
library(tidyverse)


###############################################################################################

### 10yr: Extract chosen models for each outcome and save files in location easily accessible

###############################################################################################

chosen <- read.csv("path.../combined_chosen.csv")

chosen <- chosen %>% mutate(Batch = case_when(
  iteration %in% c(1:10) ~ 'Batch1',
  iteration %in% c(11:20) ~ 'Batch2',
  iteration %in% c(21:30) ~ 'Batch3',
  iteration %in% c(31:40) ~ 'Batch4',
  iteration %in% c(41:50) ~ 'Batch5'
))

# For each 10yr disease, pull chosen model, load key files and examine fully adjusted cox PH
batches_location <- 'path.../Diseases_140223/Batches/'
chosen_model_files <- 'path.../Chosen_model_files/Files_10y/'

traits <- c('AL', 'Breast', 'Colorectal', 'COPD', 'Diab', 'GYN', 'IBD', 'IHD', 'LIV', 'LUNG', 'PD',
            'Prostate', 'RA', 'ST', 'VD')
# traits <- c('ALS', 'Dep', 'LUP', 'ENDO')

batches <- c('Batch1', 'Batch2', 'Batch3', 'Batch4', 'Batch5')
index <- c('batch1', 'batch2', 'batch3', 'batch4', 'batch5')

models_res <- list()

for(i in 1:length(traits)){
  trait <- as.character(traits[i])
  print(trait)

  # Identify median model iteration and batch
  chosen_sub <- chosen[which(chosen$name %in% trait),]
  it <- chosen_sub$iteration
  bat <- chosen_sub$Batch
  see <- chosen_sub$seed
  set.seed(see)

  # Load in model projection file with covariates for Cox PH analyses
  testTarget <- readRDS(paste0(batches_location, bat, "/", trait, "/model_logs_", it, "/output/", trait, "testTarget.rds"))
  testTarget <- na.omit(testTarget)
  # Run all metrics required

  # Load weights and other models files for chosen model
  weights <- read.csv(paste0(batches_location, bat, "/", trait, "/model_logs_", it, "/output/", trait, '_', it, "_weights.csv"))
  mod <- readRDS(paste0(batches_location, bat, "/", trait, "/model_logs_", it, "/output/", trait, "models.rds"))

  # Compare null and scores added cox models
  if(trait %in% c('Prostate', 'CYS', 'GYN', 'Breast', 'ENDO')){
    riskFactorsOnlyCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment, testTarget)
    fullCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment + dScore, testTarget)
    ProteinOnly <- coxph(Surv(time_to_event, Event) ~ dScore, testTarget)
    allrisk <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Edu) + BMI + Dep + as.factor(Smo) + as.factor(Alc), testTarget)
    allriskprotein <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Edu) + BMI + Dep + as.factor(Smo) + as.factor(Alc) + dScore, testTarget)
    print('Model has been sex stratified in cox.')
  } else {
    riskFactorsOnlyCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex), testTarget)
    fullCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex) + dScore, testTarget)
    ProteinOnly <- coxph(Surv(time_to_event, Event) ~ dScore, testTarget)
    allrisk <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex) + as.factor(Edu) + BMI + Dep + as.factor(Smo) + as.factor(Alc), testTarget)
    allriskprotein <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex) + as.factor(Edu) + BMI + Dep + as.factor(Smo) + as.factor(Alc) + dScore, testTarget)
    print('Model has not been sex stratified in cox.')
  }
  # + Dep + as.factor(Smo) + as.factor(Alc) + BMI +

  # List models
  models <- list(r = riskFactorsOnlyCoxPH, f = fullCoxPH, t = ProteinOnly, s = allrisk, b = allriskprotein)


  # Generate performance statistics - for 10 years of follow up, comparing base HR to survival HR at 10 years
  predictCoxPHOnset <- function(dataDF, coxPHModel, threshold = 10) {
    uniqueTimes <- uniqueTimes <- sort(unique(c(dataDF$time_to_event, threshold)))
    thresholdIndex <- match(threshold, uniqueTimes)
    cumulativeBaseHaz <- gbm::basehaz.gbm(dataDF$time_to_event, dataDF$Event, predict(coxPHModel), uniqueTimes)
    survivalPredictions <- exp(-cumulativeBaseHaz[[thresholdIndex]]) ^ exp(predict(coxPHModel))
    onsetPredictions <- 1 - survivalPredictions

    print('Cumulative baseline hazard presence NA - print onset predictions:')
    print(table(is.na(onsetPredictions)))

    # Event should be 0 if tte is > 10
    dataDF$Event <- sapply(1:nrow(dataDF), function(i) {
      if (dataDF$time_to_event[[i]] > threshold) {
        0
      } else {
        dataDF$Event[[i]]
      }
    })

    auc <- MLmetrics::AUC(y_pred = onsetPredictions, y_true = dataDF$Event)
    prauc <- MLmetrics::PRAUC(y_pred = onsetPredictions, y_true = dataDF$Event)
    roc <- pROC::roc(response = dataDF$Event, predictor = onsetPredictions)
    list(cumulativeBaseHaz = cumulativeBaseHaz, onsetPredictions = onsetPredictions, auc = auc, prauc = prauc, roc = roc)
  }

  # Extract test results
  testResults <- lapply(models, function(m) {predictCoxPHOnset(testTarget, m)})

  # Extract onset predictions
  predictions <- lapply(testResults, function(r) {r$onsetPredictions})

  # Calculate p values for AUC comparisons of interest (age/sex + protein)
  p_null <- pROC::roc.test(response = testTarget$Event, predictor1 = predictions$r, predictor2 = predictions$f)$p.value

  p_second <- pROC::roc.test(response = testTarget$Event, predictor1 = predictions$s, predictor2 = predictions$b)$p.value

  # Extract metrics table
  aucs <- sapply(testResults, function(r) {r$auc})
  praucs <- sapply(testResults, function(r) {r$prauc})
  metricsTable <- data.frame(AUC = aucs, PRAUC = praucs)
  metricsTable[6,1] <- metricsTable[2,1] - metricsTable[1,1]
  metricsTable[6,2] <- metricsTable[2,2] - metricsTable[1,2]
  metricsTable[7,1] <- metricsTable[5,1] - metricsTable[4,1]
  metricsTable[7,2] <- metricsTable[5,2] - metricsTable[4,2]
  row.names(metricsTable) <- c('AgeSex', 'AgeSex_ProteinScore', 'ProteinScore Only', 'AgeSexCovs', 'AgeSexCovs_ProteinScore',
                               'Diff_AgeSex', 'Diff_AgeSexCovs')
  metricsTable$Outcome <- trait
  metricsTable$seed <- see
  metricsTable$iteration <- it

  # Add P values for AUCs to table
  metricsTable$P_ROCs <- 'NA'
  metricsTable[6,6] <- p_null
  metricsTable[7,6] <- p_second


  write.csv(weights, paste0(chosen_model_files, trait, '_weights.csv'))
  saveRDS(testResults, paste0(chosen_model_files, trait, '_testResults.rds'))
  saveRDS(testTarget, paste0(chosen_model_files, trait, '_testTarget.rds'))
  saveRDS(metricsTable, paste0(chosen_model_files, trait, '_metricsTable.rds'))
  saveRDS(mod, paste0(chosen_model_files, trait, '_models.rds'))

  models_res[[i]] <- metricsTable
}

bind <- do.call(rbind, models_res)

library(readxl)
naming <- read_excel("path.../naming_index.xlsx")
naming <- as.data.frame(naming)
names(naming)[1] <- 'Outcome'

bind <- left_join(bind, naming, by = 'Outcome')


write.csv(bind, 'path.../disease_results_full_10yr.csv')

###############################################################################################

### 5yr: Extract chosen models for each outcome and save files in location easily accessible 

###############################################################################################

chosen <- read.csv("path.../combined_chosen.csv")

chosen <- chosen %>% mutate(Batch = case_when(
  iteration %in% c(1:10) ~ 'Batch1',
  iteration %in% c(11:20) ~ 'Batch2',
  iteration %in% c(21:30) ~ 'Batch3',
  iteration %in% c(31:40) ~ 'Batch4',
  iteration %in% c(41:50) ~ 'Batch5'
))

# For each 5yr disease, pull chosen model, load key files and examine fully adjusted cox PH
batches_location <- 'path.../Batches/'
chosen_model_files <- 'path.../Chosen_model_files/Files_5y/'

traits <- c('ALS', 'Dep', 'LUP', 'ENDO')

batches <- c('Batch1', 'Batch2', 'Batch3', 'Batch4', 'Batch5')
index <- c('batch1', 'batch2', 'batch3', 'batch4', 'batch5')

models_res <- list()

for(i in 1:length(traits)){
  trait <- as.character(traits[i])
  print(trait)
  
  # Identify median model iteration and batch 
  chosen_sub <- chosen[which(chosen$name %in% trait),]
  it <- chosen_sub$iteration
  bat <- chosen_sub$Batch
  see <- chosen_sub$seed
  set.seed(see)
  
  # Load in model projection file with covariates for Cox PH analyses
  testTarget <- readRDS(paste0(batches_location, bat, "/", trait, "/model_logs_", it, "/output/", trait, "testTarget.rds"))
  testTarget <- na.omit(testTarget)
  # Run all metrics required
  
  # Load weights and other models files for chosen model
  weights <- read.csv(paste0(batches_location, bat, "/", trait, "/model_logs_", it, "/output/", trait, '_', it, "_weights.csv"))
  mod <- readRDS(paste0(batches_location, bat, "/", trait, "/model_logs_", it, "/output/", trait, "models.rds"))
  
  # Compare null and scores added cox models
  if(trait %in% c('Prostate', 'CYS', 'GYN', 'Breast', 'ENDO')){
    riskFactorsOnlyCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment, testTarget)
    fullCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment + dScore, testTarget)
    ProteinOnly <- coxph(Surv(time_to_event, Event) ~ dScore, testTarget)
    allrisk <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Edu) + BMI + Dep + as.factor(Smo) + as.factor(Alc), testTarget)
    allriskprotein <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Edu) + BMI + Dep + as.factor(Smo) + as.factor(Alc) + dScore, testTarget)
    print('Model has been sex stratified in cox.')
  } else {
    riskFactorsOnlyCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex), testTarget)
    fullCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex) + dScore, testTarget)
    ProteinOnly <- coxph(Surv(time_to_event, Event) ~ dScore, testTarget)
    allrisk <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex) + as.factor(Edu) + BMI + Dep + as.factor(Smo) + as.factor(Alc), testTarget)
    allriskprotein <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex) + as.factor(Edu) + BMI + Dep + as.factor(Smo) + as.factor(Alc) + dScore, testTarget)
    print('Model has not been sex stratified in cox.')
  }
  
  # List models
  models <- list(r = riskFactorsOnlyCoxPH, f = fullCoxPH, t = ProteinOnly, s = allrisk, b = allriskprotein)
  
  
  # Generate performance statistics - for 10 years of follow up, comparing base HR to survival HR at 10 years 
  predictCoxPHOnset <- function(dataDF, coxPHModel, threshold = 5) {
    uniqueTimes <- uniqueTimes <- sort(unique(c(dataDF$time_to_event, threshold)))
    thresholdIndex <- match(threshold, uniqueTimes)
    cumulativeBaseHaz <- gbm::basehaz.gbm(dataDF$time_to_event, dataDF$Event, predict(coxPHModel), uniqueTimes)
    survivalPredictions <- exp(-cumulativeBaseHaz[[thresholdIndex]]) ^ exp(predict(coxPHModel))
    onsetPredictions <- 1 - survivalPredictions
    
    print('Cumulative baseline hazard presence NA - print onset predictions:')
    print(table(is.na(onsetPredictions)))
    
    # Event should be 0 if tte is > 10
    dataDF$Event <- sapply(1:nrow(dataDF), function(i) {
      if (dataDF$time_to_event[[i]] > threshold) {
        0
      } else {
        dataDF$Event[[i]]
      }
    })
    
    auc <- MLmetrics::AUC(y_pred = onsetPredictions, y_true = dataDF$Event)
    prauc <- MLmetrics::PRAUC(y_pred = onsetPredictions, y_true = dataDF$Event)
    roc <- pROC::roc(response = dataDF$Event, predictor = onsetPredictions)
    list(cumulativeBaseHaz = cumulativeBaseHaz, onsetPredictions = onsetPredictions, auc = auc, prauc = prauc, roc = roc)
  }
  
  # Extract test results
  testResults <- lapply(models, function(m) {predictCoxPHOnset(testTarget, m)})
  
  # Extract onset predictions 
  predictions <- lapply(testResults, function(r) {r$onsetPredictions})
  
  # Calculate p values for AUC comparisons of interest (age/sex + protein)
  p_null <- pROC::roc.test(response = testTarget$Event, predictor1 = predictions$r, predictor2 = predictions$f)$p.value
  
  p_second <- pROC::roc.test(response = testTarget$Event, predictor1 = predictions$s, predictor2 = predictions$b)$p.value
  
  # Extract metrics table
  aucs <- sapply(testResults, function(r) {r$auc})
  praucs <- sapply(testResults, function(r) {r$prauc})
  metricsTable <- data.frame(AUC = aucs, PRAUC = praucs)
  metricsTable[6,1] <- metricsTable[2,1] - metricsTable[1,1]
  metricsTable[6,2] <- metricsTable[2,2] - metricsTable[1,2]
  metricsTable[7,1] <- metricsTable[5,1] - metricsTable[4,1]
  metricsTable[7,2] <- metricsTable[5,2] - metricsTable[4,2]
  row.names(metricsTable) <- c('AgeSex', 'AgeSex_ProteinScore', 'ProteinScore Only', 'AgeSexCovs', 'AgeSexCovs_ProteinScore',
                               'Diff_AgeSex', 'Diff_AgeSexCovs')
  metricsTable$Outcome <- trait
  metricsTable$seed <- see
  metricsTable$iteration <- it
  
  # Add P values for AUCs to table
  metricsTable$P_ROCs <- 'NA'
  metricsTable[6,6] <- p_null
  metricsTable[7,6] <- p_second
  
  
  write.csv(weights, paste0(chosen_model_files, trait, '_weights.csv'))
  saveRDS(testResults, paste0(chosen_model_files, trait, '_testResults.rds'))
  saveRDS(testTarget, paste0(chosen_model_files, trait, '_testTarget.rds'))
  saveRDS(metricsTable, paste0(chosen_model_files, trait, '_metricsTable.rds'))
  saveRDS(mod, paste0(chosen_model_files, trait, '_models.rds'))
  
  models_res[[i]] <- metricsTable
}

bind <- do.call(rbind, models_res)

library(readxl)
naming <- read_excel("path.../naming_index.xlsx")
naming <- as.data.frame(naming)
names(naming)[1] <- 'Outcome'

bind <- left_join(bind, naming, by = 'Outcome')

write.csv(bind, 'path.../disease_results_full_5yr.csv')


###############################################################################################

### 10yr: Mortality

###############################################################################################

chosen <- read.csv("path.../combined_chosen.csv")

chosen <- chosen %>% mutate(Batch = case_when(
  iteration %in% c(1:10) ~ 'Batch1',
  iteration %in% c(11:20) ~ 'Batch2',
  iteration %in% c(21:30) ~ 'Batch3',
  iteration %in% c(31:40) ~ 'Batch4',
  iteration %in% c(41:50) ~ 'Batch5'
))


# For each 10yr disease, pull chosen model, load key files and examine fully adjusted cox PH

batches_location <- 'path.../Death_140223/Batches/'
chosen_model_files <- 'path.../Chosen_model_files/Files_10y/'

traits <- 'DEATH'

batches <- c('Batch1', 'Batch2', 'Batch3', 'Batch4', 'Batch5')
index <- c('batch1', 'batch2', 'batch3', 'batch4', 'batch5')

models_res <- list()

for(i in 1:length(traits)){
  trait <- as.character(traits[i])
  print(trait)
  
  # Identify median model iteration and batch
  chosen_sub <- chosen[which(chosen$name %in% trait),]
  it <- chosen_sub$iteration
  bat <- chosen_sub$Batch
  see <- chosen_sub$seed
  set.seed(see)
  
  # Load in model projection file with covariates for Cox PH analyses
  testTarget <- readRDS(paste0(batches_location, bat, "/", trait, "/model_logs_", it, "/output/", trait, "testTarget.rds"))
  testTarget <- na.omit(testTarget)
  # Run all metrics required
  
  # Load weights and other models files for chosen model
  weights <- read.csv(paste0(batches_location, bat, "/", trait, "/model_logs_", it, "/output/", trait, '_', it, "_weights.csv"))
  mod <- readRDS(paste0(batches_location, bat, "/", trait, "/model_logs_", it, "/output/", trait, "models.rds"))
  
  # Compare null and scores added cox models
  if(trait %in% c('Prostate', 'CYS', 'GYN', 'Breast', 'ENDO')){
    riskFactorsOnlyCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment, testTarget)
    fullCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment + dScore, testTarget)
    ProteinOnly <- coxph(Surv(time_to_event, Event) ~ dScore, testTarget)
    allrisk <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Edu) + BMI + Dep + as.factor(Smo) + as.factor(Alc), testTarget)
    allriskprotein <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Edu) + BMI + Dep + as.factor(Smo) + as.factor(Alc) + dScore, testTarget)
    print('Model has been sex stratified in cox.')
  } else {
    riskFactorsOnlyCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex), testTarget)
    fullCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex) + dScore, testTarget)
    ProteinOnly <- coxph(Surv(time_to_event, Event) ~ dScore, testTarget)
    allrisk <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex) + as.factor(Edu) + BMI + Dep + as.factor(Smo) + as.factor(Alc), testTarget)
    allriskprotein <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex) + as.factor(Edu) + BMI + Dep + as.factor(Smo) + as.factor(Alc) + dScore, testTarget)
    print('Model has not been sex stratified in cox.')
  }
  
  # List models
  models <- list(r = riskFactorsOnlyCoxPH, f = fullCoxPH, t = ProteinOnly, s = allrisk, b = allriskprotein)
  
  
  # Generate performance statistics - for 10 years of follow up, comparing base HR to survival HR at 10 years
  predictCoxPHOnset <- function(dataDF, coxPHModel, threshold = 10) {
    uniqueTimes <- uniqueTimes <- sort(unique(c(dataDF$time_to_event, threshold)))
    thresholdIndex <- match(threshold, uniqueTimes)
    cumulativeBaseHaz <- gbm::basehaz.gbm(dataDF$time_to_event, dataDF$Event, predict(coxPHModel), uniqueTimes)
    survivalPredictions <- exp(-cumulativeBaseHaz[[thresholdIndex]]) ^ exp(predict(coxPHModel))
    onsetPredictions <- 1 - survivalPredictions
    
    print('Cumulative baseline hazard presence NA - print onset predictions:')
    print(table(is.na(onsetPredictions)))
    
    # Event should be 0 if tte is > 10
    dataDF$Event <- sapply(1:nrow(dataDF), function(i) {
      if (dataDF$time_to_event[[i]] > threshold) {
        0
      } else {
        dataDF$Event[[i]]
      }
    })
    
    auc <- MLmetrics::AUC(y_pred = onsetPredictions, y_true = dataDF$Event)
    prauc <- MLmetrics::PRAUC(y_pred = onsetPredictions, y_true = dataDF$Event)
    roc <- pROC::roc(response = dataDF$Event, predictor = onsetPredictions)
    list(cumulativeBaseHaz = cumulativeBaseHaz, onsetPredictions = onsetPredictions, auc = auc, prauc = prauc, roc = roc)
  }
  
  # Extract test results
  testResults <- lapply(models, function(m) {predictCoxPHOnset(testTarget, m)})
  
  # Extract onset predictions
  predictions <- lapply(testResults, function(r) {r$onsetPredictions})
  
  # Calculate p values for AUC comparisons of interest (age/sex + protein)
  p_null <- pROC::roc.test(response = testTarget$Event, predictor1 = predictions$r, predictor2 = predictions$f)$p.value
  
  p_second <- pROC::roc.test(response = testTarget$Event, predictor1 = predictions$s, predictor2 = predictions$b)$p.value
  
  # Extract metrics table
  aucs <- sapply(testResults, function(r) {r$auc})
  praucs <- sapply(testResults, function(r) {r$prauc})
  metricsTable <- data.frame(AUC = aucs, PRAUC = praucs)
  metricsTable[6,1] <- metricsTable[2,1] - metricsTable[1,1]
  metricsTable[6,2] <- metricsTable[2,2] - metricsTable[1,2]
  metricsTable[7,1] <- metricsTable[5,1] - metricsTable[4,1]
  metricsTable[7,2] <- metricsTable[5,2] - metricsTable[4,2]
  row.names(metricsTable) <- c('AgeSex', 'AgeSex_ProteinScore', 'ProteinScore Only', 'AgeSexCovs', 'AgeSexCovs_ProteinScore',
                               'Diff_AgeSex', 'Diff_AgeSexCovs')
  metricsTable$Outcome <- trait
  metricsTable$seed <- see
  metricsTable$iteration <- it
  
  # Add P values for AUCs to table
  metricsTable$P_ROCs <- 'NA'
  metricsTable[6,6] <- p_null
  metricsTable[7,6] <- p_second
  
  
  write.csv(weights, paste0(chosen_model_files, trait, '_weights.csv'))
  saveRDS(testResults, paste0(chosen_model_files, trait, '_testResults.rds'))
  saveRDS(testTarget, paste0(chosen_model_files, trait, '_testTarget.rds'))
  saveRDS(metricsTable, paste0(chosen_model_files, trait, '_metricsTable.rds'))
  saveRDS(mod, paste0(chosen_model_files, trait, '_models.rds'))
  
  models_res[[i]] <- metricsTable
}

bind <- do.call(rbind, models_res)

library(readxl)
naming <- read_excel("path.../naming_index.xlsx")
naming <- as.data.frame(naming)
names(naming)[1] <- 'Outcome'

bind <- left_join(bind, naming, by = 'Outcome')

write.csv(bind, 'path.../death_results_full_10yr.csv')