############################################################################################### ##### MethylPipeR chosen models - processing Cox PH AUC PRAUC and saving results together for each score ############################################################################################### # Train ProteinScores for traits using desired case: control ratio # srun -p interactive --pty bash library(readxl) library(glmnet) library(survival) library(gbm) library(data.table) library(pacman) library(tidyverse) ############################################################################################### ### 10yr: Extract chosen models for each outcome and save files in location easily accessible ############################################################################################### chosen <- read.csv("path.../combined_chosen.csv") chosen <- chosen %>% mutate(Batch = case_when( iteration %in% c(1:10) ~ 'Batch1', iteration %in% c(11:20) ~ 'Batch2', iteration %in% c(21:30) ~ 'Batch3', iteration %in% c(31:40) ~ 'Batch4', iteration %in% c(41:50) ~ 'Batch5' )) # For each 10yr disease, pull chosen model, load key files and examine fully adjusted cox PH batches_location <- 'path.../Diseases_140223/Batches/' chosen_model_files <- 'path.../Chosen_model_files/Files_10y/' traits <- c('AL', 'Breast', 'Colorectal', 'COPD', 'Diab', 'GYN', 'IBD', 'IHD', 'LIV', 'LUNG', 'PD', 'Prostate', 'RA', 'ST', 'VD') # traits <- c('ALS', 'Dep', 'LUP', 'ENDO') batches <- c('Batch1', 'Batch2', 'Batch3', 'Batch4', 'Batch5') index <- c('batch1', 'batch2', 'batch3', 'batch4', 'batch5') models_res <- list() for(i in 1:length(traits)){ trait <- as.character(traits[i]) print(trait) # Identify median model iteration and batch chosen_sub <- chosen[which(chosen$name %in% trait),] it <- chosen_sub$iteration bat <- chosen_sub$Batch see <- chosen_sub$seed set.seed(see) # Load in model projection file with covariates for Cox PH analyses testTarget <- readRDS(paste0(batches_location, bat, "/", trait, "/model_logs_", it, "/output/", trait, "testTarget.rds")) testTarget <- na.omit(testTarget) # Run all metrics required # Load weights and other models files for chosen model weights <- read.csv(paste0(batches_location, bat, "/", trait, "/model_logs_", it, "/output/", trait, '_', it, "_weights.csv")) mod <- readRDS(paste0(batches_location, bat, "/", trait, "/model_logs_", it, "/output/", trait, "models.rds")) # Compare null and scores added cox models if(trait %in% c('Prostate', 'CYS', 'GYN', 'Breast', 'ENDO')){ riskFactorsOnlyCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment, testTarget) fullCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment + dScore, testTarget) ProteinOnly <- coxph(Surv(time_to_event, Event) ~ dScore, testTarget) allrisk <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Edu) + BMI + Dep + as.factor(Smo) + as.factor(Alc), testTarget) allriskprotein <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Edu) + BMI + Dep + as.factor(Smo) + as.factor(Alc) + dScore, testTarget) print('Model has been sex stratified in cox.') } else { riskFactorsOnlyCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex), testTarget) fullCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex) + dScore, testTarget) ProteinOnly <- coxph(Surv(time_to_event, Event) ~ dScore, testTarget) allrisk <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex) + as.factor(Edu) + BMI + Dep + as.factor(Smo) + as.factor(Alc), testTarget) allriskprotein <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex) + as.factor(Edu) + BMI + Dep + as.factor(Smo) + as.factor(Alc) + dScore, testTarget) print('Model has not been sex stratified in cox.') } # + Dep + as.factor(Smo) + as.factor(Alc) + BMI + # List models models <- list(r = riskFactorsOnlyCoxPH, f = fullCoxPH, t = ProteinOnly, s = allrisk, b = allriskprotein) # Generate performance statistics - for 10 years of follow up, comparing base HR to survival HR at 10 years predictCoxPHOnset <- function(dataDF, coxPHModel, threshold = 10) { uniqueTimes <- uniqueTimes <- sort(unique(c(dataDF$time_to_event, threshold))) thresholdIndex <- match(threshold, uniqueTimes) cumulativeBaseHaz <- gbm::basehaz.gbm(dataDF$time_to_event, dataDF$Event, predict(coxPHModel), uniqueTimes) survivalPredictions <- exp(-cumulativeBaseHaz[[thresholdIndex]]) ^ exp(predict(coxPHModel)) onsetPredictions <- 1 - survivalPredictions print('Cumulative baseline hazard presence NA - print onset predictions:') print(table(is.na(onsetPredictions))) # Event should be 0 if tte is > 10 dataDF$Event <- sapply(1:nrow(dataDF), function(i) { if (dataDF$time_to_event[[i]] > threshold) { 0 } else { dataDF$Event[[i]] } }) auc <- MLmetrics::AUC(y_pred = onsetPredictions, y_true = dataDF$Event) prauc <- MLmetrics::PRAUC(y_pred = onsetPredictions, y_true = dataDF$Event) roc <- pROC::roc(response = dataDF$Event, predictor = onsetPredictions) list(cumulativeBaseHaz = cumulativeBaseHaz, onsetPredictions = onsetPredictions, auc = auc, prauc = prauc, roc = roc) } # Extract test results testResults <- lapply(models, function(m) {predictCoxPHOnset(testTarget, m)}) # Extract onset predictions predictions <- lapply(testResults, function(r) {r$onsetPredictions}) # Calculate p values for AUC comparisons of interest (age/sex + protein) p_null <- pROC::roc.test(response = testTarget$Event, predictor1 = predictions$r, predictor2 = predictions$f)$p.value p_second <- pROC::roc.test(response = testTarget$Event, predictor1 = predictions$s, predictor2 = predictions$b)$p.value # Extract metrics table aucs <- sapply(testResults, function(r) {r$auc}) praucs <- sapply(testResults, function(r) {r$prauc}) metricsTable <- data.frame(AUC = aucs, PRAUC = praucs) metricsTable[6,1] <- metricsTable[2,1] - metricsTable[1,1] metricsTable[6,2] <- metricsTable[2,2] - metricsTable[1,2] metricsTable[7,1] <- metricsTable[5,1] - metricsTable[4,1] metricsTable[7,2] <- metricsTable[5,2] - metricsTable[4,2] row.names(metricsTable) <- c('AgeSex', 'AgeSex_ProteinScore', 'ProteinScore Only', 'AgeSexCovs', 'AgeSexCovs_ProteinScore', 'Diff_AgeSex', 'Diff_AgeSexCovs') metricsTable$Outcome <- trait metricsTable$seed <- see metricsTable$iteration <- it # Add P values for AUCs to table metricsTable$P_ROCs <- 'NA' metricsTable[6,6] <- p_null metricsTable[7,6] <- p_second write.csv(weights, paste0(chosen_model_files, trait, '_weights.csv')) saveRDS(testResults, paste0(chosen_model_files, trait, '_testResults.rds')) saveRDS(testTarget, paste0(chosen_model_files, trait, '_testTarget.rds')) saveRDS(metricsTable, paste0(chosen_model_files, trait, '_metricsTable.rds')) saveRDS(mod, paste0(chosen_model_files, trait, '_models.rds')) models_res[[i]] <- metricsTable } bind <- do.call(rbind, models_res) library(readxl) naming <- read_excel("path.../naming_index.xlsx") naming <- as.data.frame(naming) names(naming)[1] <- 'Outcome' bind <- left_join(bind, naming, by = 'Outcome') write.csv(bind, 'path.../disease_results_full_10yr.csv') ############################################################################################### ### 5yr: Extract chosen models for each outcome and save files in location easily accessible ############################################################################################### chosen <- read.csv("path.../combined_chosen.csv") chosen <- chosen %>% mutate(Batch = case_when( iteration %in% c(1:10) ~ 'Batch1', iteration %in% c(11:20) ~ 'Batch2', iteration %in% c(21:30) ~ 'Batch3', iteration %in% c(31:40) ~ 'Batch4', iteration %in% c(41:50) ~ 'Batch5' )) # For each 5yr disease, pull chosen model, load key files and examine fully adjusted cox PH batches_location <- 'path.../Batches/' chosen_model_files <- 'path.../Chosen_model_files/Files_5y/' traits <- c('ALS', 'Dep', 'LUP', 'ENDO') batches <- c('Batch1', 'Batch2', 'Batch3', 'Batch4', 'Batch5') index <- c('batch1', 'batch2', 'batch3', 'batch4', 'batch5') models_res <- list() for(i in 1:length(traits)){ trait <- as.character(traits[i]) print(trait) # Identify median model iteration and batch chosen_sub <- chosen[which(chosen$name %in% trait),] it <- chosen_sub$iteration bat <- chosen_sub$Batch see <- chosen_sub$seed set.seed(see) # Load in model projection file with covariates for Cox PH analyses testTarget <- readRDS(paste0(batches_location, bat, "/", trait, "/model_logs_", it, "/output/", trait, "testTarget.rds")) testTarget <- na.omit(testTarget) # Run all metrics required # Load weights and other models files for chosen model weights <- read.csv(paste0(batches_location, bat, "/", trait, "/model_logs_", it, "/output/", trait, '_', it, "_weights.csv")) mod <- readRDS(paste0(batches_location, bat, "/", trait, "/model_logs_", it, "/output/", trait, "models.rds")) # Compare null and scores added cox models if(trait %in% c('Prostate', 'CYS', 'GYN', 'Breast', 'ENDO')){ riskFactorsOnlyCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment, testTarget) fullCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment + dScore, testTarget) ProteinOnly <- coxph(Surv(time_to_event, Event) ~ dScore, testTarget) allrisk <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Edu) + BMI + Dep + as.factor(Smo) + as.factor(Alc), testTarget) allriskprotein <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Edu) + BMI + Dep + as.factor(Smo) + as.factor(Alc) + dScore, testTarget) print('Model has been sex stratified in cox.') } else { riskFactorsOnlyCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex), testTarget) fullCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex) + dScore, testTarget) ProteinOnly <- coxph(Surv(time_to_event, Event) ~ dScore, testTarget) allrisk <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex) + as.factor(Edu) + BMI + Dep + as.factor(Smo) + as.factor(Alc), testTarget) allriskprotein <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex) + as.factor(Edu) + BMI + Dep + as.factor(Smo) + as.factor(Alc) + dScore, testTarget) print('Model has not been sex stratified in cox.') } # List models models <- list(r = riskFactorsOnlyCoxPH, f = fullCoxPH, t = ProteinOnly, s = allrisk, b = allriskprotein) # Generate performance statistics - for 10 years of follow up, comparing base HR to survival HR at 10 years predictCoxPHOnset <- function(dataDF, coxPHModel, threshold = 5) { uniqueTimes <- uniqueTimes <- sort(unique(c(dataDF$time_to_event, threshold))) thresholdIndex <- match(threshold, uniqueTimes) cumulativeBaseHaz <- gbm::basehaz.gbm(dataDF$time_to_event, dataDF$Event, predict(coxPHModel), uniqueTimes) survivalPredictions <- exp(-cumulativeBaseHaz[[thresholdIndex]]) ^ exp(predict(coxPHModel)) onsetPredictions <- 1 - survivalPredictions print('Cumulative baseline hazard presence NA - print onset predictions:') print(table(is.na(onsetPredictions))) # Event should be 0 if tte is > 10 dataDF$Event <- sapply(1:nrow(dataDF), function(i) { if (dataDF$time_to_event[[i]] > threshold) { 0 } else { dataDF$Event[[i]] } }) auc <- MLmetrics::AUC(y_pred = onsetPredictions, y_true = dataDF$Event) prauc <- MLmetrics::PRAUC(y_pred = onsetPredictions, y_true = dataDF$Event) roc <- pROC::roc(response = dataDF$Event, predictor = onsetPredictions) list(cumulativeBaseHaz = cumulativeBaseHaz, onsetPredictions = onsetPredictions, auc = auc, prauc = prauc, roc = roc) } # Extract test results testResults <- lapply(models, function(m) {predictCoxPHOnset(testTarget, m)}) # Extract onset predictions predictions <- lapply(testResults, function(r) {r$onsetPredictions}) # Calculate p values for AUC comparisons of interest (age/sex + protein) p_null <- pROC::roc.test(response = testTarget$Event, predictor1 = predictions$r, predictor2 = predictions$f)$p.value p_second <- pROC::roc.test(response = testTarget$Event, predictor1 = predictions$s, predictor2 = predictions$b)$p.value # Extract metrics table aucs <- sapply(testResults, function(r) {r$auc}) praucs <- sapply(testResults, function(r) {r$prauc}) metricsTable <- data.frame(AUC = aucs, PRAUC = praucs) metricsTable[6,1] <- metricsTable[2,1] - metricsTable[1,1] metricsTable[6,2] <- metricsTable[2,2] - metricsTable[1,2] metricsTable[7,1] <- metricsTable[5,1] - metricsTable[4,1] metricsTable[7,2] <- metricsTable[5,2] - metricsTable[4,2] row.names(metricsTable) <- c('AgeSex', 'AgeSex_ProteinScore', 'ProteinScore Only', 'AgeSexCovs', 'AgeSexCovs_ProteinScore', 'Diff_AgeSex', 'Diff_AgeSexCovs') metricsTable$Outcome <- trait metricsTable$seed <- see metricsTable$iteration <- it # Add P values for AUCs to table metricsTable$P_ROCs <- 'NA' metricsTable[6,6] <- p_null metricsTable[7,6] <- p_second write.csv(weights, paste0(chosen_model_files, trait, '_weights.csv')) saveRDS(testResults, paste0(chosen_model_files, trait, '_testResults.rds')) saveRDS(testTarget, paste0(chosen_model_files, trait, '_testTarget.rds')) saveRDS(metricsTable, paste0(chosen_model_files, trait, '_metricsTable.rds')) saveRDS(mod, paste0(chosen_model_files, trait, '_models.rds')) models_res[[i]] <- metricsTable } bind <- do.call(rbind, models_res) library(readxl) naming <- read_excel("path.../naming_index.xlsx") naming <- as.data.frame(naming) names(naming)[1] <- 'Outcome' bind <- left_join(bind, naming, by = 'Outcome') write.csv(bind, 'path.../disease_results_full_5yr.csv') ############################################################################################### ### 10yr: Mortality ############################################################################################### chosen <- read.csv("path.../combined_chosen.csv") chosen <- chosen %>% mutate(Batch = case_when( iteration %in% c(1:10) ~ 'Batch1', iteration %in% c(11:20) ~ 'Batch2', iteration %in% c(21:30) ~ 'Batch3', iteration %in% c(31:40) ~ 'Batch4', iteration %in% c(41:50) ~ 'Batch5' )) # For each 10yr disease, pull chosen model, load key files and examine fully adjusted cox PH batches_location <- 'path.../Death_140223/Batches/' chosen_model_files <- 'path.../Chosen_model_files/Files_10y/' traits <- 'DEATH' batches <- c('Batch1', 'Batch2', 'Batch3', 'Batch4', 'Batch5') index <- c('batch1', 'batch2', 'batch3', 'batch4', 'batch5') models_res <- list() for(i in 1:length(traits)){ trait <- as.character(traits[i]) print(trait) # Identify median model iteration and batch chosen_sub <- chosen[which(chosen$name %in% trait),] it <- chosen_sub$iteration bat <- chosen_sub$Batch see <- chosen_sub$seed set.seed(see) # Load in model projection file with covariates for Cox PH analyses testTarget <- readRDS(paste0(batches_location, bat, "/", trait, "/model_logs_", it, "/output/", trait, "testTarget.rds")) testTarget <- na.omit(testTarget) # Run all metrics required # Load weights and other models files for chosen model weights <- read.csv(paste0(batches_location, bat, "/", trait, "/model_logs_", it, "/output/", trait, '_', it, "_weights.csv")) mod <- readRDS(paste0(batches_location, bat, "/", trait, "/model_logs_", it, "/output/", trait, "models.rds")) # Compare null and scores added cox models if(trait %in% c('Prostate', 'CYS', 'GYN', 'Breast', 'ENDO')){ riskFactorsOnlyCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment, testTarget) fullCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment + dScore, testTarget) ProteinOnly <- coxph(Surv(time_to_event, Event) ~ dScore, testTarget) allrisk <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Edu) + BMI + Dep + as.factor(Smo) + as.factor(Alc), testTarget) allriskprotein <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Edu) + BMI + Dep + as.factor(Smo) + as.factor(Alc) + dScore, testTarget) print('Model has been sex stratified in cox.') } else { riskFactorsOnlyCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex), testTarget) fullCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex) + dScore, testTarget) ProteinOnly <- coxph(Surv(time_to_event, Event) ~ dScore, testTarget) allrisk <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex) + as.factor(Edu) + BMI + Dep + as.factor(Smo) + as.factor(Alc), testTarget) allriskprotein <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex) + as.factor(Edu) + BMI + Dep + as.factor(Smo) + as.factor(Alc) + dScore, testTarget) print('Model has not been sex stratified in cox.') } # List models models <- list(r = riskFactorsOnlyCoxPH, f = fullCoxPH, t = ProteinOnly, s = allrisk, b = allriskprotein) # Generate performance statistics - for 10 years of follow up, comparing base HR to survival HR at 10 years predictCoxPHOnset <- function(dataDF, coxPHModel, threshold = 10) { uniqueTimes <- uniqueTimes <- sort(unique(c(dataDF$time_to_event, threshold))) thresholdIndex <- match(threshold, uniqueTimes) cumulativeBaseHaz <- gbm::basehaz.gbm(dataDF$time_to_event, dataDF$Event, predict(coxPHModel), uniqueTimes) survivalPredictions <- exp(-cumulativeBaseHaz[[thresholdIndex]]) ^ exp(predict(coxPHModel)) onsetPredictions <- 1 - survivalPredictions print('Cumulative baseline hazard presence NA - print onset predictions:') print(table(is.na(onsetPredictions))) # Event should be 0 if tte is > 10 dataDF$Event <- sapply(1:nrow(dataDF), function(i) { if (dataDF$time_to_event[[i]] > threshold) { 0 } else { dataDF$Event[[i]] } }) auc <- MLmetrics::AUC(y_pred = onsetPredictions, y_true = dataDF$Event) prauc <- MLmetrics::PRAUC(y_pred = onsetPredictions, y_true = dataDF$Event) roc <- pROC::roc(response = dataDF$Event, predictor = onsetPredictions) list(cumulativeBaseHaz = cumulativeBaseHaz, onsetPredictions = onsetPredictions, auc = auc, prauc = prauc, roc = roc) } # Extract test results testResults <- lapply(models, function(m) {predictCoxPHOnset(testTarget, m)}) # Extract onset predictions predictions <- lapply(testResults, function(r) {r$onsetPredictions}) # Calculate p values for AUC comparisons of interest (age/sex + protein) p_null <- pROC::roc.test(response = testTarget$Event, predictor1 = predictions$r, predictor2 = predictions$f)$p.value p_second <- pROC::roc.test(response = testTarget$Event, predictor1 = predictions$s, predictor2 = predictions$b)$p.value # Extract metrics table aucs <- sapply(testResults, function(r) {r$auc}) praucs <- sapply(testResults, function(r) {r$prauc}) metricsTable <- data.frame(AUC = aucs, PRAUC = praucs) metricsTable[6,1] <- metricsTable[2,1] - metricsTable[1,1] metricsTable[6,2] <- metricsTable[2,2] - metricsTable[1,2] metricsTable[7,1] <- metricsTable[5,1] - metricsTable[4,1] metricsTable[7,2] <- metricsTable[5,2] - metricsTable[4,2] row.names(metricsTable) <- c('AgeSex', 'AgeSex_ProteinScore', 'ProteinScore Only', 'AgeSexCovs', 'AgeSexCovs_ProteinScore', 'Diff_AgeSex', 'Diff_AgeSexCovs') metricsTable$Outcome <- trait metricsTable$seed <- see metricsTable$iteration <- it # Add P values for AUCs to table metricsTable$P_ROCs <- 'NA' metricsTable[6,6] <- p_null metricsTable[7,6] <- p_second write.csv(weights, paste0(chosen_model_files, trait, '_weights.csv')) saveRDS(testResults, paste0(chosen_model_files, trait, '_testResults.rds')) saveRDS(testTarget, paste0(chosen_model_files, trait, '_testTarget.rds')) saveRDS(metricsTable, paste0(chosen_model_files, trait, '_metricsTable.rds')) saveRDS(mod, paste0(chosen_model_files, trait, '_models.rds')) models_res[[i]] <- metricsTable } bind <- do.call(rbind, models_res) library(readxl) naming <- read_excel("path.../naming_index.xlsx") naming <- as.data.frame(naming) names(naming)[1] <- 'Outcome' bind <- left_join(bind, naming, by = 'Outcome') write.csv(bind, 'path.../death_results_full_10yr.csv')