############################################################################################### ##### MethylPipeR chosen models - processing - collating weights and plotting results ############################################################################################### library(readxl) library(glmnet) library(survival) library(gbm) library(data.table) library(pacman) library(tidyverse) ############################################################################################### ### Process metrics table and weights selected as features ############################################################################################### ### Load the 10yr weights files path <- 'path.../Chosen_model_files/Files_10y/' files <- list.files(paste0(path), '.csv') traits <- gsub("_weights.csv*","", files) ### Collate weights table, along with features summary res_weights <- list() table <- data.frame(Trait = 1:length(traits), Features = 1:length(traits), Proteins = 1:length(traits)) for(i in 1:length(traits)){ tryCatch({ print(traits[i]) loc <- files[i] file <- read.csv(paste0(path, loc)) file$Trait <- traits[i] res_weights[[i]] <- file dim <- dim(file)[1] table[i,1] <- traits[i] table[i,2] <- dim file$Protein <- gsub("\\..*","",file$Protein) prot <- file$Protein table[i,3] <- paste(prot, sep = ",", collapse = ", ") }, error = function(e) cat("skipped")) } bind1 <- do.call(rbind, res_weights) table1 <- table[order(table$Features),] ### Load the 5yr weights files path <- 'path.../Chosen_model_files/Files_5y/' files <- list.files(paste0(path), '.csv') traits <- gsub("_weights.csv*","", files) ### Collate weights table, along with features summary res_weights <- list() table <- data.frame(Trait = 1:length(traits), Features = 1:length(traits), Proteins = 1:length(traits)) for(i in 1:length(traits)){ tryCatch({ print(traits[i]) loc <- files[i] file <- read.csv(paste0(path, loc)) file$Trait <- traits[i] res_weights[[i]] <- file dim <- dim(file)[1] table[i,1] <- traits[i] table[i,2] <- dim file$Protein <- gsub("\\..*","",file$Protein) prot <- file$Protein table[i,3] <- paste(prot, sep = ",", collapse = ", ") }, error = function(e) cat("skipped")) } bind2 <- do.call(rbind, res_weights) table2 <- table[order(table$Features),] bind <- rbind(bind1,bind2) library(readxl) naming <- read_excel("path.../naming_index.xlsx") naming <- as.data.frame(naming) names(naming)[1] <- 'Outcome' names(bind)[4] <- 'Outcome' names(table)[1] <- 'Outcome' bind <- left_join(bind, naming, by = 'Outcome') table <- left_join(table, naming, by = 'Outcome') write.csv(bind, 'path.../Chosen_model_files/weights_saved.csv') table <- rbind(table1,table2) table <- table[order(table$Features),] write.csv(table, 'path.../Chosen_model_files/features_summary.csv', row.names = F) ### Collate metrics results -/ res_metrics <- list() for(i in 1:length(traits)){ tryCatch({ print(traits[i]) output_metrics <- paste0('path...//files_with_P_ROC_metrics/', traits[i], '_metricsTable.rds') print(output_metrics) file <- readRDS(output_metrics) file$Trait <- traits[i] res_metrics[[i]] <- file }, error = function(e) cat("skipped")) } bind <- do.call(rbind, res_metrics) write.csv(bind, 'path.../joint_metrics_table_with_P_ROCs/metrics_joint.csv') ############################################################################################### ### 10yr: Process and plot model results - ROC and PR curves ############################################################################################### library(survival) library(gbm) library(precrec) library(ggplot2) list <- list() location <- 'path...//Chosen_model_files/Files_10y/' outputs <- 'path.../Chosen_model_files/ROC_suppl_plots/' ### Load the traits used to input files <- list.files(location, 'testResults.rds') traits <- gsub("_testResults.rds*","", files) plot_list <- list() # Loop over and plot the results ROC curves for(i in 1:length(traits)){ tryCatch({ output_path <- paste0(location) # Read in files name <- traits[i] testResults <- readRDS(paste0(output_path, traits[i], "_testResults.rds")) testTarget <- readRDS(paste0(output_path, traits[i], "_testTarget.rds")) # Assign target test file to variable and subset such that no event greater than 10 years is present (event = 0, if tte > 10) w1Target <- testTarget w1Target$Event <- sapply(1:nrow(w1Target), function(i) { if (w1Target$time_to_event[[i]] > 10) { 0 } else { w1Target$Event[[i]] } }) w1Target$Event <- as.factor(w1Target$Event) # Assign cox test results from calculation above coxTestResults <- testResults # Calculate roc for model with score added nullResponse <- coxTestResults$r$onsetPredictions nullandProteinResponse <- coxTestResults$f$onsetPredictions coxProteinOnlyResponse <- coxTestResults$t$onsetPredictions fullResponse <- coxTestResults$s$onsetPredictions fullandProteinResponse <- coxTestResults$b$onsetPredictions # Plot ROC curves for models calibrationDFLasso <- data.frame(list('FullwithProteinScore' = fullandProteinResponse, 'Full' = fullResponse, 'NullwithProteinScore' = nullandProteinResponse, 'Null' = nullResponse, 'ProteinScore' = coxProteinOnlyResponse)) calibrationDFLasso$actual <- as.factor(w1Target$Event) # factor(w1Target$Event, levels = c(1, 0)) calibrationResultLasso <- caret::calibration(actual ~ FullwithProteinScore + Full + NullwithProteinScore + Null + ProteinScore, data = calibrationDFLasso, class = '1') scores <- join_scores(nullResponse, nullandProteinResponse, coxProteinOnlyResponse, fullResponse, fullandProteinResponse ) curves <- evalmod(scores = scores, labels = w1Target$Event, calc_avg = FALSE) # Save plot pdf(paste0(outputs, name, '.pdf'), width = 5, height = 3) plot(curves) dev.off() # Save plot png(paste0(outputs, name, '.png'), width = 9, height = 4.5, units = 'in', res = 300) plot(curves) dev.off() ### Calculate performance metrics breakdown table # Set thresholds for occurrence proportion of the population for cases thresholds <- seq(from = 0, to = 1, by = 0.1) # Full confusion matrix fullBinaryPredictions <- lapply(thresholds, function(threshold) { as.factor(as.numeric(fullandProteinResponse >= threshold)) }) names(fullBinaryPredictions) <- thresholds fullConfusionMatrices <- lapply(fullBinaryPredictions, function(x) { caret::confusionMatrix(x, w1Target$Event, positive = '1') }) # Null confusion matrix nullBinaryPredictions <- lapply(thresholds, function(threshold) { as.factor(as.numeric(fullResponse >= threshold)) }) names(nullBinaryPredictions) <- thresholds nullConfusionMatrices <- lapply(nullBinaryPredictions, function(x) { caret::confusionMatrix(x, w1Target$Event, positive = '1') }) fullMetrics <- do.call(rbind, lapply(fullConfusionMatrices, function(confusionMatrix) {confusionMatrix[[4]][1:4]})) fm <- fullMetrics colnames(fullMetrics) <- c('Full Protein Sensitivity', 'Full Protein Specificity', 'Full Protein PPV', 'Full Protein NPV') nullMetrics <- do.call(rbind, lapply(nullConfusionMatrices, function(confusionMatrix) {confusionMatrix[[4]][1:4]})) colnames(nullMetrics) <- c('Full Sensitivity', 'Full Specificity', 'Full PPV', 'Full NPV') metrics <- as.data.frame(cbind(fullMetrics, nullMetrics)) #metrics <- as.data.frame(cbind(metrics, proteinonlyMetrics)) # Save table of metrics across increments write.csv(metrics, paste0(outputs, name, '_.csv'), row.names = F) }, error = function(e) cat("skipped")) } ############################################################################################### ### 5yr: Process and plot model results - ROC and PR curves ############################################################################################### library(survival) library(gbm) library(precrec) library(ggplot2) list <- list() location <- 'path.../Chosen_model_files/Files_5y/' outputs <- 'path.../Chosen_model_files/ROC_suppl_plots/' ### Load the traits used to input files <- list.files(location, 'testResults.rds') traits <- gsub("_testResults.rds*","", files) plot_list <- list() # Loop over and plot the results ROC curves for(i in 1:length(traits)){ tryCatch({ output_path <- paste0(location) # Read in files name <- traits[i] testResults <- readRDS(paste0(output_path, traits[i], "_testResults.rds")) testTarget <- readRDS(paste0(output_path, traits[i], "_testTarget.rds")) # Assign target test file to variable and subset such that no event greater than 10 years is present (event = 0, if tte > 10) w1Target <- testTarget w1Target$Event <- sapply(1:nrow(w1Target), function(i) { if (w1Target$time_to_event[[i]] > 5) { 0 } else { w1Target$Event[[i]] } }) w1Target$Event <- as.factor(w1Target$Event) # Assign cox test results from calculation above coxTestResults <- testResults # Calculate roc for model with score added nullResponse <- coxTestResults$r$onsetPredictions nullandProteinResponse <- coxTestResults$f$onsetPredictions coxProteinOnlyResponse <- coxTestResults$t$onsetPredictions fullResponse <- coxTestResults$s$onsetPredictions fullandProteinResponse <- coxTestResults$b$onsetPredictions # Plot ROC curves for models calibrationDFLasso <- data.frame(list('FullwithProteinScore' = fullandProteinResponse, 'Full' = fullResponse, 'NullwithProteinScore' = nullandProteinResponse, 'Null' = nullResponse, 'ProteinScore' = coxProteinOnlyResponse)) calibrationDFLasso$actual <- as.factor(w1Target$Event) # factor(w1Target$Event, levels = c(1, 0)) calibrationResultLasso <- caret::calibration(actual ~ FullwithProteinScore + Full + NullwithProteinScore + Null + ProteinScore, data = calibrationDFLasso, class = '1') scores <- join_scores(nullResponse, nullandProteinResponse, coxProteinOnlyResponse, fullResponse, fullandProteinResponse ) curves <- evalmod(scores = scores, labels = w1Target$Event, calc_avg = FALSE) # Save plot pdf(paste0(outputs, name, '.pdf'), width = 5, height = 3) plot(curves) dev.off() # Save plot png(paste0(outputs, name, '.png'), width = 9, height = 4.5, units = 'in', res = 300) plot(curves) dev.off() ### Calculate performance metrics breakdown table # Set thresholds for occurrence proportion of the population for cases thresholds <- seq(from = 0, to = 1, by = 0.1) # Full confusion matrix fullBinaryPredictions <- lapply(thresholds, function(threshold) { as.factor(as.numeric(fullandProteinResponse >= threshold)) }) names(fullBinaryPredictions) <- thresholds fullConfusionMatrices <- lapply(fullBinaryPredictions, function(x) { caret::confusionMatrix(x, w1Target$Event, positive = '1') }) # Null confusion matrix nullBinaryPredictions <- lapply(thresholds, function(threshold) { as.factor(as.numeric(fullResponse >= threshold)) }) names(nullBinaryPredictions) <- thresholds nullConfusionMatrices <- lapply(nullBinaryPredictions, function(x) { caret::confusionMatrix(x, w1Target$Event, positive = '1') }) fullMetrics <- do.call(rbind, lapply(fullConfusionMatrices, function(confusionMatrix) {confusionMatrix[[4]][1:4]})) fm <- fullMetrics colnames(fullMetrics) <- c('Full Protein Sensitivity', 'Full Protein Specificity', 'Full Protein PPV', 'Full Protein NPV') nullMetrics <- do.call(rbind, lapply(nullConfusionMatrices, function(confusionMatrix) {confusionMatrix[[4]][1:4]})) colnames(nullMetrics) <- c('Full Sensitivity', 'Full Specificity', 'Full PPV', 'Full NPV') metrics <- as.data.frame(cbind(fullMetrics, nullMetrics)) #metrics <- as.data.frame(cbind(metrics, proteinonlyMetrics)) # Save table of metrics across increments write.csv(metrics, paste0(outputs, name, '_.csv'), row.names = F) }, error = function(e) cat("skipped")) }