############################################################################################### ##### MethylPipeR chosen models - processing - extracting the chosen models ############################################################################################### # Train ProteinScores for traits using desired case: control ratio # srun -p interactive --pty bash library(readxl) library(MethylPipeR) library(glmnet) library(survival) library(gbm) library(data.table) library(pacman) library(tidyverse) ############################################################################################### ### Mortality: Process 50 iterations in 1:3 ratio per batch x5 (updated)- 10 year ############################################################################################### # Combine batches batches_location <- 'path.../Batches/' outputs <- 'path.../Diseases_10yr_140223/' traits <- 'DEATH' index <- c('Batch1', 'Batch2', 'Batch3', 'Batch4', 'Batch5') ind <- c('batch1', 'batch2', 'batch3', 'batch4', 'batch5') iter <- c(1:10) iter2 <- c(11:20) iter3 <- c(21:30) iter4 <- c(31:40) iter5 <- c(41:50) iter_list <- list(iter, iter2, iter3, iter4, iter5) for(i in 1:length(traits)){ # loop over each trait directory in batches location tryCatch({ trait <- as.character(traits[i]) res <- list() res2 <- list() res3 <- list() res4 <- list() res5 <- list() res_list <- list(res, res2, res3, res4, res5) batches_results <- list() for(n in 1:5){ # loop over each batch of files run tryCatch({ file_index <- as.character(index[n]) batch_index <- as.character(ind[n]) iteration_set <- iter_list[[n]] results <- res_list[[n]] for(j in 1:10){ iteration <- iteration_set[j] b <- readRDS(paste0(batches_location, file_index, '/', trait, '/model_logs_', iteration, '/output/', trait, 'metricsTable.rds')) results[[j]] <- b } result <- do.call(rbind, results) batches_results[[n]] <- result }, error = function(e) cat("skipped")) } final <- do.call(rbind, batches_results) result$Outcome <- trait write.csv(final, paste0(outputs, trait, '.csv')) print(trait) print(i) }, error = function(e) cat("skipped")) } ############################################################################################### ### Process diseases 50 iterations in 1:3 ratio per batch x5 (updated) - 10 year ############################################################################################### batches_location <- 'path.../Diseases_140223/Batches/' outputs <- 'path.../Diseases_10yr_140223/' inputs <- 'path.../Tables_cut/' traits <- list.files(inputs, '.csv') traits <- unique(sub("\\..*", "", traits)) index <- c('Batch1', 'Batch2', 'Batch3', 'Batch4', 'Batch5_new') ind <- c('batch1', 'batch2', 'batch3', 'batch4', 'batch5') iter <- c(1:10) iter2 <- c(11:20) iter3 <- c(21:30) iter4 <- c(31:40) iter5 <- c(41:50) iter_list <- list(iter, iter2, iter3, iter4, iter5) for(i in 1:length(traits)){ # loop over each trait directory in batches location trait <- as.character(traits[i]) res <- list() res2 <- list() res3 <- list() res4 <- list() res5 <- list() res_list <- list(res, res2, res3, res4, res5) batches_results <- list() for(n in 1:5){ # loop over each batch of files run file_index <- as.character(index[n]) batch_index <- as.character(ind[n]) iteration_set <- iter_list[[n]] results <- res_list[[n]] for(j in 1:10){ tryCatch({ iteration <- iteration_set[j] b <- readRDS(paste0(batches_location, file_index, '/', trait, '/model_logs_', iteration, '/output/', trait, 'metricsTable.rds')) results[[j]] <- b }, error = function(e) cat("skipped")) } result <- do.call(rbind, results) batches_results[[n]] <- result } final <- do.call(rbind, batches_results) result$Outcome <- trait write.csv(final, paste0(outputs, trait, '.csv')) print(trait) print(i) print(length(unique(final$iteration))) } ############################################################################################### ### Process diseases 50 iterations in 1:3 ratio per batch x5 (updated) - 5 year ############################################################################################### batches_location <- 'path.../Diseases_140223_5yr/Batches/' outputs <- 'path.../Diseases_5yr_140223/' inputs <- 'path.../Tables_cut/' traits <- list.files(inputs, '.csv') traits <- unique(sub("\\..*", "", traits)) index <- c('Batch1', 'Batch2', 'Batch3', 'Batch4', 'Batch5_new') ind <- c('batch1', 'batch2', 'batch3', 'batch4', 'batch5') iter <- c(1:10) iter2 <- c(11:20) iter3 <- c(21:30) iter4 <- c(31:40) iter5 <- c(41:50) iter_list <- list(iter, iter2, iter3, iter4, iter5) for(i in 1:length(traits)){ # loop over each trait directory in batches location trait <- as.character(traits[i]) res <- list() res2 <- list() res3 <- list() res4 <- list() res5 <- list() res_list <- list(res, res2, res3, res4, res5) batches_results <- list() for(n in 1:5){ # loop over each batch of files run file_index <- as.character(index[n]) batch_index <- as.character(ind[n]) iteration_set <- iter_list[[n]] results <- res_list[[n]] for(j in 1:10){ tryCatch({ iteration <- iteration_set[j] b <- readRDS(paste0(batches_location, file_index, '/', trait, '/model_logs_', iteration, '/output/', trait, 'metricsTable.rds')) results[[j]] <- b }, error = function(e) cat("skipped")) } result <- do.call(rbind, results) batches_results[[n]] <- result } final <- do.call(rbind, batches_results) result$Outcome <- trait write.csv(final, paste0(outputs, trait, '.csv')) print(trait) print(i) print(length(unique(final$iteration))) } ############################################################################################### ## Index models availale and use this stage to check whether features were selected ############################################################################################### # Exclude those with few models or not suited to 10yr prediction files <- list.files('path.../Diseases_10yr_140223/', 'csv') path <- 'path.../Diseases_10yr_140223/' for(i in 1:length(files)){ name <- as.character(files[i]) name <- unique(sub("\\..*", "", name)) res <- read.csv(paste0(path, name, '.csv')) diff <- res[grep('Diff', res$X),] diff <- diff[order(diff$AUC),] length <- dim(diff)[1] print(name) print(length) } files <- list.files('path.../Diseases_5yr_140223/', 'csv') path <- 'path.../Diseases_5yr_140223/' for(i in 1:length(files)){ name <- as.character(files[i]) name <- unique(sub("\\..*", "", name)) res <- read.csv(paste0(path, name, '.csv')) diff <- res[grep('Diff', res$X),] diff <- diff[order(diff$AUC),] length <- dim(diff)[1] print(name) print(length) } ############################################################################################### ## Index median models, adjusting for where no features were selected ############################################################################################### files <- 'path.../Diseases_10yr_140223/' path <- 'path.../Diseases_10yr_140223/' # 10-year traits traits <- c('AL', 'Breast', 'Colorectal', 'COPD', 'Diab', 'GYN', 'IBD', 'IHD', 'LIV', 'LUNG', 'PD', 'Prostate', 'RA', 'ST', 'VD', 'DEATH') ### Index of no features selected for diseases - GYN and VD - median should be weighted for these extra <- c(0,0,0,0,0,9,0,0,0,0,0,0,0,0,15,0) chosen <- list() for(i in 1:length(traits)){ name <- as.character(traits[i]) res <- read.csv(paste0(path, name, '.csv')) # Get differences for models diff <- res[grep('Diff', res$X),] diff <- diff[order(diff$AUC),] lengthd <- dim(diff)[1] print(name) print(lengthd) # Add models with no features selected if applicable to weight median calc ex <- extra[[i]] blank <- diff[1,] blank[1,1:12] <- 0 if(ex > 0){ for(j in 1:ex){ diff <- rbind(diff, blank) } } # Select median model as proteinscore length <- dim(diff)[1] print(name) print(length) cut <- as.integer(length/2) median <- diff[cut,] median$name <- name median$model_completed <- lengthd median$models_no_features <- ex median$models_used_weighted <- length # Get range of differences t1 <- res[which(res$iteration %in% median$iteration),] min <- min(diff$AUC) max <- max(diff$AUC) median$min_diff <- min median$max_diff <- max # median <- median[c(1,2,8,9,3:7)] chosen[[i]] <- median } chosen1 <- do.call(rbind, chosen) chosen1 <- chosen1[order(-chosen1$AUC),] chosen1$Onset <- 10 write.csv(chosen1, 'path.../10yr_chosen.csv', row.names = F) ### 5yr model processing files <- list.files('path.../Diseases_5yr_140223/', 'csv') path <- 'path.../Diseases_5yr_140223/' # Run median model selection for 5-year traits traits <- c('ALS', 'Dep', 'LUP', 'ENDO') ### Index of no features selected for diseases - median should be weighted for these extra <- c(1,4,0,4) chosen <- list() for(i in 1:length(traits)){ name <- as.character(traits[i]) res <- read.csv(paste0(path, name, '.csv')) # Get differences for models diff <- res[grep('Diff', res$X),] diff <- diff[order(diff$AUC),] lengthd <- dim(diff)[1] print(name) print(lengthd) # Add models with no features selected if applicable to weight median calc ex <- extra[[i]] blank <- diff[1,] blank[1,1:12] <- 0 if(ex > 0){ for(j in 1:ex){ diff <- rbind(diff, blank) } } # Select median model as proteinscore length <- dim(diff)[1] print(name) print(length) cut <- as.integer(length/2) median <- diff[cut,] median$name <- name median$model_completed <- lengthd median$models_no_features <- ex median$models_used_weighted <- length # Get range of differences t1 <- res[which(res$iteration %in% median$iteration),] min <- min(diff$AUC) max <- max(diff$AUC) median$min_diff <- min median$max_diff <- max # median <- median[c(1,2,8,9,3:7)] chosen[[i]] <- median } chosen2 <- do.call(rbind, chosen) chosen2 <- chosen2[order(-chosen2$AUC),] chosen2$Onset <- 5 write.csv(chosen2, 'path.../5yr_chosen.csv', row.names = F) ### Combine proteinscores selected for 10yr 5yr chosen3 <- rbind(chosen1, chosen2) chosen3 <- chosen3[order(-chosen3$AUC),] write.csv(chosen3, 'path.../combined_chosen.csv', row.names = F)