############################################################################################### ## Batches - MethylPipeR run 50 iterations with 1:3 ratio - select median model as ProteinScores ############################################################################################### args <- commandArgs(trailingOnly=TRUE) taskid <- as.numeric(args[1]) print(paste0('taskid for this iteration is ', taskid)) library(readxl) library(tidyverse) library(MethylPipeR) library(glmnet) library(survival) library(gbm) library(data.table) library(pacman) ### Use the preprepped traits cox tables from the incident disease summary table script: # Read subset files back in results <- list() traits_list <- list() location <- 'path.../Tables_cut/' files <- list.files(location, '.csv') for(i in 1:length(files)){ name <- files[i] name <- gsub("\\..*", "", name) file <- read.csv(paste0(location, name, '.csv')) traits_list[[i]] <- name results[[i]] <- file print(name) print(dim(file)) } i <- taskid ### Create 50 seeds randomly sampled between 1 and 5000 - ensure seed is set before random sampling and to save seeds down # seed <- 4237 # set.seed(seed) # seed_list <- floor(runif(50, min=1, max=5000)) # write.csv(seed_list, 'path.../seed_list_50_sampled_with_seed_4237.csv') seed_list <- read.csv('path.../seed_list_50_sampled_with_seed_4237.csv') seed_list <- seed_list$x ### Set the disease for this array name <- as.character(traits_list[[i]]) cox <- results[[i]] print(paste0('The disease trait for this iteration is ', name)) ### Isolate cases and controls available in the full sample for this disease cox <- cox[which(cox$tte > 0),] # ensure no negative tte between 0 and -1 for predictor models names(cox)[15] <- 'time_to_event' cases <- cox[which(cox$Event == '1'),] controls <- cox[which(cox$Event == '0'),] print('Printing case dimensions:') print(dim(cases)) print('Printing control dimensions:') print(dim(controls)) ### Load imputed proteins from imputation prep script - (i.e. that have not been transformed or scaled with related excluded - transformations and scaling are done per train/test set below) olink <- read.csv('path.../knn_imputed_processed_olink_internal_proteins.csv') print('Protein data loaded. Looping analyses through seeds now.') ### Load analysis function - called once x and y file preps are complete x 50 iterations analysis <- function(trainingData, testData, trainingTarget, testTarget, res_list, seed_iteration, cv, iter, name_dis, output){ print('Iteration') print(iter) # Create iteration specific output folder with results subfolder dir.create(paste0(output, name_dis, '/model_logs_', iter, '/')) model_logs <- paste0(output, name_dis, '/model_logs_', iter, '/') dir.create(paste0(model_logs, '/output')) startTimestamp <- format(Sys.time(), '%Y_%m_%d_%H_%M_%S') initLogs(model_logs, note = paste0('Cox glmnet with 1:3 case:controls for ', name, ' and iteration ', iter, '.')) tryCatch({ set.seed(seed_iteration) training_total <- dim(trainingTarget)[1] training_cases <- length(which(trainingTarget$Event == 1)) # Fit the training cox glmnet mprModel <- fitMPRModelCV(type = 'survival', method = 'glmnet', trainXs = trainingData, trainY = trainingTarget, seed = seed_iteration, nFolds = cv, save = TRUE) # Extract and save weights for selected proteins model_summary <- mprModel$model res <- coef(mprModel$model, s= 'lambda.min') res <- res[res[,1]!=0,] res <- as.data.frame(res) names(res)[1] <- 'Coefficient' res$Protein <- rownames(res) res <- res[c(2,1)] feature_count <- dim(res)[1] print('Feature count:') print(feature_count) # save weights write.csv(res, paste0(model_logs, '/output/', name_dis, '_', iter, '_weights.csv'), row.names = F) # Generate model predictions in the test data mprModelTestPredictions <- predictMPRModel(mprModel, data = testData, s = 'lambda.min') # Add test scores to the target y file in the test sample testTarget$dScore <- mprModelTestPredictions testTarget <- na.omit(testTarget) test_count <- dim(testTarget)[1] cases_total <- length(which(testTarget$Event == 1)) cases_recoded <- length(which(testTarget$time_to_event >= 10 & testTarget$Event == 1)) # Compare null and scores added cox models if(name_dis %in% c('Prostate', 'CYS', 'GYN', 'Breast', 'ENDO')){ riskFactorsOnlyCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment, testTarget) fullCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment + dScore, testTarget) ProteinOnly <- coxph(Surv(time_to_event, Event) ~ dScore, testTarget) print('Model has been sex stratified in cox.') print('Cases and controls available:') print(table(testTarget$Event)) } else { riskFactorsOnlyCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex), testTarget) fullCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex) + dScore, testTarget) ProteinOnly <- coxph(Surv(time_to_event, Event) ~ dScore, testTarget) print('Model has not been sex stratified in cox.') print('Cases and controls available:') print(table(testTarget$Event)) } # List models models <- list(r = riskFactorsOnlyCoxPH, f = fullCoxPH, t = ProteinOnly) # Generate performance statistics - for 10 years of follow up, comparing base HR to survival HR at 10 years predictCoxPHOnset <- function(dataDF, coxPHModel, threshold = 10) { uniqueTimes <- uniqueTimes <- sort(unique(c(dataDF$time_to_event, threshold))) thresholdIndex <- match(threshold, uniqueTimes) cumulativeBaseHaz <- gbm::basehaz.gbm(dataDF$time_to_event, dataDF$Event, predict(coxPHModel), uniqueTimes) survivalPredictions <- exp(-cumulativeBaseHaz[[thresholdIndex]]) ^ exp(predict(coxPHModel)) onsetPredictions <- 1 - survivalPredictions print('Cumulative baseline hazard presence NA - print onset predictions:') print(table(is.na(onsetPredictions))) # Event should be 0 if tte is > 10 dataDF$Event <- sapply(1:nrow(dataDF), function(i) { if (dataDF$time_to_event[[i]] > threshold) { 0 } else { dataDF$Event[[i]] } }) auc <- MLmetrics::AUC(y_pred = onsetPredictions, y_true = dataDF$Event) prauc <- MLmetrics::PRAUC(y_pred = onsetPredictions, y_true = dataDF$Event) roc <- pROC::roc(response = dataDF$Event, predictor = onsetPredictions) list(cumulativeBaseHaz = cumulativeBaseHaz, onsetPredictions = onsetPredictions, auc = auc, prauc = prauc, roc = roc) } # Extract test results testResults <- lapply(models, function(m) {predictCoxPHOnset(testTarget, m)}) # Extract metrics table aucs <- sapply(testResults, function(r) {r$auc}) praucs <- sapply(testResults, function(r) {r$prauc}) metricsTable <- data.frame(AUC = aucs, PRAUC = praucs) metricsTable[4,1] <- metricsTable[2,1] - metricsTable[1,1] metricsTable[4,2] <- metricsTable[2,2] - metricsTable[1,2] row.names(metricsTable) <- c('Null', 'Null + ProteinScore', 'ProteinScore Only', 'Difference') metricsTable$folds <- cv metricsTable$seed <- seed_iteration metricsTable$iteration <- iter metricsTable$training_total <- training_total metricsTable$training_cases <- training_cases metricsTable$feature_count <- feature_count metricsTable$test_total <- test_count metricsTable$test_cases <- cases_total metricsTable$test_cases_recoded <- cases_recoded sessionStartTimestamp <- getOption('mprSessionStartTimestamp') # save test results from cox models saveRDS(testResults, paste0(model_logs, '/output/', name_dis, 'testResults.rds')) # save metrics table performances saveRDS(metricsTable, paste0(model_logs, '/output/', name_dis, 'metricsTable.rds')) # save training file y saveRDS(trainingData, paste0(model_logs, '/output/', name_dis, 'trainingData.rds')) # save target file y saveRDS(testTarget, paste0(model_logs, '/output/', name_dis, 'testTarget.rds')) # save models cox tests saveRDS(models, paste0(model_logs, '/output/', name_dis, 'models.rds')) }, error = function(e) cat("skipped")) return(metricsTable) } ### Loop entire pipeline through 50 seeds output_location <- 'path.../Batches/Batch2/' location_out <- 'path.../Batches/Batch2/' # Set ratio for case:controls ratio <- 3 # Create list to store results list <- list() for(i in 1:50){ tryCatch({ # Extract seed for this iteration seed <- seed_list[i] iteration <- i print(seed) set.seed(seed) # Randomly sample 50% cases case_IDs <- cases$SampleID set.seed(as.integer(seed)) train_IDs <- sample(case_IDs, size=length(case_IDs)/2, replace=FALSE) cases_tr <- cases[which(cases$SampleID %in% train_IDs),] cases_te <- cases[-which(cases$SampleID %in% train_IDs),] print('Cases train:') print(dim(cases_tr)) print('Cases test:') print(dim(cases_te)) # Subset test cases to those within 10 year onset cases_te_under <- cases_te[which(cases_te$time_to_event <= 10),] # Place cases with tte >10 years back into control population for 1:3 sampling cases_te_over <- cases_te[which(cases_te$time_to_event > 10),] print('Cases test under 10:') print(dim(cases_te_under)) print('Cases test equal or over 10:') print(dim(cases_te_over)) # Randomly sample 1:3 ratio of controls control_IDs <- controls$SampleID set.seed(as.integer(seed)) control_train_IDs <- sample(control_IDs, size=length(control_IDs)/2, replace=FALSE) controls_tr <- controls[which(controls$SampleID %in% control_train_IDs),] controls_te <- controls[-which(controls$SampleID %in% control_train_IDs),] # Exclude covariates with missing values from training data (will cause errors in MethypipeR if not) cases_tr <- cases_tr[-c(2:6)] controls_tr <- controls_tr[-c(2:6)] # Randomly sample 1:3 case:control ratio in train and test sets once Ncases1 <- dim(cases_tr)[1] Ncases2 <- dim(cases_te_under)[1] # Join cases beyond 10 year limit back into test control population controls_te <- rbind(cases_te_over, controls_te) print('Controls train:') print(dim(controls_tr)) print('Controls test') print(dim(controls_te)) length1 <- dim(controls_tr)[1] length2 <- dim(controls_te)[1] Ncontrols1 <- ratio * Ncases1 Ncontrols2 <- ratio * Ncases2 set.seed(as.integer(seed)) sample1 <- sample(length1, size=Ncontrols1, replace=FALSE) set.seed(as.integer(seed)) sample2 <- sample(length2, size=Ncontrols2, replace=FALSE) controls_train <- controls_tr[sample1,] controls_test <- controls_te[sample2,] # Check how many cases > 10 years got selected as controls in test set print('Number of cases in control test set') print(length(which(controls_test$Event == '1'))) y_train <- rbind(cases_tr, controls_train) y_test <- rbind(cases_te_under, controls_test) print('Yrain:') print(dim(y_train)) print('Ytest:') print(dim(y_test)) # Subset x files to train and test as inputs, ensuring order of IDs are matched to y files x_train <- olink[which(olink$pseudo_ind_id %in% y_train$pseudo_ind_id),] x_test <- olink[which(olink$pseudo_ind_id %in% y_test$pseudo_ind_id),] x_train <- x_train[match(y_train$pseudo_ind_id, x_train$pseudo_ind_id),] x_test <- x_test[match(y_test$pseudo_ind_id, x_test$pseudo_ind_id),] # Sanity check on matching - printed to terminal identical(as.character(x_train$pseudo_ind_id), as.character(y_train$pseudo_ind_id)) identical(as.character(x_test$pseudo_ind_id), as.character(y_test$pseudo_ind_id)) x_train <- as.matrix(x_train[3:1470]) x_test <- as.matrix(x_test[3:1470]) # Transform and scale protein inputs in train and test populations for(k in colnames(x_train)){ x_train[,i] <- qnorm((rank(x_train[,k], na.last='keep')-0.5)/sum(!is.na(x_train[,k]))) } x_train <- apply(x_train, 2, scale) for(k in colnames(x_test)){ x_test[,k] <- qnorm((rank(x_test[,k], na.last='keep')-0.5)/sum(!is.na(x_test[,k]))) } x_test <- apply(x_test, 2, scale) print(paste0('Allocations prepped - trying model runs.')) # Assign fold number based on availability of cases in training sample if(Ncases1 >= 1000){ cv = 10 } else { if(Ncases1 < 1000 & Ncases1 >= 500){ cv = 5 } else { cv = 3 } } cross <- cv print(paste0('Using ', cross, ' cross-fold validation.')) # Ensure seed is still set from 50 randomly sampled seeds set.seed(seed) # Create output folder for disease - will be populated with subfolders in analyses function for each iteration dir.create(paste0(location_out, name)) # Run analysis function models <- analysis(x_train, x_test, y_train, y_test, list, seed, cross, iteration, name, location_out) list[[i]] <- models }, error = function(e) cat("skipped")) } result <- do.call(rbind, list) # Bind summary table together path <- paste0(output_location, name, '/metrics_tables_50_split_1_3_batch2.csv') write.csv(result, path)