UKB-project / 16_Methylpiper_10year.R
16_Methylpiper_10year.R
Raw
###############################################################################################

## Batches - MethylPipeR run 50 iterations with 1:3 ratio - select median model as ProteinScores

###############################################################################################

args <- commandArgs(trailingOnly=TRUE)
taskid <- as.numeric(args[1])
print(paste0('taskid for this iteration is ', taskid))

library(readxl)
library(tidyverse)
library(MethylPipeR)
library(glmnet)
library(survival)
library(gbm)
library(data.table)
library(pacman)


### Use the preprepped traits cox tables from the incident disease summary table script:
# Read subset files back in 
results <- list()
traits_list <- list()
location <- 'path.../Tables_cut/'
files <- list.files(location, '.csv')

for(i in 1:length(files)){
  name <- files[i]
  name <- gsub("\\..*", "", name)  
  file <- read.csv(paste0(location, name, '.csv'))
  traits_list[[i]] <- name
  results[[i]] <- file
  print(name)
  print(dim(file))
}

i <- taskid


### Create 50 seeds randomly sampled between 1 and 5000 - ensure seed is set before random sampling and to save seeds down
# seed <- 4237
# set.seed(seed)
# seed_list <- floor(runif(50, min=1, max=5000))
# write.csv(seed_list, 'path.../seed_list_50_sampled_with_seed_4237.csv')
seed_list <- read.csv('path.../seed_list_50_sampled_with_seed_4237.csv')
seed_list <- seed_list$x

### Set the disease for this array 
name <- as.character(traits_list[[i]])
cox <- results[[i]]
print(paste0('The disease trait for this iteration is ', name))

### Isolate cases and controls available in the full sample for this disease
cox <- cox[which(cox$tte > 0),] # ensure no negative tte between 0 and -1 for predictor models 
names(cox)[15] <- 'time_to_event'
cases <- cox[which(cox$Event == '1'),]
controls <- cox[which(cox$Event == '0'),]

print('Printing case dimensions:')
print(dim(cases))
print('Printing control dimensions:')
print(dim(controls))

### Load imputed proteins from imputation prep script - (i.e. that have not been transformed or scaled with related excluded - transformations and scaling are done per train/test set below)
olink <- read.csv('path.../knn_imputed_processed_olink_internal_proteins.csv')
print('Protein data loaded. Looping analyses through seeds now.')

### Load analysis function - called once x and y file preps are complete x 50 iterations
analysis <- function(trainingData, testData, trainingTarget, testTarget, res_list, seed_iteration, cv, iter, name_dis, output){
  
  print('Iteration')
  print(iter)
  
  # Create iteration specific output folder with results subfolder
  dir.create(paste0(output, name_dis, '/model_logs_', iter, '/'))
  model_logs <- paste0(output, name_dis, '/model_logs_', iter, '/')
  dir.create(paste0(model_logs, '/output'))
  
  startTimestamp <- format(Sys.time(), '%Y_%m_%d_%H_%M_%S')
  initLogs(model_logs, note = paste0('Cox glmnet with 1:3 case:controls for ', name, ' and iteration ', iter, '.'))
  
  tryCatch({
    set.seed(seed_iteration)
    
    training_total <- dim(trainingTarget)[1]
    training_cases <- length(which(trainingTarget$Event == 1))
    
    # Fit the training cox glmnet 
    mprModel <- fitMPRModelCV(type = 'survival',
                              method = 'glmnet',
                              trainXs = trainingData,
                              trainY = trainingTarget,
                              seed = seed_iteration,
                              nFolds = cv,
                              save = TRUE)
    
    # Extract and save weights for selected proteins
    model_summary <-  mprModel$model
    res <- coef(mprModel$model, s= 'lambda.min')
    res <- res[res[,1]!=0,]
    res <- as.data.frame(res)
    names(res)[1] <- 'Coefficient'
    res$Protein <- rownames(res)
    res <- res[c(2,1)]
    
    feature_count <- dim(res)[1]
    print('Feature count:')
    print(feature_count)
    
    # save weights
    write.csv(res, paste0(model_logs, '/output/', name_dis, '_', iter, '_weights.csv'), row.names = F)

    # Generate model predictions in the test data
    mprModelTestPredictions <- predictMPRModel(mprModel,
                                               data = testData,
                                               s = 'lambda.min')

    # Add test scores to the target y file in the test sample
    testTarget$dScore <- mprModelTestPredictions
    testTarget <- na.omit(testTarget)
    
    test_count <- dim(testTarget)[1]
    
    cases_total <- length(which(testTarget$Event == 1))
    
    cases_recoded <- length(which(testTarget$time_to_event >= 10 & testTarget$Event == 1))
  
        
    # Compare null and scores added cox models
    if(name_dis %in% c('Prostate', 'CYS', 'GYN', 'Breast', 'ENDO')){
      riskFactorsOnlyCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment, testTarget)
      fullCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment + dScore, testTarget)
      ProteinOnly <- coxph(Surv(time_to_event, Event) ~ dScore, testTarget)
      print('Model has been sex stratified in cox.')
      print('Cases and controls available:')
      print(table(testTarget$Event))
    } else {
      riskFactorsOnlyCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex), testTarget)
      fullCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex) + dScore, testTarget)
      ProteinOnly <- coxph(Surv(time_to_event, Event) ~ dScore, testTarget)
      print('Model has not been sex stratified in cox.')
      print('Cases and controls available:')
      print(table(testTarget$Event))
    }
    
    # List models
    models <- list(r = riskFactorsOnlyCoxPH, f = fullCoxPH, t = ProteinOnly)
    
    # Generate performance statistics - for 10 years of follow up, comparing base HR to survival HR at 10 years 
      predictCoxPHOnset <- function(dataDF, coxPHModel, threshold = 10) {
      uniqueTimes <- uniqueTimes <- sort(unique(c(dataDF$time_to_event, threshold)))
      thresholdIndex <- match(threshold, uniqueTimes)
      cumulativeBaseHaz <- gbm::basehaz.gbm(dataDF$time_to_event, dataDF$Event, predict(coxPHModel), uniqueTimes)

      survivalPredictions <- exp(-cumulativeBaseHaz[[thresholdIndex]]) ^ exp(predict(coxPHModel))
      onsetPredictions <- 1 - survivalPredictions
      print('Cumulative baseline hazard presence NA - print onset predictions:')
      print(table(is.na(onsetPredictions)))
      
      # Event should be 0 if tte is > 10
      dataDF$Event <- sapply(1:nrow(dataDF), function(i) {
        if (dataDF$time_to_event[[i]] > threshold) {
          0
        } else {
          dataDF$Event[[i]]
        }
      })
      
      auc <- MLmetrics::AUC(y_pred = onsetPredictions, y_true = dataDF$Event)
      prauc <- MLmetrics::PRAUC(y_pred = onsetPredictions, y_true = dataDF$Event)
      roc <- pROC::roc(response = dataDF$Event, predictor = onsetPredictions)
      list(cumulativeBaseHaz = cumulativeBaseHaz, onsetPredictions = onsetPredictions, auc = auc, prauc = prauc, roc = roc)
    }
    
    # Extract test results
    testResults <- lapply(models, function(m) {predictCoxPHOnset(testTarget, m)})

    # Extract metrics table
    aucs <- sapply(testResults, function(r) {r$auc})
    praucs <- sapply(testResults, function(r) {r$prauc})
    metricsTable <- data.frame(AUC = aucs, PRAUC = praucs)
    metricsTable[4,1] <- metricsTable[2,1] - metricsTable[1,1]
    metricsTable[4,2] <- metricsTable[2,2] - metricsTable[1,2]
    row.names(metricsTable) <- c('Null', 'Null + ProteinScore', 'ProteinScore Only', 'Difference')
    metricsTable$folds <- cv
    metricsTable$seed <- seed_iteration
    metricsTable$iteration <- iter
    metricsTable$training_total <- training_total
    metricsTable$training_cases <- training_cases
    metricsTable$feature_count <- feature_count
    metricsTable$test_total <- test_count
    metricsTable$test_cases <- cases_total
    metricsTable$test_cases_recoded <- cases_recoded
    
    sessionStartTimestamp <- getOption('mprSessionStartTimestamp')

    # save test results from cox models
    saveRDS(testResults, paste0(model_logs, '/output/', name_dis, 'testResults.rds'))
    # save metrics table performances
    saveRDS(metricsTable, paste0(model_logs, '/output/', name_dis, 'metricsTable.rds'))
    # save training file y
    saveRDS(trainingData, paste0(model_logs, '/output/', name_dis, 'trainingData.rds'))
    # save target file y
    saveRDS(testTarget, paste0(model_logs, '/output/', name_dis, 'testTarget.rds'))
    # save models cox tests
    saveRDS(models, paste0(model_logs, '/output/', name_dis, 'models.rds'))
}, error = function(e) cat("skipped"))
  return(metricsTable) 
}


### Loop entire pipeline through 50 seeds

output_location <- 'path.../Batches/Batch2/'
location_out <- 'path.../Batches/Batch2/'

# Set ratio for case:controls
ratio <- 3

# Create list to store results
list <- list()

for(i in 1:50){
  tryCatch({
    # Extract seed for this iteration
    seed <- seed_list[i]
    iteration <- i
    print(seed)
    set.seed(seed)
    
    # Randomly sample 50% cases
    case_IDs <- cases$SampleID
    set.seed(as.integer(seed))
    train_IDs <- sample(case_IDs, size=length(case_IDs)/2, replace=FALSE)
    cases_tr <- cases[which(cases$SampleID %in% train_IDs),]
    cases_te <- cases[-which(cases$SampleID %in% train_IDs),]
    
    print('Cases train:')
    print(dim(cases_tr))
    print('Cases test:')
    print(dim(cases_te))
    
    # Subset test cases to those within 10 year onset
    cases_te_under <- cases_te[which(cases_te$time_to_event <= 10),]
    
    # Place cases with tte >10 years back into control population for 1:3 sampling
    cases_te_over <- cases_te[which(cases_te$time_to_event > 10),]
    
    print('Cases test under 10:')
    print(dim(cases_te_under))
    print('Cases test equal or over 10:')
    print(dim(cases_te_over))
    
    # Randomly sample 1:3 ratio of controls
    control_IDs <- controls$SampleID
    set.seed(as.integer(seed))
    control_train_IDs <- sample(control_IDs, size=length(control_IDs)/2, replace=FALSE)
    controls_tr <- controls[which(controls$SampleID %in% control_train_IDs),]
    controls_te <- controls[-which(controls$SampleID %in% control_train_IDs),]
    
    # Exclude covariates with missing values from training data (will cause errors in MethypipeR if not)
    cases_tr <- cases_tr[-c(2:6)]
    controls_tr <- controls_tr[-c(2:6)]
    
    # Randomly sample 1:3 case:control ratio in train and test sets once
    Ncases1 <- dim(cases_tr)[1]
    Ncases2 <- dim(cases_te_under)[1]
    
    # Join cases beyond 10 year limit back into test control population
    controls_te <- rbind(cases_te_over, controls_te)
    
    print('Controls train:')
    print(dim(controls_tr))
    print('Controls test')
    print(dim(controls_te))
    
    length1 <- dim(controls_tr)[1]
    length2 <- dim(controls_te)[1]
    
    Ncontrols1 <- ratio * Ncases1
    Ncontrols2 <- ratio * Ncases2
    
    set.seed(as.integer(seed))
    sample1 <- sample(length1, size=Ncontrols1, replace=FALSE)
    set.seed(as.integer(seed))
    sample2 <- sample(length2, size=Ncontrols2, replace=FALSE)
    
    controls_train <- controls_tr[sample1,]
    controls_test <- controls_te[sample2,]
    
    # Check how many cases > 10 years got selected as controls in test set 
    print('Number of cases in control test set')
    print(length(which(controls_test$Event == '1')))
    
    y_train <- rbind(cases_tr, controls_train)
    y_test <- rbind(cases_te_under, controls_test)
    
    print('Yrain:')
    print(dim(y_train))
    print('Ytest:')
    print(dim(y_test))
    
    # Subset x files to train and test as inputs, ensuring order of IDs are matched to y files
    x_train <- olink[which(olink$pseudo_ind_id %in% y_train$pseudo_ind_id),]
    
    x_test <- olink[which(olink$pseudo_ind_id %in% y_test$pseudo_ind_id),]
    
    x_train <- x_train[match(y_train$pseudo_ind_id, x_train$pseudo_ind_id),]
    
    x_test <- x_test[match(y_test$pseudo_ind_id,  x_test$pseudo_ind_id),]
  
    # Sanity check on matching - printed to terminal
    identical(as.character(x_train$pseudo_ind_id), as.character(y_train$pseudo_ind_id))
    identical(as.character(x_test$pseudo_ind_id), as.character(y_test$pseudo_ind_id))
    
    x_train <- as.matrix(x_train[3:1470]) 
    x_test <- as.matrix(x_test[3:1470])
    
    # Transform and scale protein inputs in train and test populations
    for(k in colnames(x_train)){
      x_train[,i] <- qnorm((rank(x_train[,k], na.last='keep')-0.5)/sum(!is.na(x_train[,k])))
    }
    
    x_train <- apply(x_train, 2, scale)
    
    for(k in colnames(x_test)){
      x_test[,k] <- qnorm((rank(x_test[,k], na.last='keep')-0.5)/sum(!is.na(x_test[,k])))
    }
    
    x_test <- apply(x_test, 2, scale)
    
    
    print(paste0('Allocations prepped - trying model runs.'))
    
    # Assign fold number based on availability of cases in training sample
    if(Ncases1 >= 1000){
      cv = 10
    } else {
      if(Ncases1 < 1000 & Ncases1 >= 500){
        cv = 5
      } else {
        cv = 3
      }
    }
    cross <- cv
    print(paste0('Using ', cross, ' cross-fold validation.'))
    
    # Ensure seed is still set from 50 randomly sampled seeds
    set.seed(seed)
    
    # Create output folder for disease - will be populated with subfolders in analyses function for each iteration
    dir.create(paste0(location_out, name))
    
    # Run analysis function 
    models <- analysis(x_train, x_test, y_train, y_test, list, seed, cross, iteration, name, location_out)
  
    list[[i]] <- models

  }, error = function(e) cat("skipped"))
}

result <- do.call(rbind, list) # Bind summary table together  

path <- paste0(output_location, name, '/metrics_tables_50_split_1_3_batch2.csv')
write.csv(result, path)