18_MethylPipeR_processing.R · UKB-project

###############################################################################################

##### MethylPipeR chosen models - processing - extracting the chosen models

###############################################################################################

# Train ProteinScores for traits using desired case: control ratio
# srun -p interactive --pty bash

library(readxl)
library(MethylPipeR)
library(glmnet)
library(survival)
library(gbm)
library(data.table)
library(pacman)
library(tidyverse)

###############################################################################################

### Mortality: Process 50 iterations in 1:3 ratio per batch x5 (updated)- 10 year 

###############################################################################################

# Combine batches
batches_location <- 'path.../Batches/'
outputs <- 'path.../Diseases_10yr_140223/'
traits <- 'DEATH'

index <- c('Batch1', 'Batch2', 'Batch3', 'Batch4', 'Batch5')
ind <- c('batch1', 'batch2', 'batch3', 'batch4', 'batch5')
iter <- c(1:10)
iter2 <- c(11:20)
iter3 <- c(21:30)
iter4 <- c(31:40)
iter5 <- c(41:50)

iter_list <- list(iter, iter2, iter3, iter4, iter5)

for(i in 1:length(traits)){ # loop over each trait directory in batches location
  tryCatch({
    trait <- as.character(traits[i]) 
    res <- list()
    res2 <- list()
    res3 <- list()
    res4 <- list()
    res5 <- list()
    res_list <- list(res, res2, res3, res4, res5)
    batches_results <- list()

    for(n in 1:5){ # loop over each batch of files run
      tryCatch({
        file_index <- as.character(index[n])
        batch_index <- as.character(ind[n])
        iteration_set <- iter_list[[n]]
        results <- res_list[[n]]
        for(j in 1:10){
          iteration <- iteration_set[j]
          b <- readRDS(paste0(batches_location, file_index, '/', trait, '/model_logs_', iteration, '/output/', trait, 'metricsTable.rds'))
          results[[j]] <- b
        }
        result <- do.call(rbind, results)
        batches_results[[n]] <- result
      }, error = function(e) cat("skipped"))
    }
    final <- do.call(rbind, batches_results)
    result$Outcome <- trait
    write.csv(final, paste0(outputs, trait, '.csv'))
    print(trait)
    print(i)
  }, error = function(e) cat("skipped"))
}


###############################################################################################

### Process diseases 50 iterations in 1:3 ratio per batch x5 (updated) - 10 year 

###############################################################################################

batches_location <- 'path.../Diseases_140223/Batches/'
outputs <- 'path.../Diseases_10yr_140223/'

inputs <- 'path.../Tables_cut/'
traits <- list.files(inputs, '.csv')
traits <- unique(sub("\\..*", "", traits))

index <- c('Batch1', 'Batch2', 'Batch3', 'Batch4', 'Batch5_new')
ind <- c('batch1', 'batch2', 'batch3', 'batch4', 'batch5')
iter <- c(1:10)
iter2 <- c(11:20)
iter3 <- c(21:30)
iter4 <- c(31:40)
iter5 <- c(41:50)


iter_list <- list(iter, iter2, iter3, iter4, iter5)

for(i in 1:length(traits)){ # loop over each trait directory in batches location

    trait <- as.character(traits[i]) 
    res <- list()
    res2 <- list()
    res3 <- list()
    res4 <- list()
    res5 <- list()
    res_list <- list(res, res2, res3, res4, res5)
    batches_results <- list()
    
    for(n in 1:5){ # loop over each batch of files run
        file_index <- as.character(index[n])
        batch_index <- as.character(ind[n])
        iteration_set <- iter_list[[n]]
        results <- res_list[[n]]

          for(j in 1:10){
            tryCatch({
            iteration <- iteration_set[j]
            b <- readRDS(paste0(batches_location, file_index, '/', trait, '/model_logs_', iteration, '/output/', trait, 'metricsTable.rds'))
            results[[j]] <- b
            }, error = function(e) cat("skipped"))
          }
          result <- do.call(rbind, results)
          batches_results[[n]] <- result
    }
    final <- do.call(rbind, batches_results)
    result$Outcome <- trait
    write.csv(final, paste0(outputs, trait, '.csv'))
    print(trait)
    print(i)
    print(length(unique(final$iteration)))

}


###############################################################################################

### Process diseases 50 iterations in 1:3 ratio per batch x5 (updated) - 5 year 

###############################################################################################

batches_location <- 'path.../Diseases_140223_5yr/Batches/'
outputs <- 'path.../Diseases_5yr_140223/'

inputs <- 'path.../Tables_cut/'
traits <- list.files(inputs, '.csv')
traits <- unique(sub("\\..*", "", traits))

index <- c('Batch1', 'Batch2', 'Batch3', 'Batch4', 'Batch5_new')
ind <- c('batch1', 'batch2', 'batch3', 'batch4', 'batch5')
iter <- c(1:10)
iter2 <- c(11:20)
iter3 <- c(21:30)
iter4 <- c(31:40)
iter5 <- c(41:50)

iter_list <- list(iter, iter2, iter3, iter4, iter5)

for(i in 1:length(traits)){ # loop over each trait directory in batches location
  
  trait <- as.character(traits[i]) 
  res <- list()
  res2 <- list()
  res3 <- list()
  res4 <- list()
  res5 <- list()
  res_list <- list(res, res2, res3, res4, res5)
  batches_results <- list()
  
  for(n in 1:5){ # loop over each batch of files run
    file_index <- as.character(index[n])
    batch_index <- as.character(ind[n])
    iteration_set <- iter_list[[n]]
    results <- res_list[[n]]
    
    for(j in 1:10){
      tryCatch({
        iteration <- iteration_set[j]
        b <- readRDS(paste0(batches_location, file_index, '/', trait, '/model_logs_', iteration, '/output/', trait, 'metricsTable.rds'))
        results[[j]] <- b
      }, error = function(e) cat("skipped"))
    }
    result <- do.call(rbind, results)
    batches_results[[n]] <- result
  }
  final <- do.call(rbind, batches_results)
  result$Outcome <- trait
  write.csv(final, paste0(outputs, trait, '.csv'))
  print(trait)
  print(i)
  print(length(unique(final$iteration)))
  
}


###############################################################################################

## Index models availale and use this stage to check whether features were selected 

###############################################################################################

# Exclude those with few models or not suited to 10yr prediction

files <- list.files('path.../Diseases_10yr_140223/', 'csv')
path <- 'path.../Diseases_10yr_140223/'

for(i in 1:length(files)){
  name <- as.character(files[i])
  name <- unique(sub("\\..*", "", name))
  res <- read.csv(paste0(path, name, '.csv'))
  diff <- res[grep('Diff', res$X),]
  diff <- diff[order(diff$AUC),]
  length <- dim(diff)[1]
  print(name)
  print(length)
}

files <- list.files('path.../Diseases_5yr_140223/', 'csv')
path <- 'path.../Diseases_5yr_140223/'

for(i in 1:length(files)){
  name <- as.character(files[i])
  name <- unique(sub("\\..*", "", name))
  res <- read.csv(paste0(path, name, '.csv'))
  diff <- res[grep('Diff', res$X),]
  diff <- diff[order(diff$AUC),]
  length <- dim(diff)[1]
  print(name)
  print(length)
}

###############################################################################################

## Index median models, adjusting for where no features were selected

###############################################################################################

files <- 'path.../Diseases_10yr_140223/'
path <- 'path.../Diseases_10yr_140223/'

# 10-year traits 
traits <- c('AL', 'Breast', 'Colorectal', 'COPD', 'Diab', 'GYN', 'IBD', 'IHD', 'LIV', 'LUNG', 'PD',
            'Prostate', 'RA', 'ST', 'VD', 'DEATH')

### Index of no features selected for diseases - GYN and VD - median should be weighted for these
extra <- c(0,0,0,0,0,9,0,0,0,0,0,0,0,0,15,0)

chosen <- list()

for(i in 1:length(traits)){
  name <- as.character(traits[i])
  res <- read.csv(paste0(path, name, '.csv'))
  
  # Get differences for models
  diff <- res[grep('Diff', res$X),]
  diff <- diff[order(diff$AUC),]
  lengthd <- dim(diff)[1]
  print(name)
  print(lengthd)

  
  # Add models with no features selected if applicable to weight median calc
  ex <- extra[[i]]
  blank <- diff[1,]
  blank[1,1:12] <- 0
  if(ex > 0){
    for(j in 1:ex){
      diff <- rbind(diff, blank)
    }
  }
  
  # Select median model as proteinscore
  length <- dim(diff)[1]
  print(name)
  print(length)
  cut <- as.integer(length/2)
  median <- diff[cut,]
  median$name <- name
  median$model_completed <- lengthd
  median$models_no_features <- ex
  median$models_used_weighted <- length
  
  # Get range of differences
  t1 <- res[which(res$iteration %in% median$iteration),]
  min <- min(diff$AUC)
  max <- max(diff$AUC)
  
  median$min_diff <- min
  median$max_diff <- max
  # median <- median[c(1,2,8,9,3:7)]
  chosen[[i]] <- median
}

chosen1 <- do.call(rbind, chosen)
chosen1 <- chosen1[order(-chosen1$AUC),]
chosen1$Onset <- 10
write.csv(chosen1, 'path.../10yr_chosen.csv', row.names = F)


### 5yr model processing

files <- list.files('path.../Diseases_5yr_140223/', 'csv')
path <- 'path.../Diseases_5yr_140223/'

# Run median model selection for 5-year traits 
traits <- c('ALS', 'Dep', 'LUP', 'ENDO')

### Index of no features selected for diseases - median should be weighted for these
extra <- c(1,4,0,4)

chosen <- list()

for(i in 1:length(traits)){
  name <- as.character(traits[i])
  res <- read.csv(paste0(path, name, '.csv'))
  
  # Get differences for models
  diff <- res[grep('Diff', res$X),]
  diff <- diff[order(diff$AUC),]
  lengthd <- dim(diff)[1]
  print(name)
  print(lengthd)
  
  
  # Add models with no features selected if applicable to weight median calc
  ex <- extra[[i]]
  blank <- diff[1,]
  blank[1,1:12] <- 0
  if(ex > 0){
    for(j in 1:ex){
      diff <- rbind(diff, blank)
    }
  }
  
  # Select median model as proteinscore
  length <- dim(diff)[1]
  print(name)
  print(length)
  cut <- as.integer(length/2)
  median <- diff[cut,]
  median$name <- name
  median$model_completed <- lengthd
  median$models_no_features <- ex
  median$models_used_weighted <- length
  
  # Get range of differences
  t1 <- res[which(res$iteration %in% median$iteration),]
  min <- min(diff$AUC)
  max <- max(diff$AUC)
  
  median$min_diff <- min
  median$max_diff <- max
  # median <- median[c(1,2,8,9,3:7)]
  chosen[[i]] <- median
}

chosen2 <- do.call(rbind, chosen)
chosen2 <- chosen2[order(-chosen2$AUC),]
chosen2$Onset <- 5
write.csv(chosen2, 'path.../5yr_chosen.csv', row.names = F)

### Combine proteinscores selected for 10yr 5yr
chosen3 <- rbind(chosen1, chosen2)
chosen3 <- chosen3[order(-chosen3$AUC),]
write.csv(chosen3, 'path.../combined_chosen.csv', row.names = F)