UKB-project / 20_MethylPipeR_processing3.R
20_MethylPipeR_processing3.R
Raw
###############################################################################################

##### MethylPipeR chosen models - processing - collating weights and plotting results

###############################################################################################

library(readxl)
library(glmnet)
library(survival)
library(gbm)
library(data.table)
library(pacman)
library(tidyverse)

###############################################################################################

### Process metrics table and weights selected as features

###############################################################################################

### Load the 10yr weights files
path <- 'path.../Chosen_model_files/Files_10y/'
files <- list.files(paste0(path), '.csv')
traits <- gsub("_weights.csv*","", files)

### Collate weights table, along with features summary
res_weights <- list()

table <- data.frame(Trait = 1:length(traits), Features = 1:length(traits), Proteins = 1:length(traits))

for(i in 1:length(traits)){
  tryCatch({
    print(traits[i])
    loc <- files[i]
    file <- read.csv(paste0(path, loc))
    file$Trait <- traits[i]
    res_weights[[i]] <- file
    dim <- dim(file)[1]
    table[i,1] <- traits[i]
    table[i,2] <- dim
    file$Protein <- gsub("\\..*","",file$Protein)
    prot <- file$Protein
    table[i,3] <- paste(prot, sep = ",", collapse = ", ")
  }, error = function(e) cat("skipped"))
}

bind1 <- do.call(rbind, res_weights)

table1 <- table[order(table$Features),]

### Load the 5yr weights files
path <- 'path.../Chosen_model_files/Files_5y/'
files <- list.files(paste0(path), '.csv')
traits <- gsub("_weights.csv*","", files)

### Collate weights table, along with features summary
res_weights <- list()

table <- data.frame(Trait = 1:length(traits), Features = 1:length(traits), Proteins = 1:length(traits))

for(i in 1:length(traits)){
  tryCatch({
    print(traits[i])
    loc <- files[i]
    file <- read.csv(paste0(path, loc))
    file$Trait <- traits[i]
    res_weights[[i]] <- file
    dim <- dim(file)[1]
    table[i,1] <- traits[i]
    table[i,2] <- dim
    file$Protein <- gsub("\\..*","",file$Protein)
    prot <- file$Protein
    table[i,3] <- paste(prot, sep = ",", collapse = ", ")
  }, error = function(e) cat("skipped"))
}

bind2 <- do.call(rbind, res_weights)

table2 <- table[order(table$Features),]

bind <- rbind(bind1,bind2)


library(readxl)
naming <- read_excel("path.../naming_index.xlsx")
naming <- as.data.frame(naming)
names(naming)[1] <- 'Outcome'

names(bind)[4] <- 'Outcome'
names(table)[1] <- 'Outcome'

bind <- left_join(bind, naming, by = 'Outcome')
table <- left_join(table, naming, by = 'Outcome')

write.csv(bind, 'path.../Chosen_model_files/weights_saved.csv')

table <- rbind(table1,table2)
table <- table[order(table$Features),]
write.csv(table, 'path.../Chosen_model_files/features_summary.csv', row.names = F)


### Collate metrics results -/ 
res_metrics <- list()

for(i in 1:length(traits)){
  tryCatch({
    print(traits[i])
    output_metrics <- paste0('path...//files_with_P_ROC_metrics/', traits[i], '_metricsTable.rds')
    print(output_metrics)
    file <- readRDS(output_metrics)
    file$Trait <- traits[i]
    res_metrics[[i]] <- file
  }, error = function(e) cat("skipped"))
}

bind <- do.call(rbind, res_metrics)

write.csv(bind, 'path.../joint_metrics_table_with_P_ROCs/metrics_joint.csv')


###############################################################################################

### 10yr: Process and plot model results - ROC and PR curves

###############################################################################################

library(survival)
library(gbm)
library(precrec)
library(ggplot2)

list <- list()

location <- 'path...//Chosen_model_files/Files_10y/'
outputs <- 'path.../Chosen_model_files/ROC_suppl_plots/'

### Load the traits used to input

files <- list.files(location, 'testResults.rds')
traits <- gsub("_testResults.rds*","", files)

plot_list <- list()

# Loop over and plot the results ROC curves
for(i in 1:length(traits)){
  tryCatch({

    output_path <- paste0(location)

    # Read in files
    name <- traits[i]

    testResults <- readRDS(paste0(output_path, traits[i],  "_testResults.rds"))
    testTarget <- readRDS(paste0(output_path, traits[i],  "_testTarget.rds"))

    # Assign target test file to variable and subset such that no event greater than 10 years is present (event = 0, if tte > 10)
    w1Target <- testTarget

    w1Target$Event <- sapply(1:nrow(w1Target), function(i) {
      if (w1Target$time_to_event[[i]] > 10) {
        0
      } else {
        w1Target$Event[[i]]
      }
    })

    w1Target$Event <- as.factor(w1Target$Event)

    # Assign cox test results from calculation above
    coxTestResults <- testResults

    # Calculate roc for model with score added
    
    nullResponse <- coxTestResults$r$onsetPredictions
    nullandProteinResponse <- coxTestResults$f$onsetPredictions
    coxProteinOnlyResponse <- coxTestResults$t$onsetPredictions
    fullResponse <- coxTestResults$s$onsetPredictions
    fullandProteinResponse <- coxTestResults$b$onsetPredictions

    # Plot ROC curves for models
    calibrationDFLasso <- data.frame(list('FullwithProteinScore' = fullandProteinResponse, 'Full' = fullResponse,
                                          'NullwithProteinScore' = nullandProteinResponse, 'Null' = nullResponse,
                                          'ProteinScore' = coxProteinOnlyResponse))
    calibrationDFLasso$actual <- as.factor(w1Target$Event) # factor(w1Target$Event, levels = c(1, 0))
    calibrationResultLasso <- caret::calibration(actual ~ FullwithProteinScore + Full + NullwithProteinScore + Null + ProteinScore,
                                                 data = calibrationDFLasso, class = '1')
    
    scores <- join_scores(nullResponse,  nullandProteinResponse,  coxProteinOnlyResponse, fullResponse, fullandProteinResponse )
    curves <- evalmod(scores = scores, labels = w1Target$Event, calc_avg = FALSE)
    # Save plot
    pdf(paste0(outputs, name, '.pdf'), width = 5, height = 3)
    plot(curves)
    dev.off()
    
    # Save plot
    png(paste0(outputs, name, '.png'), width = 9, height = 4.5, units = 'in', res = 300)
    plot(curves)
    dev.off()

    ### Calculate performance metrics breakdown table

    # Set thresholds for occurrence proportion of the population for cases
    thresholds <- seq(from = 0, to = 1, by = 0.1)

    # Full confusion matrix
    fullBinaryPredictions <- lapply(thresholds, function(threshold) {
      as.factor(as.numeric(fullandProteinResponse >= threshold))
    })

    names(fullBinaryPredictions) <- thresholds

    fullConfusionMatrices <- lapply(fullBinaryPredictions, function(x) {
      caret::confusionMatrix(x, w1Target$Event, positive = '1')
    })

    # Null confusion matrix
    nullBinaryPredictions <- lapply(thresholds, function(threshold) {
      as.factor(as.numeric(fullResponse >= threshold))
    })

    names(nullBinaryPredictions) <- thresholds

    nullConfusionMatrices <- lapply(nullBinaryPredictions, function(x) {
      caret::confusionMatrix(x, w1Target$Event, positive = '1')
    })

    fullMetrics <- do.call(rbind, lapply(fullConfusionMatrices, function(confusionMatrix) {confusionMatrix[[4]][1:4]}))
    fm <- fullMetrics
    colnames(fullMetrics) <- c('Full Protein Sensitivity', 'Full Protein Specificity', 'Full Protein PPV', 'Full Protein NPV')

    nullMetrics <- do.call(rbind, lapply(nullConfusionMatrices, function(confusionMatrix) {confusionMatrix[[4]][1:4]}))
    colnames(nullMetrics) <- c('Full Sensitivity', 'Full Specificity', 'Full PPV', 'Full NPV')

    metrics <- as.data.frame(cbind(fullMetrics, nullMetrics))
    #metrics <- as.data.frame(cbind(metrics, proteinonlyMetrics))

    # Save table of metrics across increments
    write.csv(metrics, paste0(outputs, name, '_.csv'), row.names = F)



  }, error = function(e) cat("skipped"))
}



###############################################################################################

### 5yr: Process and plot model results - ROC and PR curves

###############################################################################################

library(survival)
library(gbm)
library(precrec)
library(ggplot2)

list <- list()

location <- 'path.../Chosen_model_files/Files_5y/'
outputs <- 'path.../Chosen_model_files/ROC_suppl_plots/'

### Load the traits used to input

files <- list.files(location, 'testResults.rds')
traits <- gsub("_testResults.rds*","", files)

plot_list <- list()

# Loop over and plot the results ROC curves
for(i in 1:length(traits)){
  tryCatch({
    
    output_path <- paste0(location)
    
    # Read in files
    name <- traits[i]
    
    testResults <- readRDS(paste0(output_path, traits[i],  "_testResults.rds"))
    testTarget <- readRDS(paste0(output_path, traits[i],  "_testTarget.rds"))
    
    # Assign target test file to variable and subset such that no event greater than 10 years is present (event = 0, if tte > 10)
    w1Target <- testTarget
    
    w1Target$Event <- sapply(1:nrow(w1Target), function(i) {
      if (w1Target$time_to_event[[i]] > 5) {
        0
      } else {
        w1Target$Event[[i]]
      }
    })
    
    w1Target$Event <- as.factor(w1Target$Event)
    
    # Assign cox test results from calculation above
    coxTestResults <- testResults
    
    # Calculate roc for model with score added
    
    nullResponse <- coxTestResults$r$onsetPredictions
    nullandProteinResponse <- coxTestResults$f$onsetPredictions
    coxProteinOnlyResponse <- coxTestResults$t$onsetPredictions
    fullResponse <- coxTestResults$s$onsetPredictions
    fullandProteinResponse <- coxTestResults$b$onsetPredictions
    
    # Plot ROC curves for models
    calibrationDFLasso <- data.frame(list('FullwithProteinScore' = fullandProteinResponse, 'Full' = fullResponse,
                                          'NullwithProteinScore' = nullandProteinResponse, 'Null' = nullResponse,
                                          'ProteinScore' = coxProteinOnlyResponse))
    calibrationDFLasso$actual <- as.factor(w1Target$Event) # factor(w1Target$Event, levels = c(1, 0))
    calibrationResultLasso <- caret::calibration(actual ~ FullwithProteinScore + Full + NullwithProteinScore + Null + ProteinScore,
                                                 data = calibrationDFLasso, class = '1')
    
    scores <- join_scores(nullResponse,  nullandProteinResponse,  coxProteinOnlyResponse, fullResponse, fullandProteinResponse )
    curves <- evalmod(scores = scores, labels = w1Target$Event, calc_avg = FALSE)
    # Save plot
    pdf(paste0(outputs, name, '.pdf'), width = 5, height = 3)
    plot(curves)
    dev.off()
    
    # Save plot
    png(paste0(outputs, name, '.png'), width = 9, height = 4.5, units = 'in', res = 300)
    plot(curves)
    dev.off()
    
    ### Calculate performance metrics breakdown table
    
    # Set thresholds for occurrence proportion of the population for cases
    thresholds <- seq(from = 0, to = 1, by = 0.1)
    
    # Full confusion matrix
    fullBinaryPredictions <- lapply(thresholds, function(threshold) {
      as.factor(as.numeric(fullandProteinResponse >= threshold))
    })
    
    names(fullBinaryPredictions) <- thresholds
    
    fullConfusionMatrices <- lapply(fullBinaryPredictions, function(x) {
      caret::confusionMatrix(x, w1Target$Event, positive = '1')
    })
    
    # Null confusion matrix
    nullBinaryPredictions <- lapply(thresholds, function(threshold) {
      as.factor(as.numeric(fullResponse >= threshold))
    })
    
    names(nullBinaryPredictions) <- thresholds
    
    nullConfusionMatrices <- lapply(nullBinaryPredictions, function(x) {
      caret::confusionMatrix(x, w1Target$Event, positive = '1')
    })
    
    fullMetrics <- do.call(rbind, lapply(fullConfusionMatrices, function(confusionMatrix) {confusionMatrix[[4]][1:4]}))
    fm <- fullMetrics
    colnames(fullMetrics) <- c('Full Protein Sensitivity', 'Full Protein Specificity', 'Full Protein PPV', 'Full Protein NPV')
    
    nullMetrics <- do.call(rbind, lapply(nullConfusionMatrices, function(confusionMatrix) {confusionMatrix[[4]][1:4]}))
    colnames(nullMetrics) <- c('Full Sensitivity', 'Full Specificity', 'Full PPV', 'Full NPV')
    
    metrics <- as.data.frame(cbind(fullMetrics, nullMetrics))
    #metrics <- as.data.frame(cbind(metrics, proteinonlyMetrics))
    
    # Save table of metrics across increments
    write.csv(metrics, paste0(outputs, name, '_.csv'), row.names = F)
    
    
    
  }, error = function(e) cat("skipped"))
}