UKB-project / 21_Diabetes_HbA1c_PGS.R
21_Diabetes_HbA1c_PGS.R
Raw
###############################################################################################

### Additional clinical profiling and value exploration in T2D

###############################################################################################

# srun -p interactive --pty bash

library(tidyverse)
library(survival)
library(gbm)
library(precrec)
library(ggplot2)

### T2D

# Load PGS
PGS <- read.table("path.../ukb671220.tab", header = T)
PGS <- PGS[c(1,35)]

# Load phenotypes file
phen <- read.table(gzfile("path.../all_quant_20200904.pheno.v2.tsv.gz"), header = T)

# Hba1c
hb <- phen[which(colnames(phen) %in% c('IND', 'f_30750_0_0'))]
names(hb)[1] <- 'SampleID'

# Plot against diabetes score in test sample at baseline
diab <- readRDS("path.../Diab_testTarget.rds")
diab <- left_join(diab, hb, by = 'SampleID')
diab <- na.omit(diab)
base <- diab
ev <- base[which(base$Event %in% 1),]
mean(ev$time_to_event, na.rm = T)
sd(ev$time_to_event, na.rm = T)
comp <- diab[complete.cases(diab$f_30750_0_0),]
untransformed <- diab

###############################################################################################

# Rank-based inverse normalisation and scaling of each marker

diab$f_30750_0_0 <- qnorm((rank(diab$f_30750_0_0, na.last='keep')-0.5)/sum(!is.na(diab$f_30750_0_0)))
diab$f_30750_0_0<- scale(diab$f_30750_0_0)
mean(diab$f_30750_0_0, na.rm = T)
sd(diab$f_30750_0_0, na.rm = T)
diab$dScore <- qnorm((rank(diab$dScore, na.last='keep')-0.5)/sum(!is.na(diab$dScore)))
diab$dScore <- scale(diab$dScore)
mean(diab$dScore, na.rm = T)
sd(diab$dScore, na.rm = T)
transformed <- diab

###############################################################################################

# Examine AUC in comparison in 10-year follow-up window
testTarget <- diab
names(PGS)[1] <- 'SampleID'
testTarget <- left_join(testTarget, PGS, by = 'SampleID')
names(testTarget)[19] <- 'T2D_PGS' # 18 missing PGS
testTarget <- na.omit(testTarget)

riskFactorsOnlyCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex), testTarget)
PGSonly <- coxph(Surv(time_to_event, Event) ~  T2D_PGS, testTarget)
fullCoxPH <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex) + dScore, testTarget)
ProteinOnly <- coxph(Surv(time_to_event, Event) ~ dScore, testTarget)
allrisk <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex) + as.factor(Edu) + BMI + Dep + as.factor(Smo) + as.factor(Alc), testTarget)
allriskprotein <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex) + as.factor(Edu) + BMI + Dep + as.factor(Smo) + as.factor(Alc) + dScore, testTarget)
hbOnly <- coxph(Surv(time_to_event, Event) ~ f_30750_0_0, testTarget)
hbcovs <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex) + as.factor(Edu) + BMI + Dep + as.factor(Smo) + as.factor(Alc) + f_30750_0_0, testTarget)
hbprot <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex) + as.factor(Edu) + BMI + Dep + as.factor(Smo) + as.factor(Alc) + f_30750_0_0 + dScore, testTarget)
hbprotpgs <- coxph(Surv(time_to_event, Event) ~ Age_assessment + as.factor(Sex) + as.factor(Edu) + BMI + Dep + as.factor(Smo) + as.factor(Alc) + f_30750_0_0 + dScore + T2D_PGS, testTarget)
print('Model has not been sex stratified in cox.')

# List models
models <- list(r = riskFactorsOnlyCoxPH, o = PGSonly, f = fullCoxPH, 
               t = ProteinOnly, u = allrisk, b = allriskprotein, j = hbOnly, 
               m = hbcovs, q = hbprot, z = hbprotpgs)


# Generate performance statistics - for 10 years of follow up, comparing base HR to survival HR at 10 years
predictCoxPHOnset <- function(dataDF, coxPHModel, threshold = 10) {
  uniqueTimes <- sort(unique(dataDF$time_to_event))
  thresholdIndex <- match(threshold, uniqueTimes)
  cumulativeBaseHaz <- gbm::basehaz.gbm(dataDF$time_to_event, dataDF$Event, predict(coxPHModel), uniqueTimes)
  survivalPredictions <- exp(-cumulativeBaseHaz[[thresholdIndex]]) ^ exp(predict(coxPHModel))
  onsetPredictions <- 1 - survivalPredictions
  
  # Event should be 0 if tte is > 10
  dataDF$Event <- sapply(1:nrow(dataDF), function(i) {
    if (dataDF$time_to_event[[i]] > threshold) {
      0
    } else {
      dataDF$Event[[i]]
    }
  })
  
  auc <- MLmetrics::AUC(y_pred = onsetPredictions, y_true = dataDF$Event)
  prauc <- MLmetrics::PRAUC(y_pred = onsetPredictions, y_true = dataDF$Event)
  roc <- pROC::roc(response = dataDF$Event, predictor = onsetPredictions)
  list(cumulativeBaseHaz = cumulativeBaseHaz, onsetPredictions = onsetPredictions, auc = auc, prauc = prauc, roc = roc)
}

# Extract test results
testResults <- lapply(models, function(m) {predictCoxPHOnset(testTarget, m)})


# Extract onset predictions 
predictions <- lapply(testResults, function(r) {r$onsetPredictions})

# Calculate p values for AUC comparisons of interest (age/sex + protein)
p_null <- pROC::roc.test(response = testTarget$Event, predictor1 = predictions$r, predictor2 = predictions$f)$p.value
p_second <- pROC::roc.test(response = testTarget$Event, predictor1 = predictions$u, predictor2 = predictions$b)$p.value
p_third <- pROC::roc.test(response = testTarget$Event, predictor1 = predictions$m, predictor2 = predictions$q)$p.value
p_fourth <- pROC::roc.test(response = testTarget$Event, predictor1 = predictions$q, predictor2 = predictions$z)$p.value

# Extract metrics table
aucs <- sapply(testResults, function(r) {r$auc})
praucs <- sapply(testResults, function(r) {r$prauc})
metricsTable <- data.frame(AUC = aucs, PRAUC = praucs)
metricsTable[11,1] <- metricsTable[3,1] - metricsTable[1,1]
metricsTable[11,2] <- metricsTable[3,2] - metricsTable[1,2]
metricsTable[12,1] <- metricsTable[6,1] - metricsTable[5,1]
metricsTable[12,2] <- metricsTable[6,2] - metricsTable[5,2]
metricsTable[13,1] <- metricsTable[9,1] - metricsTable[8,1]
metricsTable[13,2] <- metricsTable[9,2] - metricsTable[8,2]
metricsTable[14,1] <- metricsTable[10,1] - metricsTable[9,1]
metricsTable[14,2] <- metricsTable[10,2] - metricsTable[9,2]
row.names(metricsTable) <- c('AgeSex', 'PGS', 'AgeSex_ProteinScore', 'ProteinScore Only', 'AgeSexCovs', 'AgeSexCovs_ProteinScore',
                             'HbA1c_only', 'HbA1c+age+sex+lifestyle', 'ProteinScore_HbA1c_age_sex_lifestyle', 'ProteinScore_HbA1c_age_sex_lifestyle_PGS',
                             'Diff_AgeSex', 'Diff_AgeSexCovs', 'Diff_hba1c', 'Diff_PGS')

# Add P values for AUCs to table
metricsTable$P_ROCs <- 'NA'
metricsTable[11,3] <- p_null
metricsTable[12,3] <- p_second
metricsTable[13,3] <- p_third
metricsTable[14,3] <- p_fourth

write.csv(metricsTable, 'path.../diabetes_comparison_ROC_p_PGS.csv', row.names = T)

# Assign target test file to variable and subset such that no event greater than 10 years is present (event = 0, if tte > 10)
w1Target <- testTarget

w1Target$Event <- sapply(1:nrow(w1Target), function(i) {
  if (w1Target$time_to_event[[i]] > 10) {
    0
  } else {
    w1Target$Event[[i]]
  }
})

w1Target$Event <- as.factor(w1Target$Event)

# Assign cox test results from calculation above
coxTestResults <- testResults


# quickly convert list to a dataframe
listToDataframe = function(list) {
  models = colnames(list)
  data = data.frame("Sensitivity" =  numeric(), "Specificity" = numeric(), "Model" = character())
  
  for (model in models) {
    tmp_data = data.frame("Sensitivity" =  list[['sensitivities', model]], "Specificity" = list[['specificities', model]], "Model" = model)
    data = rbind(data, tmp_data)
  }
  
  return(data)
}

rocs <- sapply(testResults, function(r) {r$roc})
rocs_df = listToDataframe(rocs)
rocs_df[which(rocs_df$Model =='r'), "Model"] = "Age/Sex"
rocs_df[which(rocs_df$Model =='o'), "Model"] = "Age/Sex/PGS"
rocs_df[which(rocs_df$Model =='f'), "Model"] = "ProteinScore/Age/Sex"
rocs_df[which(rocs_df$Model =='t'), "Model"] = "ProteinScore"
rocs_df[which(rocs_df$Model =='u'), "Model"] = "Age/Sex/Lifestyle"
rocs_df[which(rocs_df$Model =='b'), "Model"] = "ProteinScore/Age/Sex/Lifestyle"
rocs_df[which(rocs_df$Model =='j'), "Model"] = "HbA1c"
rocs_df[which(rocs_df$Model =='m'), "Model"] = "HbA1c/Age/Sex/Lifestyle"
rocs_df[which(rocs_df$Model =='q'), "Model"] = "ProteinScore/HbA1c/Age/Sex/Lifestyle"
rocs_df[which(rocs_df$Model =='z'), "Model"] = "ProteinScore/HbA1c/Age/Sex/Lifestyle/PGS"

cbPalette <- c("red", "gold1", "darkorange3", "violetred1", "darkturquoise",
               "darkorchid2", "tan1", "blue", "gray46", "black")

My_Theme = theme(
  axis.title.x = element_text(size = 16),
  axis.text.x = element_text(size = 12),
  axis.text.y = element_text(size =12),
  axis.title.y = element_text(size = 16),
  strip.text = element_text(size = 12, face = "bold"),
  legend.text=element_text(size=16),
  legend.title=element_text(size=16, face = "bold"))
# , legend.position = "none"

rocs_df$Model <-  factor(rocs_df$Model, levels = c("ProteinScore/HbA1c/Age/Sex/Lifestyle/PGS",
                                                   "ProteinScore/HbA1c/Age/Sex/Lifestyle",
                                                   "HbA1c/Age/Sex/Lifestyle", 
                                                   "HbA1c",
                                                   "ProteinScore/Age/Sex/Lifestyle",
                                                   "ProteinScore/Age/Sex",
                                                   "ProteinScore",
                                                   "Age/Sex/Lifestyle",
                                                   "Age/Sex/PGS",
                                                   "Age/Sex"))

# draw ROC curves
pdf('path.../IAB_rocs_diabetes_hba1c_PGS.pdf', width = 12, height = 6)
rocs_df %>%
  ggplot( aes(x=1-Specificity, y=Sensitivity, group=Model, color=Model)) +
  geom_line() +
  theme_light() +
  # scale_color_viridis(discrete=TRUE) 
  scale_colour_manual(values=cbPalette)   +
  theme(legend.position = "bottom", legend.title = element_blank()) +
  guides(color = guide_legend(nrow = 8)) +
  xlab("False Positive Rate (1 - Specificity)") +
  ylab("True Postive Rate (Sensitivity)")  + theme_classic() + My_Theme
dev.off()


# DRAW WITHOUT LEGEND
pdf('path.../IAB_rocs_diabetes_hba1c_PGS2.pdf', width = 8, height = 6)
rocs_df %>%
  ggplot( aes(x=1-Specificity, y=Sensitivity, group=Model, color=Model)) +
  geom_line() +
  theme_light() +
  # scale_color_viridis(discrete=TRUE) 
  scale_colour_manual(values=cbPalette)   +
  theme(legend.position = "bottom", legend.title = element_blank()) +
  guides(color = guide_legend(nrow = 8)) +
  xlab("False Positive Rate (1 - Specificity)") +
  ylab("True Postive Rate (Sensitivity)")  + theme_classic() + My_Theme + theme(legend.position = 'none')
dev.off()


###############################################################################################

# Plot with scaling 
diab <- transformed
diab <- diab[order(diab$dScore),]
diab <- diab %>% mutate(decile = ntile(diab$dScore, 10))
diab$decile <- as.factor(diab$decile)

diab$Event <- as.factor(diab$Event)
library(ggpubr)

My_Theme = theme(
  axis.title.x = element_text(size = 16),
  axis.text.x = element_text(size = 12),
  axis.text.y = element_text(size =12),
  axis.title.y = element_text(size = 16),
  strip.text = element_text(size = 12, face = "bold"),
  legend.text=element_text(size=16),
  legend.title=element_text(size=16, face = "bold"), legend.position = "none")

pdf('path.../plot_scatter_hba1c_diab_no_legend_loes.pdf', width = 6, height = 5)
ggplot(diab, aes(x=dScore, y=f_30750_0_0, group = Event)) +
  geom_point(size = 0.2)+
  stat_density_2d(geom = "polygon",
                  aes(alpha = ..level.., fill = Event, bins = 3)) + theme_minimal() + theme_classic() +
  scale_fill_manual(values = c("dodgerblue", "tomato")) +
  geom_density_2d(aes(color = Event)) +
  scale_color_manual(values = c("blue4", "firebrick1")) + My_Theme + xlab('Type 2 diabetes ProteinScore') + ylab('HbA1c')
dev.off()

###############################################################################################

# Plot without scaling 
diab <- untransformed
diab <- diab[order(diab$dScore),]
diab <- diab %>% mutate(decile = ntile(diab$dScore, 10))
diab$decile <- as.factor(diab$decile)
diab$Event <- as.factor(diab$Event)
library(ggpubr)

# Do an anova one way on deciles to get a p value for inclusion
# https://www.r-bloggers.com/2022/05/one-way-anova-example-in-r-quick-guide/
res.aov <- aov(f_30750_0_0 ~ decile, data = diab)

My_Theme = theme(
  axis.title.x = element_text(size = 16),
  axis.text.x = element_text(size = 12),
  axis.text.y = element_text(size =12),
  axis.title.y = element_text(size = 16),
  strip.text = element_text(size = 12, face = "bold"),
  legend.text=element_text(size=16),
  legend.title=element_text(size=16, face = "bold"), legend.position = "none")

pdf('path.../plot_deciles_rect_hba1c_diab_no_scaling.pdf', width = 6, height = 5)
ggplot(diab, aes(x=decile, y=f_30750_0_0)) + 
  geom_violin(trim=FALSE, fill='lightcyan2', color="cadetblue4")+
  geom_boxplot(width=0.1, color="darkslategrey", fill="darkslategrey", alpha=0.2) + theme_classic() + 
  labs(x="Type 2 diabetes ProteinScore decile", y = 'HbA1c (mmol/mol)')+ 
  annotate("rect", ymin = 42, ymax = 47, xmin = 0, xmax = 10.5,
           alpha = .1,fill = "blue") +
  My_Theme  + ylim(0,130)
dev.off()