############################################################################################### ## Protein data preparation ############################################################################################### # srun -p interactive --pty bash library(tidyverse) library(data.table) library(pacman) library(tidyverse) olink_internal = read.csv('path/...') names(olink_internal)[1] <- 'SampleID' # 54,189 total ## Remove related individuals rel <- read.table('path/...', header = T) dim(rel) # 107161 individual pairs rel <- subset(rel, ID1 %in% olink_internal$SampleID & ID2 %in% olink_internal$SampleID) # 1325 pairs write.csv(rel, 'path/...') list1 <- as.data.frame(rel$ID1) list2 <- as.data.frame(rel$ID2) names(list1) <- "X" names(list2) <- "X" list <- rbind(list1, list2) list <- list$X # 2650 length(unique(list)) # 2420 unique individuals - so family structures exist with multiple ID connections t <- as.data.frame(table(list)) t <- t[order(-t$Freq),] m <- t[which(t$Freq > 1),] # 108 individuals that are part of more than 1 pairing n <- t[which(t$Freq < 2),] # 2312 for(i in 1:1325){ pair <- rel[i,1:2] pair_list <- c(pair$ID1, pair$ID2) if(pair$ID1 %in% m$list == TRUE){ keep1 <- 'ID1' } else { keep1 <- 'No' } rel[i,6] <- keep1 } for(i in 1:1325){ pair <- rel[i,1:2] pair_list <- c(pair$ID1, pair$ID2) if(pair$ID2 %in% m$list == TRUE){ keep1 <- 'ID2' } else { keep1 <- 'No' } rel[i,7] <- keep1 } rel$V8 <- ifelse(rel$V6 == 'ID1' & rel$V7 == 'ID2', 'BOTH', 'No') both <- rel[which(rel$V8 %in% 'BOTH'),] # Exclude pairs with both IDs in multiple families olink_ex <- olink_internal[-which(olink_internal$SampleID %in% both$ID1),] olink_ex <- olink_ex[-which(olink_ex$SampleID %in% both$ID2),] dim(olink_ex) # 54,110 # Find instances where individuals were part of multiple family structures and in pairs # with others that were not - then select the one in single pairing sub <- rel[-which(rel$V8 %in% 'BOTH'),] sub1 <- sub[which(sub$V6 %in% 'ID1'),] olink_ex <- olink_ex[-which(olink_ex$SampleID %in% sub1$ID1),] dim(olink_ex) # 54,092 sub2 <- sub[which(sub$V7 %in% 'ID2'),] olink_ex <- olink_ex[-which(olink_ex$SampleID %in% sub2$ID2),] dim(olink_ex) # 54,081 # So individuals in multiple family structures have been removed # We can then sample the unique pairs and remove one person at random # We can be sure that the remaining person is not part of any other pairs, as # we've filtered out those with multiple pairs. # Now look at those in single pairs and select one at random single <- rel[which(rel$V6 %in% 'No'),] single <- single[which(single$V7 %in% 'No'),] single <- single[which(single$V8 %in% 'No'),] # 1119 pairs to sample randomly for(i in 1:length(single$ID1)){ pair <- single[i,1:2] pair_list <- c(pair$ID1, pair$ID2) keep <- sample(pair_list, size=1, replace=FALSE) single[i,9] <- keep } write.csv(rel, 'path/...', row.names = F) write.csv(both, 'path/...', row.names = F) write.csv(single, 'path/...', row.names = F) # Remove the final set of 1/2 pairs in single pair matches olink_ex <- olink_ex[-which(olink_ex$SampleID %in% single$V9),] which(olink_ex$SampleID %in% m$list) # 0 dim(olink_ex) # 52,962 write.csv(olink_ex, 'path/...', row.names = F) ## Check people's missingness for the protein measurements olink_internal <- olink_ex length <- length(rownames(olink_internal)) res <- data.frame(SampleID = 1:length, Complete = 1:length, Missing = 1:length) for (i in 1:length){ variable <- as.character(olink_internal$SampleID[i]) individual <- olink_internal[which(olink_internal$SampleID %in% variable),] individual <- individual[3:1474] complete <- table(is.na(individual)) index_present <- complete[1] index_missing <- 1472 - index_present res[i,1] <- variable res[i,2] <- index_present res[i,3] <- index_missing print(i) } # Order by those that have the most missing data res <- res[order(res$Complete),] # Index how many people have >10% missingness as a proportion res$prop <- res$Missing / 1472 res$exclude <- ifelse(res$prop > 0.1, '1', '0') # Save off missingness summary write.csv(res, 'path/...', row.names = F) ## Check protein missingness in the subset of individuals that have >10% data olink_test <- read.csv('path/...') olink_exclude <- olink_test %>% filter(exclude != '1') # # Subset data to individuals to keep olink_internal_sub <- olink_internal[which(olink_internal$SampleID %in% olink_exclude$SampleID),] length <- length(colnames(olink_internal_sub)) res2 <- data.frame(Variable = 1:length, Complete = 1:length, Missing = 1:length) for (i in 1:length){ variable <- as.character(colnames(olink_internal_sub)[i]) complete <- olink_internal_sub[which(complete.cases(olink_internal_sub[,variable])),] incomplete <- olink_internal_sub[-which(complete.cases(olink_internal_sub[,variable])),] index_present <- dim(complete)[1] index_missing <- dim(incomplete)[1] res2[i,1] <- variable res2[i,2] <- index_present res2[i,3] <- index_missing print(i) } # Save off missingness summary res2 <- res2[order(res2$Complete),] # Index how many proteins have >10% missingness res2$prop <- res2$Missing / 50372 res2$exclude <- ifelse(res2$prop > 0.1, '1', '0') # Save out file with exclusion summaries for proteins res2 <- res2[-c(1473:1474),] write.csv(res2, 'path/...', row.names = F) ########################################################################### ### Subset the protein dataset to exclude proteins with >10% missing data olink_internal <- read.csv('path/...') # Load people exclusions olink_test <- read.csv('path/...') olink_exclude <- olink_test %>% filter(exclude != '1') # 49234 to keep # Subset data to individuals to keep olink_internal_sub <- olink_internal[which(olink_internal$SampleID %in% olink_exclude$SampleID),] # Load protein exclusions olink_test2 <- read.csv('path/...') olink_exclude <- olink_test2 %>% filter(exclude != '1') # Subset data to individuals to keep olink_IDs <- olink_internal_sub[c(1:2)] olink_internal_sub2 <- olink_internal_sub[,which(colnames(olink_internal_sub) %in% olink_exclude$Variable)] olink_internal_sub2 <- cbind(olink_IDs, olink_internal_sub2) # There are now 49234 people (of 54,189 - 3728 excluded) and 1,468 protein measurements (of 1,472 - 4 excluded) that are now included ##################################################################### # KNN imputation #if (!require("BiocManager", quietly = TRUE)) # install.packages("BiocManager") #BiocManager::install("impute") #https://www.rdocumentation.org/packages/impute/versions/1.46.0/topics/impute.knn library(impute) IDs <- olink_internal_sub2[c(1:2)] data <- olink_internal_sub2[c(3:1470)] data <- as.matrix(data) rownames(data) <- IDs$pseudo_ind_id data <- t(data) imputed <- impute.knn(data) imputed_data <- as.data.frame(t(imputed$data)) identical(as.character(rownames(imputed_data)), as.character(IDs$pseudo_ind_id)) imputed_data <- cbind(IDs, imputed_data) write.csv(imputed_data, 'path/...', row.names = F) # These data are used to feed into ProteinScores scripts (then transformed in train/test sets selected at random) olink_internal_sub2 <- imputed_data # Rank-based inverse normalisation and scaling of each protein for(i in colnames(olink_internal_sub2)[c(3:1470)]){ olink_internal_sub2[,i] <- qnorm((rank(olink_internal_sub2[,i], na.last='keep')-0.5)/sum(!is.na(olink_internal_sub2[,i]))) } # Scale protein data olink_internal_sub2[,3:1470] <- apply(olink_internal_sub2[,3:1470], 2, scale) mean(olink_internal_sub2[,200], na.rm = T) sd(olink_internal_sub2[,200], na.rm = T) write.csv(olink_internal_sub2, 'path/...', row.names = F) # These data are used in individual cox ph models (transformed as a whole) ##################################################################### ### SENSITIVITY - checking that no major batch/genetic PC effects were present across the data (aligned to Bens checks in his GWAS work) # Subset dataset to those with genetic information available and preadjust for genetic PCs and study centre effects covs <- read.csv('path/...') covs <- covs[complete.cases(covs$Batch),] olink_internal_sub2 <- read.csv('path/...') olink_internal_sub2 <- olink_internal_sub2[which(olink_internal_sub2$SampleID %in% covs$SampleID),] library(tidyverse) ## Regress Proteins onto Covariates phenotypes_residualised <- olink_internal_sub2 for(i in colnames(phenotypes_residualised)[3:1470]){ phenotypes_residualised[,i]<- lm(phenotypes_residualised[,i] ~ factor(ukb_centre_fct) + factor(Batch) + UKBPC_1 + UKBPC_2 + UKBPC_3 + UKBPC_4 + UKBPC_5 + UKBPC_6 + UKBPC_7 + UKBPC_8 + UKBPC_9 + UKBPC_10 + UKBPC_11 + UKBPC_12 + UKBPC_13 + UKBPC_14 + UKBPC_15 + UKBPC_16 + UKBPC_17 + UKBPC_18 + UKBPC_19 + UKBPC_20, na.action = na.exclude, data = phenotypes_residualised)$residuals } write.csv(phenotypes_residualised, 'path/...', row.names = F) # Correlate residuals against protein levels prot <- read.csv('path/...') resid <- read.csv('path/...') prot <- prot[which(prot$SampleID %in% resid$SampleID),] length <- c(3:1470) names <- colnames(prot)[3:1470] res <- data.frame(Prot = 1:3, Coef = 1:3, p = 1:3) for (i in 1:1468){ protein <- as.character(names[i]) cor <- cor.test(prot[,protein], resid[,protein], na.rm = T) r <- cor$estimate p <- cor$p.value res[i,1] <- protein res[i,2] <- r res[i,3] <- p } res <- res[order(res$Coef),] write.csv(res, 'path/...', row.names = F)