UKB-project / 03_Protein_prep.R
03_Protein_prep.R
Raw
###############################################################################################

## Protein data preparation

###############################################################################################

# srun -p interactive --pty bash

library(tidyverse) 
library(data.table)
library(pacman)
library(tidyverse)

olink_internal = read.csv('path/...')
names(olink_internal)[1] <- 'SampleID' # 54,189 total 

## Remove related individuals
rel <- read.table('path/...', header = T) 
dim(rel) # 107161 individual pairs
rel <- subset(rel, ID1 %in% olink_internal$SampleID & ID2 %in% olink_internal$SampleID) # 1325 pairs
write.csv(rel, 'path/...')

list1 <- as.data.frame(rel$ID1)
list2 <- as.data.frame(rel$ID2)
names(list1) <- "X"
names(list2) <- "X"
list <- rbind(list1, list2) 
list <- list$X # 2650
length(unique(list)) # 2420 unique individuals - so family structures exist with multiple ID connections
t <- as.data.frame(table(list))
t <- t[order(-t$Freq),]
m <- t[which(t$Freq > 1),] # 108 individuals that are part of more than 1 pairing 

 n <- t[which(t$Freq < 2),] # 2312
 
 for(i in 1:1325){
   pair <- rel[i,1:2]
   pair_list <- c(pair$ID1, pair$ID2)
   if(pair$ID1 %in% m$list == TRUE){
     keep1 <- 'ID1'
   } else {
     keep1 <- 'No'
   }
   rel[i,6] <- keep1
 }

 for(i in 1:1325){
   pair <- rel[i,1:2]
   pair_list <- c(pair$ID1, pair$ID2)
   if(pair$ID2 %in% m$list == TRUE){
     keep1 <- 'ID2'
   } else { 
     keep1 <- 'No'
   }
   rel[i,7] <- keep1
 }
 
 rel$V8 <- ifelse(rel$V6 == 'ID1' & rel$V7 == 'ID2', 'BOTH', 'No')
 
both <- rel[which(rel$V8 %in% 'BOTH'),] 

# Exclude pairs with both IDs in multiple families
olink_ex <- olink_internal[-which(olink_internal$SampleID %in% both$ID1),]  
olink_ex <- olink_ex[-which(olink_ex$SampleID %in% both$ID2),] 
dim(olink_ex) # 54,110

# Find instances where individuals were part of multiple family structures and in pairs 
# with others that were not - then select the one in single pairing
sub <- rel[-which(rel$V8 %in% 'BOTH'),]
sub1 <- sub[which(sub$V6 %in% 'ID1'),]
olink_ex <- olink_ex[-which(olink_ex$SampleID %in% sub1$ID1),]
dim(olink_ex) #  54,092
sub2 <- sub[which(sub$V7 %in% 'ID2'),]
olink_ex <- olink_ex[-which(olink_ex$SampleID %in% sub2$ID2),]
dim(olink_ex) #  54,081

# So individuals in multiple family structures have been removed
# We can then sample the unique pairs and remove one person at random
# We can be sure that the remaining person is not part of any other pairs, as
# we've filtered out those with multiple pairs.

# Now look at those in single pairs and select one at random
single <- rel[which(rel$V6 %in% 'No'),]
single <- single[which(single$V7 %in% 'No'),]
single <- single[which(single$V8 %in% 'No'),] # 1119 pairs to sample randomly 

for(i in 1:length(single$ID1)){
  pair <- single[i,1:2]
  pair_list <- c(pair$ID1, pair$ID2)
  keep <- sample(pair_list, size=1, replace=FALSE)
  single[i,9] <- keep
}

write.csv(rel, 'path/...', row.names = F)
write.csv(both, 'path/...', row.names = F)
write.csv(single, 'path/...', row.names = F)

# Remove the final set of 1/2 pairs in single pair matches
olink_ex <- olink_ex[-which(olink_ex$SampleID %in% single$V9),]  
which(olink_ex$SampleID %in% m$list) # 0
dim(olink_ex) # 52,962
write.csv(olink_ex, 'path/...', row.names = F)

## Check people's missingness for the protein measurements
olink_internal <- olink_ex
length <- length(rownames(olink_internal))
res <- data.frame(SampleID = 1:length, Complete = 1:length, Missing = 1:length)

for (i in 1:length){
  variable <- as.character(olink_internal$SampleID[i])
  individual <- olink_internal[which(olink_internal$SampleID %in% variable),]
  individual <- individual[3:1474]
  complete <- table(is.na(individual))
  index_present <- complete[1]
  index_missing <- 1472 - index_present 
  res[i,1] <- variable
  res[i,2] <-  index_present
  res[i,3] <- index_missing
  print(i)
}

# Order by those that have the most missing data 
res <- res[order(res$Complete),]

# Index how many people have >10% missingness as a proportion 
res$prop <- res$Missing / 1472
res$exclude <- ifelse(res$prop > 0.1, '1', '0')

# Save off missingness summary 
write.csv(res, 'path/...', row.names = F)

## Check protein missingness in the subset of individuals that have >10% data 
olink_test <- read.csv('path/...')
olink_exclude <- olink_test %>% filter(exclude != '1') #

# Subset data to individuals to keep 
olink_internal_sub <- olink_internal[which(olink_internal$SampleID %in% olink_exclude$SampleID),]

length <- length(colnames(olink_internal_sub))
res2 <- data.frame(Variable = 1:length, Complete = 1:length, Missing = 1:length)

for (i in 1:length){
  variable <- as.character(colnames(olink_internal_sub)[i])
  complete <- olink_internal_sub[which(complete.cases(olink_internal_sub[,variable])),]
  incomplete <- olink_internal_sub[-which(complete.cases(olink_internal_sub[,variable])),]
  index_present <- dim(complete)[1]
  index_missing <- dim(incomplete)[1]
  res2[i,1] <- variable
  res2[i,2] <-  index_present
  res2[i,3] <- index_missing
  print(i)
}

# Save off missingness summary 
res2 <- res2[order(res2$Complete),]

# Index how many proteins have >10% missingness
res2$prop <- res2$Missing / 50372
res2$exclude <- ifelse(res2$prop > 0.1, '1', '0')

# Save out file with exclusion summaries for proteins 
res2 <- res2[-c(1473:1474),]
write.csv(res2, 'path/...', row.names = F)

###########################################################################

### Subset the protein dataset to exclude proteins with >10% missing data

olink_internal <- read.csv('path/...')

# Load people exclusions
olink_test <- read.csv('path/...')
olink_exclude <- olink_test %>% filter(exclude != '1') # 49234 to keep 

# Subset data to individuals to keep 
olink_internal_sub <- olink_internal[which(olink_internal$SampleID %in% olink_exclude$SampleID),]

# Load protein exclusions
olink_test2 <- read.csv('path/...')
olink_exclude <- olink_test2 %>% filter(exclude != '1') 

# Subset data to individuals to keep 
olink_IDs <- olink_internal_sub[c(1:2)]
olink_internal_sub2 <- olink_internal_sub[,which(colnames(olink_internal_sub) %in% olink_exclude$Variable)]
olink_internal_sub2 <- cbind(olink_IDs, olink_internal_sub2)
# There are now 49234 people (of 54,189 - 3728 excluded) and 1,468 protein measurements (of 1,472 - 4 excluded) that are now included 

#####################################################################

# KNN imputation

#if (!require("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
#BiocManager::install("impute")
#https://www.rdocumentation.org/packages/impute/versions/1.46.0/topics/impute.knn

library(impute)

IDs <- olink_internal_sub2[c(1:2)]
data <- olink_internal_sub2[c(3:1470)]
data <- as.matrix(data)
rownames(data) <- IDs$pseudo_ind_id
data <- t(data)

imputed <- impute.knn(data)
imputed_data <- as.data.frame(t(imputed$data))

identical(as.character(rownames(imputed_data)), as.character(IDs$pseudo_ind_id))

imputed_data <- cbind(IDs, imputed_data)

write.csv(imputed_data, 'path/...', row.names = F)
# These data are used to feed into ProteinScores scripts (then transformed in train/test sets selected at random)

olink_internal_sub2 <- imputed_data

# Rank-based inverse normalisation and scaling of each protein
for(i in colnames(olink_internal_sub2)[c(3:1470)]){ 
  olink_internal_sub2[,i] <- qnorm((rank(olink_internal_sub2[,i], na.last='keep')-0.5)/sum(!is.na(olink_internal_sub2[,i])))
}

# Scale protein data 
olink_internal_sub2[,3:1470] <- apply(olink_internal_sub2[,3:1470], 2, scale)

mean(olink_internal_sub2[,200], na.rm = T)
sd(olink_internal_sub2[,200], na.rm = T)

write.csv(olink_internal_sub2, 'path/...', row.names = F)
# These data are used in individual cox ph models (transformed as a whole)

#####################################################################

### SENSITIVITY - checking that no major batch/genetic PC effects were present across the data (aligned to Bens checks in his GWAS work)

# Subset dataset to those with genetic information available and preadjust for genetic PCs and study centre effects 

covs <- read.csv('path/...')
covs <- covs[complete.cases(covs$Batch),]
olink_internal_sub2 <- read.csv('path/...')
olink_internal_sub2 <- olink_internal_sub2[which(olink_internal_sub2$SampleID %in% covs$SampleID),]
library(tidyverse)

## Regress Proteins onto Covariates 
phenotypes_residualised <- olink_internal_sub2
for(i in colnames(phenotypes_residualised)[3:1470]){ 
  phenotypes_residualised[,i]<- lm(phenotypes_residualised[,i] ~ factor(ukb_centre_fct) + factor(Batch) + 
                                     UKBPC_1 + UKBPC_2 + UKBPC_3 + UKBPC_4 + UKBPC_5 + UKBPC_6 + UKBPC_7 + UKBPC_8 + UKBPC_9 + UKBPC_10 +
                                     UKBPC_11 + UKBPC_12 + UKBPC_13 + UKBPC_14 + UKBPC_15 + UKBPC_16 + UKBPC_17 + UKBPC_18 + UKBPC_19 + UKBPC_20, 
                                   na.action = na.exclude, data = phenotypes_residualised)$residuals
}

write.csv(phenotypes_residualised, 'path/...', row.names = F)

# Correlate residuals against protein levels 

prot <- read.csv('path/...')
resid <- read.csv('path/...')
prot <- prot[which(prot$SampleID %in% resid$SampleID),]
length <- c(3:1470)
names <- colnames(prot)[3:1470]
res <- data.frame(Prot = 1:3, Coef = 1:3, p = 1:3)

for (i in 1:1468){
  protein <- as.character(names[i])
  cor <- cor.test(prot[,protein], resid[,protein], na.rm = T)
  r <- cor$estimate
  p <- cor$p.value
  res[i,1] <- protein
  res[i,2] <- r
  res[i,3] <- p
}

res <- res[order(res$Coef),]
write.csv(res, 'path/...', row.names = F)