UKB-project / 02_Initial_protein_locations.R
02_Initial_protein_locations.R
Raw
###############################################################################################

## Load protein locations

###############################################################################################

# srun -p interactive --pty bash
# Load packages

library(tidyverse) 
library(data.table)
library(pacman)
p_load(tidyverse, data.table, magrittr, tools, ggpubr)

# Load batch info and list of Olink protein index files from Ben
batchinfo = fread('path/...', colClasses = c("character")) %>% as_tibble()
batchinfo <- as.data.frame(batchinfo) 
length(unique(batchinfo$SampleID)) # 60463
length(unique(batchinfo$App_26041)) # 54309
length(unique(batchinfo$pseudo_ind_id)) # 54309
length(unique(batchinfo$UKBPPP_SampleID)) # 60463

# List of Olink protein measurements available (1472 protein observations)
olink_proteins = fread('path/...', colClasses = c("character")) %>%
  distinct(UKBPPP_ProteinID, Panel) %>% as_tibble()

olink_proteins <- as.data.frame(olink_proteins)
olink_proteins[,1] <- sub(':', '.', olink_proteins[,1])
olink_proteins[,1] <- sub(':', '.', olink_proteins[,1])
olink_proteins[,1] <- sub(':', '.', olink_proteins[,1])
olink_proteins$P <- gsub("\\..*", "", olink_proteins[,1])  

t <- olink_proteins[which(duplicated(olink_proteins$P)),]

write.csv(olink_proteins, 'path/...', row.names = F)

# Protein measurements for 54,189 individuals and 1,474 protein levels
olink_internal = read.csv('path/...')

# Check dist
hist(olink_internal$NPPB.P16860.OID20049.v1)

# Check missingness across one protein measurement
table(is.na(olink_internal$NPPB.P16860.OID20049.v1))