UKB-project / 04_Covariate_prep.R
04_Covariate_prep.R
Raw
###############################################################################################

## Covariate prep

###############################################################################################

# srun -p interactive --pty bash

# module load R

R

library(ggplot2)
library(data.table)

# Prepped data
olink_internal <- read.csv('path/...')

# Extraction (v1)
dat <- read.csv('path/...')
dat <- dat[which(dat$f.eid %in% olink_internal$SampleID),]
covs <- dat[,which(colnames(dat) %in% c('f.eid','f.189.0.0', 'f.20116.0.0', 'f.1558.0.0', 'f.21001.0.0'))]
names(covs) <- c('SampleID', 'Dep', 'Alc', 'Smo', 'BMI')

# Plot distributions

pdf('path/...')
ggplot(covs, aes(x=BMI)) + 
  geom_histogram(aes(y=..density..), colour="black", fill="white")+
  geom_density(alpha=.2, fill="#FF6666") 
dev.off() # 244 missing

pdf('path/...')
ggplot(covs, aes(x=Dep)) + 
  geom_histogram(aes(y=..density..), colour="black", fill="white")+
  geom_density(alpha=.2, fill="#FF6666")
dev.off() # 62 missing

pdf('path/...')
ggplot(covs, aes(x=Smo)) + 
  geom_histogram(aes(y=..density..), colour="black", fill="white")+
  geom_density(alpha=.2, fill="#FF6666")
dev.off() # 55 missing

pdf('path/...')
ggplot(covs, aes(x=Alc)) + 
  geom_histogram(aes(y=..density..), colour="black", fill="white")+
  geom_density(alpha=.2, fill="#FF6666")
dev.off() # 55 missing


# Load in additional covariates from ben GWAS
cov <- fread('path/...')
cov <- cov[which(cov$IID %in% olink_internal$SampleID),]
table(is.na(cov$UKBPC_16)) # FALSE- 48821 with genetic data
table(cov$ukb_centre_fct)

# Remove prefer no to answer for alc and smo
length(which(covs$Smo < 0)) # 188
covs$Smo[covs$Smo < 0] <- NA
length(which(covs$Alc < 0)) # 59
covs$Alc[covs$Alc < 0] <- NA

# Join up covariates into one set
library(tidyverse)
names(cov)[2] <- 'SampleID'
joint <- left_join(covs, cov, by = 'SampleID')

# Add updated data linkage to education qualifications
t <- read.csv('path/...')
ed <- t[c(1,10:33)]
#1	College or University degree
#2	A levels/AS levels or equivalent
#3	O levels/GCSEs or equivalent
#4	CSEs or equivalent
#5	NVQ or HND or HNC or equivalent
#6	Other professional qualifications eg: nursing, teaching
#-7	None of the above
#-3	Prefer not to answer

# For each individual, search for whether they had a 1 (college or uni degree) and index their IDs
sub1 <- ed[,c(1,2)]
sub1 <- sub1[which(sub1[,2] %in% 1),]
names(sub1)[2] <- 'X'
sub2 <- ed[,c(1,8)]
sub2 <- sub2[which(sub2[,2] %in% 1),]
names(sub2)[2] <- 'X'
sub3 <- ed[,c(1,14)]
sub3 <- sub3[which(sub3[,2] %in% 1),]
names(sub3)[2] <- 'X'
sub4 <- ed[,c(1,20)]
sub4 <- sub4[which(sub4[,2] %in% 1),]
names(sub4)[2] <- 'X'

bind <- rbind(sub1, sub2)
bind <- rbind(bind, sub3)
bind <- rbind(bind, sub4)
names(bind)[1] <- 'SampleID'
bind <- bind[which(bind$SampleID %in% olink_internal$SampleID),]
length(unique(bind[,1])) 
names(ed) <- c('SampleID', 'Educ') # 16,216
names(ed)[1] <- 'SampleID'
ed <- ed[which(ed$SampleID %in% olink_internal$SampleID),]
ed$Edu <- ifelse(ed$SampleID %in% bind$SampleID, 1, 0)
table(ed$Educ)

#0     1
#33018 16216
ed <- ed[c(1,26)]
joint <- left_join(joint, ed, by = 'SampleID')
write.csv(joint, 'path/...', row.names = F)