############################################################################################### ## Covariate prep ############################################################################################### # srun -p interactive --pty bash # module load R R library(ggplot2) library(data.table) # Prepped data olink_internal <- read.csv('path/...') # Extraction (v1) dat <- read.csv('path/...') dat <- dat[which(dat$f.eid %in% olink_internal$SampleID),] covs <- dat[,which(colnames(dat) %in% c('f.eid','f.189.0.0', 'f.20116.0.0', 'f.1558.0.0', 'f.21001.0.0'))] names(covs) <- c('SampleID', 'Dep', 'Alc', 'Smo', 'BMI') # Plot distributions pdf('path/...') ggplot(covs, aes(x=BMI)) + geom_histogram(aes(y=..density..), colour="black", fill="white")+ geom_density(alpha=.2, fill="#FF6666") dev.off() # 244 missing pdf('path/...') ggplot(covs, aes(x=Dep)) + geom_histogram(aes(y=..density..), colour="black", fill="white")+ geom_density(alpha=.2, fill="#FF6666") dev.off() # 62 missing pdf('path/...') ggplot(covs, aes(x=Smo)) + geom_histogram(aes(y=..density..), colour="black", fill="white")+ geom_density(alpha=.2, fill="#FF6666") dev.off() # 55 missing pdf('path/...') ggplot(covs, aes(x=Alc)) + geom_histogram(aes(y=..density..), colour="black", fill="white")+ geom_density(alpha=.2, fill="#FF6666") dev.off() # 55 missing # Load in additional covariates from ben GWAS cov <- fread('path/...') cov <- cov[which(cov$IID %in% olink_internal$SampleID),] table(is.na(cov$UKBPC_16)) # FALSE- 48821 with genetic data table(cov$ukb_centre_fct) # Remove prefer no to answer for alc and smo length(which(covs$Smo < 0)) # 188 covs$Smo[covs$Smo < 0] <- NA length(which(covs$Alc < 0)) # 59 covs$Alc[covs$Alc < 0] <- NA # Join up covariates into one set library(tidyverse) names(cov)[2] <- 'SampleID' joint <- left_join(covs, cov, by = 'SampleID') # Add updated data linkage to education qualifications t <- read.csv('path/...') ed <- t[c(1,10:33)] #1 College or University degree #2 A levels/AS levels or equivalent #3 O levels/GCSEs or equivalent #4 CSEs or equivalent #5 NVQ or HND or HNC or equivalent #6 Other professional qualifications eg: nursing, teaching #-7 None of the above #-3 Prefer not to answer # For each individual, search for whether they had a 1 (college or uni degree) and index their IDs sub1 <- ed[,c(1,2)] sub1 <- sub1[which(sub1[,2] %in% 1),] names(sub1)[2] <- 'X' sub2 <- ed[,c(1,8)] sub2 <- sub2[which(sub2[,2] %in% 1),] names(sub2)[2] <- 'X' sub3 <- ed[,c(1,14)] sub3 <- sub3[which(sub3[,2] %in% 1),] names(sub3)[2] <- 'X' sub4 <- ed[,c(1,20)] sub4 <- sub4[which(sub4[,2] %in% 1),] names(sub4)[2] <- 'X' bind <- rbind(sub1, sub2) bind <- rbind(bind, sub3) bind <- rbind(bind, sub4) names(bind)[1] <- 'SampleID' bind <- bind[which(bind$SampleID %in% olink_internal$SampleID),] length(unique(bind[,1])) names(ed) <- c('SampleID', 'Educ') # 16,216 names(ed)[1] <- 'SampleID' ed <- ed[which(ed$SampleID %in% olink_internal$SampleID),] ed$Edu <- ifelse(ed$SampleID %in% bind$SampleID, 1, 0) table(ed$Educ) #0 1 #33018 16216 ed <- ed[c(1,26)] joint <- left_join(joint, ed, by = 'SampleID') write.csv(joint, 'path/...', row.names = F)