############################################################################################### ## Health linkage preps for cox ############################################################################################### # srun -p interactive --pty bash # module load R R ### PREP BASIS FOR COX MODELS library(survival) library(coxme) library(readxl) library(tidyverse) library(gdata) # Process phenotype (d1) and death / alive date data dat <- read.csv('path/...') d1 <- dat[,which(colnames(dat) %in% c('f.eid','f.21022.0.0', 'f.52.0.0', 'f.34.0.0', 'f.31.0.0', 'f.40000.0.0', 'f.40007.0.0'))] names(d1) <- c('SampleID', 'Sex', 'YOB', 'MOB', 'Age_recruitment', 'DOD', 'Age_death') # Add date of assessment cenre from Eric's extraction t <- read.csv('path/...') date <- t[which(colnames(t) %in% c('f.eid', 'f.53.0.0'))] names(date) <- c('SampleID', 'DOA') d1 <- left_join(d1, date, by = 'SampleID') # Subset to complete protein data individuals - prepped data olink_internal <- read.csv('path/...') d1 <- d1[which(d1$SampleID %in% olink_internal$SampleID),] # 49234 # Format DOA date info d1$DOA <- gsub('-', '', d1$DOA) d1$DOA <- substr(d1$DOA,1,6) # Format DOD date info d1$DOD <- gsub('-', '', d1$DOD) d1$DOD <- substr(d1$DOD,1,6) # Format so that mob has '01' rather than '1' for months less than 10, calculating DOB subset1 <- d1[which(d1$MOB < 10),] subset2 <- d1[which(d1$MOB > 9),] subset1$MOB <- sub("^", "0", subset1$MOB) # adds 0 in front of any single digit month d1 <- rbind(subset1, subset2) d1$DOB <- paste0(d1$YOB, d1$MOB) # Work out those who were reported as having died during UKB follow up age_dead <- d1[complete.cases(d1$Age_death),] # 4580 dead age_dead$dead <- 1 age_alive <- d1[-which(d1$SampleID %in% age_dead$SampleID),] # 44654 alive age_alive$dead <- 0 ## extract year/month of death as separate variables ## age_dead$y_dead <- as.numeric(substr(age_dead$DOD, 1, 4)) age_dead$m_dead <- as.numeric(substr(age_dead$DOD, 5, 6)) # Filter death data to earliest sampling date (i.e. check for any death exclusions) age_dead_include <- age_dead[which(age_dead$DOD <= 201606),] # 1939 died before sampling date age_dead_exclude <- age_dead[which(age_dead$DOD > 201606),] #2641 died after sampling date and have been excluded # Calculate a more exact estimate (by year and month) for age of death in the included individuals that died age_dead_include$y_diff <- age_dead_include$y_dead - as.numeric(age_dead_include$YOB) age_dead_include$m_diff <- (age_dead_include$m_dead - as.numeric(age_dead_include$MOB))/12 age_dead_include$diff <- age_dead_include$y_diff + age_dead_include$m_diff # Work out those who are alive (i.e. everytone not in the list of included dead people from above) age_alive <- d1[-which(d1$SampleID %in% age_dead_include$SampleID),] # 47295 people alive at censor date # Ensure that all individuals in the samples are coded as alive and all individuals in the dead file are coded as such age_alive$dead <- 0 age_dead_include$dead <- 1 # Find age the 'alive' people were at censor age_alive$y_diff <- 2016 - as.numeric(age_alive$YOB) age_alive$m_diff <- (06 - as.numeric(age_alive$MOB))/12 age_alive$diff <- age_alive$y_diff + age_alive$m_diff # So for those that died, we are taking forward their age at death # And for those that were alive, we are taking forward their age at censor date # Subset to just the cols needed for joining dead and alive data age_alive <- age_alive[c('SampleID', 'Sex', 'Age_recruitment', 'DOD', 'Age_death', 'DOA', 'YOB', 'MOB', 'DOB', 'dead', 'diff')] age_dead_include <- age_dead_include[c('SampleID', 'Sex', 'Age_recruitment', 'DOD', 'Age_death', 'DOA', 'YOB', 'MOB', 'DOB', 'dead', 'diff')] names(age_alive)[11] <- 'aged' names(age_dead_include)[11] <- 'aged' # Bind dead and alive individuals together d1 <- rbind(age_alive, age_dead_include) dim(d1) # 49234 11 # Get a more precise (year and month) estimate of age at recruitment (using the date of assessment) and DOB d1$DOA_year <- as.numeric(substr(d1$DOA, 1, 4)) d1$DOA_month <- as.numeric(substr(d1$DOA, 5, 6)) d1$Age_y <- d1$DOA_year - as.numeric(d1$YOB) d1$Age_m <- (d1$DOA_month - as.numeric(d1$MOB))/12 d1$Age_assessment <- d1$Age_y + d1$Age_m # Remove old age recrutiment variable d1 <- d1[-c(3)] # Merge protein data into phenotypes file d1 <- merge(d1, olink_internal, by = 'SampleID', all = TRUE) ### Add covariates covs <- read.csv('path/...') d1 <- merge(covs, d1, by = "SampleID", all = TRUE) write.csv(d1, 'path/...', row.names = F) ### Now feed into the cox models as the phenotype file ############################################################################################### ## PREP FOR MORTALITY MODELS - CENSOR DATE difference # srun -p interactive --pty bash library(survival) library(coxme) library(readxl) library(tidyverse) library(gdata) # Process phenotype (d1) and death / alive date data dat <- read.csv('path/...') d1 <- dat[,which(colnames(dat) %in% c('f.eid','f.21022.0.0', 'f.52.0.0', 'f.34.0.0', 'f.31.0.0', 'f.40000.0.0', 'f.40007.0.0'))] names(d1) <- c('SampleID', 'Sex', 'YOB', 'MOB', 'Age_recruitment', 'DOD', 'Age_death') # Add date of assessment cenre from Eric's extraction t <- read.csv('path/...') date <- t[which(colnames(t) %in% c('f.eid', 'f.53.0.0'))] names(date) <- c('SampleID', 'DOA') d1 <- left_join(d1, date, by = 'SampleID') # Subset to complete protein data individuals - prepped data olink_internal <- read.csv('path/...') d1 <- d1[which(d1$SampleID %in% olink_internal$SampleID),] # 49234 # Format DOA date info d1$DOA <- gsub('-', '', d1$DOA) d1$DOA <- substr(d1$DOA,1,6) # Format DOD date info d1$DOD <- gsub('-', '', d1$DOD) d1$DOD <- substr(d1$DOD,1,6) # Format so that mob has '01' rather than '1' for months less than 10, calculating DOB subset1 <- d1[which(d1$MOB < 10),] subset2 <- d1[which(d1$MOB > 9),] subset1$MOB <- sub("^", "0", subset1$MOB) # adds 0 in front of any single digit month d1 <- rbind(subset1, subset2) d1$DOB <- paste0(d1$YOB, d1$MOB) # Work out those who were reported as having died during UKB follow up age_dead <- d1[complete.cases(d1$Age_death),] # 4580 dead age_dead$dead <- 1 age_alive <- d1[-which(d1$SampleID %in% age_dead$SampleID),] # 44654 alive age_alive$dead <- 0 ## extract year/month of death as separate variables ## age_dead$y_dead <- as.numeric(substr(age_dead$DOD, 1, 4)) age_dead$m_dead <- as.numeric(substr(age_dead$DOD, 5, 6)) # Filter death data to earliest sampling date (i.e. check for any death exclusions) age_dead_include <- age_dead[which(age_dead$DOD <= 202112),] # 1939 died before sampling date age_dead_exclude <- age_dead[which(age_dead$DOD > 202112),] #2641 died after sampling date and have been excluded # Calculate a more exact estimate (by year and month) for age of death in the included individuals that died age_dead_include$y_diff <- age_dead_include$y_dead - as.numeric(age_dead_include$YOB) age_dead_include$m_diff <- (age_dead_include$m_dead - as.numeric(age_dead_include$MOB))/12 age_dead_include$diff <- age_dead_include$y_diff + age_dead_include$m_diff # Work out those who are alive (i.e. everytone not in the list of included dead people from above) age_alive <- d1[-which(d1$SampleID %in% age_dead_include$SampleID),] # 47295 people alive at censor date # Ensure that all individuals in the samples are coded as alive and all individuals in the dead file are coded as such age_alive$dead <- 0 age_dead_include$dead <- 1 # Find age the 'alive' people were at censor age_alive$y_diff <- 2021 - as.numeric(age_alive$YOB) age_alive$m_diff <- (12 - as.numeric(age_alive$MOB))/12 age_alive$diff <- age_alive$y_diff + age_alive$m_diff # So for those that died, we are taking forward their age at death # And for those that were alive, we are taking forward their age at censor date # Subset to just the cols needed for joining dead and alive data age_alive <- age_alive[c('SampleID', 'Sex', 'Age_recruitment', 'DOD', 'Age_death', 'DOA', 'YOB', 'MOB', 'DOB', 'dead', 'diff')] age_dead_include <- age_dead_include[c('SampleID', 'Sex', 'Age_recruitment', 'DOD', 'Age_death', 'DOA', 'YOB', 'MOB', 'DOB', 'dead', 'diff')] names(age_alive)[11] <- 'aged' names(age_dead_include)[11] <- 'aged' # Bind dead and alive individuals together d1 <- rbind(age_alive, age_dead_include) dim(d1) # 49234 11 # Get a more precise (year and month) estimate of age at recruitment (using the date of assessment) and DOB d1$DOA_year <- as.numeric(substr(d1$DOA, 1, 4)) d1$DOA_month <- as.numeric(substr(d1$DOA, 5, 6)) d1$Age_y <- d1$DOA_year - as.numeric(d1$YOB) d1$Age_m <- (d1$DOA_month - as.numeric(d1$MOB))/12 d1$Age_assessment <- d1$Age_y + d1$Age_m d1 <- d1[-c(3)] # Merge protein data into phenotypes file d1 <- merge(d1, olink_internal, by = 'SampleID', all = TRUE) ### Add covariates covs <- read.csv('path/...') d1 <- merge(covs, d1, by = "SampleID", all = TRUE) write.csv(d1, 'path/...', row.names = F) ### Now feed into the cox models as the phenotype file