############################################################################################### ## Health linkage preps for cox ############################################################################################### # srun -p interactive --pty bash # module load R R ############################################################################################### ### PREP CODE INTEGRATION olink_internal <- read.csv('path/...') clinical <- read.delim('path/...') # 123662721 names(clinical)[1] <- 'SampleID' clinical <- clinical[which(clinical$SampleID %in% olink_internal$SampleID),] # 13334281 reg <- read.delim('path/...') # TPP is data provider 3 - as its the only one ot have read 3 codes # scotland (EMIS/vision), wales (emis/vision) and england (vision) all have read 2 # Last updated TPP Jun 2016 (i.e. censor date) ### Read files for conversions a <- read.csv('path/...') b <- read.csv('path/...') c <- read.csv('path/...') d <- read.csv('path/...') e <- read.csv('path/...') f <- read.csv('path/...') ### Conversion from r3 to r2 r3 <- clinical %>% filter(data_provider == 3) d <- read.csv('path/...') d <- d %>% select(c('READV2_CODE', 'READV3_CODE')) names(d) <- c('read_2', 'read_3') r3 <- r3[c(1:3,5)] r3 <- left_join(r3, d, by = 'read_3') r3 <- r3[c(1,2,3,5,4)] clinical <- clinical[c(1:5)] clinical <- clinical %>% filter(data_provider %in% c('1','2','4')) clinical <- rbind(clinical, r3) write.csv(clinical, 'path/...', row.names = F) ######################################################################################## ### Load in and prep ICD codes sec <- read.csv('path/...') IDs <- sec[1] ICD10 <- sec[grep('41270', colnames(sec))] ICD10 <- cbind(IDs, ICD10) names(ICD10)[1] <- 'SampleID' ICD10_dat <- sec[grep('41280', colnames(sec))] ICD10_dat <- cbind(IDs, ICD10_dat) names(ICD10_dat)[1] <- 'SampleID' ICD9 <- sec[grep('41271', colnames(sec))] ICD9 <- cbind(IDs, ICD9) names(ICD9)[1] <- 'SampleID' ICD9_dat <- sec[grep('41281', colnames(sec))] ICD9_dat <- cbind(IDs, ICD9_dat) names(ICD9_dat)[1] <- 'SampleID' # ICD10 list <- list() for(i in 1:length(ICD10$SampleID)){ tryCatch({ person <- ICD10[i,1:243] dates <- ICD10_dat[i,1:243] person <- t(person) dates <- t(dates) person <- as.data.frame(person) dates <- as.data.frame(dates) person$SampleID <- person[1,1] dates$SampleID <- dates[1,1] names(person)[1] <- 'code' names(dates)[1] <- 'first' row.names(person) <- NULL row.names(dates) <- NULL person <- person[-which(person$code %in% NA),] dates <- dates[-which(dates$first %in% NA),] person <- person[c(2,1)] person <- person[-1,] dates <- dates[-1,] person$first <- dates$first print(i) list[[i]] <- person }, error = function(e) cat("skipped")) } ICD10 <- do.call(rbind, list) write.csv(ICD10,'path/...', row.names = F) # ICD9 list2 <- list() for(i in 1:length(ICD9$SampleID)){ tryCatch({ person <- ICD9[i,1:48] dates <- ICD9_dat[i,1:48] person <- t(person) dates <- t(dates) person <- as.data.frame(person) dates <- as.data.frame(dates) person$SampleID <- person[1,1] dates$SampleID <- dates[1,1] names(person)[1] <- 'code' names(dates)[1] <- 'first' row.names(person) <- NULL row.names(dates) <- NULL person <- person[-which(person$code %in% NA),] dates <- dates[-which(dates$first %in% NA),] person <- person[c(2,1)] person <- person[-1,] dates <- dates[-1,] person$first <- dates$first print(i) list2[[i]] <- person }, error = function(e) cat("skipped")) } ICD9 <- do.call(rbind, list2) write.csv(ICD9, 'path/...', row.names = F) ######################################################################################## ### TYPE 2 DIABETES library(readxl) clinical <- read.csv('path/...') phen <- read_excel('path/...') phen <- as.data.frame(phen) phen <- phen[-which(phen$Rob_include %in% 'N'),] phen$code <- gsub('.{2}$', '', phen$code) patterns <- phen$code sub <- clinical[which(clinical$read_2 %in% phen$code),] data.frame(table(sub$read_2)) sub <- clinical[which(clinical$read_2 %in% phen$code),] sub <- sub[c(1,4,3)] names(sub) <- c('SampleID', 'code', 'first') sub$first <- sub('/', '', sub$first) sub$first <- sub('/', '', sub$first) sub$y <- substr(sub$first,5,8) sub$m <- substr(sub$first,3,4) sub$first <- paste0(sub$y, sub$m) sub$y <- NULL sub$m <- NULL save <- phen[which(phen$code %in% clinical$read_2),] write.csv(phen, 'path/...', row.names = F) # ICD10 for diabetes is E11 E12 # ICD9 for diabetes is 250, 25000, 25001, 25009, 2501, 25010, 2503, 2504, 2505, 2509 list1 <- c('E11.1', 'E11.2', 'E11.3') list2 <- c('2500', '25000', '25010', '2503', '2504', '2505', '2509') ICD9 <- read.csv('path/...') ICD10 <- read.csv('path/...') ICD10_1 <- ICD10[grep('E11', ICD10$code),] ICD10_2 <- ICD10[grep('E12', ICD10$code),] ICD10 <- rbind(ICD10_1, ICD10_2) ICD9 <- ICD9[which(ICD9$code %in% list2),] ICD9$first <- gsub('-', '', ICD9$first) ICD10$first <- gsub('-', '', ICD10$first) ICD9$first <- substr(ICD9$first,1,6) ICD10$first <- substr(ICD10$first,1,6) # check to ensure dates have correct number of characters in strings which(nchar(ICD10$first) > 6) which(nchar(ICD9$first) > 6) sub <- sub[which(nchar(sub$first) == 6),] # Check ordering of dates to spot any odd formats sub <- sub[order(sub$first),] sub$type <- 'GP' ICD9$type <- 'ICD9' ICD10$type <- 'ICD10' dat <- rbind(sub, ICD10) dat <- rbind(dat, ICD9) subset_diab <- dat[order(dat$first),] library(ggplot2) subset_diab <- subset_diab[-which(subset_diab$first < 197001),] subset_diab <-subset_diab[-which(duplicated(subset_diab$SampleID)),] Diab <- subset_diab write.csv(Diab, 'path/...', row.names = F) ######################################################################################## ### MDD library(readxl) clinical <- read.csv('path/...') phen <- read_excel('path/...') phen <- as.data.frame(phen) phen <- phen[-which(phen$Rob_include %in% 'N'),] phen$code <- gsub('.{2}$', '', phen$code) patterns <- phen$code sub <- clinical[which(clinical$read_2 %in% phen$code),] data.frame(table(sub$read_2)) sub <- sub[c(1,4,3)] names(sub) <- c('SampleID', 'code', 'first') sub$first <- sub('/', '', sub$first) sub$first <- sub('/', '', sub$first) sub$y <- substr(sub$first,5,8) sub$m <- substr(sub$first,3,4) sub$first <- paste0(sub$y, sub$m) sub$y <- NULL sub$m <- NULL save <- phen[which(phen$code %in% clinical$read_2),] write.csv(phen,'path/...', row.names = F) ICD10 <- read.csv('path/...') ICD10 <- ICD10[grep('F33', ICD10$code),] ICD10$first <- gsub('-', '', ICD10$first) ICD10$first <- substr(ICD10$first,1,6) # check to ensure dates have correct number of characters in strings which(nchar(ICD10$first) > 6) sb <- sub[-which(nchar(sub$first) == 6),] # These are missing dates -remove sub <- sub[which(nchar(sub$first) == 6),] # Check ordering of dates to spot any odd formats sub <- sub[order(sub$first),] sub$type <- 'GP' ICD10$type <- 'ICD10' dat <- rbind(sub, ICD10) subset <- dat[order(dat$first),] subset$year <- substr(subset$first, 1,4) subset <- subset[-which(subset$first < 197001),] subset <-subset[-which(duplicated(subset$SampleID)),] library(ggplot2) Dep <- subset write.csv(Dep, 'path/...', row.names = F) ######################################################################################## ### LIV FIB/CIR library(readxl) clinical <- read.csv('path/...') phen <- read_excel('path/...') phen <- as.data.frame(phen) phen <- phen[-which(phen$Rob_include %in% 'N'),] phen$code <- gsub('.{2}$', '', phen$code) patterns <- phen$code sub <- clinical[which(clinical$read_2 %in% phen$code),] data.frame(table(sub$read_2)) sub <- sub[c(1,4,3)] names(sub) <- c('SampleID', 'code', 'first') sub$first <- sub('/', '', sub$first) sub$first <- sub('/', '', sub$first) sub$y <- substr(sub$first,5,8) sub$m <- substr(sub$first,3,4) sub$first <- paste0(sub$y, sub$m) sub$y <- NULL sub$m <- NULL save <- phen[which(phen$code %in% clinical$read_2),] write.csv(phen, 'path/...', row.names = F) ICD9 <- read.csv('path/...') ICD10 <- read.csv('path/...') ICD10_1 <- ICD10[grep('K70', ICD10$code),] ICD10_2 <- ICD10[grep('K71', ICD10$code),] ICD10_3 <- ICD10[grep('K74', ICD10$code),] ICD10 <- rbind(ICD10_1, ICD10_2) ICD10 <- rbind(ICD10, ICD10_3) ICD9 <- ICD9[grep('571', ICD9$code),] ICD9 <- ICD9[-which(ICD9$code %in% 'V571'),] ICD9$first <- gsub('-', '', ICD9$first) ICD10$first <- gsub('-', '', ICD10$first) ICD9$first <- substr(ICD9$first,1,6) ICD10$first <- substr(ICD10$first,1,6) # check to ensure dates have correct number of characters in strings which(nchar(ICD10$first) > 6) which(nchar(ICD9$first) > 6) # Check ordering of dates to spot any odd formats sub <- sub[order(sub$first),] sub$type <- 'GP' ICD9$type <- 'ICD9' ICD10$type <- 'ICD10' dat <- rbind(sub, ICD10) dat <- rbind(dat, ICD9) subset <- dat[order(dat$first),] subset$year <- substr(subset$first, 1,4) subset <- subset[-which(nchar(subset$first) < 6),] # subset <- subset[-which(subset$first < 199001),] subset <-subset[-which(duplicated(subset$SampleID)),] LIV <- subset write.csv(LIV, 'path/...', row.names = F) ######################################################################################## ### Breast cancer library(readxl) phen <- read_excel('path/...') phen <- as.data.frame(phen) phen <- phen[-which(phen$Rob_include %in% 'N'),] phen$code <- gsub('.{2}$', '', phen$code) patterns <- phen$code sub <- clinical[which(clinical$read_2 %in% phen$code),] data.frame(table(sub$read_2)) sub <- sub[c(1,4,3)] names(sub) <- c('SampleID', 'code', 'first') sub$first <- sub('/', '', sub$first) sub$first <- sub('/', '', sub$first) sub$y <- substr(sub$first,5,8) sub$m <- substr(sub$first,3,4) sub$first <- paste0(sub$y, sub$m) sub$y <- NULL sub$m <- NULL save <- phen[which(phen$code %in% clinical$read_2),] write.csv(phen, 'path/...', row.names = F) ICD9 <- read.csv('path/...') ICD10 <- read.csv('path/...') ICD10_1 <- ICD10[grep('C50', ICD10$code),] ICD10_2 <- ICD10[grep('D05', ICD10$code),] ICD10 <- rbind(ICD10_1, ICD10_2) ICD9 <- ICD9[grep('174', ICD9$code),] ICD9 <- ICD9[-which(ICD9$code %in% '7174'),] ICD9$first <- gsub('-', '', ICD9$first) ICD10$first <- gsub('-', '', ICD10$first) ICD9$first <- substr(ICD9$first,1,6) ICD10$first <- substr(ICD10$first,1,6) # check to ensure dates have correct number of characters in strings which(nchar(ICD10$first) > 6) which(nchar(ICD9$first) > 6) # Check ordering of dates to spot any odd formats sub <- sub[order(sub$first),] sub$type <- 'GP' ICD9$type <- 'ICD9' ICD10$type <- 'ICD10' dat <- rbind(sub, ICD10) dat <- rbind(dat, ICD9) subset <- dat[order(dat$first),] subset <- subset[-which(nchar(subset$first) < 6),] subset$year <- substr(subset$first, 1,4) subset <- subset[-which(subset$first < 197001),] subset <-subset[-which(duplicated(subset$SampleID)),] Breast <- subset write.csv(Breast, 'path/...', row.names = F) ######################################################################################## ### Colorectal cancer library(readxl) phen <- read_excel('path/...') phen <- as.data.frame(phen) phen$code <- gsub('.{2}$', '', phen$code) patterns <- phen$code sub <- clinical[which(clinical$read_2 %in% phen$code),] data.frame(table(sub$read_2)) sub <- sub[c(1,4,3)] names(sub) <- c('SampleID', 'code', 'first') sub$first <- sub('/', '', sub$first) sub$first <- sub('/', '', sub$first) sub$y <- substr(sub$first,5,8) sub$m <- substr(sub$first,3,4) sub$first <- paste0(sub$y, sub$m) sub$y <- NULL sub$m <- NULL save <- phen[which(phen$code %in% clinical$read_2),] write.csv(phen, 'path/...', row.names = F) ICD9 <- read.csv('path/...') ICD10 <- read.csv('path/...') ICD10_1 <- ICD10[grep('C18', ICD10$code),] ICD10_2 <- ICD10[grep('C19', ICD10$code),] ICD10_3 <- ICD10[grep('C20', ICD10$code),] ICD10_4 <- ICD10[grep('C21', ICD10$code),] ICD10 <- rbind(ICD10_1, ICD10_2) ICD10 <- rbind(ICD10, ICD10_3) ICD10 <- rbind(ICD10, ICD10_4) ICD9_1 <- ICD9[grep('153', ICD9$code),] ICD9_2 <- ICD9[grep('154', ICD9$code),] ICD9 <- rbind(ICD9_1, ICD9_2) ICD9 <- ICD9[-which(ICD9$code %in% c('71536', '71535', '71537')),] ICD9$first <- gsub('-', '', ICD9$first) ICD10$first <- gsub('-', '', ICD10$first) ICD9$first <- substr(ICD9$first,1,6) ICD10$first <- substr(ICD10$first,1,6) # check to ensure dates have correct number of characters in strings which(nchar(ICD10$first) > 6) which(nchar(ICD9$first) > 6) # Check ordering of dates to spot any odd formats sub <- sub[order(sub$first),] sub$type <- 'GP' ICD9$type <- 'ICD9' ICD10$type <- 'ICD10' dat <- rbind(sub, ICD10) dat <- rbind(dat, ICD9) subset <- dat[order(dat$first),] subset <- subset[-which(nchar(subset$first) < 6),] subset$year <- substr(subset$first, 1,4) # subset <- subset[-which(subset$first < 199001),] subset <-subset[-which(duplicated(subset$SampleID)),] Colorectal <- subset write.csv(Colorectal, 'path/...', row.names = F) ######################################################################################## ### Prostate cancer library(readxl) library(tidyverse) phen <- read_excel('path/...') phen <- as.data.frame(phen) phen <- phen[-which(phen$Rob_include %in% 'N'),] phen$code <- gsub('.{2}$', '', phen$code) patterns <- phen$code sub <- clinical[which(clinical$read_2 %in% phen$code),] data.frame(table(sub$read_2)) sub <- sub[c(1,4,3)] names(sub) <- c('SampleID', 'code', 'first') sub$first <- sub('/', '', sub$first) sub$first <- sub('/', '', sub$first) sub$y <- substr(sub$first,5,8) sub$m <- substr(sub$first,3,4) sub$first <- paste0(sub$y, sub$m) sub$y <- NULL sub$m <- NULL save <- phen[which(phen$code %in% clinical$read_2),] write.csv(phen, 'path/...', row.names = F) ICD9 <- read.csv('path/...') ICD10 <- read.csv('path/...') ICD10 <- ICD10[grep('C61', ICD10$code),] ICD10$first <- gsub('-', '', ICD10$first) # ICD9$first <- substr(ICD9$first,1,6) ICD10$first <- substr(ICD10$first,1,6) # check to ensure dates have correct number of characters in strings which(nchar(ICD10$first) > 6) which(nchar(ICD9$first) > 6) # Check ordering of dates to spot any odd formats sub <- sub[order(sub$first),] sub$type <- 'GP' ICD10$type <- 'ICD10' dat <- rbind(sub, ICD10) subset <- dat[order(dat$first),] subset <- subset[-which(nchar(subset$first) < 6),] subset$year <- substr(subset$first, 1,4) # subset <- subset[-which(subset$first < 199001),] subset <-subset[-which(duplicated(subset$SampleID)),] Prostate <- subset write.csv(Prostate, 'path/...', row.names = F) ######################################################################################## ### ALS library(readxl) clinical <- read.csv('path/...') phen <- read_excel('path/...') phen <- as.data.frame(phen) phen <- phen[which(phen$Rob_include %in% 'Y'),] phen$code <- gsub('.{2}$', '', phen$code) patterns <- phen$code sub <- clinical[which(clinical$read_2 %in% phen$code),] data.frame(table(sub$read_2)) sub <- sub[c(1,4,3)] names(sub) <- c('SampleID', 'code', 'first') sub$first <- sub('/', '', sub$first) sub$first <- sub('/', '', sub$first) sub$y <- substr(sub$first,5,8) sub$m <- substr(sub$first,3,4) sub$first <- paste0(sub$y, sub$m) sub$y <- NULL sub$m <- NULL save <- phen[which(phen$code %in% clinical$read_2),] write.csv(phen, 'path/...', row.names = F) ICD9 <- read.csv('path/...') ICD10 <- read.csv('path/...') ICD10 <- ICD10[grep('G122', ICD10$code),] ICD9 <- ICD9[grep('3352', ICD9$code),] ICD9$first <- gsub('-', '', ICD9$first) ICD10$first <- gsub('-', '', ICD10$first) ICD9$first <- substr(ICD9$first,1,6) ICD10$first <- substr(ICD10$first,1,6) # check to ensure dates have correct number of characters in strings which(nchar(ICD10$first) > 6) which(nchar(ICD9$first) > 6) # Check ordering of dates to spot any odd formats sub <- sub[order(sub$first),] sub$type <- 'GP' ICD9$type <- 'ICD9' ICD10$type <- 'ICD10' dat <- rbind(sub, ICD10) dat <- rbind(dat, ICD9) subset <- dat[order(dat$first),] subset <- subset[-which(nchar(subset$first) < 6),] subset$year <- substr(subset$first, 1,4) # subset <- subset[-which(subset$first < 199001),] subset <-subset[-which(duplicated(subset$SampleID)),] ALS <- subset write.csv(ALS, 'path/...', row.names = F) ######################################################################################## ### AD library(readxl) clinical <- read.csv('path/...') phen <- read_excel('path/...') phen <- as.data.frame(phen) phen <- phen[which(phen$Rob_include %in% 'Y'),] phen$code <- gsub('.{2}$', '', phen$code) patterns <- phen$code sub <- clinical[which(clinical$read_2 %in% phen$code),] data.frame(table(sub$read_2)) sub <- sub[c(1,4,3)] names(sub) <- c('SampleID', 'code', 'first') sub$first <- sub('/', '', sub$first) sub$first <- sub('/', '', sub$first) sub$y <- substr(sub$first,5,8) sub$m <- substr(sub$first,3,4) sub$first <- paste0(sub$y, sub$m) sub$y <- NULL sub$m <- NULL save <- phen[which(phen$code %in% clinical$read_2),] write.csv(phen, 'path/...', row.names = F) ICD9 <- read.csv('path/...') ICD10 <- read.csv('path/...') ICD10_1 <- ICD10[grep('G30', ICD10$code),] ICD10_2 <- ICD10[grep('F00', ICD10$code),] ICD10 <- rbind(ICD10_1, ICD10_2) # ICD9$first <- gsub('-', '', ICD9$first) ICD10$first <- gsub('-', '', ICD10$first) # ICD9$first <- substr(ICD9$first,1,6) ICD10$first <- substr(ICD10$first,1,6) # check to ensure dates have correct number of characters in strings which(nchar(ICD10$first) > 6) # which(nchar(ICD9$first) > 6) # Check ordering of dates to spot any odd formats sub <- sub[order(sub$first),] sub$type <- 'GP' ICD9$type <- 'ICD9' ICD10$type <- 'ICD10' dat <- rbind(sub, ICD10) # dat <- rbind(dat, ICD9) subset <- dat[order(dat$first),] subset <- subset[-which(nchar(subset$first) < 6),] subset$year <- substr(subset$first, 1,4) # subset <- subset[-which(subset$first < 199001),] subset <-subset[-which(duplicated(subset$SampleID)),] AL <- subset write.csv(AL, 'path/...', row.names = F) ######################################################################################## ### VD library(readxl) clinical <- read.csv('path/...') phen <- read_excel('path/...') phen <- as.data.frame(phen) phen <- phen[which(phen$Rob_include %in% 'Y'),] phen$code <- gsub('.{2}$', '', phen$code) patterns <- phen$code sub <- clinical[which(clinical$read_2 %in% phen$code),] data.frame(table(sub$read_2)) sub <- sub[c(1,4,3)] names(sub) <- c('SampleID', 'code', 'first') sub$first <- sub('/', '', sub$first) sub$first <- sub('/', '', sub$first) sub$y <- substr(sub$first,5,8) sub$m <- substr(sub$first,3,4) sub$first <- paste0(sub$y, sub$m) sub$y <- NULL sub$m <- NULL save <- phen[which(phen$code %in% clinical$read_2),] write.csv(phen, 'path/...', row.names = F) ICD9 <- read.csv('path/...') ICD10 <- read.csv('path/...') ICD10 <- ICD10[grep('F01', ICD10$code),] # ICD9$first <- gsub('-', '', ICD9$first) ICD10$first <- gsub('-', '', ICD10$first) # ICD9$first <- substr(ICD9$first,1,6) ICD10$first <- substr(ICD10$first,1,6) # check to ensure dates have correct number of characters in strings which(nchar(ICD10$first) > 6) # which(nchar(ICD9$first) > 6) # Check ordering of dates to spot any odd formats sub <- sub[order(sub$first),] sub$type <- 'GP' ICD9$type <- 'ICD9' ICD10$type <- 'ICD10' dat <- rbind(sub, ICD10) # dat <- rbind(dat, ICD9) subset <- dat[order(dat$first),] subset <- subset[-which(nchar(subset$first) < 6),] subset$year <- substr(subset$first, 1,4) # subset <- subset[-which(subset$first < 199001),] subset <-subset[-which(duplicated(subset$SampleID)),] VD <- subset write.csv(VD, 'path/...', row.names = F) ######################################################################################## ### SCZ library(readxl) clinical <- read.csv('path/...') phen <- read_excel('path/...') phen <- as.data.frame(phen) phen <- phen[which(phen$Rob_include %in% 'Y'),] phen$code <- gsub('.{2}$', '', phen$code) patterns <- phen$code sub <- clinical[which(clinical$read_2 %in% phen$code),] data.frame(table(sub$read_2)) sub <- sub[c(1,4,3)] names(sub) <- c('SampleID', 'code', 'first') sub$first <- sub('/', '', sub$first) sub$first <- sub('/', '', sub$first) sub$y <- substr(sub$first,5,8) sub$m <- substr(sub$first,3,4) sub$first <- paste0(sub$y, sub$m) sub$y <- NULL sub$m <- NULL save <- phen[which(phen$code %in% clinical$read_2),] write.csv(phen, 'path/...', row.names = F) ICD9 <- read.csv('path/...') ICD10 <- read.csv('path/...') ICD10_1 <- ICD10[grep('F20', ICD10$code),] ICD10_2 <- ICD10[grep('F21', ICD10$code),] ICD10_3 <- ICD10[grep('F22', ICD10$code),] ICD10_4 <- ICD10[grep('F23', ICD10$code),] ICD10 <- rbind(ICD10_1, ICD10_2) ICD10 <- rbind(ICD10, ICD10_3) ICD10 <- rbind(ICD10, ICD10_4) # ICD9 <- ICD9[grep('295', ICD9$code),] # ICD9$first <- gsub('-', '', ICD9$first) ICD10$first <- gsub('-', '', ICD10$first) # ICD9$first <- substr(ICD9$first,1,6) ICD10$first <- substr(ICD10$first,1,6) # check to ensure dates have correct number of characters in strings which(nchar(ICD10$first) > 6) # which(nchar(ICD9$first) > 6) # Check ordering of dates to spot any odd formats sub <- sub[order(sub$first),] sub$type <- 'GP' ICD9$type <- 'ICD9' ICD10$type <- 'ICD10' dat <- rbind(sub, ICD10) # dat <- rbind(dat, ICD9) subset <- dat[order(dat$first),] subset <- subset[-which(nchar(subset$first) < 6),] subset$year <- substr(subset$first, 1,4) subset <- subset[-which(subset$first < 197001),] subset <-subset[-which(duplicated(subset$SampleID)),] SCZ <- subset write.csv(SCZ, 'path/...', row.names = F) ######################################################################################## ### RA library(readxl) phen <- read_excel('path/...') phen <- as.data.frame(phen) phen <- phen[which(phen$Rob_include %in% 'Y'),] phen$code <- gsub('.{2}$', '', phen$code) patterns <- phen$code sub <- clinical[which(clinical$read_2 %in% phen$code),] data.frame(table(sub$read_2)) sub <- sub[c(1,4,3)] names(sub) <- c('SampleID', 'code', 'first') sub$first <- sub('/', '', sub$first) sub$first <- sub('/', '', sub$first) sub$y <- substr(sub$first,5,8) sub$m <- substr(sub$first,3,4) sub$first <- paste0(sub$y, sub$m) sub$y <- NULL sub$m <- NULL save <- phen[which(phen$code %in% clinical$read_2),] write.csv(phen, 'path/...', row.names = F) ICD9 <- read.csv('path/...') ICD10 <- read.csv('path/...') ICD10_1 <- ICD10[grep('J99', ICD10$code),] ICD10_2 <- ICD10[grep('M05', ICD10$code),] ICD10_3 <- ICD10[grep('M06', ICD10$code),] ICD10 <- rbind(ICD10_1, ICD10_2) ICD10 <- rbind(ICD10, ICD10_3) ICD9 <- ICD9[grep('7140', ICD9$code),] ICD9$first <- gsub('-', '', ICD9$first) ICD10$first <- gsub('-', '', ICD10$first) ICD9$first <- substr(ICD9$first,1,6) ICD10$first <- substr(ICD10$first,1,6) # check to ensure dates have correct number of characters in strings which(nchar(ICD10$first) > 6) which(nchar(ICD9$first) > 6) # Check ordering of dates to spot any odd formats sub <- sub[order(sub$first),] sub$type <- 'GP' ICD9$type <- 'ICD9' ICD10$type <- 'ICD10' dat <- rbind(sub, ICD10) dat <- rbind(dat, ICD9) subset <- dat[order(dat$first),] subset <- subset[-which(nchar(subset$first) < 6),] subset$year <- substr(subset$first, 1,4) subset <- subset[-which(subset$first < 197001),] subset <-subset[-which(duplicated(subset$SampleID)),] RA <- subset write.csv(RA, 'path/...', row.names = F) ######################################################################################## ### IHD library(readxl) phen <- read_excel('path/...') phen <- as.data.frame(phen) phen <- phen[which(phen$Rob_include %in% 'Y'),] phen$code <- gsub('.{2}$', '', phen$code) patterns <- phen$code sub <- clinical[which(clinical$read_2 %in% phen$code),] data.frame(table(sub$read_2)) sub <- sub[c(1,4,3)] names(sub) <- c('SampleID', 'code', 'first') sub$first <- sub('/', '', sub$first) sub$first <- sub('/', '', sub$first) sub$y <- substr(sub$first,5,8) sub$m <- substr(sub$first,3,4) sub$first <- paste0(sub$y, sub$m) sub$y <- NULL sub$m <- NULL save <- phen[which(phen$code %in% clinical$read_2),] write.csv(phen, 'path/...', row.names = F) ICD9 <- read.csv('path/...') ICD10 <- read.csv('path/...') ICD10_patterns <- c('I250', 'I251', 'I253', 'I254', 'I255', 'I256', 'I258', 'I259') ICD10 <- filter(ICD10, grepl(paste(ICD10_patterns, collapse="|"), code)) ICD9_patterns <- c('440', '441', '437', '411', '414') ICD9 <- filter(ICD9, grepl(paste(ICD9_patterns, collapse="|"), code)) ICD9 <- ICD9[-which(ICD9$code %in% c('2411', '9411', '8411')),] ICD9$first <- gsub('-', '', ICD9$first) ICD10$first <- gsub('-', '', ICD10$first) ICD9$first <- substr(ICD9$first,1,6) ICD10$first <- substr(ICD10$first,1,6) # check to ensure dates have correct number of characters in strings which(nchar(ICD10$first) > 6) which(nchar(ICD9$first) > 6) # Check ordering of dates to spot any odd formats sub <- sub[order(sub$first),] sub$type <- 'GP' ICD9$type <- 'ICD9' ICD10$type <- 'ICD10' dat <- rbind(sub, ICD10) dat <- rbind(dat, ICD9) subset <- dat[order(dat$first),] subset <- subset[-which(nchar(subset$first) < 6),] subset$year <- substr(subset$first, 1,4) subset <- subset[-which(subset$first < 197001),] subset <-subset[-which(duplicated(subset$SampleID)),] IHD <- subset write.csv(IHD, 'path/...', row.names = F) ######################################################################################## ### PD library(readxl) phen <- read_excel('path/...') phen <- as.data.frame(phen) phen <- phen[which(phen$Rob_include %in% 'Y'),] phen$code <- gsub('.{2}$', '', phen$code) patterns <- phen$code sub <- clinical[which(clinical$read_2 %in% phen$code),] data.frame(table(sub$read_2)) sub <- sub[c(1,4,3)] names(sub) <- c('SampleID', 'code', 'first') sub$first <- sub('/', '', sub$first) sub$first <- sub('/', '', sub$first) sub$y <- substr(sub$first,5,8) sub$m <- substr(sub$first,3,4) sub$first <- paste0(sub$y, sub$m) sub$y <- NULL sub$m <- NULL save <- phen[which(phen$code %in% clinical$read_2),] write.csv(phen, 'path/...', row.names = F) ICD9 <- read.csv('path/...') ICD10 <- read.csv('path/...') ICD10_patterns <- c('F023', 'G20') ICD10 <- filter(ICD10, grepl(paste(ICD10_patterns, collapse="|"), code)) ICD10$first <- gsub('-', '', ICD10$first) # ICD9$first <- substr(ICD9$first,1,6) ICD10$first <- substr(ICD10$first,1,6) # check to ensure dates have correct number of characters in strings which(nchar(ICD10$first) > 6) which(nchar(ICD9$first) > 6) # Check ordering of dates to spot any odd formats sub <- sub[order(sub$first),] sub$type <- 'GP' ICD9$type <- 'ICD9' ICD10$type <- 'ICD10' dat <- rbind(sub, ICD10) # dat <- rbind(dat, ICD9) subset <- dat[order(dat$first),] subset$year <- substr(subset$first, 1,4) subset <- subset[-which(subset$first < 197001),] subset <-subset[-which(duplicated(subset$SampleID)),] PD <- subset write.csv(PD, 'path/...', row.names = F) ######################################################################################## ### MS phen <- read_excel('path/...') phen <- as.data.frame(phen) phen <- phen[which(phen$Rob_include %in% 'Y'),] phen$code <- gsub('.{2}$', '', phen$code) patterns <- phen$code sub <- clinical[which(clinical$read_2 %in% phen$code),] data.frame(table(sub$read_2)) sub <- sub[c(1,4,3)] names(sub) <- c('SampleID', 'code', 'first') sub$first <- sub('/', '', sub$first) sub$first <- sub('/', '', sub$first) sub$y <- substr(sub$first,5,8) sub$m <- substr(sub$first,3,4) sub$first <- paste0(sub$y, sub$m) sub$y <- NULL sub$m <- NULL save <- phen[which(phen$code %in% clinical$read_2),] write.csv(phen, 'path/...', row.names = F) ICD9 <- read.csv('path/...') ICD10 <- read.csv('path/...') ICD10_patterns <- c('G35') ICD10 <- filter(ICD10, grepl(paste(ICD10_patterns, collapse="|"), code)) ICD9_patterns <- c('3409') ICD9 <- filter(ICD9, grepl(paste(ICD9_patterns, collapse="|"), code)) # ICD9 <- ICD9[-which(ICD9$code %in% c('2411', '9411', '8411')),] ICD9$first <- gsub('-', '', ICD9$first) ICD10$first <- gsub('-', '', ICD10$first) ICD9$first <- substr(ICD9$first,1,6) ICD10$first <- substr(ICD10$first,1,6) # check to ensure dates have correct number of characters in strings which(nchar(ICD10$first) > 6) which(nchar(ICD9$first) > 6) # Check ordering of dates to spot any odd formats sub <- sub[order(sub$first),] sub$type <- 'GP' ICD9$type <- 'ICD9' ICD10$type <- 'ICD10' dat <- rbind(sub, ICD10) dat <- rbind(dat, ICD9) subset <- dat[order(dat$first),] subset <- subset[-which(nchar(subset$first) < 6),] subset$year <- substr(subset$first, 1,4) subset <- subset[-which(subset$first < 197001),] subset <-subset[-which(duplicated(subset$SampleID)),] MS <- subset write.csv(MS, 'path/...', row.names = F) ######################################################################################## ### LUP phen <- read_excel('path/...') phen <- as.data.frame(phen) phen <- phen[which(phen$Rob_include %in% 'Y'),] phen$code <- gsub('.{2}$', '', phen$code) patterns <- phen$code sub <- clinical[which(clinical$read_2 %in% phen$code),] data.frame(table(sub$read_2)) sub <- sub[c(1,4,3)] names(sub) <- c('SampleID', 'code', 'first') sub$first <- sub('/', '', sub$first) sub$first <- sub('/', '', sub$first) sub$y <- substr(sub$first,5,8) sub$m <- substr(sub$first,3,4) sub$first <- paste0(sub$y, sub$m) sub$y <- NULL sub$m <- NULL save <- phen[which(phen$code %in% clinical$read_2),] write.csv(phen, 'path/...', row.names = F) ICD9 <- read.csv('path/...') ICD10 <- read.csv('path/...') ICD10_patterns <- c('L93', 'M32') ICD10 <- filter(ICD10, grepl(paste(ICD10_patterns, collapse="|"), code)) ICD9_patterns <- c('7100', '6954') ICD9 <- filter(ICD9, grepl(paste(ICD9_patterns, collapse="|"), code)) # ICD9 <- ICD9[-which(ICD9$code %in% c('2411', '9411', '8411')),] ICD9$first <- gsub('-', '', ICD9$first) ICD10$first <- gsub('-', '', ICD10$first) ICD9$first <- substr(ICD9$first,1,6) ICD10$first <- substr(ICD10$first,1,6) # check to ensure dates have correct number of characters in strings which(nchar(ICD10$first) > 6) which(nchar(ICD9$first) > 6) # Check ordering of dates to spot any odd formats sub <- sub[order(sub$first),] sub$type <- 'GP' ICD9$type <- 'ICD9' ICD10$type <- 'ICD10' dat <- rbind(sub, ICD10) dat <- rbind(dat, ICD9) subset <- dat[order(dat$first),] subset <- subset[-which(nchar(subset$first) < 6),] subset$year <- substr(subset$first, 1,4) subset <- subset[-which(subset$first < 197001),] subset <-subset[-which(duplicated(subset$SampleID)),] LUP <- subset write.csv(LUP, 'path/...', row.names = F) ######################################################################################## ### COPD phen <- read_excel('path/...') phen <- as.data.frame(phen) phen <- phen[which(phen$Rob_include %in% 'Y'),] phen$code <- gsub('.{2}$', '', phen$code) patterns <- phen$code sub <- clinical[which(clinical$read_2 %in% phen$code),] data.frame(table(sub$read_2)) sub <- sub[c(1,4,3)] names(sub) <- c('SampleID', 'code', 'first') sub$first <- sub('/', '', sub$first) sub$first <- sub('/', '', sub$first) sub$y <- substr(sub$first,5,8) sub$m <- substr(sub$first,3,4) sub$first <- paste0(sub$y, sub$m) sub$y <- NULL sub$m <- NULL save <- phen[which(phen$code %in% clinical$read_2),] write.csv(phen, 'path/...', row.names = F) ICD9 <- read.csv('path/...') ICD10 <- read.csv('path/...') ICD10_patterns <- c('J41', 'J42', 'J44', 'J43') ICD10 <- filter(ICD10, grepl(paste(ICD10_patterns, collapse="|"), code)) ICD9_patterns <- c('4929', '491') ICD9 <- filter(ICD9, grepl(paste(ICD9_patterns, collapse="|"), code)) ICD9 <- ICD9[-which(ICD9$code %in% c('74929')),] ICD9$first <- gsub('-', '', ICD9$first) ICD10$first <- gsub('-', '', ICD10$first) ICD9$first <- substr(ICD9$first,1,6) ICD10$first <- substr(ICD10$first,1,6) # check to ensure dates have correct number of characters in strings which(nchar(ICD10$first) > 6) which(nchar(ICD9$first) > 6) # Check ordering of dates to spot any odd formats sub <- sub[order(sub$first),] sub$type <- 'GP' ICD9$type <- 'ICD9' ICD10$type <- 'ICD10' dat <- rbind(sub, ICD10) dat <- rbind(dat, ICD9) subset <- dat[order(dat$first),] subset <- subset[-which(nchar(subset$first) < 6),] subset$year <- substr(subset$first, 1,4) subset <- subset[-which(subset$first < 197001),] subset <-subset[-which(duplicated(subset$SampleID)),] COPD <- subset write.csv(COPD, 'path/...', row.names = F) ######################################################################################## ### ENDO phen <- read_excel('path/...') phen <- as.data.frame(phen) phen <- phen[which(phen$Rob_include %in% 'Y'),] phen$code <- gsub('.{2}$', '', phen$code) patterns <- phen$code sub <- clinical[which(clinical$read_2 %in% phen$code),] data.frame(table(sub$read_2)) ICD9 <- read.csv('path/...') ICD10 <- read.csv('path/...') ICD10_patterns <- c('N80') ICD10 <- filter(ICD10, grepl(paste(ICD10_patterns, collapse="|"), code)) ICD9_patterns <- c('617') ICD9 <- filter(ICD9, grepl(paste(ICD9_patterns, collapse="|"), code)) ICD9$first <- gsub('-', '', ICD9$first) ICD10$first <- gsub('-', '', ICD10$first) ICD9$first <- substr(ICD9$first,1,6) ICD10$first <- substr(ICD10$first,1,6) # check to ensure dates have correct number of characters in strings which(nchar(ICD10$first) > 6) which(nchar(ICD9$first) > 6) # Check ordering of dates to spot any odd formats # sub <- sub[order(sub$first),] # sub$type <- 'GP' ICD9$type <- 'ICD9' ICD10$type <- 'ICD10' dat <- rbind(ICD9, ICD10) # dat <- rbind(dat, ICD9) subset <- dat[order(dat$first),] # subset <- subset[-which(nchar(subset$first) < 6),] subset$year <- substr(subset$first, 1,4) # subset <- subset[-which(subset$first < 199001),] subset <-subset[-which(duplicated(subset$SampleID)),] ENDO <- subset write.csv(ENDO, 'path/...', row.names = F) ######################################################################################## ### CYS phen <- read.csv('path/...') phen <- as.data.frame(phen) # phen <- phen[which(phen$Rob_include %in% 'Y'),] phen$code <- gsub('.{2}$', '', phen$code) patterns <- phen$code sub <- clinical[which(clinical$read_2 %in% phen$code),] data.frame(table(sub$read_2)) sub <- sub[c(1,4,3)] names(sub) <- c('SampleID', 'code', 'first') sub$first <- sub('/', '', sub$first) sub$first <- sub('/', '', sub$first) sub$y <- substr(sub$first,5,8) sub$m <- substr(sub$first,3,4) sub$first <- paste0(sub$y, sub$m) sub$y <- NULL sub$m <- NULL save <- phen[which(phen$code %in% clinical$read_2),] write.csv(phen, 'path/...', row.names = F) ICD9 <- read.csv('path/...') ICD10 <- read.csv('path/...') ICD10_patterns <- c('N301', 'N302') ICD10 <- filter(ICD10, grepl(paste(ICD10_patterns, collapse="|"), code)) ICD9_patterns <- c('5951', '5952') ICD9 <- filter(ICD9, grepl(paste(ICD9_patterns, collapse="|"), code)) ICD9$first <- gsub('-', '', ICD9$first) ICD10$first <- gsub('-', '', ICD10$first) ICD9$first <- substr(ICD9$first,1,6) ICD10$first <- substr(ICD10$first,1,6) # check to ensure dates have correct number of characters in strings which(nchar(ICD10$first) > 6) which(nchar(ICD9$first) > 6) # Check ordering of dates to spot any odd formats sub <- sub[order(sub$first),] sub$type <- 'GP' ICD9$type <- 'ICD9' ICD10$type <- 'ICD10' dat <- rbind(sub, ICD10) dat <- rbind(dat, ICD9) subset <- dat[order(dat$first),] subset$year <- substr(subset$first, 1,4) # subset <- subset[-which(subset$first < 197001),] subset <-subset[-which(duplicated(subset$SampleID)),] CYS <- subset write.csv(CYS, 'path/...', row.names = F) ######################################################################################## ### IBD phen <- read_excel('path/...') phen <- as.data.frame(phen) phen <- phen[which(phen$Rob_include %in% 'Y'),] phen$code <- gsub('.{2}$', '', phen$code) patterns <- phen$code sub <- clinical[which(clinical$read_2 %in% phen$code),] data.frame(table(sub$read_2)) sub <- sub[c(1,4,3)] names(sub) <- c('SampleID', 'code', 'first') sub$first <- sub('/', '', sub$first) sub$first <- sub('/', '', sub$first) sub$y <- substr(sub$first,5,8) sub$m <- substr(sub$first,3,4) sub$first <- paste0(sub$y, sub$m) sub$y <- NULL sub$m <- NULL save <- phen[which(phen$code %in% clinical$read_2),] write.csv(phen, 'path/...', row.names = F) ICD9 <- read.csv('path/...') ICD10 <- read.csv('path/...') ICD10_patterns <- c('K50', 'K51') ICD10 <- filter(ICD10, grepl(paste(ICD10_patterns, collapse="|"), code)) # ICD9$first <- gsub('-', '', ICD9$first) ICD10$first <- gsub('-', '', ICD10$first) # ICD9$first <- substr(ICD9$first,1,6) ICD10$first <- substr(ICD10$first,1,6) # check to ensure dates have correct number of characters in strings which(nchar(ICD10$first) > 6) which(nchar(ICD9$first) > 6) # Check ordering of dates to spot any odd formats sub <- sub[order(sub$first),] sub$type <- 'GP' ICD9$type <- 'ICD9' ICD10$type <- 'ICD10' dat <- rbind(sub, ICD10) # dat <- rbind(dat, ICD9) subset <- dat[order(dat$first),] subset <- subset[-which(nchar(subset$first) < 6),] subset$year <- substr(subset$first, 1,4) subset <- subset[-which(subset$first < 197001),] subset <-subset[-which(duplicated(subset$SampleID)),] IBD <- subset write.csv(IBD, 'path/...', row.names = F) ######################################################################################## ### Stroke library(readxl) phen <- read_excel('path/...') phen <- as.data.frame(phen) # phen <- phen[which(phen$Rob_include %in% 'Y'),] phen$code <- gsub('.{2}$', '', phen$code) patterns <- phen$code sub <- clinical[which(clinical$read_2 %in% phen$code),] data.frame(table(sub$read_2)) sub <- sub[c(1,4,3)] names(sub) <- c('SampleID', 'code', 'first') sub$first <- sub('/', '', sub$first) sub$first <- sub('/', '', sub$first) sub$y <- substr(sub$first,5,8) sub$m <- substr(sub$first,3,4) sub$first <- paste0(sub$y, sub$m) sub$y <- NULL sub$m <- NULL save <- phen[which(phen$code %in% clinical$read_2),] write.csv(phen, 'path/...', row.names = F) ICD9 <- read.csv('path/...') ICD10 <- read.csv('path/...') library(tidyverse) ICD10_patterns <- c('I63', 'I69') ICD10 <- filter(ICD10, grepl(paste(ICD10_patterns, collapse="|"), code)) ICD9_patterns <- c('433', '434', '435') ICD9 <- filter(ICD9, grepl(paste(ICD9_patterns, collapse="|"), code)) ICD9 <- ICD9[-which(ICD9$code %in% c('V433', '74339', 'V434')),] ICD9$first <- gsub('-', '', ICD9$first) ICD10$first <- gsub('-', '', ICD10$first) ICD9$first <- substr(ICD9$first,1,6) ICD10$first <- substr(ICD10$first,1,6) # check to ensure dates have correct number of characters in strings which(nchar(ICD10$first) > 6) which(nchar(ICD9$first) > 6) # Check ordering of dates to spot any odd formats sub <- sub[order(sub$first),] sub$type <- 'GP' ICD9$type <- 'ICD9' ICD10$type <- 'ICD10' dat <- rbind(sub, ICD10) dat <- rbind(dat, ICD9) subset <- dat[order(dat$first),] subset$year <- substr(subset$first, 1,4) # subset <- subset[-which(subset$first < 199001),] subset <-subset[-which(duplicated(subset$SampleID)),] ST <- subset write.csv(ST, 'path/...', row.names = F) ######################################################################################## ### Lung cancer library(readxl) phen <- read.csv('path/...') phen <- as.data.frame(phen) # phen <- phen[which(phen$Rob_include %in% 'Y'),] phen$code <- gsub('.{2}$', '', phen$code) patterns <- phen$code sub <- clinical[which(clinical$read_2 %in% phen$code),] data.frame(table(sub$read_2)) sub <- sub[c(1,4,3)] names(sub) <- c('SampleID', 'code', 'first') sub$first <- sub('/', '', sub$first) sub$first <- sub('/', '', sub$first) sub$y <- substr(sub$first,5,8) sub$m <- substr(sub$first,3,4) sub$first <- paste0(sub$y, sub$m) sub$y <- NULL sub$m <- NULL save <- phen[which(phen$code %in% clinical$read_2),] write.csv(phen, 'path/...', row.names = F) ICD9 <- read.csv('path/...') ICD10 <- read.csv('path/...') library(tidyverse) ICD10_patterns <- c('C33', 'C34') ICD10 <- filter(ICD10, grepl(paste(ICD10_patterns, collapse="|"), code)) ICD9_patterns <- c('1629') ICD9 <- filter(ICD9, grepl(paste(ICD9_patterns, collapse="|"), code)) ICD9$first <- gsub('-', '', ICD9$first) ICD10$first <- gsub('-', '', ICD10$first) ICD9$first <- substr(ICD9$first,1,6) ICD10$first <- substr(ICD10$first,1,6) # check to ensure dates have correct number of characters in strings which(nchar(ICD10$first) > 6) which(nchar(ICD9$first) > 6) # Check ordering of dates to spot any odd formats sub <- sub[order(sub$first),] sub$type <- 'GP' ICD9$type <- 'ICD9' ICD10$type <- 'ICD10' dat <- rbind(sub, ICD10) dat <- rbind(dat, ICD9) subset <- dat[order(dat$first),] # subset <- subset[-which(nchar(subset$first) < 6),] subset$year <- substr(subset$first, 1,4) # subset <- subset[-which(subset$first < 199001),] subset <-subset[-which(duplicated(subset$SampleID)),] LUNG <- subset write.csv(LUNG, 'path/...', row.names = F) ######################################################################################## ### Brain/CNS cancer library(readxl) phen <- read.csv('path/...') phen <- as.data.frame(phen) # phen <- phen[which(phen$Rob_include %in% 'Y'),] phen$code <- gsub('.{2}$', '', phen$code) patterns <- phen$code sub <- clinical[which(clinical$read_2 %in% phen$code),] data.frame(table(sub$read_2)) sub <- sub[c(1,4,3)] names(sub) <- c('SampleID', 'code', 'first') sub$first <- sub('/', '', sub$first) sub$first <- sub('/', '', sub$first) sub$y <- substr(sub$first,5,8) sub$m <- substr(sub$first,3,4) sub$first <- paste0(sub$y, sub$m) sub$y <- NULL sub$m <- NULL save <- phen[which(phen$code %in% clinical$read_2),] write.csv(phen, 'path/...', row.names = F) ICD9 <- read.csv('path/...') ICD10 <- read.csv('path/...') library(tidyverse) ICD10_patterns <- c('C70', 'C71', 'C72', 'C75') ICD10 <- filter(ICD10, grepl(paste(ICD10_patterns, collapse="|"), code)) ICD9_patterns <- c('191') ICD9 <- filter(ICD9, grepl(paste(ICD9_patterns, collapse="|"), code)) # ICD9$first <- gsub('-', '', ICD9$first) ICD10$first <- gsub('-', '', ICD10$first) # ICD9$first <- substr(ICD9$first,1,6) ICD10$first <- substr(ICD10$first,1,6) # check to ensure dates have correct number of characters in strings which(nchar(ICD10$first) > 6) # which(nchar(ICD9$first) > 6) # Check ordering of dates to spot any odd formats sub <- sub[order(sub$first),] sub$type <- 'GP' # ICD9$type <- 'ICD9' ICD10$type <- 'ICD10' dat <- rbind(sub, ICD10) # dat <- rbind(dat, ICD9) subset <- dat[order(dat$first),] # subset <- subset[-which(nchar(subset$first) < 6),] subset$year <- substr(subset$first, 1,4) # subset <- subset[-which(subset$first < 199001),] subset <-subset[-which(duplicated(subset$SampleID)),] BRAIN <- subset write.csv(BRAIN, 'path/...', row.names = F) ######################################################################################## ### GYNC - cervical x2, ovarian and uterine phen <- read.csv('path/...') phen <- as.data.frame(phen) # phen <- phen[which(phen$Rob_include %in% 'Y'),] phen$code <- gsub('.{2}$', '', phen$code) patterns <- phen$code sub <- clinical[which(clinical$read_2 %in% phen$code),] data.frame(table(sub$read_2)) sub <- sub[c(1,4,3)] names(sub) <- c('SampleID', 'code', 'first') sub$first <- sub('/', '', sub$first) sub$first <- sub('/', '', sub$first) sub$y <- substr(sub$first,5,8) sub$m <- substr(sub$first,3,4) sub$first <- paste0(sub$y, sub$m) sub$y <- NULL sub$m <- NULL save <- phen[which(phen$code %in% clinical$read_2),] write.csv(phen, 'path/...', row.names = F) ICD9 <- read.csv('path/...') ICD10 <- read.csv('path/...') ICD10_patterns <- c('C54', 'C55', 'C56', 'D06', 'N87', 'C53', 'C51', 'C52', 'C57') ICD10 <- filter(ICD10, grepl(paste(ICD10_patterns, collapse="|"), code)) ICD9_patterns <- c('5951', '5952', '179', '180', '182', '183', '184') ICD9 <- filter(ICD9, grepl(paste(ICD9_patterns, collapse="|"), code)) ICD9 <- ICD9[-which(ICD9$code %in% c('E91799' ,'2179', '6179', '71842', 'E8179', '6180', '7179', '71828', '71831', '4179', '71839', '6184', 'E91795', 'E9179', 'E91794', 'E8182', '5952', '69180', '7183')),] ICD9$first <- gsub('-', '', ICD9$first) ICD10$first <- gsub('-', '', ICD10$first) ICD9$first <- substr(ICD9$first,1,6) ICD10$first <- substr(ICD10$first,1,6) # check to ensure dates have correct number of characters in strings which(nchar(ICD10$first) > 6) which(nchar(ICD9$first) > 6) # Check ordering of dates to spot any odd formats sub <- sub[order(sub$first),] sub$type <- 'GP' ICD9$type <- 'ICD9' ICD10$type <- 'ICD10' dat <- rbind(sub, ICD10) dat <- rbind(dat, ICD9) subset <- dat[order(dat$first),] subset <- subset[-which(nchar(subset$first) < 6),] subset$year <- substr(subset$first, 1,4) subset <- subset[-which(subset$first < 197001),] subset <-subset[-which(duplicated(subset$SampleID)),] GYN <- subset write.csv(GYN,'path/...', row.names = F) # Join Urology outcomes together (i.e. prostate or testicular cancer for men) pros <- read.csv('path/...') test <- read.csv('path/...') ############################################################################################### ##### MORTALITY DATA ############################################################################################### dat <- read.csv('path/...') t <- read.csv('path/...') proteins <- read.csv('path/...') # Mortality dat <- read.csv("/home/dgadd/PPP_core_input_files/parquet_54189_260722.csv") d1 <- dat[,which(colnames(dat) %in% c('f.eid','f.21022.0.0', 'f.52.0.0', 'f.34.0.0', 'f.31.0.0', 'f.40000.0.0', 'f.40007.0.0'))] names(d1) <- c('SampleID', 'Sex', 'YOB', 'MOB', 'Age_recruitment', 'DOD', 'Age_death') d1 <- d1[which(d1$SampleID %in% proteins$SampleID),] death <- d1[c(1,6)] death <- na.omit(death) death$DOD <- gsub('-', '', death$DOD) names(death) <- c('SampleID', 'first') death <- death[order(death$first),] death$first <- substr(death$first,1,6) DEATH <- death # 4580 write.csv(DEATH, 'path/...', row.names = F) ############################################################################################### ##### SELF REPORT ############################################################################################### # Extract instances of self-report for diseases from verbal interview # use instancing 0 which is baseline verbal interview # Cancer: https://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=20001 # Non-cancer: https://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=20002 library(tidyverse) self <- read.csv('path/...') proteins <- read.csv('path/...') names(self)[1] <- 'SampleID' self <- self[which(self$SampleID %in% proteins$SampleID),] ## Cancers # Subset linkage to get cancer specific codes at baseline non <- self[grep("20001", colnames(self))] non <- cbind(self[,1], non) names(non)[1] <- 'SampleID' non <- non[c(1:7)] list <- list() for(i in 1:length(non$SampleID)){ tryCatch({ person <- non[i,1:7] person <- t(person) person <- as.data.frame(person) person$SampleID <- person[1,1] names(person)[1] <- 'code' row.names(person) <- NULL person <- person[-which(person$code %in% NA),] person <- person[c(2,1)] person <- person[-1,] print(i) list[[i]] <- person }, error = function(e) cat("skipped")) } non <- do.call(rbind, list) write.csv(non, 'path/...', row.names = F) ## Non-cancers non <- self[grep("20002", colnames(self))] non <- cbind(self[,1], non) names(non)[1] <- 'SampleID' non <- non[c(1:35)] list <- list() for(i in 1:length(non$SampleID)){ tryCatch({ person <- non[i,1:35] person <- t(person) person <- as.data.frame(person) person$SampleID <- person[1,1] names(person)[1] <- 'code' row.names(person) <- NULL person <- person[-which(person$code %in% NA),] person <- person[c(2,1)] person <- person[-1,] print(i) list[[i]] <- person }, error = function(e) cat("skipped")) } non <- do.call(rbind, list) write.csv(non, 'path/...', row.names = F) ######################################## ## Save out self-report based on the indexes created for each disease non <- read.csv('path/...') can <- read.csv('path/...') list_BRAIN <- c('1032', '1033') list_LUNG <- c('1001', '1027', '1028') list_GYN <- c('1039', '1040', '1041', '1042', '1043') list_Breast <- c('1002') list_Colorectal <- c('1019', '1020', '1021', '1022', '1023') list_TEST <- c('1045') list_Prostate <- c('1044') list_cancer <- list(list_BRAIN, list_Breast, list_Colorectal, list_GYN, list_LUNG, list_Prostate, list_TEST) list1 <- c('BRAIN', 'Breast', 'Colorectal', 'GYN', 'LUNG', 'Prostate', 'TEST') for(i in 1:length(list_cancer)){ codes <- list_cancer[[i]] trait <- as.character(list1[i]) data <- can[which(can$code %in% codes),] print(trait) print(dim(data)) write.csv(data, paste0('path/...', trait, '.csv'), row.names = F) } list_AL <- c('1263') list_ALS <- c('1259') list_COPD <- c('1112', '1113', '1114') list_CYS <- c('1514') list_Dep <- c('1286', '1291', '1531') list_Diab <- c('1220', '1221', '1222', '1223', '1521') list_ENDO <- c('1402') list_IBD <- c('1463', '1462') list_IHD <- c('1492', '1490', '1079', '1075', '1076') list_LIV <- c('1604', '1158') list_LUP <- c('1381') list_MS <- c('1261') list_PCO <- c('1350') list_PD <- c('1262') list_RA <- c('1464') list_SCZ <- c('1289') list_ST <- c('1583') list_VD <- c('1263') list_codes <- list(list_AL, list_ALS, list_COPD, list_CYS, list_Dep, list_Diab, list_ENDO, list_IBD, list_IHD, list_LIV, list_LUP, list_MS, list_PCO, list_PD, list_RA, list_SCZ, list_ST, list_VD) list2 <- c('AL', 'ALS', 'COPD', 'CYS', 'Dep', 'Diab', 'ENDO', 'IBD', 'IHD', 'LIV', 'LUP', 'MS', 'PCO', 'PD', 'RA', 'SCZ', 'ST', 'VD') for(i in 1:length(list_codes)){ codes <- list_codes[[i]] trait <- as.character(list2[i]) data <- non[which(non$code %in% codes),] print(trait) print(dim(data)) write.csv(data, paste0('path/...', trait, '.csv'), row.names = F) }