05_Health_linkage_prep_GP_diseases.R · UKB-project

###############################################################################################

## Health linkage preps for cox

###############################################################################################

# srun -p interactive --pty bash

# module load R

R

###############################################################################################

### PREP CODE INTEGRATION

olink_internal <- read.csv('path/...')

clinical <- read.delim('path/...') # 123662721   
names(clinical)[1] <- 'SampleID'
clinical <- clinical[which(clinical$SampleID %in% olink_internal$SampleID),] # 13334281 

reg <- read.delim('path/...')

# TPP is data provider 3 - as its the only one ot have read 3 codes
# scotland (EMIS/vision), wales (emis/vision) and england (vision) all have read 2 
# Last updated TPP Jun 2016 (i.e. censor date)

### Read files for conversions

a <- read.csv('path/...')
b <- read.csv('path/...')
c <- read.csv('path/...')
d <- read.csv('path/...')
e <- read.csv('path/...')
f <- read.csv('path/...')

### Conversion from r3 to r2
r3 <- clinical %>% filter(data_provider == 3)
d <- read.csv('path/...')
d <- d %>% select(c('READV2_CODE', 'READV3_CODE'))
names(d) <- c('read_2', 'read_3')
r3 <- r3[c(1:3,5)]
r3 <- left_join(r3, d, by = 'read_3')
r3 <- r3[c(1,2,3,5,4)]
clinical <- clinical[c(1:5)]
clinical <- clinical %>% filter(data_provider %in% c('1','2','4'))
clinical <- rbind(clinical, r3)
write.csv(clinical, 'path/...', row.names = F)

########################################################################################

### Load in and prep ICD codes
sec <- read.csv('path/...')
IDs <- sec[1]
ICD10 <- sec[grep('41270', colnames(sec))]
ICD10 <- cbind(IDs, ICD10)
names(ICD10)[1] <- 'SampleID'
ICD10_dat <- sec[grep('41280', colnames(sec))]
ICD10_dat <- cbind(IDs, ICD10_dat)
names(ICD10_dat)[1] <- 'SampleID'
ICD9 <- sec[grep('41271', colnames(sec))]
ICD9 <- cbind(IDs, ICD9)
names(ICD9)[1] <- 'SampleID'
ICD9_dat <- sec[grep('41281', colnames(sec))]
ICD9_dat <- cbind(IDs, ICD9_dat)
names(ICD9_dat)[1] <- 'SampleID'

# ICD10
list <- list()

for(i in 1:length(ICD10$SampleID)){
  tryCatch({ 
    person <- ICD10[i,1:243]
    dates <- ICD10_dat[i,1:243]
    person <- t(person)
    dates <- t(dates)
    person <- as.data.frame(person)
    dates <- as.data.frame(dates)
    person$SampleID <- person[1,1]
    dates$SampleID <- dates[1,1]
    names(person)[1] <- 'code'
    names(dates)[1] <- 'first'
    row.names(person) <- NULL
    row.names(dates) <- NULL
    person <- person[-which(person$code %in% NA),]
    dates <- dates[-which(dates$first %in% NA),]
    person <- person[c(2,1)]
    person <- person[-1,]
    dates <- dates[-1,]
    person$first <- dates$first
    print(i)
    list[[i]] <- person
  }, error = function(e) cat("skipped"))
}

ICD10 <- do.call(rbind, list)
write.csv(ICD10,'path/...', row.names = F)

# ICD9
list2 <- list()

for(i in 1:length(ICD9$SampleID)){
  tryCatch({ 
    person <- ICD9[i,1:48]
    dates <- ICD9_dat[i,1:48]
    person <- t(person)
    dates <- t(dates)
    person <- as.data.frame(person)
    dates <- as.data.frame(dates)
    person$SampleID <- person[1,1]
    dates$SampleID <- dates[1,1]
    names(person)[1] <- 'code'
    names(dates)[1] <- 'first'
    row.names(person) <- NULL
    row.names(dates) <- NULL
    person <- person[-which(person$code %in% NA),]
    dates <- dates[-which(dates$first %in% NA),]
    person <- person[c(2,1)]
    person <- person[-1,]
    dates <- dates[-1,]
    person$first <- dates$first
    print(i)
    list2[[i]] <- person
  }, error = function(e) cat("skipped"))
}

ICD9 <- do.call(rbind, list2)
write.csv(ICD9, 'path/...', row.names = F)

########################################################################################

### TYPE 2 DIABETES

library(readxl)
clinical <- read.csv('path/...')
phen <- read_excel('path/...')
phen <- as.data.frame(phen)
phen <- phen[-which(phen$Rob_include %in% 'N'),]
phen$code <- gsub('.{2}$', '', phen$code)
patterns <- phen$code
sub <- clinical[which(clinical$read_2 %in% phen$code),]
data.frame(table(sub$read_2))

sub <- clinical[which(clinical$read_2 %in% phen$code),]
sub <- sub[c(1,4,3)]
names(sub) <- c('SampleID', 'code', 'first')
sub$first <- sub('/', '', sub$first)
sub$first <- sub('/', '', sub$first)
sub$y <- substr(sub$first,5,8)
sub$m <- substr(sub$first,3,4)
sub$first <- paste0(sub$y, sub$m)
sub$y <- NULL
sub$m <- NULL

save <- phen[which(phen$code %in% clinical$read_2),]
write.csv(phen, 'path/...', row.names = F)

# ICD10 for diabetes is E11 E12
# ICD9 for diabetes is 250, 25000, 25001, 25009, 2501, 25010, 2503, 2504, 2505, 2509
list1 <- c('E11.1', 'E11.2', 'E11.3')
list2 <- c('2500', '25000', '25010', '2503', '2504', '2505', '2509')
ICD9 <- read.csv('path/...')
ICD10 <- read.csv('path/...')
ICD10_1 <- ICD10[grep('E11', ICD10$code),]
ICD10_2 <- ICD10[grep('E12', ICD10$code),]
ICD10 <- rbind(ICD10_1, ICD10_2)
ICD9 <- ICD9[which(ICD9$code %in% list2),]
ICD9$first <- gsub('-', '', ICD9$first)
ICD10$first <- gsub('-', '', ICD10$first)
ICD9$first <- substr(ICD9$first,1,6)
ICD10$first <- substr(ICD10$first,1,6)

# check to ensure dates have correct number of characters in strings
which(nchar(ICD10$first) > 6)
which(nchar(ICD9$first) > 6)
sub <- sub[which(nchar(sub$first) == 6),]

# Check ordering of dates to spot any odd formats 
sub <- sub[order(sub$first),]
sub$type <- 'GP'
ICD9$type <- 'ICD9'
ICD10$type <- 'ICD10'
dat <- rbind(sub, ICD10)
dat <- rbind(dat, ICD9)
subset_diab <- dat[order(dat$first),]
library(ggplot2)
subset_diab <- subset_diab[-which(subset_diab$first < 197001),] 
subset_diab <-subset_diab[-which(duplicated(subset_diab$SampleID)),] 
Diab <- subset_diab
write.csv(Diab, 'path/...', row.names = F)

########################################################################################

### MDD

library(readxl)
clinical <- read.csv('path/...')
phen <- read_excel('path/...')
phen <- as.data.frame(phen)
phen <- phen[-which(phen$Rob_include %in% 'N'),]
phen$code <- gsub('.{2}$', '', phen$code)
patterns <- phen$code
sub <- clinical[which(clinical$read_2 %in% phen$code),]
data.frame(table(sub$read_2))
sub <- sub[c(1,4,3)]
names(sub) <- c('SampleID', 'code', 'first')
sub$first <- sub('/', '', sub$first)
sub$first <- sub('/', '', sub$first)
sub$y <- substr(sub$first,5,8)
sub$m <- substr(sub$first,3,4)
sub$first <- paste0(sub$y, sub$m)
sub$y <- NULL
sub$m <- NULL
save <- phen[which(phen$code %in% clinical$read_2),]
write.csv(phen,'path/...', row.names = F)
ICD10 <- read.csv('path/...')
ICD10 <- ICD10[grep('F33', ICD10$code),]
ICD10$first <- gsub('-', '', ICD10$first)
ICD10$first <- substr(ICD10$first,1,6)

# check to ensure dates have correct number of characters in strings
which(nchar(ICD10$first) > 6)
sb <- sub[-which(nchar(sub$first) == 6),] # These are missing dates -remove 
sub <- sub[which(nchar(sub$first) == 6),]

# Check ordering of dates to spot any odd formats 
sub <- sub[order(sub$first),]
sub$type <- 'GP'
ICD10$type <- 'ICD10'
dat <- rbind(sub, ICD10)
subset <- dat[order(dat$first),]
subset$year <- substr(subset$first, 1,4)
subset <- subset[-which(subset$first < 197001),] 
subset <-subset[-which(duplicated(subset$SampleID)),] 
library(ggplot2)
Dep <- subset
write.csv(Dep, 'path/...', row.names = F)

########################################################################################

### LIV FIB/CIR

library(readxl)
clinical <- read.csv('path/...')
phen <- read_excel('path/...')
phen <- as.data.frame(phen)
phen <- phen[-which(phen$Rob_include %in% 'N'),]
phen$code <- gsub('.{2}$', '', phen$code)
patterns <- phen$code
sub <- clinical[which(clinical$read_2 %in% phen$code),]
data.frame(table(sub$read_2))

sub <- sub[c(1,4,3)]
names(sub) <- c('SampleID', 'code', 'first')
sub$first <- sub('/', '', sub$first)
sub$first <- sub('/', '', sub$first)
sub$y <- substr(sub$first,5,8)
sub$m <- substr(sub$first,3,4)
sub$first <- paste0(sub$y, sub$m)
sub$y <- NULL
sub$m <- NULL
save <- phen[which(phen$code %in% clinical$read_2),]
write.csv(phen, 'path/...', row.names = F)

ICD9 <- read.csv('path/...')
ICD10 <- read.csv('path/...')
ICD10_1 <- ICD10[grep('K70', ICD10$code),]
ICD10_2 <- ICD10[grep('K71', ICD10$code),]
ICD10_3 <- ICD10[grep('K74', ICD10$code),]
ICD10 <- rbind(ICD10_1, ICD10_2)
ICD10 <- rbind(ICD10, ICD10_3)
ICD9 <- ICD9[grep('571', ICD9$code),]
ICD9 <- ICD9[-which(ICD9$code %in% 'V571'),]
ICD9$first <- gsub('-', '', ICD9$first)
ICD10$first <- gsub('-', '', ICD10$first)
ICD9$first <- substr(ICD9$first,1,6)
ICD10$first <- substr(ICD10$first,1,6)

# check to ensure dates have correct number of characters in strings
which(nchar(ICD10$first) > 6)
which(nchar(ICD9$first) > 6)

# Check ordering of dates to spot any odd formats 
sub <- sub[order(sub$first),]
sub$type <- 'GP'
ICD9$type <- 'ICD9'
ICD10$type <- 'ICD10'
dat <- rbind(sub, ICD10)
dat <- rbind(dat, ICD9)
subset <- dat[order(dat$first),]
subset$year <- substr(subset$first, 1,4)
subset <- subset[-which(nchar(subset$first) < 6),]

# subset <- subset[-which(subset$first < 199001),] 
subset <-subset[-which(duplicated(subset$SampleID)),] 
LIV <- subset
write.csv(LIV, 'path/...', row.names = F)

########################################################################################

### Breast cancer

library(readxl)
phen <- read_excel('path/...')
phen <- as.data.frame(phen)
phen <- phen[-which(phen$Rob_include %in% 'N'),]
phen$code <- gsub('.{2}$', '', phen$code)
patterns <- phen$code
sub <- clinical[which(clinical$read_2 %in% phen$code),]
data.frame(table(sub$read_2))

sub <- sub[c(1,4,3)]
names(sub) <- c('SampleID', 'code', 'first')
sub$first <- sub('/', '', sub$first)
sub$first <- sub('/', '', sub$first)
sub$y <- substr(sub$first,5,8)
sub$m <- substr(sub$first,3,4)
sub$first <- paste0(sub$y, sub$m)
sub$y <- NULL
sub$m <- NULL

save <- phen[which(phen$code %in% clinical$read_2),]
write.csv(phen, 'path/...', row.names = F)


ICD9 <- read.csv('path/...')
ICD10 <- read.csv('path/...')
ICD10_1 <- ICD10[grep('C50', ICD10$code),]
ICD10_2 <- ICD10[grep('D05', ICD10$code),]
ICD10 <- rbind(ICD10_1, ICD10_2)
ICD9 <- ICD9[grep('174', ICD9$code),]
ICD9 <- ICD9[-which(ICD9$code %in% '7174'),]
ICD9$first <- gsub('-', '', ICD9$first)
ICD10$first <- gsub('-', '', ICD10$first)
ICD9$first <- substr(ICD9$first,1,6)
ICD10$first <- substr(ICD10$first,1,6)

# check to ensure dates have correct number of characters in strings
which(nchar(ICD10$first) > 6)
which(nchar(ICD9$first) > 6)

# Check ordering of dates to spot any odd formats 
sub <- sub[order(sub$first),]

sub$type <- 'GP'
ICD9$type <- 'ICD9'
ICD10$type <- 'ICD10'
dat <- rbind(sub, ICD10)
dat <- rbind(dat, ICD9)
subset <- dat[order(dat$first),]
subset <- subset[-which(nchar(subset$first) < 6),]
subset$year <- substr(subset$first, 1,4)
subset <- subset[-which(subset$first < 197001),] 
subset <-subset[-which(duplicated(subset$SampleID)),] 
Breast <- subset
write.csv(Breast, 'path/...', row.names = F)

########################################################################################

### Colorectal cancer

library(readxl)
phen <- read_excel('path/...')
phen <- as.data.frame(phen)
phen$code <- gsub('.{2}$', '', phen$code)
patterns <- phen$code
sub <- clinical[which(clinical$read_2 %in% phen$code),]
data.frame(table(sub$read_2))
sub <- sub[c(1,4,3)]
names(sub) <- c('SampleID', 'code', 'first')
sub$first <- sub('/', '', sub$first)
sub$first <- sub('/', '', sub$first)
sub$y <- substr(sub$first,5,8)
sub$m <- substr(sub$first,3,4)
sub$first <- paste0(sub$y, sub$m)
sub$y <- NULL
sub$m <- NULL
save <- phen[which(phen$code %in% clinical$read_2),]
write.csv(phen, 'path/...', row.names = F)

ICD9 <- read.csv('path/...')
ICD10 <- read.csv('path/...')
ICD10_1 <- ICD10[grep('C18', ICD10$code),]
ICD10_2 <- ICD10[grep('C19', ICD10$code),]
ICD10_3 <- ICD10[grep('C20', ICD10$code),]
ICD10_4 <- ICD10[grep('C21', ICD10$code),]
ICD10 <- rbind(ICD10_1, ICD10_2)
ICD10 <- rbind(ICD10, ICD10_3)
ICD10 <- rbind(ICD10, ICD10_4)
ICD9_1 <- ICD9[grep('153', ICD9$code),]
ICD9_2 <- ICD9[grep('154', ICD9$code),]
ICD9 <- rbind(ICD9_1, ICD9_2)
ICD9 <- ICD9[-which(ICD9$code %in% c('71536', '71535', '71537')),]
ICD9$first <- gsub('-', '', ICD9$first)
ICD10$first <- gsub('-', '', ICD10$first)
ICD9$first <- substr(ICD9$first,1,6)
ICD10$first <- substr(ICD10$first,1,6)

# check to ensure dates have correct number of characters in strings
which(nchar(ICD10$first) > 6)
which(nchar(ICD9$first) > 6)

# Check ordering of dates to spot any odd formats 
sub <- sub[order(sub$first),]

sub$type <- 'GP'
ICD9$type <- 'ICD9'
ICD10$type <- 'ICD10'
dat <- rbind(sub, ICD10)
dat <- rbind(dat, ICD9)
subset <- dat[order(dat$first),]
subset <- subset[-which(nchar(subset$first) < 6),]
subset$year <- substr(subset$first, 1,4)

# subset <- subset[-which(subset$first < 199001),] 
subset <-subset[-which(duplicated(subset$SampleID)),] 
Colorectal <- subset
write.csv(Colorectal, 'path/...', row.names = F)

########################################################################################

### Prostate cancer

library(readxl)
library(tidyverse)
phen <- read_excel('path/...')
phen <- as.data.frame(phen)
phen <- phen[-which(phen$Rob_include %in% 'N'),]
phen$code <- gsub('.{2}$', '', phen$code)
patterns <- phen$code
sub <- clinical[which(clinical$read_2 %in% phen$code),]
data.frame(table(sub$read_2))
sub <- sub[c(1,4,3)]
names(sub) <- c('SampleID', 'code', 'first')
sub$first <- sub('/', '', sub$first)
sub$first <- sub('/', '', sub$first)
sub$y <- substr(sub$first,5,8)
sub$m <- substr(sub$first,3,4)
sub$first <- paste0(sub$y, sub$m)
sub$y <- NULL
sub$m <- NULL
save <- phen[which(phen$code %in% clinical$read_2),]
write.csv(phen, 'path/...', row.names = F)

ICD9 <- read.csv('path/...')
ICD10 <- read.csv('path/...')
ICD10 <- ICD10[grep('C61', ICD10$code),]
ICD10$first <- gsub('-', '', ICD10$first)
# ICD9$first <- substr(ICD9$first,1,6)
ICD10$first <- substr(ICD10$first,1,6)

# check to ensure dates have correct number of characters in strings
which(nchar(ICD10$first) > 6)
which(nchar(ICD9$first) > 6)

# Check ordering of dates to spot any odd formats 
sub <- sub[order(sub$first),]
sub$type <- 'GP'
ICD10$type <- 'ICD10'
dat <- rbind(sub, ICD10)
subset <- dat[order(dat$first),]
subset <- subset[-which(nchar(subset$first) < 6),]
subset$year <- substr(subset$first, 1,4)

# subset <- subset[-which(subset$first < 199001),] 
subset <-subset[-which(duplicated(subset$SampleID)),] 
Prostate <- subset
write.csv(Prostate, 'path/...', row.names = F)

########################################################################################

### ALS

library(readxl)
clinical <- read.csv('path/...')
phen <- read_excel('path/...')
phen <- as.data.frame(phen)
phen <- phen[which(phen$Rob_include %in% 'Y'),]
phen$code <- gsub('.{2}$', '', phen$code)
patterns <- phen$code
sub <- clinical[which(clinical$read_2 %in% phen$code),]
data.frame(table(sub$read_2))
sub <- sub[c(1,4,3)]
names(sub) <- c('SampleID', 'code', 'first')
sub$first <- sub('/', '', sub$first)
sub$first <- sub('/', '', sub$first)
sub$y <- substr(sub$first,5,8)
sub$m <- substr(sub$first,3,4)
sub$first <- paste0(sub$y, sub$m)
sub$y <- NULL
sub$m <- NULL
save <- phen[which(phen$code %in% clinical$read_2),]
write.csv(phen, 'path/...', row.names = F)

ICD9 <- read.csv('path/...')
ICD10 <- read.csv('path/...')
ICD10 <- ICD10[grep('G122', ICD10$code),]
ICD9 <- ICD9[grep('3352', ICD9$code),]
ICD9$first <- gsub('-', '', ICD9$first)
ICD10$first <- gsub('-', '', ICD10$first)
ICD9$first <- substr(ICD9$first,1,6)
ICD10$first <- substr(ICD10$first,1,6)

# check to ensure dates have correct number of characters in strings
which(nchar(ICD10$first) > 6)
which(nchar(ICD9$first) > 6)

# Check ordering of dates to spot any odd formats 
sub <- sub[order(sub$first),]
sub$type <- 'GP'
ICD9$type <- 'ICD9'
ICD10$type <- 'ICD10'
dat <- rbind(sub, ICD10)
dat <- rbind(dat, ICD9)
subset <- dat[order(dat$first),]
subset <- subset[-which(nchar(subset$first) < 6),]
subset$year <- substr(subset$first, 1,4)

# subset <- subset[-which(subset$first < 199001),] 
subset <-subset[-which(duplicated(subset$SampleID)),] 
ALS <- subset
write.csv(ALS, 'path/...', row.names = F)

########################################################################################

### AD

library(readxl)
clinical <- read.csv('path/...')
phen <- read_excel('path/...')
phen <- as.data.frame(phen)
phen <- phen[which(phen$Rob_include %in% 'Y'),]
phen$code <- gsub('.{2}$', '', phen$code)
patterns <- phen$code
sub <- clinical[which(clinical$read_2 %in% phen$code),]
data.frame(table(sub$read_2))
sub <- sub[c(1,4,3)]
names(sub) <- c('SampleID', 'code', 'first')
sub$first <- sub('/', '', sub$first)
sub$first <- sub('/', '', sub$first)
sub$y <- substr(sub$first,5,8)
sub$m <- substr(sub$first,3,4)
sub$first <- paste0(sub$y, sub$m)
sub$y <- NULL
sub$m <- NULL
save <- phen[which(phen$code %in% clinical$read_2),]
write.csv(phen, 'path/...', row.names = F)

ICD9 <- read.csv('path/...')
ICD10 <- read.csv('path/...')
ICD10_1 <- ICD10[grep('G30', ICD10$code),]
ICD10_2 <- ICD10[grep('F00', ICD10$code),]
ICD10 <- rbind(ICD10_1, ICD10_2)

# ICD9$first <- gsub('-', '', ICD9$first)
ICD10$first <- gsub('-', '', ICD10$first)
# ICD9$first <- substr(ICD9$first,1,6)
ICD10$first <- substr(ICD10$first,1,6)
# check to ensure dates have correct number of characters in strings
which(nchar(ICD10$first) > 6)
# which(nchar(ICD9$first) > 6)
# Check ordering of dates to spot any odd formats 
sub <- sub[order(sub$first),]
sub$type <- 'GP'
ICD9$type <- 'ICD9'
ICD10$type <- 'ICD10'
dat <- rbind(sub, ICD10)
# dat <- rbind(dat, ICD9)
subset <- dat[order(dat$first),]
subset <- subset[-which(nchar(subset$first) < 6),]
subset$year <- substr(subset$first, 1,4)

# subset <- subset[-which(subset$first < 199001),] 
subset <-subset[-which(duplicated(subset$SampleID)),] 
AL <- subset
write.csv(AL, 'path/...', row.names = F)

########################################################################################

### VD

library(readxl)
clinical <- read.csv('path/...')
phen <- read_excel('path/...')
phen <- as.data.frame(phen)
phen <- phen[which(phen$Rob_include %in% 'Y'),]
phen$code <- gsub('.{2}$', '', phen$code)
patterns <- phen$code
sub <- clinical[which(clinical$read_2 %in% phen$code),]
data.frame(table(sub$read_2))
sub <- sub[c(1,4,3)]
names(sub) <- c('SampleID', 'code', 'first')
sub$first <- sub('/', '', sub$first)
sub$first <- sub('/', '', sub$first)
sub$y <- substr(sub$first,5,8)
sub$m <- substr(sub$first,3,4)
sub$first <- paste0(sub$y, sub$m)
sub$y <- NULL
sub$m <- NULL
save <- phen[which(phen$code %in% clinical$read_2),]
write.csv(phen, 'path/...', row.names = F)

ICD9 <- read.csv('path/...')
ICD10 <- read.csv('path/...')
ICD10 <- ICD10[grep('F01', ICD10$code),]
# ICD9$first <- gsub('-', '', ICD9$first)
ICD10$first <- gsub('-', '', ICD10$first)
# ICD9$first <- substr(ICD9$first,1,6)
ICD10$first <- substr(ICD10$first,1,6)
# check to ensure dates have correct number of characters in strings
which(nchar(ICD10$first) > 6)
# which(nchar(ICD9$first) > 6)

# Check ordering of dates to spot any odd formats 
sub <- sub[order(sub$first),]
sub$type <- 'GP'
ICD9$type <- 'ICD9'
ICD10$type <- 'ICD10'
dat <- rbind(sub, ICD10)
# dat <- rbind(dat, ICD9)

subset <- dat[order(dat$first),]
subset <- subset[-which(nchar(subset$first) < 6),]
subset$year <- substr(subset$first, 1,4)
# subset <- subset[-which(subset$first < 199001),] 
subset <-subset[-which(duplicated(subset$SampleID)),] 
VD <- subset
write.csv(VD, 'path/...', row.names = F)

########################################################################################

### SCZ

library(readxl)
clinical <- read.csv('path/...')
phen <- read_excel('path/...')
phen <- as.data.frame(phen)
phen <- phen[which(phen$Rob_include %in% 'Y'),]
phen$code <- gsub('.{2}$', '', phen$code)
patterns <- phen$code
sub <- clinical[which(clinical$read_2 %in% phen$code),]
data.frame(table(sub$read_2))
sub <- sub[c(1,4,3)]
names(sub) <- c('SampleID', 'code', 'first')
sub$first <- sub('/', '', sub$first)
sub$first <- sub('/', '', sub$first)
sub$y <- substr(sub$first,5,8)
sub$m <- substr(sub$first,3,4)
sub$first <- paste0(sub$y, sub$m)
sub$y <- NULL
sub$m <- NULL
save <- phen[which(phen$code %in% clinical$read_2),]
write.csv(phen, 'path/...', row.names = F)


ICD9 <- read.csv('path/...')
ICD10 <- read.csv('path/...')
ICD10_1 <- ICD10[grep('F20', ICD10$code),]
ICD10_2 <- ICD10[grep('F21', ICD10$code),]
ICD10_3 <- ICD10[grep('F22', ICD10$code),]
ICD10_4 <- ICD10[grep('F23', ICD10$code),]
ICD10 <- rbind(ICD10_1, ICD10_2)
ICD10 <- rbind(ICD10, ICD10_3)
ICD10 <- rbind(ICD10, ICD10_4)

# ICD9 <- ICD9[grep('295', ICD9$code),]
# ICD9$first <- gsub('-', '', ICD9$first)
ICD10$first <- gsub('-', '', ICD10$first)
# ICD9$first <- substr(ICD9$first,1,6)
ICD10$first <- substr(ICD10$first,1,6)
# check to ensure dates have correct number of characters in strings
which(nchar(ICD10$first) > 6)
# which(nchar(ICD9$first) > 6)

# Check ordering of dates to spot any odd formats 
sub <- sub[order(sub$first),]
sub$type <- 'GP'
ICD9$type <- 'ICD9'
ICD10$type <- 'ICD10'
dat <- rbind(sub, ICD10)
# dat <- rbind(dat, ICD9)
subset <- dat[order(dat$first),]
subset <- subset[-which(nchar(subset$first) < 6),]
subset$year <- substr(subset$first, 1,4)
subset <- subset[-which(subset$first < 197001),] 
subset <-subset[-which(duplicated(subset$SampleID)),] 
SCZ <- subset
write.csv(SCZ, 'path/...', row.names = F)

########################################################################################

### RA

library(readxl)
phen <- read_excel('path/...')
phen <- as.data.frame(phen)
phen <- phen[which(phen$Rob_include %in% 'Y'),]
phen$code <- gsub('.{2}$', '', phen$code)
patterns <- phen$code
sub <- clinical[which(clinical$read_2 %in% phen$code),]
data.frame(table(sub$read_2))
sub <- sub[c(1,4,3)]
names(sub) <- c('SampleID', 'code', 'first')
sub$first <- sub('/', '', sub$first)
sub$first <- sub('/', '', sub$first)
sub$y <- substr(sub$first,5,8)
sub$m <- substr(sub$first,3,4)
sub$first <- paste0(sub$y, sub$m)
sub$y <- NULL
sub$m <- NULL
save <- phen[which(phen$code %in% clinical$read_2),]
write.csv(phen, 'path/...', row.names = F)

ICD9 <- read.csv('path/...')
ICD10 <- read.csv('path/...')
ICD10_1 <- ICD10[grep('J99', ICD10$code),]
ICD10_2 <- ICD10[grep('M05', ICD10$code),]
ICD10_3 <- ICD10[grep('M06', ICD10$code),]
ICD10 <- rbind(ICD10_1, ICD10_2)
ICD10 <- rbind(ICD10, ICD10_3)
ICD9 <- ICD9[grep('7140', ICD9$code),]
ICD9$first <- gsub('-', '', ICD9$first)
ICD10$first <- gsub('-', '', ICD10$first)
ICD9$first <- substr(ICD9$first,1,6)
ICD10$first <- substr(ICD10$first,1,6)

# check to ensure dates have correct number of characters in strings
which(nchar(ICD10$first) > 6)
which(nchar(ICD9$first) > 6)

# Check ordering of dates to spot any odd formats 
sub <- sub[order(sub$first),]
sub$type <- 'GP'
ICD9$type <- 'ICD9'
ICD10$type <- 'ICD10'
dat <- rbind(sub, ICD10)
dat <- rbind(dat, ICD9)
subset <- dat[order(dat$first),]
subset <- subset[-which(nchar(subset$first) < 6),]
subset$year <- substr(subset$first, 1,4)
subset <- subset[-which(subset$first < 197001),] 
subset <-subset[-which(duplicated(subset$SampleID)),] 
RA <- subset
write.csv(RA, 'path/...', row.names = F)

########################################################################################

### IHD

library(readxl)
phen <- read_excel('path/...')
phen <- as.data.frame(phen)
phen <- phen[which(phen$Rob_include %in% 'Y'),]
phen$code <- gsub('.{2}$', '', phen$code)
patterns <- phen$code
sub <- clinical[which(clinical$read_2 %in% phen$code),]
data.frame(table(sub$read_2))
sub <- sub[c(1,4,3)]
names(sub) <- c('SampleID', 'code', 'first')
sub$first <- sub('/', '', sub$first)
sub$first <- sub('/', '', sub$first)
sub$y <- substr(sub$first,5,8)
sub$m <- substr(sub$first,3,4)
sub$first <- paste0(sub$y, sub$m)
sub$y <- NULL
sub$m <- NULL
save <- phen[which(phen$code %in% clinical$read_2),]
write.csv(phen, 'path/...', row.names = F)

ICD9 <- read.csv('path/...')
ICD10 <- read.csv('path/...')
ICD10_patterns <- c('I250', 'I251', 'I253', 'I254', 'I255', 'I256', 'I258', 'I259')
ICD10 <- filter(ICD10, grepl(paste(ICD10_patterns, collapse="|"), code))
ICD9_patterns <- c('440', '441', '437', '411', '414')
ICD9 <- filter(ICD9, grepl(paste(ICD9_patterns, collapse="|"), code))
ICD9 <- ICD9[-which(ICD9$code %in% c('2411', '9411', '8411')),]
ICD9$first <- gsub('-', '', ICD9$first)
ICD10$first <- gsub('-', '', ICD10$first)
ICD9$first <- substr(ICD9$first,1,6)
ICD10$first <- substr(ICD10$first,1,6)

# check to ensure dates have correct number of characters in strings
which(nchar(ICD10$first) > 6)
which(nchar(ICD9$first) > 6)

# Check ordering of dates to spot any odd formats 
sub <- sub[order(sub$first),]
sub$type <- 'GP'
ICD9$type <- 'ICD9'
ICD10$type <- 'ICD10'
dat <- rbind(sub, ICD10)
dat <- rbind(dat, ICD9)
subset <- dat[order(dat$first),]
subset <- subset[-which(nchar(subset$first) < 6),]
subset$year <- substr(subset$first, 1,4)
subset <- subset[-which(subset$first < 197001),] 
subset <-subset[-which(duplicated(subset$SampleID)),] 
IHD <- subset
write.csv(IHD, 'path/...', row.names = F)

########################################################################################

### PD

library(readxl)
phen <- read_excel('path/...')
phen <- as.data.frame(phen)
phen <- phen[which(phen$Rob_include %in% 'Y'),]
phen$code <- gsub('.{2}$', '', phen$code)
patterns <- phen$code
sub <- clinical[which(clinical$read_2 %in% phen$code),]
data.frame(table(sub$read_2))
sub <- sub[c(1,4,3)]
names(sub) <- c('SampleID', 'code', 'first')
sub$first <- sub('/', '', sub$first)
sub$first <- sub('/', '', sub$first)
sub$y <- substr(sub$first,5,8)
sub$m <- substr(sub$first,3,4)
sub$first <- paste0(sub$y, sub$m)
sub$y <- NULL
sub$m <- NULL
save <- phen[which(phen$code %in% clinical$read_2),]
write.csv(phen, 'path/...', row.names = F)

ICD9 <- read.csv('path/...')
ICD10 <- read.csv('path/...')
ICD10_patterns <- c('F023', 'G20')
ICD10 <- filter(ICD10, grepl(paste(ICD10_patterns, collapse="|"), code))
ICD10$first <- gsub('-', '', ICD10$first)
# ICD9$first <- substr(ICD9$first,1,6)
ICD10$first <- substr(ICD10$first,1,6)

# check to ensure dates have correct number of characters in strings
which(nchar(ICD10$first) > 6)
which(nchar(ICD9$first) > 6)

# Check ordering of dates to spot any odd formats 
sub <- sub[order(sub$first),]
sub$type <- 'GP'
ICD9$type <- 'ICD9'
ICD10$type <- 'ICD10'
dat <- rbind(sub, ICD10)
# dat <- rbind(dat, ICD9)
subset <- dat[order(dat$first),]
subset$year <- substr(subset$first, 1,4)
subset <- subset[-which(subset$first < 197001),] 
subset <-subset[-which(duplicated(subset$SampleID)),] 
PD <- subset
write.csv(PD, 'path/...', row.names = F)


########################################################################################

### MS

phen <- read_excel('path/...')
phen <- as.data.frame(phen)
phen <- phen[which(phen$Rob_include %in% 'Y'),]
phen$code <- gsub('.{2}$', '', phen$code)
patterns <- phen$code
sub <- clinical[which(clinical$read_2 %in% phen$code),]
data.frame(table(sub$read_2))
sub <- sub[c(1,4,3)]
names(sub) <- c('SampleID', 'code', 'first')
sub$first <- sub('/', '', sub$first)
sub$first <- sub('/', '', sub$first)
sub$y <- substr(sub$first,5,8)
sub$m <- substr(sub$first,3,4)
sub$first <- paste0(sub$y, sub$m)
sub$y <- NULL
sub$m <- NULL
save <- phen[which(phen$code %in% clinical$read_2),]
write.csv(phen, 'path/...', row.names = F)

ICD9 <- read.csv('path/...')
ICD10 <- read.csv('path/...')
ICD10_patterns <- c('G35')
ICD10 <- filter(ICD10, grepl(paste(ICD10_patterns, collapse="|"), code))
ICD9_patterns <- c('3409')
ICD9 <- filter(ICD9, grepl(paste(ICD9_patterns, collapse="|"), code))
# ICD9 <- ICD9[-which(ICD9$code %in% c('2411', '9411', '8411')),]
ICD9$first <- gsub('-', '', ICD9$first)
ICD10$first <- gsub('-', '', ICD10$first)
ICD9$first <- substr(ICD9$first,1,6)
ICD10$first <- substr(ICD10$first,1,6)
# check to ensure dates have correct number of characters in strings
which(nchar(ICD10$first) > 6)
which(nchar(ICD9$first) > 6)

# Check ordering of dates to spot any odd formats 
sub <- sub[order(sub$first),]
sub$type <- 'GP'
ICD9$type <- 'ICD9'
ICD10$type <- 'ICD10'

dat <- rbind(sub, ICD10)
dat <- rbind(dat, ICD9)
subset <- dat[order(dat$first),]
subset <- subset[-which(nchar(subset$first) < 6),]
subset$year <- substr(subset$first, 1,4)
subset <- subset[-which(subset$first < 197001),] 
subset <-subset[-which(duplicated(subset$SampleID)),] 
MS <- subset
write.csv(MS, 'path/...', row.names = F)

########################################################################################

### LUP

phen <- read_excel('path/...')
phen <- as.data.frame(phen)
phen <- phen[which(phen$Rob_include %in% 'Y'),]
phen$code <- gsub('.{2}$', '', phen$code)
patterns <- phen$code
sub <- clinical[which(clinical$read_2 %in% phen$code),]
data.frame(table(sub$read_2))
sub <- sub[c(1,4,3)]
names(sub) <- c('SampleID', 'code', 'first')
sub$first <- sub('/', '', sub$first)
sub$first <- sub('/', '', sub$first)
sub$y <- substr(sub$first,5,8)
sub$m <- substr(sub$first,3,4)
sub$first <- paste0(sub$y, sub$m)
sub$y <- NULL
sub$m <- NULL
save <- phen[which(phen$code %in% clinical$read_2),]
write.csv(phen, 'path/...', row.names = F)

ICD9 <- read.csv('path/...')
ICD10 <- read.csv('path/...')
ICD10_patterns <- c('L93', 'M32')
ICD10 <- filter(ICD10, grepl(paste(ICD10_patterns, collapse="|"), code))
ICD9_patterns <- c('7100', '6954')
ICD9 <- filter(ICD9, grepl(paste(ICD9_patterns, collapse="|"), code))
# ICD9 <- ICD9[-which(ICD9$code %in% c('2411', '9411', '8411')),]
ICD9$first <- gsub('-', '', ICD9$first)
ICD10$first <- gsub('-', '', ICD10$first)
ICD9$first <- substr(ICD9$first,1,6)
ICD10$first <- substr(ICD10$first,1,6)
# check to ensure dates have correct number of characters in strings
which(nchar(ICD10$first) > 6)
which(nchar(ICD9$first) > 6)

# Check ordering of dates to spot any odd formats 
sub <- sub[order(sub$first),]
sub$type <- 'GP'
ICD9$type <- 'ICD9'
ICD10$type <- 'ICD10'
dat <- rbind(sub, ICD10)
dat <- rbind(dat, ICD9)
subset <- dat[order(dat$first),]
subset <- subset[-which(nchar(subset$first) < 6),]
subset$year <- substr(subset$first, 1,4)
subset <- subset[-which(subset$first < 197001),] 
subset <-subset[-which(duplicated(subset$SampleID)),] 
LUP <- subset
write.csv(LUP, 'path/...', row.names = F)

########################################################################################

### COPD

phen <- read_excel('path/...')
phen <- as.data.frame(phen)
phen <- phen[which(phen$Rob_include %in% 'Y'),]
phen$code <- gsub('.{2}$', '', phen$code)
patterns <- phen$code
sub <- clinical[which(clinical$read_2 %in% phen$code),]
data.frame(table(sub$read_2))

sub <- sub[c(1,4,3)]
names(sub) <- c('SampleID', 'code', 'first')
sub$first <- sub('/', '', sub$first)
sub$first <- sub('/', '', sub$first)
sub$y <- substr(sub$first,5,8)
sub$m <- substr(sub$first,3,4)
sub$first <- paste0(sub$y, sub$m)
sub$y <- NULL
sub$m <- NULL
save <- phen[which(phen$code %in% clinical$read_2),]
write.csv(phen, 'path/...', row.names = F)

ICD9 <- read.csv('path/...')
ICD10 <- read.csv('path/...')
ICD10_patterns <- c('J41', 'J42', 'J44', 'J43')
ICD10 <- filter(ICD10, grepl(paste(ICD10_patterns, collapse="|"), code))
ICD9_patterns <- c('4929', '491')
ICD9 <- filter(ICD9, grepl(paste(ICD9_patterns, collapse="|"), code))
ICD9 <- ICD9[-which(ICD9$code %in% c('74929')),]
ICD9$first <- gsub('-', '', ICD9$first)
ICD10$first <- gsub('-', '', ICD10$first)
ICD9$first <- substr(ICD9$first,1,6)
ICD10$first <- substr(ICD10$first,1,6)

# check to ensure dates have correct number of characters in strings
which(nchar(ICD10$first) > 6)
which(nchar(ICD9$first) > 6)

# Check ordering of dates to spot any odd formats 
sub <- sub[order(sub$first),]
sub$type <- 'GP'
ICD9$type <- 'ICD9'
ICD10$type <- 'ICD10'
dat <- rbind(sub, ICD10)
dat <- rbind(dat, ICD9)
subset <- dat[order(dat$first),]
subset <- subset[-which(nchar(subset$first) < 6),]
subset$year <- substr(subset$first, 1,4)
subset <- subset[-which(subset$first < 197001),] 
subset <-subset[-which(duplicated(subset$SampleID)),] 
COPD <- subset
write.csv(COPD, 'path/...', row.names = F)


########################################################################################

### ENDO

phen <- read_excel('path/...')
phen <- as.data.frame(phen)
phen <- phen[which(phen$Rob_include %in% 'Y'),]
phen$code <- gsub('.{2}$', '', phen$code)
patterns <- phen$code
sub <- clinical[which(clinical$read_2 %in% phen$code),]
data.frame(table(sub$read_2))

ICD9 <- read.csv('path/...')
ICD10 <- read.csv('path/...')
ICD10_patterns <- c('N80')
ICD10 <- filter(ICD10, grepl(paste(ICD10_patterns, collapse="|"), code))
ICD9_patterns <- c('617')
ICD9 <- filter(ICD9, grepl(paste(ICD9_patterns, collapse="|"), code))
ICD9$first <- gsub('-', '', ICD9$first)
ICD10$first <- gsub('-', '', ICD10$first)
ICD9$first <- substr(ICD9$first,1,6)
ICD10$first <- substr(ICD10$first,1,6)

# check to ensure dates have correct number of characters in strings
which(nchar(ICD10$first) > 6)
which(nchar(ICD9$first) > 6)

# Check ordering of dates to spot any odd formats 
# sub <- sub[order(sub$first),]
# sub$type <- 'GP'
ICD9$type <- 'ICD9'
ICD10$type <- 'ICD10'
dat <- rbind(ICD9, ICD10)
# dat <- rbind(dat, ICD9)
subset <- dat[order(dat$first),]
# subset <- subset[-which(nchar(subset$first) < 6),]
subset$year <- substr(subset$first, 1,4)
# subset <- subset[-which(subset$first < 199001),] 
subset <-subset[-which(duplicated(subset$SampleID)),] 
ENDO <- subset
write.csv(ENDO, 'path/...', row.names = F)

########################################################################################

### CYS

phen <- read.csv('path/...')
phen <- as.data.frame(phen)
# phen <- phen[which(phen$Rob_include %in% 'Y'),]
phen$code <- gsub('.{2}$', '', phen$code)
patterns <- phen$code
sub <- clinical[which(clinical$read_2 %in% phen$code),]
data.frame(table(sub$read_2))
sub <- sub[c(1,4,3)]
names(sub) <- c('SampleID', 'code', 'first')
sub$first <- sub('/', '', sub$first)
sub$first <- sub('/', '', sub$first)
sub$y <- substr(sub$first,5,8)
sub$m <- substr(sub$first,3,4)
sub$first <- paste0(sub$y, sub$m)
sub$y <- NULL
sub$m <- NULL
save <- phen[which(phen$code %in% clinical$read_2),]
write.csv(phen, 'path/...', row.names = F)

ICD9 <- read.csv('path/...')
ICD10 <- read.csv('path/...')
ICD10_patterns <- c('N301', 'N302')
ICD10 <- filter(ICD10, grepl(paste(ICD10_patterns, collapse="|"), code))
ICD9_patterns <- c('5951', '5952')
ICD9 <- filter(ICD9, grepl(paste(ICD9_patterns, collapse="|"), code))
ICD9$first <- gsub('-', '', ICD9$first)
ICD10$first <- gsub('-', '', ICD10$first)
ICD9$first <- substr(ICD9$first,1,6)
ICD10$first <- substr(ICD10$first,1,6)

# check to ensure dates have correct number of characters in strings
which(nchar(ICD10$first) > 6)
which(nchar(ICD9$first) > 6)

# Check ordering of dates to spot any odd formats 
sub <- sub[order(sub$first),]
sub$type <- 'GP'
ICD9$type <- 'ICD9'
ICD10$type <- 'ICD10'
dat <- rbind(sub, ICD10)
dat <- rbind(dat, ICD9)
subset <- dat[order(dat$first),]
subset$year <- substr(subset$first, 1,4)

# subset <- subset[-which(subset$first < 197001),] 
subset <-subset[-which(duplicated(subset$SampleID)),] 
CYS <- subset
write.csv(CYS, 'path/...', row.names = F)

########################################################################################

### IBD

phen <- read_excel('path/...')
phen <- as.data.frame(phen)
phen <- phen[which(phen$Rob_include %in% 'Y'),]
phen$code <- gsub('.{2}$', '', phen$code)
patterns <- phen$code
sub <- clinical[which(clinical$read_2 %in% phen$code),]
data.frame(table(sub$read_2))
sub <- sub[c(1,4,3)]
names(sub) <- c('SampleID', 'code', 'first')
sub$first <- sub('/', '', sub$first)
sub$first <- sub('/', '', sub$first)
sub$y <- substr(sub$first,5,8)
sub$m <- substr(sub$first,3,4)
sub$first <- paste0(sub$y, sub$m)
sub$y <- NULL
sub$m <- NULL
save <- phen[which(phen$code %in% clinical$read_2),]
write.csv(phen, 'path/...', row.names = F)

ICD9 <- read.csv('path/...')
ICD10 <- read.csv('path/...')
ICD10_patterns <- c('K50', 'K51')
ICD10 <- filter(ICD10, grepl(paste(ICD10_patterns, collapse="|"), code))
# ICD9$first <- gsub('-', '', ICD9$first)
ICD10$first <- gsub('-', '', ICD10$first)
# ICD9$first <- substr(ICD9$first,1,6)
ICD10$first <- substr(ICD10$first,1,6)

# check to ensure dates have correct number of characters in strings
which(nchar(ICD10$first) > 6)
which(nchar(ICD9$first) > 6)

# Check ordering of dates to spot any odd formats 
sub <- sub[order(sub$first),]
sub$type <- 'GP'
ICD9$type <- 'ICD9'
ICD10$type <- 'ICD10'
dat <- rbind(sub, ICD10)
# dat <- rbind(dat, ICD9)
subset <- dat[order(dat$first),]
subset <- subset[-which(nchar(subset$first) < 6),]
subset$year <- substr(subset$first, 1,4)
subset <- subset[-which(subset$first < 197001),] 
subset <-subset[-which(duplicated(subset$SampleID)),] 
IBD <- subset
write.csv(IBD, 'path/...', row.names = F)

########################################################################################

### Stroke

library(readxl)
phen <- read_excel('path/...')
phen <- as.data.frame(phen)
# phen <- phen[which(phen$Rob_include %in% 'Y'),]
phen$code <- gsub('.{2}$', '', phen$code)
patterns <- phen$code
sub <- clinical[which(clinical$read_2 %in% phen$code),]
data.frame(table(sub$read_2))
sub <- sub[c(1,4,3)]
names(sub) <- c('SampleID', 'code', 'first')
sub$first <- sub('/', '', sub$first)
sub$first <- sub('/', '', sub$first)
sub$y <- substr(sub$first,5,8)
sub$m <- substr(sub$first,3,4)
sub$first <- paste0(sub$y, sub$m)
sub$y <- NULL
sub$m <- NULL
save <- phen[which(phen$code %in% clinical$read_2),]
write.csv(phen, 'path/...', row.names = F)

ICD9 <- read.csv('path/...')
ICD10 <- read.csv('path/...')
library(tidyverse)
ICD10_patterns <- c('I63', 'I69')
ICD10 <- filter(ICD10, grepl(paste(ICD10_patterns, collapse="|"), code))
ICD9_patterns <- c('433', '434', '435')
ICD9 <- filter(ICD9, grepl(paste(ICD9_patterns, collapse="|"), code))
ICD9 <- ICD9[-which(ICD9$code %in% c('V433', '74339', 'V434')),]
ICD9$first <- gsub('-', '', ICD9$first)
ICD10$first <- gsub('-', '', ICD10$first)
ICD9$first <- substr(ICD9$first,1,6)
ICD10$first <- substr(ICD10$first,1,6)

# check to ensure dates have correct number of characters in strings
which(nchar(ICD10$first) > 6)
which(nchar(ICD9$first) > 6)

# Check ordering of dates to spot any odd formats 
sub <- sub[order(sub$first),]
sub$type <- 'GP'
ICD9$type <- 'ICD9'
ICD10$type <- 'ICD10'
dat <- rbind(sub, ICD10)
dat <- rbind(dat, ICD9)
subset <- dat[order(dat$first),]
subset$year <- substr(subset$first, 1,4)

# subset <- subset[-which(subset$first < 199001),] 
subset <-subset[-which(duplicated(subset$SampleID)),] 
ST <- subset
write.csv(ST, 'path/...', row.names = F)

########################################################################################

### Lung cancer

library(readxl)
phen <- read.csv('path/...')
phen <- as.data.frame(phen)
# phen <- phen[which(phen$Rob_include %in% 'Y'),]
phen$code <- gsub('.{2}$', '', phen$code)
patterns <- phen$code
sub <- clinical[which(clinical$read_2 %in% phen$code),]
data.frame(table(sub$read_2))
sub <- sub[c(1,4,3)]
names(sub) <- c('SampleID', 'code', 'first')
sub$first <- sub('/', '', sub$first)
sub$first <- sub('/', '', sub$first)
sub$y <- substr(sub$first,5,8)
sub$m <- substr(sub$first,3,4)
sub$first <- paste0(sub$y, sub$m)
sub$y <- NULL
sub$m <- NULL
save <- phen[which(phen$code %in% clinical$read_2),]
write.csv(phen, 'path/...', row.names = F)

ICD9 <- read.csv('path/...')
ICD10 <- read.csv('path/...')
library(tidyverse)
ICD10_patterns <- c('C33', 'C34')
ICD10 <- filter(ICD10, grepl(paste(ICD10_patterns, collapse="|"), code))
ICD9_patterns <- c('1629')
ICD9 <- filter(ICD9, grepl(paste(ICD9_patterns, collapse="|"), code))
ICD9$first <- gsub('-', '', ICD9$first)
ICD10$first <- gsub('-', '', ICD10$first)
ICD9$first <- substr(ICD9$first,1,6)
ICD10$first <- substr(ICD10$first,1,6)

# check to ensure dates have correct number of characters in strings
which(nchar(ICD10$first) > 6)
which(nchar(ICD9$first) > 6)

# Check ordering of dates to spot any odd formats 
sub <- sub[order(sub$first),]
sub$type <- 'GP'
ICD9$type <- 'ICD9'
ICD10$type <- 'ICD10'
dat <- rbind(sub, ICD10)
dat <- rbind(dat, ICD9)
subset <- dat[order(dat$first),]
# subset <- subset[-which(nchar(subset$first) < 6),]
subset$year <- substr(subset$first, 1,4)
# subset <- subset[-which(subset$first < 199001),] 
subset <-subset[-which(duplicated(subset$SampleID)),] 
LUNG <- subset
write.csv(LUNG, 'path/...', row.names = F)


########################################################################################

### Brain/CNS cancer

library(readxl)
phen <- read.csv('path/...')
phen <- as.data.frame(phen)
# phen <- phen[which(phen$Rob_include %in% 'Y'),]
phen$code <- gsub('.{2}$', '', phen$code)
patterns <- phen$code
sub <- clinical[which(clinical$read_2 %in% phen$code),]
data.frame(table(sub$read_2))
sub <- sub[c(1,4,3)]
names(sub) <- c('SampleID', 'code', 'first')
sub$first <- sub('/', '', sub$first)
sub$first <- sub('/', '', sub$first)
sub$y <- substr(sub$first,5,8)
sub$m <- substr(sub$first,3,4)
sub$first <- paste0(sub$y, sub$m)
sub$y <- NULL
sub$m <- NULL
save <- phen[which(phen$code %in% clinical$read_2),]
write.csv(phen, 'path/...', row.names = F)

ICD9 <- read.csv('path/...')
ICD10 <- read.csv('path/...')
library(tidyverse)
ICD10_patterns <- c('C70', 'C71', 'C72', 'C75')
ICD10 <- filter(ICD10, grepl(paste(ICD10_patterns, collapse="|"), code))
ICD9_patterns <- c('191')
ICD9 <- filter(ICD9, grepl(paste(ICD9_patterns, collapse="|"), code))
# ICD9$first <- gsub('-', '', ICD9$first)
ICD10$first <- gsub('-', '', ICD10$first)
# ICD9$first <- substr(ICD9$first,1,6)
ICD10$first <- substr(ICD10$first,1,6)

# check to ensure dates have correct number of characters in strings
which(nchar(ICD10$first) > 6)
# which(nchar(ICD9$first) > 6)

# Check ordering of dates to spot any odd formats 
sub <- sub[order(sub$first),]
sub$type <- 'GP'
# ICD9$type <- 'ICD9'
ICD10$type <- 'ICD10'
dat <- rbind(sub, ICD10)
# dat <- rbind(dat, ICD9)
subset <- dat[order(dat$first),]
# subset <- subset[-which(nchar(subset$first) < 6),]
subset$year <- substr(subset$first, 1,4)
# subset <- subset[-which(subset$first < 199001),] 
subset <-subset[-which(duplicated(subset$SampleID)),] 
BRAIN <- subset
write.csv(BRAIN, 'path/...', row.names = F)

########################################################################################

### GYNC - cervical x2, ovarian and uterine

phen <- read.csv('path/...')
phen <- as.data.frame(phen)
# phen <- phen[which(phen$Rob_include %in% 'Y'),]
phen$code <- gsub('.{2}$', '', phen$code)
patterns <- phen$code
sub <- clinical[which(clinical$read_2 %in% phen$code),]
data.frame(table(sub$read_2))
sub <- sub[c(1,4,3)]
names(sub) <- c('SampleID', 'code', 'first')
sub$first <- sub('/', '', sub$first)
sub$first <- sub('/', '', sub$first)
sub$y <- substr(sub$first,5,8)
sub$m <- substr(sub$first,3,4)
sub$first <- paste0(sub$y, sub$m)
sub$y <- NULL
sub$m <- NULL
save <- phen[which(phen$code %in% clinical$read_2),]
write.csv(phen, 'path/...', row.names = F)

ICD9 <- read.csv('path/...')
ICD10 <- read.csv('path/...')
ICD10_patterns <- c('C54', 'C55', 'C56', 'D06', 'N87', 'C53', 'C51', 'C52', 'C57')
ICD10 <- filter(ICD10, grepl(paste(ICD10_patterns, collapse="|"), code))
ICD9_patterns <- c('5951', '5952', '179', '180', '182', '183', '184')
ICD9 <- filter(ICD9, grepl(paste(ICD9_patterns, collapse="|"), code))
ICD9 <- ICD9[-which(ICD9$code %in% c('E91799' ,'2179', '6179', '71842', 'E8179', '6180', '7179', '71828', '71831', '4179', '71839', '6184', 'E91795', 'E9179', 'E91794', 'E8182', '5952', '69180', '7183')),]
ICD9$first <- gsub('-', '', ICD9$first)
ICD10$first <- gsub('-', '', ICD10$first)
ICD9$first <- substr(ICD9$first,1,6)
ICD10$first <- substr(ICD10$first,1,6)

# check to ensure dates have correct number of characters in strings
which(nchar(ICD10$first) > 6)
which(nchar(ICD9$first) > 6)

# Check ordering of dates to spot any odd formats 
sub <- sub[order(sub$first),]
sub$type <- 'GP'
ICD9$type <- 'ICD9'
ICD10$type <- 'ICD10'
dat <- rbind(sub, ICD10)
dat <- rbind(dat, ICD9)
subset <- dat[order(dat$first),]
subset <- subset[-which(nchar(subset$first) < 6),]
subset$year <- substr(subset$first, 1,4)
subset <- subset[-which(subset$first < 197001),] 
subset <-subset[-which(duplicated(subset$SampleID)),] 
GYN <- subset
write.csv(GYN,'path/...', row.names = F)

# Join Urology outcomes together (i.e. prostate or testicular cancer for men)
pros <- read.csv('path/...')
test <- read.csv('path/...')

###############################################################################################

##### MORTALITY DATA 

###############################################################################################

dat <- read.csv('path/...')
t <- read.csv('path/...')
proteins <- read.csv('path/...')

# Mortality
dat <- read.csv("/home/dgadd/PPP_core_input_files/parquet_54189_260722.csv")
d1 <- dat[,which(colnames(dat) %in% c('f.eid','f.21022.0.0', 'f.52.0.0', 'f.34.0.0', 'f.31.0.0', 'f.40000.0.0', 'f.40007.0.0'))]
names(d1) <- c('SampleID', 'Sex', 'YOB', 'MOB', 'Age_recruitment', 'DOD', 'Age_death')
d1 <- d1[which(d1$SampleID %in% proteins$SampleID),]

death <- d1[c(1,6)]
death <- na.omit(death)
death$DOD <- gsub('-', '', death$DOD)
names(death) <- c('SampleID', 'first')
death <- death[order(death$first),]
death$first <- substr(death$first,1,6) 
DEATH <- death # 4580
write.csv(DEATH, 'path/...', row.names = F)




###############################################################################################

##### SELF REPORT

###############################################################################################

# Extract instances of self-report for diseases from verbal interview
# use instancing 0 which is baseline verbal interview
# Cancer: https://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=20001
# Non-cancer: https://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=20002

library(tidyverse)

self <- read.csv('path/...')
proteins <- read.csv('path/...')
names(self)[1] <- 'SampleID'
self <- self[which(self$SampleID %in% proteins$SampleID),]

## Cancers

# Subset linkage to get cancer specific codes at baseline

non <- self[grep("20001", colnames(self))]
non <- cbind(self[,1], non)
names(non)[1] <- 'SampleID'
non <- non[c(1:7)]


list <- list()

for(i in 1:length(non$SampleID)){
  tryCatch({ 
    person <- non[i,1:7]
    person <- t(person)
    person <- as.data.frame(person)
    person$SampleID <- person[1,1]
    names(person)[1] <- 'code'
    row.names(person) <- NULL
    person <- person[-which(person$code %in% NA),]
    person <- person[c(2,1)]
    person <- person[-1,]
    print(i)
    list[[i]] <- person
  }, error = function(e) cat("skipped"))
}

non <- do.call(rbind, list)
write.csv(non, 'path/...', row.names = F)

## Non-cancers
non <- self[grep("20002", colnames(self))]
non <- cbind(self[,1], non)
names(non)[1] <- 'SampleID'
non <- non[c(1:35)]


list <- list()

for(i in 1:length(non$SampleID)){
  tryCatch({ 
    person <- non[i,1:35]
    person <- t(person)
    person <- as.data.frame(person)
    person$SampleID <- person[1,1]
    names(person)[1] <- 'code'
    row.names(person) <- NULL
    person <- person[-which(person$code %in% NA),]
    person <- person[c(2,1)]
    person <- person[-1,]
    print(i)
    list[[i]] <- person
  }, error = function(e) cat("skipped"))
}

non <- do.call(rbind, list)
write.csv(non, 'path/...', row.names = F)

########################################

## Save out self-report based on the indexes created for each disease

non <- read.csv('path/...')
can <- read.csv('path/...')

list_BRAIN <- c('1032', '1033')
list_LUNG <- c('1001', '1027', '1028')
list_GYN <- c('1039', '1040', '1041', '1042', '1043')
list_Breast <- c('1002')
list_Colorectal <- c('1019', '1020', '1021', '1022', '1023')
list_TEST <- c('1045')
list_Prostate <- c('1044')


list_cancer <- list(list_BRAIN, list_Breast, list_Colorectal, list_GYN, list_LUNG, list_Prostate, list_TEST)
list1 <- c('BRAIN', 'Breast', 'Colorectal', 'GYN', 'LUNG', 'Prostate', 'TEST')

for(i in 1:length(list_cancer)){
  codes <- list_cancer[[i]]
  trait <- as.character(list1[i])
  data <- can[which(can$code %in% codes),]
  print(trait)
  print(dim(data))
  write.csv(data, paste0('path/...', trait, '.csv'), row.names = F)
}

list_AL <- c('1263')
list_ALS <- c('1259')
list_COPD <- c('1112', '1113', '1114')
list_CYS <- c('1514')
list_Dep <- c('1286', '1291', '1531')
list_Diab <- c('1220', '1221', '1222', '1223', '1521')
list_ENDO <- c('1402')
list_IBD <- c('1463', '1462')
list_IHD <- c('1492', '1490', '1079', '1075', '1076')
list_LIV <- c('1604', '1158')
list_LUP <- c('1381')
list_MS <- c('1261')
list_PCO <- c('1350')
list_PD <- c('1262')
list_RA <- c('1464')
list_SCZ <- c('1289')
list_ST <- c('1583')
list_VD <- c('1263')

list_codes <- list(list_AL, list_ALS, list_COPD,
                   list_CYS, list_Dep, list_Diab, list_ENDO, list_IBD, list_IHD,
                   list_LIV, list_LUP, list_MS, list_PCO, list_PD, list_RA,
                   list_SCZ, list_ST, list_VD)


list2 <- c('AL', 'ALS', 'COPD', 'CYS', 'Dep', 'Diab', 'ENDO', 'IBD', 'IHD', 
           'LIV', 'LUP', 'MS', 'PCO', 'PD', 'RA', 'SCZ', 'ST', 'VD')

for(i in 1:length(list_codes)){
  codes <- list_codes[[i]]
  trait <- as.character(list2[i])
  data <- non[which(non$code %in% codes),]
  print(trait)
  print(dim(data))
  write.csv(data, paste0('path/...', trait, '.csv'), row.names = F)
}