############################################################################################ ### Incident disease summary table ############################################################################################ # srun -p interactive --pty bash library(tidyverse) # Set disease and read in disease codes for the iteration (array) diseases <- c('AL', 'ALS', 'BRAIN', 'Breast', 'Colorectal', 'COPD', 'CYS', 'Dep', 'Diab', 'ENDO', 'GYN', 'IBD', 'IHD', 'LIV', 'LUNG', 'LUP', 'MS', 'PD', 'Prostate', 'RA', 'SCZ', 'ST', 'VD') files <- list.files('/path.../Tables/', '.csv') location <- 'path.../Tables/' # Read in and extract relevant info (i.e. smaller files minus protein data) for(i in 1:length(files)){ name <- as.character(diseases[i]) print(name) cox <- read.csv(paste0(location, name, '.csv')) data <- cox[,which(colnames(cox) %in% c('SampleID', 'pseudo_ind_id', "Age_assessment", "Sex", "dead", "aged", "Event", "age_death", "age_at_event", "tte", 'BMI', 'Dep', 'Alc', 'Smo', 'Edu'))] data$Outcome <- name write.csv(data, paste0('path.../', name, '.csv'), row.names = F) print(dim(cox)) cox = NULL data = NULL } # Read subset files back in as list and check max tte location_new <- 'path...' results <- list() files <- list.files(location_new, '.csv') for(i in 1:length(files)){ name <- files[i] name <- gsub("\\..*", "", name) results[[i]] <- read.csv(paste0(location_new, name, '.csv')) print(i) } bind <- do.call(rbind, results) max <- max(bind$tte, na.rm = T) # 15.25 max years of follow up over cases and controls ### Get the cases, controls and mean tte for each trait - basic model output <- matrix(nrow = 1*length(files), ncol = 4) output <- as.data.frame(output) j=c(1:length(files)) for(i in 1:length(files)){ df <- results[[i]] df <- as.data.frame(df) join <- df # remove missing covariates # join <- join[!is.na(join$Alc), ] # join <- join[!is.na(join$Smo), ] # join <- join[!is.na(join$Dep), ] # join <- join[!is.na(join$Edu), ] # join <- join[!is.na(join$BMI), ] df <- join # generate metrics required cases <- df[which(df$Event == "1"),] case <- cases %>% filter(!tte == 'NA') case_count <- nrow(case) control <- df[which(df$Event == "0"),] control_count <- nrow(control) mean_tte <- mean(case$tte, na.rm = T) mean_tte <- round(mean_tte, digits = 1) sd_tte <- sd(case$tte, na.rm = T) sd_tte <- round(sd_tte, digits = 1) mean_sd <- paste0(mean_tte, " ", "(", sd_tte, ")") name_dis <- unique(df$Outcome) # output metrics output[j[i],1] <- name_dis output[j[i],2] <- case_count output[j[i],3] <- control_count output[j[i],4] <- mean_sd names(output) <- c("Disease", "Cases", "Controls", "Mean time to event") } # order by cases low to high sort <- output[order(output$Cases),] write.csv(sort, file = 'path.../summary_N_basic_table.csv', row.names =F) ### Get the cases, controls and mean tte for each trait - basic model output <- matrix(nrow = 1*length(files), ncol = 4) output <- as.data.frame(output) j=c(1:length(files)) for(i in 1:length(files)){ df <- results[[i]] df <- as.data.frame(df) join <- df # remove missing covariates join <- join[!is.na(join$Alc), ] join <- join[!is.na(join$Smo), ] join <- join[!is.na(join$Dep), ] join <- join[!is.na(join$Edu), ] join <- join[!is.na(join$BMI), ] df <- join # generate metrics required cases <- df[which(df$Event == "1"),] case <- cases %>% filter(!tte == 'NA') case_count <- nrow(case) control <- df[which(df$Event == "0"),] control_count <- nrow(control) mean_tte <- mean(case$tte, na.rm = T) mean_tte <- round(mean_tte, digits = 1) sd_tte <- sd(case$tte, na.rm = T) sd_tte <- round(sd_tte, digits = 1) mean_sd <- paste0(mean_tte, " ", "(", sd_tte, ")") name_dis <- unique(df$Outcome) # output metrics output[j[i],1] <- name_dis output[j[i],2] <- case_count output[j[i],3] <- control_count output[j[i],4] <- mean_sd names(output) <- c("Disease", "Cases", "Controls", "Mean time to event") } # order by cases low to high sort <- output[order(output$Cases),] write.csv(sort, file = 'path.../summary_N_full_table.csv', row.names =F) ### JOIN THEM TOGETHER basic <- read.csv('path.../summary_N_basic_table.csv') full <- read.csv('path.../summary_N_full_table.csv') names(basic) <- c("Disease", "Cases basic", "Controls basic", "Mean time to event basic") names(full) <- c("Disease", "Cases full", "Controls full", "Mean time to event full") join <- left_join(basic, full, by = "Disease") write.csv(join, "path.../joint.csv", row.names = F) ### ADD DEATH d <- read.csv("path.../DEATH.csv") cox <- d data <- cox[,which(colnames(cox) %in% c('SampleID', 'pseudo_ind_id', "Age_assessment", "Sex", "dead", "aged", "Event", "age_death", "age_at_event", "tte", 'BMI', 'Dep', 'Alc', 'Smo', 'Edu'))] data$Outcome <- name write.csv(data, 'path.../DEATH.csv', row.names = F) output <- matrix(nrow = 1, ncol = 4) output <- as.data.frame(output) join <- data # remove missing covariates join <- join[!is.na(join$Alc), ] join <- join[!is.na(join$Smo), ] join <- join[!is.na(join$Dep), ] join <- join[!is.na(join$Edu), ] join <- join[!is.na(join$BMI), ] df <- join # generate metrics required cases <- df[which(df$Event == "1"),] case <- cases %>% filter(!tte == 'NA') case_count <- nrow(case) control <- df[which(df$Event == "0"),] control_count <- nrow(control) mean_tte <- mean(case$tte, na.rm = T) mean_tte <- round(mean_tte, digits = 1) sd_tte <- sd(case$tte, na.rm = T) sd_tte <- round(sd_tte, digits = 1) mean_sd <- paste0(mean_tte, " ", "(", sd_tte, ")") name_dis <- unique(df$Outcome) # output metrics output[1,1] <- 'DEATH' output[1,2] <- case_count output[1,3] <- control_count output[1,4] <- mean_sd names(output) <- c("Disease", "Cases", "Controls", "Mean time to event")