Analyses/stability.R · SherlockLung-EvolutionaryTrajectory-Analysis

library(dplyr)
library(survival)
library(survminer)

# read copy number signature data
cnsigs = read.table("path/to/data/Activities_Sherlock_CN_COSMIC_decomposition_v3.4.txt", header=T, stringsAsFactors=FALSE, sep='\t')
rownames(cnsigs) = cnsigs[,1]

wgd_file = "path/to/data/SherlockLung_WGDStatus.txt"
wgd_status = read.table(wgd_file, stringsAsFactors=FALSE)
rownames(wgd_status) = wgd_status[,1]

msl = read.table("path/to/data/SherlockLung_AllHQ_LUAD_LOH_mergedsegs.txt", header=T, stringsAsFactors = F, sep='\t')
ms = read.table("path/to/data/SherlockLung_AllHQ_LUAD_mergedseg_G3.txt", header=T, stringsAsFactors = F, sep='\t')
msl_compare = apply(msl, 2, function(x) { gsub("^ *", "", x) })
ms_compare = apply(ms[,1:10], 2, function(x) { gsub("^ *", "", x) })
msl_in_ms = apply(msl_compare, 1, function(x) { any(apply(ms_compare, 1, function(y) { identical(x, y) })) })
sLOH_to_add = msl[!msl_in_ms,]
sLOH_to_add = cbind(sLOH_to_add, `ID`=apply(sLOH_to_add, 1, function(x) { paste(paste0("chr", x["chr"]), x["startpos"], x["endpos"], sep="_") }), `no.chr.bearing.mut`=as.character(NA))
mergedseg_extended = rbind(ms, sLOH_to_add)

barcodes = gsub("01_01", "01.01", readLines("path/to/data/AllHQ_LUAD_2025_barcodes.txt"))
all_events = unique(mergedseg_extended[,"ID"])

# generate matrix for linear model
sample_survival_and_events = as.data.frame(t(sapply(barcodes, function(bc) {
  sw = landscape[landscape$Tumor_Barcode == bc,"Survival_Week"]
  sample_events <- mergedseg_extended[mergedseg_extended$Tumour_Name == bc,"ID"]
  event_status = sapply(all_events, function(event) {
    if(event %in% sample_events) {
	  paste0(event, "_pos")
	} else {
	  paste0(event, "_neg")
	}
  })
  out = c(sw, event_status)
  names(out) = c("time", all_events)
  out
})))
sample_survival_and_events[,"time"] = as.numeric(sample_survival_and_events[,"time"])

aber_cn_pct = sapply(barcodes, function(bc) {
  if (wgd_status[bc,2] == "nWGD") { norm_sig = "CN1" } else { norm_sig = "CN2" }
  aber_prop = 1 - (cnsigs[bc,norm_sig]/sum(cnsigs[bc,-1]))
  aber_prop
})

aber_cn_col = aber_cn_pct[rownames(sample_survival_and_events)]
sample_survival_and_events = cbind(sample_survival_and_events, `AberrantCN`=aber_cn_col)

smoking_status = substr(landscape[rownames(sample_survival_and_events),"Smoking"], 1, 1)
sample_survival_and_events_with_smoking = cbind(sample_survival_and_events, `Smoking`=smoking_status)
sample_survival_and_events_with_smoking[which(sample_survival_and_events_with_smoking$Smoking == "S"),"Smoking"] = "Smoking_pos"
sample_survival_and_events_with_smoking[which(sample_survival_and_events_with_smoking$Smoking == "N"),"Smoking"] = "Smoking_neg"

anc = landscape[rownames(sample_survival_and_events_with_smoking),"Assigned_Population"]
sample_survival_and_events_with_smoking = cbind(sample_survival_and_events_with_smoking, `Ancestry`=anc)
sample_survival_and_events_with_smoking[which(sample_survival_and_events_with_smoking$Ancestry == "EUR"),"Ancestry"] = "Ancestry_Eur"
sample_survival_and_events_with_smoking[which(sample_survival_and_events_with_smoking$Ancestry != "Ancestry_Eur"),"Ancestry"] = "Ancestry_oth"

sex = landscape[rownames(sample_survival_and_events_with_smoking),"Gender"]
sample_survival_and_events_with_smoking = cbind(sample_survival_and_events_with_smoking, `Sex`=sex)
sample_survival_and_events_with_smoking[which(sample_survival_and_events_with_smoking$Sex == "Male"),"Sex"] = "Sex_mal"
sample_survival_and_events_with_smoking[which(sample_survival_and_events_with_smoking$Sex == "Female"),"Sex"] = "Sex_fem"

wgd = wgd_status[rownames(sample_survival_and_events_with_smoking),2]
sample_survival_and_events_with_smoking = cbind(sample_survival_and_events_with_smoking, `WGD`=wgd)
sample_survival_and_events_with_smoking[which(sample_survival_and_events_with_smoking$WGD == "WGD"),"WGD"] = "WGD_pos"
sample_survival_and_events_with_smoking[which(sample_survival_and_events_with_smoking$WGD == "nWGD"),"WGD"] = "WGD_neg"

pur = sex = landscape[rownames(sample_survival_and_events_with_smoking),"Tumor_Purity"]
sample_survival_and_events_with_smoking = cbind(sample_survival_and_events_with_smoking, `Purity`=pur)

get_plot_from_lm <- function(fit_lm, plot_title, hr_label_mult=0.6, p_label_mult=1.02, xleft_mult=1.05, xright_mult=1.025) {
  fit_summary = summary(fit_lm)
  fit_conf = cbind(exp(coef(fit_lm)), exp(confint(fit_lm)))
  colnames(fit_conf) = c("exp_coef","lower95","upper95")
  plot_y_order = order(fit_conf[-1,'exp_coef'])
  forest_data = as.data.frame(cbind(`event_name`=gsub("dMut_", "", rownames(fit_conf)), fit_conf, `sig`=ifelse(fit_summary$coefficients[,'Pr(>|t|)'] < 0.05 & fit_conf[,'exp_coef'] > 1, "GREATER", "NEITHER"), `pvalue`=fit_summary$coefficients[,'Pr(>|t|)']))
  forest_data[fit_summary$coefficients[,'Pr(>|t|)'] < 0.05 & fit_conf[,'exp_coef'] < 1,'sig'] = "LESS"
  forest_data = forest_data[-1,]
  fdrn = rownames(forest_data)
  with_suffix = grep("_(.{3})$", fdrn, perl=TRUE)
  forest_data$event_name = fdrn
  if (length(with_suffix) > 0) {
    forest_data$event_name[with_suffix] = sapply(gsub("dMut_", "", fdrn[with_suffix]), function(x) { substr(x, 1, (nchar(x)-4)/2) })
  }
  rownames(forest_data) = forest_data$event_name
  forest_data[,2:4] = apply(forest_data[,2:4], 2, as.numeric)
  forest_data[,5] = factor(forest_data[,5], levels=c("LESS","NEITHER","GREATER"))
  forest_data[,1] = factor(forest_data[,1], levels=forest_data[plot_y_order,1])
  forest_data[forest_data[,6] >= 0.001 & forest_data[,6] < 0.01,6] = round(as.numeric(forest_data[forest_data[,6] >= 0.001 & forest_data[,6] < 0.01,6]), 3)
  forest_data[forest_data[,6] >= 0.01,6] = sprintf("%.3f", round(as.numeric(forest_data[forest_data[,6] >= 0.01,6]), 3))
  forest_data[forest_data[,6] < 0.001 & forest_data[,6] != "",6] = "< 0.001"
  hr_label_x = min(forest_data$lower95)*hr_label_mult
  pval_label_x = max(forest_data$upper95)*p_label_mult
  hr_text = paste0(sprintf("%.2f",round(forest_data[,2],2)), "  (", sprintf("%.2f",round(forest_data[,3],2)), "-", sprintf("%.2f",round(forest_data[,4],2)), ")")
  colour_codes = c(`LESS`="#398BE3",`NEITHER`="#444444",`GREATER`="#40D62D")
  p <- ggplot(data=forest_data,aes(x=exp_coef,y=event_name, color=sig, fill=sig))+
    geom_vline(xintercept=1,linetype="dashed", color="#A0A0A0")+
    geom_errorbarh(aes(xmin=lower95,xmax=upper95),height=0, lwd=0.75) +
    scale_color_manual(values=colour_codes[unique(forest_data$sig)]) +
    geom_point(cex=2, shape=15) +
    scale_fill_manual(values=colour_codes[unique(forest_data$sig)]) +
    theme_minimal(base_family="Roboto Condensed") +
    theme(panel.grid = element_line(color="#F3F3F3")) +
    guides(size=guide_legend(override.aes = list(fill='gray50',col='black')), fill="none", color="none") +
    labs(x = "Hazard ratio", y = NULL, title=plot_title) +
    theme(text = element_text(size=12), axis.line.y = element_blank(), axis.text.y = element_text(face="bold")) +
    annotate("text", x = pval_label_x, y = nrow(forest_data) + 1, label = "p-value", fontface=2, hjust=0, family="Roboto Condensed") +
    annotate("text", x = pval_label_x, y =1:nrow(forest_data), label = forest_data$pvalue[plot_y_order], hjust=0, family="Roboto Condensed") + 
    annotate("text", x = hr_label_x, y = nrow(forest_data) + 1, label = "HR (95% CI)", fontface=2, hjust=0, family="Roboto Condensed") +
    annotate("text", x = hr_label_x, y =1:nrow(forest_data), label = hr_text[plot_y_order], hjust=0, family="Roboto Condensed") + 
    coord_cartesian(ylim=c(1,nrow(forest_data) + 1), xlim=c(hr_label_x*xleft_mult, pval_label_x*xright_mult))
  p
}

# Fit linear model with every driver gene included in the ordering model, along with WGD, tumour purity, smoking status, genetic ancestry, and sex
fit.lm.acnp <- lm(AberrantCN ~ dMut_APC + dMut_ARID1A + dMut_BCORL1 + dMut_BIRC6 + dMut_CDKN2A + dMut_CTNNB1 + dMut_EGFR + dMut_ERBB2 + dMut_FAT4 + dMut_GNAS + dMut_KEAP1 + dMut_KMT2C + dMut_KMT2D + dMut_KRAS + dMut_LRP1B + dMut_MED12 + dMut_MGA + dMut_NCOR2 + dMut_NF1 + dMut_PIK3CA + dMut_PTEN + dMut_RB1 + dMut_RBM10 + dMut_ROBO2 + dMut_SETD2 + dMut_SMARCA4 + dMut_STAG2 + dMut_STK11 + dMut_TP53 + WGD + Purity + Smoking + Ancestry + Sex, data=sample_survival_and_events_with_smoking)

# plot and output linear model
p <- get_plot_from_lm(fit.lm.acnp, "Aberrant Copy Number Signature Proportion: All LUAD Samples", hr_label_mult=0.75, xleft_mult=1.05)
ggsave(file='path/to/figures/Fig5/stability_mvlm_with_covariates.png',plot = p,width = 6+(2/3),height = 7,device = "png")



# Test for multicollinearity

library(car)
vif_values = vif(fit.lm.acnp)
names(vif_values) = gsub("dMut_", "", names(vif_values))

pdf("path/to/supplementary/figures/vif.pdf", height=10, width=10)
par(mar=c(5.1, 8 ,4.1 ,2.1))
barplot(sort(vif_values), main = "VIF Values", horiz=TRUE, col = "#8888DD", xlim=c(0,6), las=2)
abline(v = 5, lwd = 3, lty = 2)
dev.off()

independent_variable_numeric_matrix = sample_survival_and_events_with_smoking[,c("dMut_APC", "dMut_ARID1A", "dMut_BCORL1", "dMut_BIRC6", "dMut_CDKN2A", "dMut_CTNNB1", "dMut_EGFR", "dMut_ERBB2", "dMut_FAT4", "dMut_GNAS", "dMut_KEAP1", "dMut_KMT2C", "dMut_KMT2D", "dMut_KRAS", "dMut_LRP1B", "dMut_MED12", "dMut_MGA", "dMut_NCOR2", "dMut_NF1", "dMut_PIK3CA", "dMut_PTEN", "dMut_RB1", "dMut_RBM10", "dMut_ROBO2", "dMut_SETD2", "dMut_SMARCA4", "dMut_STAG2", "dMut_STK11", "dMut_TP53", "WGD", "Purity", "Smoking", "Ancestry", "Sex")]
independent_variable_numeric_matrix_cols_1_to_30 = apply(independent_variable_numeric_matrix[,1:30], 2, function(x) { x[grep("_pos", x)] = 1; x[grep("_neg", x)] = 0; x })
independent_variable_numeric_matrix_col_32 = independent_variable_numeric_matrix[,32]
independent_variable_numeric_matrix_col_32[grep("_pos", independent_variable_numeric_matrix_col_32)] = 1
independent_variable_numeric_matrix_col_32[grep("_neg", independent_variable_numeric_matrix_col_32)] = 0
independent_variable_numeric_matrix_col_31 = independent_variable_numeric_matrix[,31]
independent_variable_numeric_matrix_col_33 = independent_variable_numeric_matrix[,33]
independent_variable_numeric_matrix_col_33[grep("_Eur", independent_variable_numeric_matrix_col_33)] = 1
independent_variable_numeric_matrix_col_33[grep("_oth", independent_variable_numeric_matrix_col_33)] = 0
independent_variable_numeric_matrix_col_34 = independent_variable_numeric_matrix[,34]
independent_variable_numeric_matrix_col_34[grep("_mal", independent_variable_numeric_matrix_col_34)] = 1
independent_variable_numeric_matrix_col_34[grep("_fem", independent_variable_numeric_matrix_col_34)] = 0
independent_variable_numeric_matrix = cbind(independent_variable_numeric_matrix_cols_1_to_30, independent_variable_numeric_matrix_col_31, independent_variable_numeric_matrix_col_32, independent_variable_numeric_matrix_col_33, independent_variable_numeric_matrix_col_34)
independent_variable_numeric_matrix = as.data.frame(independent_variable_numeric_matrix)
independent_variable_numeric_matrix = apply(independent_variable_numeric_matrix, 2, as.numeric)
colnames(independent_variable_numeric_matrix)[31:34] = c("Purity","Smoking","Ancestry","Sex")

library(corrplot)
colnames(independent_variable_numeric_matrix) = gsub("dMut_", "", colnames(independent_variable_numeric_matrix))
cor_matrix <- cor(independent_variable_numeric_matrix)
pdf("path/to/supplementary/figures/linear_model_variable_correlation.pdf", height=10, width=10)
corrplot::corrplot(cor_matrix, method = "circle")
dev.off()