library(dplyr)
library(survival)
library(survminer)
# read copy number signature data
cnsigs = read.table("path/to/data/Activities_Sherlock_CN_COSMIC_decomposition_v3.4.txt", header=T, stringsAsFactors=FALSE, sep='\t')
rownames(cnsigs) = cnsigs[,1]
wgd_file = "path/to/data/SherlockLung_WGDStatus.txt"
wgd_status = read.table(wgd_file, stringsAsFactors=FALSE)
rownames(wgd_status) = wgd_status[,1]
msl = read.table("path/to/data/SherlockLung_AllHQ_LUAD_LOH_mergedsegs.txt", header=T, stringsAsFactors = F, sep='\t')
ms = read.table("path/to/data/SherlockLung_AllHQ_LUAD_mergedseg_G3.txt", header=T, stringsAsFactors = F, sep='\t')
msl_compare = apply(msl, 2, function(x) { gsub("^ *", "", x) })
ms_compare = apply(ms[,1:10], 2, function(x) { gsub("^ *", "", x) })
msl_in_ms = apply(msl_compare, 1, function(x) { any(apply(ms_compare, 1, function(y) { identical(x, y) })) })
sLOH_to_add = msl[!msl_in_ms,]
sLOH_to_add = cbind(sLOH_to_add, `ID`=apply(sLOH_to_add, 1, function(x) { paste(paste0("chr", x["chr"]), x["startpos"], x["endpos"], sep="_") }), `no.chr.bearing.mut`=as.character(NA))
mergedseg_extended = rbind(ms, sLOH_to_add)
barcodes = gsub("01_01", "01.01", readLines("path/to/data/AllHQ_LUAD_2025_barcodes.txt"))
all_events = unique(mergedseg_extended[,"ID"])
# generate matrix for linear model
sample_survival_and_events = as.data.frame(t(sapply(barcodes, function(bc) {
sw = landscape[landscape$Tumor_Barcode == bc,"Survival_Week"]
sample_events <- mergedseg_extended[mergedseg_extended$Tumour_Name == bc,"ID"]
event_status = sapply(all_events, function(event) {
if(event %in% sample_events) {
paste0(event, "_pos")
} else {
paste0(event, "_neg")
}
})
out = c(sw, event_status)
names(out) = c("time", all_events)
out
})))
sample_survival_and_events[,"time"] = as.numeric(sample_survival_and_events[,"time"])
aber_cn_pct = sapply(barcodes, function(bc) {
if (wgd_status[bc,2] == "nWGD") { norm_sig = "CN1" } else { norm_sig = "CN2" }
aber_prop = 1 - (cnsigs[bc,norm_sig]/sum(cnsigs[bc,-1]))
aber_prop
})
aber_cn_col = aber_cn_pct[rownames(sample_survival_and_events)]
sample_survival_and_events = cbind(sample_survival_and_events, `AberrantCN`=aber_cn_col)
smoking_status = substr(landscape[rownames(sample_survival_and_events),"Smoking"], 1, 1)
sample_survival_and_events_with_smoking = cbind(sample_survival_and_events, `Smoking`=smoking_status)
sample_survival_and_events_with_smoking[which(sample_survival_and_events_with_smoking$Smoking == "S"),"Smoking"] = "Smoking_pos"
sample_survival_and_events_with_smoking[which(sample_survival_and_events_with_smoking$Smoking == "N"),"Smoking"] = "Smoking_neg"
anc = landscape[rownames(sample_survival_and_events_with_smoking),"Assigned_Population"]
sample_survival_and_events_with_smoking = cbind(sample_survival_and_events_with_smoking, `Ancestry`=anc)
sample_survival_and_events_with_smoking[which(sample_survival_and_events_with_smoking$Ancestry == "EUR"),"Ancestry"] = "Ancestry_Eur"
sample_survival_and_events_with_smoking[which(sample_survival_and_events_with_smoking$Ancestry != "Ancestry_Eur"),"Ancestry"] = "Ancestry_oth"
sex = landscape[rownames(sample_survival_and_events_with_smoking),"Gender"]
sample_survival_and_events_with_smoking = cbind(sample_survival_and_events_with_smoking, `Sex`=sex)
sample_survival_and_events_with_smoking[which(sample_survival_and_events_with_smoking$Sex == "Male"),"Sex"] = "Sex_mal"
sample_survival_and_events_with_smoking[which(sample_survival_and_events_with_smoking$Sex == "Female"),"Sex"] = "Sex_fem"
wgd = wgd_status[rownames(sample_survival_and_events_with_smoking),2]
sample_survival_and_events_with_smoking = cbind(sample_survival_and_events_with_smoking, `WGD`=wgd)
sample_survival_and_events_with_smoking[which(sample_survival_and_events_with_smoking$WGD == "WGD"),"WGD"] = "WGD_pos"
sample_survival_and_events_with_smoking[which(sample_survival_and_events_with_smoking$WGD == "nWGD"),"WGD"] = "WGD_neg"
pur = sex = landscape[rownames(sample_survival_and_events_with_smoking),"Tumor_Purity"]
sample_survival_and_events_with_smoking = cbind(sample_survival_and_events_with_smoking, `Purity`=pur)
get_plot_from_lm <- function(fit_lm, plot_title, hr_label_mult=0.6, p_label_mult=1.02, xleft_mult=1.05, xright_mult=1.025) {
fit_summary = summary(fit_lm)
fit_conf = cbind(exp(coef(fit_lm)), exp(confint(fit_lm)))
colnames(fit_conf) = c("exp_coef","lower95","upper95")
plot_y_order = order(fit_conf[-1,'exp_coef'])
forest_data = as.data.frame(cbind(`event_name`=gsub("dMut_", "", rownames(fit_conf)), fit_conf, `sig`=ifelse(fit_summary$coefficients[,'Pr(>|t|)'] < 0.05 & fit_conf[,'exp_coef'] > 1, "GREATER", "NEITHER"), `pvalue`=fit_summary$coefficients[,'Pr(>|t|)']))
forest_data[fit_summary$coefficients[,'Pr(>|t|)'] < 0.05 & fit_conf[,'exp_coef'] < 1,'sig'] = "LESS"
forest_data = forest_data[-1,]
fdrn = rownames(forest_data)
with_suffix = grep("_(.{3})$", fdrn, perl=TRUE)
forest_data$event_name = fdrn
if (length(with_suffix) > 0) {
forest_data$event_name[with_suffix] = sapply(gsub("dMut_", "", fdrn[with_suffix]), function(x) { substr(x, 1, (nchar(x)-4)/2) })
}
rownames(forest_data) = forest_data$event_name
forest_data[,2:4] = apply(forest_data[,2:4], 2, as.numeric)
forest_data[,5] = factor(forest_data[,5], levels=c("LESS","NEITHER","GREATER"))
forest_data[,1] = factor(forest_data[,1], levels=forest_data[plot_y_order,1])
forest_data[forest_data[,6] >= 0.001 & forest_data[,6] < 0.01,6] = round(as.numeric(forest_data[forest_data[,6] >= 0.001 & forest_data[,6] < 0.01,6]), 3)
forest_data[forest_data[,6] >= 0.01,6] = sprintf("%.3f", round(as.numeric(forest_data[forest_data[,6] >= 0.01,6]), 3))
forest_data[forest_data[,6] < 0.001 & forest_data[,6] != "",6] = "< 0.001"
hr_label_x = min(forest_data$lower95)*hr_label_mult
pval_label_x = max(forest_data$upper95)*p_label_mult
hr_text = paste0(sprintf("%.2f",round(forest_data[,2],2)), " (", sprintf("%.2f",round(forest_data[,3],2)), "-", sprintf("%.2f",round(forest_data[,4],2)), ")")
colour_codes = c(`LESS`="#398BE3",`NEITHER`="#444444",`GREATER`="#40D62D")
p <- ggplot(data=forest_data,aes(x=exp_coef,y=event_name, color=sig, fill=sig))+
geom_vline(xintercept=1,linetype="dashed", color="#A0A0A0")+
geom_errorbarh(aes(xmin=lower95,xmax=upper95),height=0, lwd=0.75) +
scale_color_manual(values=colour_codes[unique(forest_data$sig)]) +
geom_point(cex=2, shape=15) +
scale_fill_manual(values=colour_codes[unique(forest_data$sig)]) +
theme_minimal(base_family="Roboto Condensed") +
theme(panel.grid = element_line(color="#F3F3F3")) +
guides(size=guide_legend(override.aes = list(fill='gray50',col='black')), fill="none", color="none") +
labs(x = "Hazard ratio", y = NULL, title=plot_title) +
theme(text = element_text(size=12), axis.line.y = element_blank(), axis.text.y = element_text(face="bold")) +
annotate("text", x = pval_label_x, y = nrow(forest_data) + 1, label = "p-value", fontface=2, hjust=0, family="Roboto Condensed") +
annotate("text", x = pval_label_x, y =1:nrow(forest_data), label = forest_data$pvalue[plot_y_order], hjust=0, family="Roboto Condensed") +
annotate("text", x = hr_label_x, y = nrow(forest_data) + 1, label = "HR (95% CI)", fontface=2, hjust=0, family="Roboto Condensed") +
annotate("text", x = hr_label_x, y =1:nrow(forest_data), label = hr_text[plot_y_order], hjust=0, family="Roboto Condensed") +
coord_cartesian(ylim=c(1,nrow(forest_data) + 1), xlim=c(hr_label_x*xleft_mult, pval_label_x*xright_mult))
p
}
# Fit linear model with every driver gene included in the ordering model, along with WGD, tumour purity, smoking status, genetic ancestry, and sex
fit.lm.acnp <- lm(AberrantCN ~ dMut_APC + dMut_ARID1A + dMut_BCORL1 + dMut_BIRC6 + dMut_CDKN2A + dMut_CTNNB1 + dMut_EGFR + dMut_ERBB2 + dMut_FAT4 + dMut_GNAS + dMut_KEAP1 + dMut_KMT2C + dMut_KMT2D + dMut_KRAS + dMut_LRP1B + dMut_MED12 + dMut_MGA + dMut_NCOR2 + dMut_NF1 + dMut_PIK3CA + dMut_PTEN + dMut_RB1 + dMut_RBM10 + dMut_ROBO2 + dMut_SETD2 + dMut_SMARCA4 + dMut_STAG2 + dMut_STK11 + dMut_TP53 + WGD + Purity + Smoking + Ancestry + Sex, data=sample_survival_and_events_with_smoking)
# plot and output linear model
p <- get_plot_from_lm(fit.lm.acnp, "Aberrant Copy Number Signature Proportion: All LUAD Samples", hr_label_mult=0.75, xleft_mult=1.05)
ggsave(file='path/to/figures/Fig5/stability_mvlm_with_covariates.png',plot = p,width = 6+(2/3),height = 7,device = "png")
# Test for multicollinearity
library(car)
vif_values = vif(fit.lm.acnp)
names(vif_values) = gsub("dMut_", "", names(vif_values))
pdf("path/to/supplementary/figures/vif.pdf", height=10, width=10)
par(mar=c(5.1, 8 ,4.1 ,2.1))
barplot(sort(vif_values), main = "VIF Values", horiz=TRUE, col = "#8888DD", xlim=c(0,6), las=2)
abline(v = 5, lwd = 3, lty = 2)
dev.off()
independent_variable_numeric_matrix = sample_survival_and_events_with_smoking[,c("dMut_APC", "dMut_ARID1A", "dMut_BCORL1", "dMut_BIRC6", "dMut_CDKN2A", "dMut_CTNNB1", "dMut_EGFR", "dMut_ERBB2", "dMut_FAT4", "dMut_GNAS", "dMut_KEAP1", "dMut_KMT2C", "dMut_KMT2D", "dMut_KRAS", "dMut_LRP1B", "dMut_MED12", "dMut_MGA", "dMut_NCOR2", "dMut_NF1", "dMut_PIK3CA", "dMut_PTEN", "dMut_RB1", "dMut_RBM10", "dMut_ROBO2", "dMut_SETD2", "dMut_SMARCA4", "dMut_STAG2", "dMut_STK11", "dMut_TP53", "WGD", "Purity", "Smoking", "Ancestry", "Sex")]
independent_variable_numeric_matrix_cols_1_to_30 = apply(independent_variable_numeric_matrix[,1:30], 2, function(x) { x[grep("_pos", x)] = 1; x[grep("_neg", x)] = 0; x })
independent_variable_numeric_matrix_col_32 = independent_variable_numeric_matrix[,32]
independent_variable_numeric_matrix_col_32[grep("_pos", independent_variable_numeric_matrix_col_32)] = 1
independent_variable_numeric_matrix_col_32[grep("_neg", independent_variable_numeric_matrix_col_32)] = 0
independent_variable_numeric_matrix_col_31 = independent_variable_numeric_matrix[,31]
independent_variable_numeric_matrix_col_33 = independent_variable_numeric_matrix[,33]
independent_variable_numeric_matrix_col_33[grep("_Eur", independent_variable_numeric_matrix_col_33)] = 1
independent_variable_numeric_matrix_col_33[grep("_oth", independent_variable_numeric_matrix_col_33)] = 0
independent_variable_numeric_matrix_col_34 = independent_variable_numeric_matrix[,34]
independent_variable_numeric_matrix_col_34[grep("_mal", independent_variable_numeric_matrix_col_34)] = 1
independent_variable_numeric_matrix_col_34[grep("_fem", independent_variable_numeric_matrix_col_34)] = 0
independent_variable_numeric_matrix = cbind(independent_variable_numeric_matrix_cols_1_to_30, independent_variable_numeric_matrix_col_31, independent_variable_numeric_matrix_col_32, independent_variable_numeric_matrix_col_33, independent_variable_numeric_matrix_col_34)
independent_variable_numeric_matrix = as.data.frame(independent_variable_numeric_matrix)
independent_variable_numeric_matrix = apply(independent_variable_numeric_matrix, 2, as.numeric)
colnames(independent_variable_numeric_matrix)[31:34] = c("Purity","Smoking","Ancestry","Sex")
library(corrplot)
colnames(independent_variable_numeric_matrix) = gsub("dMut_", "", colnames(independent_variable_numeric_matrix))
cor_matrix <- cor(independent_variable_numeric_matrix)
pdf("path/to/supplementary/figures/linear_model_variable_correlation.pdf", height=10, width=10)
corrplot::corrplot(cor_matrix, method = "circle")
dev.off()