library(plotrix)
get_combined_segs_CN = function(sample_set) {
# get all breakpoints
all_sample_set_segs = sort(Reduce(c, all_scgr[sample_set]))
chr_list_sample_set_segs = split(all_sample_set_segs, seqnames(all_sample_set_segs))
# will throw warnings because we are combining GRanges with different seqnames
# - suppress them as we know this is what we want to do
combined_segs = suppressWarnings(Reduce(c, lapply(names(chr_list_sample_set_segs), function(chr_name) {
# including "as.character" as a safety measure as most of the names are numbers and we do not want confusion
chr_sample_set_segs = chr_list_sample_set_segs[[as.character(chr_name)]]
chr_seg_starts = sort(unique(start(chr_sample_set_segs)))
chr_seg_ends = sort(unique(end(chr_sample_set_segs)))
additional_ends = chr_seg_starts[-1] - 1
additional_starts = chr_seg_ends[-length(chr_seg_ends)] + 1
# note: some of these will be gaps between segments
# - this is okay and will be handled in the overlaps section
combined_seg_starts = sort(unique(c(chr_seg_starts, additional_starts)))
combined_seg_ends = sort(unique(c(chr_seg_ends, additional_ends)))
combined_chr_segs = GRanges(seqnames=chr_name, ranges=IRanges(start=combined_seg_starts, end=combined_seg_ends))
combined_chr_segs
})))
segment_CN_stats = t(sapply(seq_along(combined_segs), function(csi) {
ol = findOverlaps(combined_segs[csi], all_sample_set_segs)
if (length(ol) > 0) {
majCNs = all_sample_set_segs[subjectHits(ol)]$av_majCN
minCNs = all_sample_set_segs[subjectHits(ol)]$av_minCN
totCNs = majCNs + minCNs
return_vals = c(`totCN_mean`=mean(totCNs), `totCN_sd`=sd(totCNs), `totCN_se`=std.error(totCNs), `majCN_mean`=mean(majCNs), `majCN_sd`=sd(majCNs), `majCN_se`=std.error(majCNs), `minCN_mean`=mean(minCNs), `minCN_sd`=sd(minCNs), `minCN_se`=std.error(minCNs))
} else {
return_vals = rep(NA, 9)
}
num_segs = length(combined_segs)
cat(paste0("\r", csi, " of ", num_segs, " (", round(((csi/num_segs) * 100), 2), "%) done"))
return_vals
}))
values(combined_segs) = as(segment_CN_stats, "DataFrame")
combined_segs
}
plot_seg_CN_nSets = function(set_seg_CN_list, top_track="total", set_cols = c("#8adbf5","#a371cb","#5a5a69","#332A2CAA")) {
set_chr_lengths = lapply(set_seg_CN_list, function(set_seg_CN) { sapply(split(set_seg_CN, seqnames(set_seg_CN)), function(x) { max(end(x)) }) })
chr_lengths = sapply(names(set_chr_lengths[[1]]), function(x) { max(sapply(set_chr_lengths, function(y) { y[x] })) })
chr_breaks = c(0, cumsum(as.numeric(chr_lengths)))
names(chr_breaks) = c(names(chr_lengths), "end")
if (top_track == "major") {
ymax = 4.25
} else {
ymax = 5.25
}
xmax = chr_breaks["end"]
plot(1, type = "n", xlab = "", ylab = "", xlim = c(1, xmax), ylim = c(0, ymax), xaxt='n', xaxs='i', yaxt="n")
axis(2,cex.axis=2)
for (vl in 0:5) {
lines(x=c(0,xmax), y=c(vl,vl), col="lightgrey")
}
par(xpd=FALSE)
for (cb in chr_breaks) {
lines(x=c(cb,cb), y=c(-1,6), col="darkgrey")
}
xlab_x = sapply(2:length(chr_breaks), function(end_i) { chr_breaks[end_i - 1] + ((chr_breaks[end_i] - chr_breaks[end_i - 1]) / 2) })
n = length(xlab_x)
axis(1, at=xlab_x[seq(1,n,2)], labels=names(chr_lengths)[seq(1,n,2)], cex.axis=2, lwd.ticks=0)
axis(1, at=xlab_x[seq(2,n,2)], labels=names(chr_lengths)[seq(2,n,2)], cex.axis=2, lwd.ticks=0)
for (set_i in seq_along(set_seg_CN_list)) {
message(paste0("Plotting segments for sample set ", set_i, " of ", length(set_seg_CN_list)))
latest_pct_done = -1
for (segment_i in seq_along(set_seg_CN_list[[set_i]])) {
pct_done = round((segment_i / length(set_seg_CN_list[[set_i]])) * 100, 1)
if (pct_done > latest_pct_done) {
cat('\rPercent complete: ',sprintf("%.1f", pct_done))
flush.console()
latest_pct_done = pct_done
}
seg = set_seg_CN_list[[set_i]][segment_i]
xstart = start(seg) + chr_breaks[as.character(seqnames(seg))]
xend = end(seg) + chr_breaks[as.character(seqnames(seg))]
if (top_track=="major") {
lines(x=c(xstart, xend), y=c(seg$majCN_mean, seg$majCN_mean), col=set_cols[set_i], lwd=4)
} else {
lines(x=c(xstart, xend), y=c(seg$totCN_mean, seg$totCN_mean), col=set_cols[set_i], lwd=4)
}
lines(x=c(xstart, xend), y=c(seg$minCN_mean, seg$minCN_mean), col=set_cols[set_i], lwd=4)
}
}
}
# get mean copy number for each unique segment across all samples of each de novo subset
allhq_luad_seg_CN = lapply(sample_groups, get_combined_segs_CN)
# plot average copy number profiles for de novo subsets
fn = "Sherlock/New_Histology/2025/Figures/Fig3/AllHQ_LUAD_mean_seg_CN_large_bluePurpleGrey.png"
png(fn, width=9000, height=3600, res=300)
plot_seg_CN_nSets(allhq_luad_seg_CN)
dev.off()
# set up pairwise comparison sets
sets_of_sets = list(
list(sample_groups[[1]], sample_groups[[3]]),
list(sample_groups[[2]], sample_groups[[3]]),
list(sample_groups[[1]], sample_groups[[2]])
)
# run pairwise comparisons between de novo subsets
comparison_lists = lapply(sets_of_sets, function(sets) {
set1 = sets[[1]]
set2 = sets[[2]]
set1_and_set2_sample_set_segs = get_all_sample_set_segs(c(set1, set2))
set1_sample_set_segs = get_all_sample_set_segs(set1)
set2_sample_set_segs = get_all_sample_set_segs(set2)
set1_and_set2_combined_segs = get_combined_segs(set1_and_set2_sample_set_segs)
#set1_vs_set2_CN_comparison = compare_total_CN(set1_and_set2_combined_segs, set1_sample_set_segs, set2_sample_set_segs)
set1_vs_set2_CN_comparison = compare_total_CN(set1_and_set2_combined_segs, set1_sample_set_segs, set2_sample_set_segs, c(length(set1)*0.25, length(set2)*0.25))
set1_vs_set2_CN_comparison_FDR = apply(set1_vs_set2_CN_comparison[,1:3], 2, p.adjust, method="fdr")
set1_vs_set2_CN_comparison_FDR = apply(set1_vs_set2_CN_comparison_FDR, 2, function(comp_col) {
comp_col[which(is.na(comp_col))] = 1
comp_col
})
total_p_lt_0pt05 = sum(width(set1_and_set2_combined_segs[set1_vs_set2_CN_comparison_FDR[,1] < 0.05])) / sum(width(set1_and_set2_combined_segs))
major_p_lt_0pt05 = sum(width(set1_and_set2_combined_segs[set1_vs_set2_CN_comparison_FDR[,2] < 0.05])) / sum(width(set1_and_set2_combined_segs))
minor_p_lt_0pt05 = sum(width(set1_and_set2_combined_segs[set1_vs_set2_CN_comparison_FDR[,3] < 0.05])) / sum(width(set1_and_set2_combined_segs))
total_p_lt_0pt001 = sum(width(set1_and_set2_combined_segs[set1_vs_set2_CN_comparison_FDR[,1] < 0.001])) / sum(width(set1_and_set2_combined_segs))
major_p_lt_0pt001 = sum(width(set1_and_set2_combined_segs[set1_vs_set2_CN_comparison_FDR[,2] < 0.001])) / sum(width(set1_and_set2_combined_segs))
minor_p_lt_0pt001 = sum(width(set1_and_set2_combined_segs[set1_vs_set2_CN_comparison_FDR[,3] < 0.001])) / sum(width(set1_and_set2_combined_segs))
abs_diff_tot = abs(set1_vs_set2_CN_comparison[,4] - set1_vs_set2_CN_comparison[,5])
adj_abs_diff_tot = abs_diff_tot * width(set1_and_set2_combined_segs)
eff_size_tot = sum(adj_abs_diff_tot, na.rm = TRUE) / sum(width(set1_and_set2_combined_segs)[which(!is.na(adj_abs_diff_tot))])
abs_diff_maj = abs(set1_vs_set2_CN_comparison[,6] - set1_vs_set2_CN_comparison[,7])
adj_abs_diff_maj = abs_diff_maj * width(set1_and_set2_combined_segs)
eff_size_maj = sum(adj_abs_diff_maj, na.rm = TRUE) / sum(width(set1_and_set2_combined_segs)[which(!is.na(adj_abs_diff_maj))])
abs_diff_min = abs(set1_vs_set2_CN_comparison[,8] - set1_vs_set2_CN_comparison[,9])
adj_abs_diff_min = abs_diff_min * width(set1_and_set2_combined_segs)
eff_size_min = sum(adj_abs_diff_min, na.rm = TRUE) / sum(width(set1_and_set2_combined_segs)[which(!is.na(adj_abs_diff_min))])
sig_diff_proportions = t(c(total_p_lt_0pt05, major_p_lt_0pt05, minor_p_lt_0pt05, total_p_lt_0pt001, major_p_lt_0pt001, minor_p_lt_0pt001, eff_size_tot, eff_size_maj, eff_size_min))
colnames(sig_diff_proportions) = c("total_p_lt_0pt05", "major_p_lt_0pt05", "minor_p_lt_0pt05", "total_p_lt_0pt001", "major_p_lt_0pt001", "minor_p_lt_0pt001", "total_CN_effect_size", "major_CN_effect_size", "minor_CN_effect_size")
comparison_list = list(set1_and_set2_combined_segs, set1_vs_set2_CN_comparison, set1_vs_set2_CN_comparison_FDR, sig_diff_proportions)
})
library(heatmaply)
plot_sig_and_graded_effect_size = function(comparison, sig_threshold=0.05, effect_size_upper_range=2, effect_size_threshold=0, pal=unique(YlGnBu(6000))) {
segs = comparison[[1]]
FDR_pvals = comparison[[3]]
total_effect_size = abs(comparison[[2]][,4] - comparison[[2]][,5])
major_effect_size = abs(comparison[[2]][,6] - comparison[[2]][,7])
minor_effect_size = abs(comparison[[2]][,8] - comparison[[2]][,9])
chr_lengths = sapply(split(segs, seqnames(segs)), function(x) { max(end(x)) })
chr_breaks = c(0, cumsum(as.numeric(chr_lengths)))
names(chr_breaks) = c(names(chr_lengths), "end")
xmax = chr_breaks["end"]
plot(1, type = "n", xlab = "", ylab = "", xlim = c(1, xmax), ylim = c(0, 3), xaxt='n', xaxs='i', yaxt='n', yaxs='i', axes=FALSE)
par(xpd=FALSE)
par(oma=c(0,0,0,0))
xlab_x = sapply(2:length(chr_breaks), function(end_i) { chr_breaks[end_i - 1] + ((chr_breaks[end_i] - chr_breaks[end_i - 1]) / 2) })
n = length(xlab_x)
axis(1, at=xlab_x[seq(1,n,2)], labels=names(chr_lengths)[seq(1,n,2)], cex.axis=2, lwd.ticks=0, lwd=0)
axis(1, at=xlab_x[seq(2,n,2)], labels=names(chr_lengths)[seq(2,n,2)], cex.axis=2, lwd.ticks=0, lwd=0)
polygon(c(0, max(chr_breaks), max(chr_breaks), 0), c(2.1, 2.1, 2.9, 2.9), col="#DADADA", border=NA)
polygon(c(0, max(chr_breaks), max(chr_breaks), 0), c(1.1, 1.1, 1.9, 1.9), col="#DADADA", border=NA)
polygon(c(0, max(chr_breaks), max(chr_breaks), 0), c(0.1, 0.1, 0.9, 0.9), col="#DADADA", border=NA)
max_effect_size = max(c(total_effect_size, major_effect_size, minor_effect_size), na.rm=T)
if (max_effect_size < effect_size_upper_range) {
effect_size_upper_range = max_effect_size
}
for (segment_i in seq_along(segs)) {
seg = segs[segment_i]
tot_pval = FDR_pvals[segment_i,1]
maj_pval = FDR_pvals[segment_i,2]
min_pval = FDR_pvals[segment_i,3]
tot_es = total_effect_size[segment_i]
maj_es = major_effect_size[segment_i]
min_es = minor_effect_size[segment_i]
xstart = start(seg) + chr_breaks[as.character(seqnames(seg))]
xend = end(seg) + chr_breaks[as.character(seqnames(seg))]
draw_ramped_effect_size_if_sig(tot_pval, tot_es, xstart, xend, 2.1, 2.9, sig_threshold, effect_size_upper_range, pal=pal)
draw_ramped_effect_size_if_sig(maj_pval, maj_es, xstart, xend, 1.1, 1.9, sig_threshold, effect_size_upper_range, pal=pal)
draw_ramped_effect_size_if_sig(min_pval, min_es, xstart, xend, 0.1, 0.9, sig_threshold, effect_size_upper_range, pal=pal)
}
for (cb in chr_breaks) {
lines(x=c(cb,cb), y=c(0,3), col="#040404")
}
}
# plot effect size of significantly different segments for each pairwise comparison
png("Sherlock/New_Histology/2025/Figures/Fig3/NSDA_SD_pos_neg_sig_and_effect_graded_Greens.png", width=9000, height=1000, res=300)
plot_sig_and_graded_effect_size(comparison_lists[[1]], pal=unique(Greens(6000)))
dev.off()
png("Sherlock/New_Histology/2025/Figures/Fig3/NSDB_SD_pos_neg_sig_and_effect_graded_Greens.png", width=9000, height=1000, res=300)
plot_sig_and_graded_effect_size(comparison_lists[[2]], pal=unique(Greens(6000)))
dev.off()
png("Sherlock/New_Histology/2025/Figures/Fig3/NSDA_NSDB_pos_neg_sig_and_effect_graded_Greens.png", width=9000, height=1000, res=300)
plot_sig_and_graded_effect_size(comparison_lists[[3]], pal=unique(Greens(6000)))
dev.off()
# get copy number of KRAS +/- samples
kras_pos = mergedseg[mergedseg$ID == "dMut_KRAS","Tumour_Name"]
kras_neg = setdiff(unique(mergedseg$Tumour_Name), kras_pos)
kras_pos_neg_seg_CN = lapply(list(kras_pos, kras_neg), get_combined_segs_CN)
# plot mean copy number for KRAS + vs - samples
fn = "Sherlock/New_Histology/2025/Figures/Fig5/KRAS_pos_neg_mean_seg_tot_and_min_CN_large.png"
png(fn, width=9000, height=3600, res=300)
plot_seg_CN_nSets(kras_pos_neg_seg_CN, set_cols = c("#E48C19AA","#08189EAA","#FFD23FAA","#332A2C"))
dev.off()
# get copy number of EGFR +/- samples
egfr_pos = mergedseg[mergedseg$ID == "dMut_EGFR","Tumour_Name"]
egfr_neg = setdiff(unique(mergedseg$Tumour_Name), egfr_pos)
egfr_pos_neg_seg_CN = lapply(list(egfr_pos, egfr_neg), get_combined_segs_CN)
# plot mean copy number for EGFR + vs - samples
fn = "Sherlock/New_Histology/2025/Figures/Fig5/EGFR_pos_neg_mean_seg_tot_and_min_CN_large.png"
png(fn, width=9000, height=3600, res=300)
plot_seg_CN_nSets(egfr_pos_neg_seg_CN, set_cols = c("#E48C19AA","#08189EAA","#FFD23FAA","#332A2C"))
dev.off()