SherlockLung-EvolutionaryTrajectory-Analysis / Analyses / average_copy_number_plots.R
average_copy_number_plots.R
Raw
library(plotrix)

get_combined_segs_CN = function(sample_set) {
    # get all breakpoints
    all_sample_set_segs = sort(Reduce(c, all_scgr[sample_set]))
    
    chr_list_sample_set_segs = split(all_sample_set_segs, seqnames(all_sample_set_segs))
    
    # will throw warnings because we are combining GRanges with different seqnames
    #  - suppress them as we know this is what we want to do
    combined_segs = suppressWarnings(Reduce(c, lapply(names(chr_list_sample_set_segs), function(chr_name) {
        # including "as.character" as a safety measure as most of the names are numbers and we do not want confusion
        chr_sample_set_segs = chr_list_sample_set_segs[[as.character(chr_name)]]
        chr_seg_starts = sort(unique(start(chr_sample_set_segs)))
        chr_seg_ends = sort(unique(end(chr_sample_set_segs)))
        
        additional_ends = chr_seg_starts[-1] - 1
        additional_starts = chr_seg_ends[-length(chr_seg_ends)] + 1
        
        # note: some of these will be gaps between segments
        #  - this is okay and will be handled in the overlaps section
        combined_seg_starts = sort(unique(c(chr_seg_starts, additional_starts)))
        combined_seg_ends = sort(unique(c(chr_seg_ends, additional_ends)))
        
        combined_chr_segs = GRanges(seqnames=chr_name, ranges=IRanges(start=combined_seg_starts, end=combined_seg_ends))
        
        combined_chr_segs
    })))
    
    segment_CN_stats = t(sapply(seq_along(combined_segs), function(csi) {
        ol = findOverlaps(combined_segs[csi], all_sample_set_segs)
        if (length(ol) > 0) {
            majCNs = all_sample_set_segs[subjectHits(ol)]$av_majCN
            minCNs = all_sample_set_segs[subjectHits(ol)]$av_minCN
            totCNs = majCNs + minCNs
            return_vals = c(`totCN_mean`=mean(totCNs), `totCN_sd`=sd(totCNs), `totCN_se`=std.error(totCNs), `majCN_mean`=mean(majCNs), `majCN_sd`=sd(majCNs), `majCN_se`=std.error(majCNs), `minCN_mean`=mean(minCNs), `minCN_sd`=sd(minCNs), `minCN_se`=std.error(minCNs))
        } else {
            return_vals = rep(NA, 9)
        }
        num_segs = length(combined_segs)
        cat(paste0("\r", csi, " of ", num_segs, " (", round(((csi/num_segs) * 100), 2), "%) done"))
        return_vals
    }))
    
    values(combined_segs) = as(segment_CN_stats, "DataFrame")
    
    combined_segs
}

plot_seg_CN_nSets = function(set_seg_CN_list, top_track="total", set_cols = c("#8adbf5","#a371cb","#5a5a69","#332A2CAA")) {
  set_chr_lengths = lapply(set_seg_CN_list, function(set_seg_CN) { sapply(split(set_seg_CN, seqnames(set_seg_CN)), function(x) { max(end(x)) }) })
  chr_lengths = sapply(names(set_chr_lengths[[1]]), function(x) { max(sapply(set_chr_lengths, function(y) { y[x] })) })
  chr_breaks = c(0, cumsum(as.numeric(chr_lengths)))
  names(chr_breaks) = c(names(chr_lengths), "end")
  if (top_track == "major") {
    ymax = 4.25
  } else {
    ymax = 5.25
  }
  xmax = chr_breaks["end"]
  plot(1, type = "n", xlab = "", ylab = "", xlim = c(1, xmax), ylim = c(0, ymax), xaxt='n', xaxs='i', yaxt="n")
  axis(2,cex.axis=2)
  for (vl in 0:5) {
    lines(x=c(0,xmax), y=c(vl,vl), col="lightgrey")
  }
  par(xpd=FALSE)
  for (cb in chr_breaks) {
    lines(x=c(cb,cb), y=c(-1,6), col="darkgrey")
  }
  xlab_x = sapply(2:length(chr_breaks), function(end_i) { chr_breaks[end_i - 1] + ((chr_breaks[end_i] - chr_breaks[end_i - 1]) / 2) })
  n = length(xlab_x)
  axis(1, at=xlab_x[seq(1,n,2)], labels=names(chr_lengths)[seq(1,n,2)], cex.axis=2, lwd.ticks=0)
  axis(1, at=xlab_x[seq(2,n,2)], labels=names(chr_lengths)[seq(2,n,2)], cex.axis=2, lwd.ticks=0)
  for (set_i in seq_along(set_seg_CN_list)) {
    message(paste0("Plotting segments for sample set ", set_i, " of ", length(set_seg_CN_list)))
    latest_pct_done = -1
	for (segment_i in seq_along(set_seg_CN_list[[set_i]])) {
      pct_done = round((segment_i / length(set_seg_CN_list[[set_i]])) * 100, 1)
	  if (pct_done > latest_pct_done) {
	    cat('\rPercent complete: ',sprintf("%.1f", pct_done))
        flush.console()
	    latest_pct_done = pct_done
	  }
      seg = set_seg_CN_list[[set_i]][segment_i]
      xstart = start(seg) + chr_breaks[as.character(seqnames(seg))]
      xend = end(seg) + chr_breaks[as.character(seqnames(seg))]
      if (top_track=="major") {
	    lines(x=c(xstart, xend), y=c(seg$majCN_mean, seg$majCN_mean), col=set_cols[set_i], lwd=4)
	  } else {
	    lines(x=c(xstart, xend), y=c(seg$totCN_mean, seg$totCN_mean), col=set_cols[set_i], lwd=4)
	  }
      lines(x=c(xstart, xend), y=c(seg$minCN_mean, seg$minCN_mean), col=set_cols[set_i], lwd=4)
    }
  }
}

# get mean copy number for each unique segment across all samples of each de novo subset
allhq_luad_seg_CN = lapply(sample_groups, get_combined_segs_CN)

# plot average copy number profiles for de novo subsets
fn = "Sherlock/New_Histology/2025/Figures/Fig3/AllHQ_LUAD_mean_seg_CN_large_bluePurpleGrey.png"
png(fn, width=9000, height=3600, res=300)
plot_seg_CN_nSets(allhq_luad_seg_CN)
dev.off()

# set up pairwise comparison sets
sets_of_sets = list(
list(sample_groups[[1]], sample_groups[[3]]),
list(sample_groups[[2]], sample_groups[[3]]),
list(sample_groups[[1]], sample_groups[[2]])
)

# run pairwise comparisons between de novo subsets
comparison_lists = lapply(sets_of_sets, function(sets) {
  set1 = sets[[1]]
  set2 = sets[[2]]
  set1_and_set2_sample_set_segs = get_all_sample_set_segs(c(set1, set2))
  set1_sample_set_segs = get_all_sample_set_segs(set1)
  set2_sample_set_segs = get_all_sample_set_segs(set2)
  set1_and_set2_combined_segs = get_combined_segs(set1_and_set2_sample_set_segs)
  #set1_vs_set2_CN_comparison = compare_total_CN(set1_and_set2_combined_segs, set1_sample_set_segs, set2_sample_set_segs)
  set1_vs_set2_CN_comparison = compare_total_CN(set1_and_set2_combined_segs, set1_sample_set_segs, set2_sample_set_segs, c(length(set1)*0.25, length(set2)*0.25))
  set1_vs_set2_CN_comparison_FDR = apply(set1_vs_set2_CN_comparison[,1:3], 2, p.adjust, method="fdr")
  set1_vs_set2_CN_comparison_FDR = apply(set1_vs_set2_CN_comparison_FDR, 2, function(comp_col) {
    comp_col[which(is.na(comp_col))] = 1
    comp_col
  })
  total_p_lt_0pt05 = sum(width(set1_and_set2_combined_segs[set1_vs_set2_CN_comparison_FDR[,1] < 0.05])) / sum(width(set1_and_set2_combined_segs))
  major_p_lt_0pt05 = sum(width(set1_and_set2_combined_segs[set1_vs_set2_CN_comparison_FDR[,2] < 0.05])) / sum(width(set1_and_set2_combined_segs))
  minor_p_lt_0pt05 = sum(width(set1_and_set2_combined_segs[set1_vs_set2_CN_comparison_FDR[,3] < 0.05])) / sum(width(set1_and_set2_combined_segs))
  total_p_lt_0pt001 = sum(width(set1_and_set2_combined_segs[set1_vs_set2_CN_comparison_FDR[,1] < 0.001])) / sum(width(set1_and_set2_combined_segs))
  major_p_lt_0pt001 = sum(width(set1_and_set2_combined_segs[set1_vs_set2_CN_comparison_FDR[,2] < 0.001])) / sum(width(set1_and_set2_combined_segs))
  minor_p_lt_0pt001 = sum(width(set1_and_set2_combined_segs[set1_vs_set2_CN_comparison_FDR[,3] < 0.001])) / sum(width(set1_and_set2_combined_segs))
  abs_diff_tot = abs(set1_vs_set2_CN_comparison[,4] - set1_vs_set2_CN_comparison[,5])
  adj_abs_diff_tot = abs_diff_tot * width(set1_and_set2_combined_segs)
  eff_size_tot = sum(adj_abs_diff_tot, na.rm = TRUE) / sum(width(set1_and_set2_combined_segs)[which(!is.na(adj_abs_diff_tot))])
  abs_diff_maj = abs(set1_vs_set2_CN_comparison[,6] - set1_vs_set2_CN_comparison[,7])
  adj_abs_diff_maj = abs_diff_maj * width(set1_and_set2_combined_segs)
  eff_size_maj = sum(adj_abs_diff_maj, na.rm = TRUE) / sum(width(set1_and_set2_combined_segs)[which(!is.na(adj_abs_diff_maj))])
  abs_diff_min = abs(set1_vs_set2_CN_comparison[,8] - set1_vs_set2_CN_comparison[,9])
  adj_abs_diff_min = abs_diff_min * width(set1_and_set2_combined_segs)
  eff_size_min = sum(adj_abs_diff_min, na.rm = TRUE) / sum(width(set1_and_set2_combined_segs)[which(!is.na(adj_abs_diff_min))])
  sig_diff_proportions = t(c(total_p_lt_0pt05, major_p_lt_0pt05, minor_p_lt_0pt05, total_p_lt_0pt001, major_p_lt_0pt001, minor_p_lt_0pt001, eff_size_tot, eff_size_maj, eff_size_min))
  colnames(sig_diff_proportions) = c("total_p_lt_0pt05", "major_p_lt_0pt05", "minor_p_lt_0pt05", "total_p_lt_0pt001", "major_p_lt_0pt001", "minor_p_lt_0pt001", "total_CN_effect_size", "major_CN_effect_size", "minor_CN_effect_size")
  comparison_list = list(set1_and_set2_combined_segs, set1_vs_set2_CN_comparison, set1_vs_set2_CN_comparison_FDR, sig_diff_proportions)
})

library(heatmaply)

plot_sig_and_graded_effect_size = function(comparison, sig_threshold=0.05, effect_size_upper_range=2, effect_size_threshold=0, pal=unique(YlGnBu(6000))) {
  segs = comparison[[1]]
  FDR_pvals = comparison[[3]]
  total_effect_size = abs(comparison[[2]][,4] - comparison[[2]][,5])
  major_effect_size = abs(comparison[[2]][,6] - comparison[[2]][,7])
  minor_effect_size = abs(comparison[[2]][,8] - comparison[[2]][,9])
  
  chr_lengths = sapply(split(segs, seqnames(segs)), function(x) { max(end(x)) })
  chr_breaks = c(0, cumsum(as.numeric(chr_lengths)))
  names(chr_breaks) = c(names(chr_lengths), "end")
  xmax = chr_breaks["end"]
  plot(1, type = "n", xlab = "", ylab = "", xlim = c(1, xmax), ylim = c(0, 3), xaxt='n', xaxs='i', yaxt='n', yaxs='i', axes=FALSE)
  par(xpd=FALSE)
  par(oma=c(0,0,0,0))
  xlab_x = sapply(2:length(chr_breaks), function(end_i) { chr_breaks[end_i - 1] + ((chr_breaks[end_i] - chr_breaks[end_i - 1]) / 2) })
  n = length(xlab_x)
  axis(1, at=xlab_x[seq(1,n,2)], labels=names(chr_lengths)[seq(1,n,2)], cex.axis=2, lwd.ticks=0, lwd=0)
  axis(1, at=xlab_x[seq(2,n,2)], labels=names(chr_lengths)[seq(2,n,2)], cex.axis=2, lwd.ticks=0, lwd=0)
  polygon(c(0, max(chr_breaks), max(chr_breaks), 0), c(2.1, 2.1, 2.9, 2.9), col="#DADADA", border=NA)
  polygon(c(0, max(chr_breaks), max(chr_breaks), 0), c(1.1, 1.1, 1.9, 1.9), col="#DADADA", border=NA)
  polygon(c(0, max(chr_breaks), max(chr_breaks), 0), c(0.1, 0.1, 0.9, 0.9), col="#DADADA", border=NA)
  max_effect_size = max(c(total_effect_size, major_effect_size, minor_effect_size), na.rm=T)
  if (max_effect_size < effect_size_upper_range) {
    effect_size_upper_range = max_effect_size
  }
  for (segment_i in seq_along(segs)) {
    seg = segs[segment_i]
	tot_pval = FDR_pvals[segment_i,1]
	maj_pval = FDR_pvals[segment_i,2]
	min_pval = FDR_pvals[segment_i,3]
    tot_es = total_effect_size[segment_i]
	maj_es = major_effect_size[segment_i]
	min_es = minor_effect_size[segment_i]
	xstart = start(seg) + chr_breaks[as.character(seqnames(seg))]
	xend = end(seg) + chr_breaks[as.character(seqnames(seg))]
	
	draw_ramped_effect_size_if_sig(tot_pval, tot_es, xstart, xend, 2.1, 2.9, sig_threshold, effect_size_upper_range, pal=pal)
	draw_ramped_effect_size_if_sig(maj_pval, maj_es, xstart, xend, 1.1, 1.9, sig_threshold, effect_size_upper_range, pal=pal)
	draw_ramped_effect_size_if_sig(min_pval, min_es, xstart, xend, 0.1, 0.9, sig_threshold, effect_size_upper_range, pal=pal)
  }
  for (cb in chr_breaks) {
    lines(x=c(cb,cb), y=c(0,3), col="#040404")
  }
}


# plot effect size of significantly different segments for each pairwise comparison
png("Sherlock/New_Histology/2025/Figures/Fig3/NSDA_SD_pos_neg_sig_and_effect_graded_Greens.png", width=9000, height=1000, res=300)
plot_sig_and_graded_effect_size(comparison_lists[[1]], pal=unique(Greens(6000)))
dev.off()
png("Sherlock/New_Histology/2025/Figures/Fig3/NSDB_SD_pos_neg_sig_and_effect_graded_Greens.png", width=9000, height=1000, res=300)
plot_sig_and_graded_effect_size(comparison_lists[[2]], pal=unique(Greens(6000)))
dev.off()
png("Sherlock/New_Histology/2025/Figures/Fig3/NSDA_NSDB_pos_neg_sig_and_effect_graded_Greens.png", width=9000, height=1000, res=300)
plot_sig_and_graded_effect_size(comparison_lists[[3]], pal=unique(Greens(6000)))
dev.off()


# get copy number of KRAS +/- samples
kras_pos = mergedseg[mergedseg$ID == "dMut_KRAS","Tumour_Name"]
kras_neg = setdiff(unique(mergedseg$Tumour_Name), kras_pos)
kras_pos_neg_seg_CN = lapply(list(kras_pos, kras_neg), get_combined_segs_CN)

# plot mean copy number for KRAS + vs - samples
fn = "Sherlock/New_Histology/2025/Figures/Fig5/KRAS_pos_neg_mean_seg_tot_and_min_CN_large.png"
png(fn, width=9000, height=3600, res=300)
plot_seg_CN_nSets(kras_pos_neg_seg_CN, set_cols = c("#E48C19AA","#08189EAA","#FFD23FAA","#332A2C"))
dev.off()

# get copy number of EGFR +/- samples
egfr_pos = mergedseg[mergedseg$ID == "dMut_EGFR","Tumour_Name"]
egfr_neg = setdiff(unique(mergedseg$Tumour_Name), egfr_pos)
egfr_pos_neg_seg_CN = lapply(list(egfr_pos, egfr_neg), get_combined_segs_CN)

# plot mean copy number for EGFR + vs - samples
fn = "Sherlock/New_Histology/2025/Figures/Fig5/EGFR_pos_neg_mean_seg_tot_and_min_CN_large.png"
png(fn, width=9000, height=3600, res=300)
plot_seg_CN_nSets(egfr_pos_neg_seg_CN, set_cols = c("#E48C19AA","#08189EAA","#FFD23FAA","#332A2C"))
dev.off()