SherlockLung-EvolutionaryTrajectory-Analysis / Analyses / metric_boxplots.R
metric_boxplots.R
Raw
library(GenomicRanges)
library(ggplot2)
library(reshape2)

landscape = read.csv("path/to/data/sherlock_landscape_matrix.csv", header=T, stringsAsFactors=F)
rownames(landscape) = landscape$Tumor_Barcode
wgd_status = read.table("path/to/data/SherlockLung_WGDStatus.txt", header=F, stringsAsFactors=F, sep='\t')
rownames(wgd_status) = wgd_status[,1]
load("path/to/data/SherlockLung_AllHQ_LUAD_2025_sample_groups_G3.Rdata")

# by default, sample groups are ordered (arbitrarily) by number of samples
# re-order so that they are 1. NSD-A, 2. NSD-B, 3. SD
sg = list(sample_groups[[2]], sample_groups[[3]], sample_groups[[1]])
sg = lapply(sg, function(x) { gsub("01_01", "01.01", x) })
sample_groups = sg

# add de novo trajectory information to landscape matrix
dn_landscape = rbind(cbind(landscape[sample_groups[[1]],], `DN_group`="NSD-A"), cbind(landscape[sample_groups[[2]],], `DN_group`="NSD-B"), cbind(landscape[sample_groups[[3]],], `DN_group`="SD"))

all_sc = list.files("path/to/data/subclones_files/", full.names=T)
# Update PGA calculations
dn_pgaInfo = t(sapply(unlist(sample_groups), function(bc) {
    bc = gsub('01_01', '01.01', bc)
    x = all_sc[grep(bc, all_sc)]
    scMat = read.delim(x)
	sample_wgd_status = wgd_status[wgd_status[,1] == bc,2]
	if (sample_wgd_status == "nWGD") { normal_mcn = 1 } else { normal_mcn = 2 }	
    
	scGR = GRanges(seqnames=scMat$chr, ranges=IRanges(start=scMat$startpos, end=scMat$endpos), majCN=scMat$nMaj1_A, minCN=scMat$nMin1_A, frac1_A=scMat$frac1_A, majCN2=scMat$nMaj2_A, minCN2=scMat$nMin2_A, frac2_A=scMat$frac2_A)
    
	aber1 = scGR$majCN != normal_mcn | scGR$minCN != normal_mcn
    aber1[is.na(aber1)] = F
    aber2 = scGR$majCN2 != normal_mcn | scGR$minCN2 != normal_mcn
    aber2[is.na(aber2)] = F
    aber = aber1 | aber2
    
	clonalAber = aber & scGR$frac1_A == 1
    clonalAber[is.na(clonalAber)] = F
		
	PGA_clonal = sum(width(scGR[clonalAber])) / sum(width(scGR))
	PGA = sum(width(scGR[aber])) / sum(width(scGR))
		
    PGA
}))
names(dn_pgaInfo) = unlist(sample_groups)

dn_landscape[,"PGA"] = dn_pgaInfo[dn_landscape$Tumor_Barcode]


###### All LUAD ###########

# SNV 

options(scipen=999)
p <- ggplot(dn_landscape, aes(x=DN_group, y=SNV, fill=DN_group), show.legend=FALSE) +
    geom_boxplot() +
    theme_bw(base_family = "Roboto Condensed") +
    theme(legend.position="none", panel.border=element_rect(linewidth = 1.25, fill=NA), axis.ticks.x=element_blank(), axis.ticks.y=element_blank(), text = element_text(size=20), axis.text.x = element_text(size=20), axis.text.y = element_text(size=18)) +
    scale_fill_manual(values=c("#8adbf590","#a371cb90","#5a5a6990")) + 
	scale_x_discrete(labels=c("NSD-A","NSD-B","SD")) +
	scale_y_continuous(trans='log10', minor_breaks=rep(1:9, 21)*(10^rep(-10:10, each=9))) +
	xlab('') +
	ylab('SNV (log scale)')

p
ggsave(file='path/to/figures/Fig3/DN_SNV_LUAD_boxplot_bluePurpleGrey_slim.png',plot = p,width = 5.25,height = 7,device = "png")
options(scipen=0)

# PGA

p <- ggplot(dn_landscape, aes(x=DN_group, y=PGA, fill=DN_group), show.legend=FALSE) +
    geom_boxplot() +
    theme_bw(base_family = "Roboto Condensed") +
    theme(legend.position="none", panel.border=element_rect(linewidth = 1.25, fill=NA), axis.ticks.x=element_blank(), axis.ticks.y=element_blank(), text = element_text(size=20), axis.text.x = element_text(size=20), axis.text.y = element_text(size=18)) +
    scale_fill_manual(values=c("#8adbf590","#a371cb90","#5a5a6990")) + 
    scale_x_discrete(labels=c("NSD-A","NSD-B","SD")) +
	scale_y_continuous() +
	xlab('') +
	ylab('PGA')

p
ggsave(file='path/to/figures/Fig3/DN_PGA_LUAD_boxplot_bluePurpleGrey_slim.png',plot = p,width = 5.25,height = 7,device = "png")

# Ploidy

p <- ggplot(dn_landscape, aes(x=DN_group, y=Tumor_Ploidy, fill=DN_group), show.legend=FALSE) +
    geom_boxplot() +
    theme_bw(base_family = "Roboto Condensed") +
    theme(legend.position="none", panel.border=element_rect(linewidth = 1.25, fill=NA), axis.ticks.x=element_blank(), axis.ticks.y=element_blank(), text = element_text(size=20), axis.text.x = element_text(size=20), axis.text.y = element_text(size=18)) +
    scale_fill_manual(values=c("#8adbf590","#a371cb90","#5a5a6990")) + 
    scale_x_discrete(labels=c("NSD-A","NSD-B","SD")) +
	scale_y_continuous() +
	xlab('') +
	ylab('Ploidy')

p
ggsave(file='path/to/figures/Fig3/DN_Ploidy_LUAD_boxplot_bluePurpleGrey_slim.png',plot = p,width = 5.25,height = 7,device = "png")

# SV

p <- ggplot(dn_landscape, aes(x=DN_group, y=SV, fill=DN_group), show.legend=FALSE) +
    geom_boxplot() +
    theme_bw(base_family = "Roboto Condensed") +
    theme(legend.position="none", panel.border=element_rect(linewidth = 1.25, fill=NA), axis.ticks.x=element_blank(), axis.ticks.y=element_blank(), text = element_text(size=20), axis.text.x = element_text(size=20), axis.text.y = element_text(size=18)) +
    scale_fill_manual(values=c("#8adbf590","#a371cb90","#5a5a6990")) + 
    scale_x_discrete(labels=c("NSD-A","NSD-B","SD")) +
	scale_y_continuous() +
	xlab('') +
	ylab('SV')

p
ggsave(file='path/to/figures/Fig3/DN_SV_LUAD_boxplot_bluePurpleGrey_slim.png',plot = p,width = 5.25,height = 7,device = "png")

# Latency

lat = read.table("Sherlock/Latency_data.txt", header=T, stringsAsFactors = F, sep='\t')
dn_latencies = lapply(split(dn_landscape[,"Tumor_Barcode"], dn_landscape[,"DN_group"]), function(bc) { lat[lat[,1] %in% bc,"Latency"] })
dn_ns_latencies = lapply(split(dn_landscape[dn_landscape$Smoking_SBS4 == "NonSmoker_NonSBS4","Tumor_Barcode"], dn_landscape[dn_landscape$Smoking_SBS4 == "NonSmoker_NonSBS4","DN_group"]), function(bc) { lat[lat[,1] %in% bc,"Latency"] })

dn_ns_latency_df <- melt(dn_ns_latencies)
dn_latency_df <- melt(dn_latencies)
colnames(dn_latency_df) <- colnames(dn_ns_latency_df) <- c("Latency","Trajectory")

p <- ggplot(dn_latency_df, aes(x=Trajectory, y=Latency, fill=Trajectory), show.legend=FALSE) +
    geom_boxplot() +
    theme_bw(base_family = "Roboto Condensed") +
    theme(legend.position="none", panel.border=element_rect(linewidth = 1.25, fill=NA), axis.ticks.x=element_blank(), axis.ticks.y=element_blank(), text = element_text(size=20), axis.text.x = element_text(size=20), axis.text.y = element_text(size=18)) +
    scale_fill_manual(values=c("#8adbf590","#a371cb90","#5a5a6990")) +
    scale_x_discrete(labels=c("NSD-A","NSD-B","SD")) +
	scale_y_continuous() +
	xlab('') +
	ylab('Latency (years)')

p
ggsave(file='path/to/figures/Fig3/DN_Latency_LUAD_boxplot_slim.png',plot = p,width = 5.25,height = 7,device = "png")


snv_list = split(dn_landscape[,"SNV"], dn_landscape[,"DN_group"])
p.adjust(c(wilcox.test(snv_list[[1]], snv_list[[2]])$p.value, wilcox.test(snv_list[[2]], snv_list[[3]])$p.value, wilcox.test(snv_list[[2]], snv_list[[3]])$p.value), method="fdr")


###### NS-LUAD only ###########

# SNV 

options(scipen=999)
p <- ggplot(dn_landscape[dn_landscape$Smoking_SBS4 == "NonSmoker_NonSBS4",], aes(x=DN_group, y=SNV, fill=DN_group), show.legend=FALSE) +
    geom_boxplot() +
    theme_bw(base_family = "Roboto Condensed") +
    theme(legend.position="none", panel.border=element_rect(linewidth = 1.25, fill=NA), axis.ticks.x=element_blank(), axis.ticks.y=element_blank(), text = element_text(size=20), axis.text.x = element_text(size=20), axis.text.y = element_text(size=18)) +
    scale_fill_manual(values=c("#8adbf590","#a371cb90","#5a5a6990")) + 
	scale_x_discrete(labels=c("NSD-A","NSD-B","SD")) +
	scale_y_continuous(trans='log10', minor_breaks=rep(1:9, 21)*(10^rep(-10:10, each=9))) +
	xlab('') +
	ylab('SNV (log scale)')

p
ggsave(file='path/to/figures/Fig3/DN_SNV_NS_LUAD_boxplot_bluePurpleGrey_slim.png',plot = p,width = 5.25,height = 7,device = "png")
options(scipen=0)

# PGA

p <- ggplot(dn_landscape[dn_landscape$Smoking_SBS4 == "NonSmoker_NonSBS4",], aes(x=DN_group, y=PGA, fill=DN_group), show.legend=FALSE) +
    geom_boxplot() +
    theme_bw(base_family = "Roboto Condensed") +
    theme(legend.position="none", panel.border=element_rect(linewidth = 1.25, fill=NA), axis.ticks.x=element_blank(), axis.ticks.y=element_blank(), text = element_text(size=20), axis.text.x = element_text(size=20), axis.text.y = element_text(size=18)) +
    scale_fill_manual(values=c("#8adbf590","#a371cb90","#5a5a6990")) + 
    scale_x_discrete(labels=c("NSD-A","NSD-B","SD")) +
	scale_y_continuous() +
	xlab('') +
	ylab('PGA')

p
ggsave(file='path/to/figures/Fig3/DN_PGA_NS_LUAD_boxplot_bluePurpleGrey_slim.png',plot = p,width = 5.25,height = 7,device = "png")

# Ploidy

p <- ggplot(dn_landscape[dn_landscape$Smoking_SBS4 == "NonSmoker_NonSBS4",], aes(x=DN_group, y=Tumor_Ploidy, fill=DN_group), show.legend=FALSE) +
    geom_boxplot() +
    theme_bw(base_family = "Roboto Condensed") +
    theme(legend.position="none", panel.border=element_rect(linewidth = 1.25, fill=NA), axis.ticks.x=element_blank(), axis.ticks.y=element_blank(), text = element_text(size=20), axis.text.x = element_text(size=20), axis.text.y = element_text(size=18)) +
    scale_fill_manual(values=c("#8adbf590","#a371cb90","#5a5a6990")) + 
    scale_x_discrete(labels=c("NSD-A","NSD-B","SD")) +
	scale_y_continuous() +
	xlab('') +
	ylab('Ploidy')

p
ggsave(file='path/to/figures/Fig3/DN_Ploidy_NS_LUAD_boxplot_bluePurpleGrey_slim.png',plot = p,width = 5.25,height = 7,device = "png")

# SV

p <- ggplot(dn_landscape[dn_landscape$Smoking_SBS4 == "NonSmoker_NonSBS4",], aes(x=DN_group, y=SV, fill=DN_group), show.legend=FALSE) +
    geom_boxplot() +
    theme_bw(base_family = "Roboto Condensed") +
    theme(legend.position="none", panel.border=element_rect(linewidth = 1.25, fill=NA), axis.ticks.x=element_blank(), axis.ticks.y=element_blank(), text = element_text(size=20), axis.text.x = element_text(size=20), axis.text.y = element_text(size=18)) +
    scale_fill_manual(values=c("#8adbf590","#a371cb90","#5a5a6990")) + 
    scale_x_discrete(labels=c("NSD-A","NSD-B","SD")) +
	scale_y_continuous() +
	xlab('') +
	ylab('SV')

p
ggsave(file='path/to/figures/Fig3/DN_SV_NS_LUAD_boxplot_bluePurpleGrey_slim.png',plot = p,width = 5.25,height = 7,device = "png")

# Latency

p <- ggplot(dn_ns_latency_df, aes(x=Trajectory, y=Latency, fill=Trajectory), show.legend=FALSE) +
    geom_boxplot() +
    theme_bw(base_family = "Roboto Condensed") +
    theme(legend.position="none", panel.border=element_rect(linewidth = 1.25, fill=NA), axis.ticks.x=element_blank(), axis.ticks.y=element_blank(), text = element_text(size=20), axis.text.x = element_text(size=20), axis.text.y = element_text(size=18)) +
    scale_fill_manual(values=c("#8adbf590","#a371cb90","#5a5a6990")) +
    scale_x_discrete(labels=c("NSD-A","NSD-B","SD")) +
	scale_y_continuous() +
	xlab('') +
	ylab('Latency (years)')

p
ggsave(file='path/to/figures/Fig3/DN_NS_Latency_LUAD_boxplot_slim.png',plot = p,width = 5.25,height = 7,device = "png")


# statistical comparisons

latency_vec = unlist(sapply(rownames(dn_landscape), function(bc) { lat[lat[,1] == bc,"Latency"] }))
dn_landscape = cbind(dn_landscape, `Latency`=latency_vec[rownames(dn_landscape)])

comparitors = c("SNV", "PGA", "Tumor_Ploidy", "SV", "Latency", "Kataegis_Level", "Survival_Week", "Age", "NRPCC", "Tumor_Purity")

library(rstatix)
get_comparison_table = function(landscape_matrix, comparitors) {
  comp_tab = t(sapply(comparitors, function(comparitor) {
    splitty = split(landscape_matrix[,comparitor], landscape_matrix[,"DN_group"])
    means = c(mean(splitty[["NSD-A"]], na.rm=T), mean(splitty[["NSD-B"]], na.rm=T), mean(splitty[["SD"]], na.rm=T))
    medians = c(median(splitty[["NSD-A"]], na.rm=T), median(splitty[["NSD-B"]], na.rm=T), median(splitty[["SD"]], na.rm=T))
    AB_p = wilcox.test(splitty[["NSD-A"]], splitty[["NSD-B"]])$p.value
    AS_p = wilcox.test(splitty[["NSD-A"]], splitty[["SD"]])$p.value
    BS_p = wilcox.test(splitty[["NSD-B"]], splitty[["SD"]])$p.value
    out_row = c(means, medians, AB_p, AS_p, BS_p)
    names(out_row) = c("NSD-A_mean", "NSD-B_mean", "SD_mean", "NSD-A_median", "NSD-B_median", "SD_median", "A_B_p", "A_S_p", "B_S_p")
    out_row
  }))
  comp_tab = cbind(comp_tab, `A_B_fdr`=p.adjust(comp_tab[,"A_B_p"], method="fdr"), `A_S_fdr`=p.adjust(comp_tab[,"A_S_p"], method="fdr"), `B_S_fdr`=p.adjust(comp_tab[,"B_S_p"], method="fdr"))
  comp_tab
}

comparison_table_all_LUAD = get_comparison_table(dn_landscape, comparitors)
comparison_table_NS_LUAD = get_comparison_table(dn_landscape[dn_landscape$Smoking_SBS4 == "NonSmoker_NonSBS4",], comparitors)
comparison_table_S_LUAD = get_comparison_table(dn_landscape[dn_landscape$Smoking_SBS4 == "Smoker_SBS4",], comparitors)

write.table(comparison_table_all_LUAD, file='Sherlock/New_Histology/2025/Figures/Fig3/All_LUAD_metric_table.txt', quote=FALSE, col.names=TRUE, row.names=TRUE, sep='\t')
write.table(comparison_table_NS_LUAD, file='Sherlock/New_Histology/2025/Figures/Fig3/SBS4negative_NS_LUAD_metric_table.txt', quote=FALSE, col.names=TRUE, row.names=TRUE, sep='\t')
write.table(comparison_table_S_LUAD, file='Sherlock/New_Histology/2025/Figures/Fig3/SBS4positive_S_LUAD_metric_table.txt', quote=FALSE, col.names=TRUE, row.names=TRUE, sep='\t')