tumour_set = "SherlockLung_LUAD_2025_DN3"
input_dir = paste0("path/to/data/", tumour_set, "/")
driver_file = "path/to/data/sherlock_driver_symbols.txt"
cyto_file = "path/to/data/cytoBand_hg38.txt"
assembly = "hg38"
PL_method = "PlackettLuce"
wgd_file = "path/to/data/SherlockLung_WGDStatus.txt"
including_wgd = TRUE
including_mrca = FALSE
output_dir = "path/to/figures/Fig1/"
horizontal = TRUE
cat(paste0("Running with the following variable values:
tumour_set = ", tumour_set, "
input_dir = ", input_dir, "
driver_file = ", driver_file, "
cyto_file = ", cyto_file, "
assembly = ", assembly, "
PL_method = ", PL_method, "
wgd_file = ", wgd_file, "
including_wgd = ", including_wgd, "
including_mrca = ", including_mrca, "
"))
### INITIALISATION ###
library(GenomicRanges)
library(magrittr)
library(hrbrthemes)
library(extrafont)
library(ggplot2)
library(gtools)
geom_stripes <- function(mapping = NULL,
data = NULL,
stat = "identity",
position = "identity",
...,
show.legend = NA,
inherit.aes = TRUE) {
ggplot2::layer(
data = data,
mapping = mapping,
stat = stat,
geom = GeomStripes,
position = position,
show.legend = show.legend,
inherit.aes = inherit.aes,
params = list(...)
)
}
GeomStripes <- ggplot2::ggproto("GeomStripes", ggplot2::Geom,
required_aes = c("y"),
default_aes = ggplot2::aes(
ymin = -Inf, ymax = Inf,
#odd = "#22222222", even = "#00000000",
odd = "#99999911", even = "#FFFFFF00",
# Change 'size' below from 0 to NA.
# When not NA then when *printing in pdf device* borders are there despite
# requested 0th size. Seems to be some ggplot2 bug caused by grid overriding
# an lwd parameter somewhere, unless the size is set to NA. Found solution here
# https://stackoverflow.com/questions/43417514/getting-rid-of-border-in-pdf-output-for-geom-label-for-ggplot2-in-r
alpha = NA, colour = "black", linetype = "solid", size = NA
),
# draw_key = ggplot2::draw_key_blank,
draw_key = ggplot2::draw_key_rect,
draw_panel = function(data, panel_params, coord) {
ggplot2::GeomRect$draw_panel(
data %>%
dplyr::mutate(
x = round(.data$x),
xmin = .data$x - 0.5,
xmax = .data$x + 0.5
) %>%
dplyr::select(
.data$ymin, .data$ymax,
.data$xmin, .data$xmax,
.data$odd, .data$even,
.data$alpha, .data$colour, .data$linetype, .data$size
) %>%
unique() %>%
dplyr::arrange(.data$xmin) %>%
dplyr::mutate(
.n = dplyr::row_number(),
fill = dplyr::if_else(
.data$.n %% 2L == 1L,
true = .data$odd,
false = .data$even
)
) %>%
dplyr::select(-.data$.n, -.data$odd, -.data$even),
panel_params,
coord
)
}
)
cyto = read.table(cyto_file, stringsAsFactors=FALSE, header=FALSE)
cyto_gr = GRanges(seqnames=gsub("chr", "", cyto[,1]), ranges=IRanges(start=cyto[,2], end=cyto[,3]), cytoband=cyto[,4])
### GENERATE PLOTS ###
G = 1
mergedseg=read.table(paste0(input_dir, "/", tumour_set,"_mergedseg_G1.txt"),header=T,stringsAsFactors = F)
matrix_coresp=read.table(paste0(input_dir, "/", tumour_set,"_matrix_coresp_G1.txt"),header=T,stringsAsFactors = F)
matrix_out=read.table(paste0(input_dir, "/", tumour_set,"_matrix_out_G1.txt"),header=T,stringsAsFactors = F)
cnai = which(mergedseg$CNA %in% c("dGain","dLOH","dHD"))
mergedseg$ID[cnai] = sapply(cnai, function(i) { paste0(gsub("d", "", mergedseg$CNA[i]), "_", mergedseg$ID[i]) })
if (wgd_file != "none") {
wgd_status = read.table(wgd_file, stringsAsFactors=FALSE)
all_wgd_samples = wgd_status[wgd_status[,2] == "WGD",1]
wgd_samples = list(intersect(unique(mergedseg$Tumour_Name), all_wgd_samples))
} else {
wgd_samples = list(unique(mergedseg[mergedseg$tumour_ploidy > 2,'Tumour_Name']))
}
if (length(wgd_samples[[1]]) == 0) {
including_wgd = FALSE
print("No WGD samples found - will not attempt to include WGD")
}
event.id=unique(mergedseg[,c("ID","noevent","CNA")])
if (including_mrca) {
event.id=rbind(data.frame(ID="MRCA",noevent=(length(unique(mergedseg$ID))+1),CNA="cMRCA",stringsAsFactors = F),event.id)
}
if (including_wgd) {
event.id=rbind(data.frame(ID="Whole_Genome_Duplication",noevent=(length(unique(mergedseg$ID))+1),CNA="cWGD",stringsAsFactors = F),event.id)
}
event.id$cna=base::substring(event.id$CNA,first = 2)
event.id=unique(event.id[,c("ID","noevent","cna")])
names(matrix_coresp)=c("noevent","event")
if (PL_method == "PLMIX") {
matrix_coresp$event=paste("p",matrix_coresp$event,sep = "_")
}
coresp=merge(event.id,matrix_coresp,"noevent")
values <- reshape2::melt(matrix_out,id.vars=NULL)
values$Var1=NULL
names(values)=c("event","value")
values$event = gsub("X","",values$event)
out <- merge(values,coresp,"event")
PLOTDF=data.frame()
unoevent=unique(out$noevent)
for (i in 1:length(unique(out$noevent))){
event=out[out$noevent==unoevent[i],]
event=event[order(event$value),]
event=event[26:975,]
PLOTDF=rbind(PLOTDF,event)
}
all_event_i = unique(PLOTDF$event)
event_medians = sapply(all_event_i, function(event_i) {
median(PLOTDF[PLOTDF$event == event_i,'value'])
})
PLOTDF$ID=factor(PLOTDF$ID,levels=unique(PLOTDF$ID)[order(event_medians, decreasing=T)])
unique_IDs = unique(mergedseg$ID)
samples_with_each_event = lapply(unique_IDs, function(ID) {
unique(mergedseg[mergedseg$ID == ID,'Tumour_Name'])
})
if (including_wgd) {
samples_with_each_event = append(samples_with_each_event, wgd_samples)
names(samples_with_each_event) = c(unique_IDs, "WGD")
} else {
names(samples_with_each_event) = unique_IDs
}
unique_samples = unique(mergedseg$Tumour_Name)
n_samples = length(unique_samples)
if (including_mrca) {
new_names = c(names(samples_with_each_event), "MRCA")
samples_with_each_event = append(samples_with_each_event, list(unique_samples))
names(samples_with_each_event) = new_names
}
event_frequency = sapply(samples_with_each_event, function(x) { round((length(x) / n_samples) * 100, 1) })
names(event_frequency) = gsub("WGD", "Whole_Genome_Duplication", names(event_frequency))
event_frequency = event_frequency[levels(PLOTDF$ID)]
if (driver_file != "none") {
if (endsWith(driver_file, ".tsv")) {
all_drivers = unique(read.table(driver_file, header=T, sep='\t', stringsAsFactors=FALSE)$SYMBOL)
} else {
all_drivers = unique(readLines(driver_file))
}
}
if (use_remote_db) {
if (driver_file != "none") {
driver_df = biomaRt::getBM(attributes = c("chromosome_name", "start_position", "end_position", "hgnc_symbol"),
mart = thisMart,
filters = "hgnc_symbol",
values = all_drivers)
} else {
driver_df = c()
}
driver_gr = GRanges(seqnames=driver_df[,1], ranges=IRanges(start=driver_df[,2], end=driver_df[,3]), symbol=driver_df[,4])
} else {
if (assembly == "hg38") {
load("code/all_gene_gr_hg38.Rdata")
} else {
load("code/all_gene_gr_hg19.Rdata")
}
driver_gr = all_gene_gr[all_gene_gr$symbol %in% all_drivers]
}
levels(PLOTDF$ID) = sapply(levels(PLOTDF$ID), function(id) {
text_to_add = ""
if (length(grep("_chr", id)) > 0) {
locus_split = strsplit(id, "_")[[1]]
locus_gr = GRanges(seqnames=gsub(".*chr", "", locus_split[2]), ranges=IRanges(start=as.numeric(locus_split[3]), end=as.numeric(locus_split[4])))
ol = suppressWarnings(findOverlaps(locus_gr, driver_gr))
if (length(ol) > 0) {
text_to_add = paste0(" (", paste(driver_gr$symbol[subjectHits(ol)], collapse=", "), ")")
}
}
paste0(id, text_to_add)
})
levels(PLOTDF$ID) = gsub("dMut_", "", levels(PLOTDF$ID))
levels(PLOTDF$ID) = gsub("Whole_Genome_Duplication", "WGD", levels(PLOTDF$ID))
names(event_frequency) = levels(PLOTDF$ID)
if (including_wgd) {
WGD_linepos = -log10(sort(event_medians, decreasing=T)[which(levels(PLOTDF$ID) == "WGD")])
}
levels(PLOTDF$ID) = paste0(levels(PLOTDF$ID), " [", format(event_frequency, nsmall=1), "%]")
cna_id_is = grep("_chr", levels(PLOTDF$ID))
ids = sapply(strsplit(levels(PLOTDF$ID)[cna_id_is], " "), function(x) { paste(strsplit(x[1], "_")[[1]][2:4], collapse="_") })
seg_mat = Reduce(rbind, strsplit(ids, "_"))
seg_gr = GRanges(seqnames=gsub("chr", "", seg_mat[,1]), ranges=IRanges(start=as.numeric(seg_mat[,2]), end=as.numeric(seg_mat[,3])), id=ids)
chr_cbs = lapply(split(cyto, cyto[,1]), function(x) {
list(`p`=x[grep("p", x[,4]),4], `q`=x[grep("q", x[,4]),4])
})
names(chr_cbs) = gsub("chr", "", names(chr_cbs))
ov = findOverlaps(seg_gr, cyto_gr)
cyto_ids = sapply(unique(queryHits(ov)), function(qhi) {
cbs = mcols(cyto_gr)[subjectHits(ov)[queryHits(ov) == qhi],'cytoband']
chr = as.vector(seqnames(cyto_gr)[subjectHits(ov)[queryHits(ov) == qhi]])[1]
if (length(cbs) > 1) {
arm = substr(cbs[1], 1, 1)
if (cbs[1] == chr_cbs[[chr]][[arm]][1] && cbs[length(cbs)] == chr_cbs[[chr]][[arm]][length(chr_cbs[[chr]][[arm]])]) {
paste0(chr, arm)
} else {
last_cb = cbs[length(cbs)]
last_cb = substr(last_cb, 2, nchar(last_cb))
paste0(chr, cbs[1], "-", last_cb)
}
} else {
paste0(chr, cbs[1])
}
})
levels(PLOTDF$ID)[cna_id_is] = sapply(1:length(cna_id_is), function(cna_i) {
cna_id_i = cna_id_is[cna_i]
splitty = strsplit(levels(PLOTDF$ID)[cna_id_i], " ")[[1]]
part1 = strsplit(splitty[1], "_")[[1]][1]
part2 = cyto_ids[cna_i]
part3 = paste(splitty[2:length(splitty)], collapse=" ")
paste0(part1, ": ", part2, " ", part3)
})
library(tidyverse)
tdata = as_tibble(PLOTDF)
min_freq <- 3
tdata <- tdata %>%
arrange(ID) %>%
mutate(freq=str_remove(str_remove(ID,"^.*\\["),"%\\].*")) %>%
mutate(freq=as.numeric(freq)) %>%
filter(freq>=min_freq)
# filtering by minimum frequency may get rid of WGD
if (length(grep("WGD", unique(tdata$ID))) == 0) {
including_wgd = FALSE
}
tdata$plotpos = -log10(tdata$value)
tdata$plotpos = (tdata$plotpos - min(tdata$plotpos)) + 0.11
if (including_wgd) {
WGD_linepos <- tdata %>% filter(str_detect(ID,'WGD')) %>% select(ID,value,plotpos) %>% group_by(ID) %>% summarise(pos=median(plotpos)) %>% pull(pos)
} else {
WGD_linepos <- -log10(median(event_medians))
}
maxpos <- max(tdata$plotpos)
midpoint <- median(tdata$plotpos)
tdata2 <- tdata %>%
group_by(ID) %>%
summarise(mvalue=median(plotpos),max=(max(plotpos)),min=(min(plotpos))) %>%
mutate(plotpos=if_else(mvalue>midpoint,min,max),cna='LOH') %>%
mutate(ID2=str_remove(ID,' \\[.*')) %>%
mutate(ID2=str_remove(ID2,'^.*: '))
tdata2.1 <- tdata2 %>% filter(mvalue>midpoint)
tdata2.2 <- tdata2 %>% filter(mvalue<=midpoint)
tdata2.3 <- tdata %>% group_by(ID,freq) %>% dplyr::slice(1) %>% ungroup()
tdata2.3$freq <- tdata2.3$freq / 1000
event_type_colours = c(`Mut`="#30C358",`Gain`="#E60751",`LOH`="#398BE3",`HD`="#664499",`WGD`="#FFA500", `MRCA`="#555555")
event_type_colours = event_type_colours[names(event_type_colours) %in% tdata$cna]
event_type_labels = c(`Mut`="Mutation",`Gain`="Gain",`LOH`="Loss",`HD`="Homozygous Deletion",`WGD`="WGD",`MRCA`="MRCA")
event_type_labels = event_type_labels[names(event_type_labels) %in% tdata$cna]
if(horizontal) {
gt_angle = 90
gt_vjust = 0.5
save_height = 7
save_width = 10
} else {
gt_angle = 0
gt_vjust=0.5
save_height = 10
save_width = 7
}
text_size <- 3.8
p2 <- tdata %>%
ggplot(aes(x = ID, y = plotpos, fill=cna)) +
geom_stripes(odd = "#11111106", even = "#00000000")+
geom_hline(yintercept=WGD_linepos, linetype='dashed', col = ifelse(including_wgd, 'gray70', rgb(1,1,1,0)),linewidth=0.25) +
geom_violin(color=NA) + #'black',size=0.001
geom_hline(yintercept=0, linetype='solid', col = 'gray50',linewidth=0.1) +
geom_hline(yintercept=0.05, linetype='dotted', col = 'gray50',linewidth=0.1) +
geom_hline(yintercept=0.1, linetype='dotted', col = 'gray25',linewidth=0.1) +
geom_bar(data=tdata2.3, aes(y=freq), width=2/3, color='black', linewidth = 0.001, stat="identity")+
geom_text(data=tdata2.3,aes(y=-0.02, label=paste0(format(round(freq*1000,1), nsmall=1),"%")),hjust=1,angle=gt_angle,size=text_size,family = 'Roboto Condensed' )+
stat_summary(fun = "median", geom = "crossbar", width = 0.4,linewidth=0.18, colour = "black", show.legend=FALSE) +
geom_text(data = tdata2.1,aes(label=ID2),vjust = 0.5,hjust=1,nudge_y = -0.05,size=text_size,angle=gt_angle,family = 'Roboto Condensed' )+
geom_text(data = tdata2.2,aes(label=ID2),vjust = 0.5,hjust=0,nudge_y = 0.05,size=text_size,angle=gt_angle,family = 'Roboto Condensed' )+
labs(x="",y="",fill="Event",size='Frequency (%)') +
scale_fill_manual(breaks=names(event_type_colours), values=event_type_colours, labels=event_type_labels) +
scale_size_binned(n.breaks = 8)+
guides(fill = guide_legend(byrow = TRUE,override.aes = list(size=2.5,pch=21)),size=guide_legend(override.aes = list(fill='gray50',col='black'))) +
theme_ipsum_rc(base_family="Roboto Condensed",ticks = FALSE,grid = FALSE,axis_title_just = 'm',axis_title_size = 14)+
theme(axis.text.x = element_blank(),axis.text.y = element_blank()) +
theme(legend.position="none") +
theme(plot.margin = unit(c(0, 0, 0, 0), "null")) +
scale_y_discrete(expand = c(0.1, 0.1))
dim1 = 6
dim2 = 13
if(horizontal) {
wid = dim2
hei = dim1
} else {
hei = dim2
wid = dim1
p2 <- p2 + coord_flip(clip = 'off')
}
cairo_pdf(paste0(output_dir, '/', tumour_set, "_PLPlotSlim.pdf"), width = wid,height = hei, family="Roboto Condensed")
p2
dev.off()
ggsave(file=paste0(output_dir, '/', tumour_set, "_PLPlotSlim.png") ,plot = p2,width = wid,height = hei,device = png())
dev.off()