### CLONALITY PREFERENCES
# This code follows through the ordering model code to the point of events being ordered as clonal (early/late/unspecified) or subclonal
# We then compare statistical preference for occurring clonally or subclonally
library(reshape2)
library(ggplot2)
var_defaults = c(
input_dir=".",
output_dir=".",
unobserved_events="notIncluded", #"end", # if model 2 is chosen (to include unobserved items in ordering), the unobserved events will be randomly shuffled and added either at the 'end' or 'interspersed'
losses="TRUE", # include loss events? Default=FALSE # using this will skew the position of WGD to far too early
PLmethod="PlackettLuce", # could also use PLmethod="PLMIX" if interested in detecting partitions/components only
G="1", # number of partitions - ONLY FOR "PLMIX"
add_mut="FALSE", # do you have driver mutations to add, if so, add_mut=T
genes="all", # top gene list to be included
gene_stratify="FALSE", # stratify samples by driver genes?
gene_for_stratification="TP53",
mut_file="driver_mut_file_for_timing.txt",
wgd_file="none"
)
### RUN THE FOLLOWING SECTION FOR EACH DE NOVO SUBSET (DN1, DN2, DN3)
name="SherlockLung_LUAD_2025_DN2"
basedir="path/to/data/"
args = c(name, paste0("input_dir=", basedir, name, "/"), paste0("output_dir=", basedir, name, "/"), "PLmethod=PlackettLuce", paste0("mut_file=", basedir, name, "/", name, "_driver_mut_file_for_timing.txt"), "wgd_file=path/to/data/SherlockLung_WGDStatus.txt", "G=1", "unobserved_events=notIncluded", "genes=all", "add_mut=TRUE")
# pass name of tumour set as first argument
tumour_set = args[1]
if (length(args) > 1) {
split_args = strsplit(args[2:length(args)], "=")
for (x in split_args) { assign(x[1], x[2]) }
}
for (x in names(var_defaults)) {
if (!exists(x)) {
assign(x, var_defaults[x])
}
}
if (!(unobserved_events %in% c("notIncluded", "end", "interspersed"))) {
stop('unobserved_events must be one of "notIncluded", "end", or "interspersed" (default "notIncluded")')
}
cat(paste0("Running with the following variable values:
tumour_set = ", tumour_set, "
input_dir = ", input_dir, "
output_dir = ", output_dir, "
unobserved_events = ", unobserved_events, "
losses = ", losses, "
PLmethod = ", PLmethod, "
G = ", G, "
add_mut = ", add_mut, "
genes = ", genes, "
gene_stratify = ", gene_stratify, "
gene_for_stratification = ", gene_for_stratification, "
mut_file = ", mut_file, "
wgd_file = ", wgd_file, "
"))
#model = as.numeric(model)
G = as.numeric(G)
losses = as.logical(losses)
add_mut = as.logical(add_mut)
gene_stratify = as.logical(gene_stratify)
genes = strsplit(gsub(" ", "", genes), ",")[[1]]
#library(PLmethod,character.only = T)
groups=tumour_set # Group name (only one at a time in this step) as used in previous postFDR steps 1-4
#### helper function - intersperse_unobserved ####
intersperse_unobserved <- function(observed, unobserved) {
# get total number of events
total_events = length(observed) + length(unobserved)
observed_i = sort(sample(total_events, length(observed)))
remaining_i = setdiff(1:total_events, observed_i)
unobserved_i = sample(remaining_i, length(remaining_i))
all_ordered = vector(mode="numeric", length=10)
all_ordered[observed_i] = observed
all_ordered[unobserved_i] = unobserved
all_ordered
}
#' Insert new item into vector with positional constraint
#'
#' Allows a new item to be inserted into a vector, maintining the order of the items before and after its insertion position, with a minimum index specified
#'
#' @param new_item the new element of the vector
#' @param vec the existing vector into which new_item should be inserted
#' @param first_possible the lowest index at which new_item should be allowed to be inserted into vec
#'
#' @return the new vector, containg all items in vec and new_item inserted at an index of at least first_possible
insert_constrained = function(new_item, vec, first_possible) {
# get vector position of new item, between first_possible and 1 higher than current vector length
insert_pos = sample(first_possible:(length(vec)+1), 1)
# if position of new item is 1, put it first, followed by all existing
if (insert_pos == 1) {
vec = c(new_item, vec)
# if position of new item is neither first nor last, put existing items before it first, then new item, then existing items after it
} else if (insert_pos <= length(vec)) {
vec = c(vec[1:(insert_pos-1)], new_item, vec[insert_pos:length(vec)])
# if position of new item is last, put all existing items, then new item
} else {
vec = c(vec, new_item)
}
vec
}
#' Adds homozygous deletion event numbers to a vector of other ordered events
#'
#' Inserts HD events into a vector of clonal events - If the vector contains a clonal LOH whose position encompasses an HD, the HD will be ordered after the LOH
#'
#' @param hd_vector a vector of homozygous deletion event numbers
#' @param ordered_vector a vector of other clonal event numbers
#' @param merged_seg the merged segments data.frame containing event information including colnames chr, startpos, endpos, and CNA
#'
#' @return the new vector of ordered events, containg HD events inserted at appropriate positions among the existing clonal events
add_hds_to_ordered_vector = function(hd_vector, ordered_vector, merged_seg) {
# IF we have clonal HDs:
if (length(hd_vector) > 0) {
# For each clonal HD:
for (HD_noevent in hd_vector) {
# If there are other pre-WGD events:
if (length(ordered_vector) > 0) {
# check for LOH segments which entirely encompass the HD locus
HD_locus = merged_seg[merged_seg[,"noevent"] == HD_noevent,c("chr", "startpos", "endpos")]
encompassing_i = which(sapply(ordered_vector, function(before_noevent) {
before_seg = merged_seg[merged_seg[,"noevent"] == before_noevent,]
before_seg["chr"] == HD_locus["chr"] && before_seg["startpos"] <= HD_locus["startpos"] && before_seg["endpos"] >= HD_locus["endpos"] & before_seg["CNA"] == "dLOH"
}))
# If any LOH segments have been found that encompass the HD, the HD must occur AFTER these
if (length(encompassing_i) > 0) {
ordered_vector = insert_constrained(HD_noevent, ordered_vector, max(encompassing_i)+1)
# Otherwise, it can occur anywhere
} else {
ordered_vector = insert_constrained(HD_noevent, ordered_vector, 1)
}
# If no other pre-WGD events, set this HD as the first one
} else {
ordered_vector = HD_noevent
}
}
}
ordered_vector
}
######INPUT DATA############################### ONLY ONE GROUP AT A TIME ####
cna_type=c("LOH","Gain","HD")
mergedseg=data.frame() # this will contain all CNA types with new noevent numbers assigned
MAX=0
for (cna in 1:length(cna_type)){
if(file.exists(paste0(input_dir, "/", groups,"_",cna_type[cna],"_mergedsegs.txt"))){
#paste0(groups[grp],"_",cna_type[cna],"_mergedsegs.txt")
CNA=read.table(paste0(input_dir, "/", groups,"_",cna_type[cna],"_mergedsegs.txt"),header=T,stringsAsFactors = F,sep="\t")
print(paste(paste0(input_dir, "/", groups,"_",cna_type[cna],"_mergedsegs.txt"),nrow(CNA)))
print(length(unique((CNA$noevent))))
CNA$noevent=CNA$noevent+MAX
print(range(CNA$noevent))
mergedseg=rbind(mergedseg,CNA)
MAX=max(CNA$noevent)
print(MAX)
}
}
mergedseg$ID=paste(paste0("chr",mergedseg$chr),mergedseg$startpos,mergedseg$endpos,sep="_")
mergedseg$no.chr.bearing.mut=NA # dummy for rbind/merging only
## ADD MUTATIONAL DRIVERS ####
if (add_mut){
mutations=read.table(mut_file,header=T,stringsAsFactors = F) #### NAP: need to import your own MUTATION TABLE ####
#mutations=mutations[!grepl("TCGA",mutations$ID),] # for NIG
#mutations=mutations[which(!is.na(match(mutations$ID,mergedseg$Tumour_Name))),] # for HER2
#func=c("frameshift deletion","frameshift insertion","frameshift substitution","nonsynonymous SNV","Splice_Site","stopgain")
mut_genes = gsub("[cs]Mut_", "", mutations$CNA)
mut_drivers=data.frame()
if (genes == "all") {
genes = unique(mut_genes)
}
for (i in 1:length(genes)){
MUTf=mutations[which(mut_genes==genes[i]),]
if (nrow(MUTf) > 0) {
#MUTf$noevent = MUTf$noevent + MAX
MUTf$noevent = MAX + 1
MAX = MAX + 1
ccf_order = order(MUTf$w.mean, decreasing=T)
MUTf = MUTf[ccf_order,]
#ID = mut_genes[ccf_order]
ID = gsub("[cs]Mut", "dMut", MUTf$CNA)
MUTf$CNA = "dMut"
no_dups = !duplicated(cbind(MUTf[,'Tumour_Name'], ID))
MUTf = MUTf[no_dups,]
ID = ID[no_dups]
MUT_out = cbind(MUTf[,1:10], ID, no.chr.bearing.mut=MUTf[,11])
mut_drivers = rbind(mut_drivers, MUT_out)
}
}
mergedseg=rbind(mergedseg,mut_drivers)
}
# MUT DRIVER-BASED STRATIFICATION:
if (gene_stratify){
# GATA3
mergedseg=mergedseg[which(!is.na(match(mergedseg$Tumour_Name,mergedseg[which(mergedseg$ID==gene_for_stratification),]$Tumour_Name))),]
mergedseg=mergedseg[which(mergedseg$CNA!="dMut"),]
}
mergedseg$Tumour_Name = basename(mergedseg$Tumour_Name)
tumour.names<-unique(gsub("\\.", "_", mergedseg$Tumour_Name))
# For incorporating Loss events in WGD samples: LOH --> WGD --> Loss
allsegs=read.table(paste0(input_dir, "/", groups,"_allsegments.txt"),header=T,stringsAsFactors = F)
if (losses){
loss=allsegs[which(!is.na(match(allsegs$CNA,c("cLoss","sLoss"))) & allsegs$tumour_ploidy>2 & allsegs$nMin1_A>0),]
enrichedLOH=read.table(paste0(input_dir, "/", groups,"_enriched_regions_LOH_noevent.txt"),header=T,stringsAsFactors = F)
enrichedLOH$chr=as.character(enrichedLOH$chr)
enrichedLOH=enrichedLOH[enrichedLOH$startpos!=enrichedLOH$endpos,]
loss=loss[which(!is.na(match(loss$chr,enrichedLOH$chr))),]
data.table::setDT(enrichedLOH)
data.table::setDT(loss)
data.table::setkey(enrichedLOH,"chr","startpos","endpos")
data.table::setkey(loss,"chr","startpos","endpos")
enrichedLoss=data.frame(data.table::foverlaps(loss,enrichedLOH,type="any",nomatch = 0),stringsAsFactors = F) # type='any' overlap
enrichedLoss$ID=paste(paste0("chr",enrichedLoss$chr),enrichedLoss$startpos,enrichedLoss$endpos,sep="_")
enrichedLoss$no.chr.bearing.mut=NA # dummy for rbind only
# REDUCE LOH/Loss events to 1 per sample per noevent, according to the following order of precedence:
# 1) Clonal LOH
# 2) Clonal Loss
# 3) Subclonal loss or LOH with highest CCF (LOH takes precedence if equal)
for (barcode in tumour.names) {
sample_enrichedLoss = cbind(enrichedLoss[enrichedLoss[,"Tumour_Name"] == barcode,c("noevent","CNA","frac1_A")], `rowname`=rownames(enrichedLoss[enrichedLoss[,"Tumour_Name"] == barcode,]))
sample_LOH = cbind(mergedseg[mergedseg[,"Tumour_Name"] == barcode & mergedseg[,"CNA"] == "dLOH",c("noevent","CNA","w.mean")], `rowname`=rownames(mergedseg[mergedseg[,"Tumour_Name"] == barcode & mergedseg[,"CNA"] == "dLOH",]))
colnames(sample_LOH)[3] = "frac1_A"
sample_L = rbind(sample_LOH, sample_enrichedLoss)
for (noev in unique(sample_L$noevent)) {
sample_L_noev = sample_L[sample_L$noevent == noev,]
if (nrow(sample_L_noev) > 1) {
clonal_LOH_row_i = which(sample_L_noev$CNA == "dLOH" & sample_L_noev$frac1_A == 1)
if (length(clonal_LOH_row_i) > 0) {
to_remove_row_i = setdiff(1:nrow(sample_L_noev), clonal_LOH_row_i[1])
} else {
clonal_loss_row_i = which(sample_L_noev$CNA == "cLoss")
if (length(clonal_loss_row_i) > 0) {
to_remove_row_i = setdiff(1:nrow(sample_L_noev), clonal_loss_row_i[1])
} else {
highest_ccf_row_i = which(sample_L_noev$frac1_A == max(sample_L_noev$frac1_A))
if (length(highest_ccf_row_i) > 1) {
highest_ccf_LOH = which(sample_L_noev$CNA == "dLOH" & sample_L_noev$frac1_A == max(sample_L_noev$frac1_A))
if (length(highest_ccf_LOH) > 0) {
highest_ccf_row_i = highest_ccf_LOH
}
if (length(highest_ccf_row_i) > 1) {
highest_ccf_row_i = highest_ccf_row_i[1]
}
}
to_remove_row_i = setdiff(1:nrow(sample_L_noev), highest_ccf_row_i)
}
}
# REMOVE to_remove_row_i
for (remove_i in to_remove_row_i) {
if (sample_L_noev[remove_i,"CNA"] == "dLOH") {
mergedseg = mergedseg[-which(rownames(mergedseg) == sample_L_noev[remove_i,"rowname"]),]
} else {
enrichedLoss = enrichedLoss[-which(rownames(enrichedLoss) == sample_L_noev[remove_i,"rowname"]),]
}
}
}
}
}
}
###
#### ORDERING THE EVENTS###################################################################
events<-c(unique(mergedseg$noevent),max(mergedseg$noevent)+1) # The final event represents WGD
matrix_coresp<-cbind(sort(events),c(1:length(events)))
colnames(matrix_coresp)<-c("true","ordered")
matrix_coresp<-as.data.frame(matrix_coresp)
matrix_out<-NULL
matrixclass<-NULL
allbics<-NULL
classification<-NULL
timing_categories = lapply(1:length(tumour.names), function(i) {
v<-NULL
vectoroforderedeventssubclonal.initial<-NULL
vectoroforderedeventssubclonal<-NULL
vector.of.ordered.events.clonal<-NULL
list.events.subclonal<-list()
mergedsegforsample<-mergedseg[as.character(mergedseg$Tumour_Name)==as.character(tumour.names[i]), ]
vector.of.total.events<-unique(mergedsegforsample$noevent)
#take subclonal events separately and order them according to trees
mergedsegforsamplesubclonal<-mergedsegforsample[mergedsegforsample$w.mean<1,]
vectorofeventssubclonal<-mergedsegforsamplesubclonal$noevent
if (losses) {
sLoss_to_add = unique(enrichedLoss[enrichedLoss[,"Tumour_Name"] == tumour.names[i] & enrichedLoss[,"CNA"] == "sLoss","noevent"])
if (length(sLoss_to_add) > 0) {
vectorofeventssubclonal = c(vectorofeventssubclonal, sLoss_to_add)
}
}
if(length(vectorofeventssubclonal)>0){
vectoroforderedeventssubclonal<-vectorofeventssubclonal
} else {
vectoroforderedeventssubclonal<-NULL
}
#analysis of clonal events-
if (wgd_file == "none") {
tumour.ploidy=unique(mergedsegforsample$tumour_ploidy)
} else {
# if supplied WGD status is "WGD", set ploidy as 4 as this will execute code for samples that have undergone WGD
# otherwise, set ploidy to 2
wgd_status = read.table(wgd_file, stringsAsFactors=FALSE)
#if (wgd_status[wgd_status[, 1] == as.character(tumour.names[i]),2] == "WGD") {
if (wgd_status[gsub("\\.", "_", wgd_status[, 1]) == as.character(tumour.names[i]),2] == "WGD") {
## OPTIONALLY, could still allow ploidy to be > 4 if data suggest this, but ONLY if called as WGD by proper metrics
#if (unique(mergedsegforsample$tumour_ploidy) > 4) {
# tumour.ploidy = unique(mergedsegforsample$tumour_ploidy)
#} else {
tumour.ploidy = 4
#}
} else {
tumour.ploidy = 2
}
}
if(length(tumour.ploidy)>1){
print("different ploidies error")
}
if(tumour.ploidy==2){
#analysis of clonal events in diploid samples- we take all events and then randomly order them (we don't know their timing)
mergedseg.for.sample.clonal_noHD <- mergedsegforsample[mergedsegforsample[,"w.mean"] == 1 & mergedsegforsample[,"CNA"] != "dHD",]
vectorofeventsclonal_noHD <- mergedseg.for.sample.clonal_noHD$noevent
mergedseg.for.sample.clonal_HD <- mergedsegforsample[mergedsegforsample[,"w.mean"] == 1 & mergedsegforsample[,"CNA"] == "dHD",]
vectorofeventsclonal_HD <- mergedseg.for.sample.clonal_HD$noevent
if(length(vectorofeventsclonal_noHD)==1){
vector.of.ordered.events.clonal<-vectorofeventsclonal_noHD
}else if (length(vectorofeventsclonal_noHD)>1){
vector.of.ordered.events.clonal<-sample(vectorofeventsclonal_noHD, size=length(vectorofeventsclonal_noHD) ,replace=FALSE)
}else{
vector.of.ordered.events.clonal<-NULL
}
vector.of.ordered.events.clonal <- add_hds_to_ordered_vector(vectorofeventsclonal_HD, vector.of.ordered.events.clonal, mergedsegforsample)
}
if(tumour.ploidy==4){
#analysis of clonal events in tetraploid samples (TAKING INTO ACCOUNT WHOLE GENOME DUPLICATION (WGD))
mergedsegforsampleWGD<-mergedsegforsample[mergedsegforsample$w.mean==1,]
# HOM DEL events
mergedsegforsampleHD<-mergedsegforsampleWGD[mergedsegforsampleWGD$CNA=="dHD",]
vector.of.events.HD<-mergedsegforsampleHD$noevent
# Mutational Driver events
mergedsegforsampleWGD.MUT<- mergedsegforsampleWGD[mergedsegforsampleWGD$CNA=="dMut",]
mergedsegforsamplebeforeWGD.MUT<-mergedsegforsampleWGD.MUT[mergedsegforsampleWGD.MUT$no.chr.bearing.mut>=2,] # > for no.chr. having non-integer values
# LOH events
mergedsegforsampleWGD.LOH<- mergedsegforsampleWGD[mergedsegforsampleWGD$CNA=="dLOH",]
mergedsegforsamplebeforeWGD.LOH<-mergedsegforsampleWGD.LOH[mergedsegforsampleWGD.LOH$nMin1_A==0,]
# Gain events
mergedsegforsampleWGD.Gain<-mergedsegforsampleWGD[mergedsegforsampleWGD$CNA=="dGain",]
mergedsegforsamplebeforeWGD.Gain<-mergedsegforsampleWGD.Gain[mergedsegforsampleWGD.Gain$nMaj1_A>3,] #if 2:1 -> min nMaj after WGD -> 4 (4:2)
# merge LOH and GAIN pre-WGD events
mergedsegforsamplebeforeWGD<-rbind(mergedsegforsamplebeforeWGD.LOH, mergedsegforsamplebeforeWGD.Gain, mergedsegforsamplebeforeWGD.MUT)
#we select all events that are before WGD=> nMin1_A==0 and then randomly order them # randomly shuffle the pre-WGD events
vector.of.events.before.WGD<- mergedsegforsamplebeforeWGD$noevent
if(length(vector.of.events.before.WGD)==1){
vector.of.ordered.events.before.WGD<-vector.of.events.before.WGD
}else if(length(vector.of.events.before.WGD)>1){
vector.of.ordered.events.before.WGD<-sample(vector.of.events.before.WGD, size=length(vector.of.events.before.WGD) ,replace=FALSE)
}else {
vector.of.ordered.events.before.WGD<-NULL
}
vector.of.ordered.events.before.WGD <- add_hds_to_ordered_vector(vector.of.events.HD, vector.of.ordered.events.before.WGD, mergedsegforsample)
#we select all events that are before (post-WGD) WGD=> nMin1_A==1 and then randomly order them
# Picks events that have occurred after WGD (e.g. 2:2 has become 4:2 (gain) or 2:1 (LOH after gain))
# LOH events post-WGD
mergedsegforsampleafterWGD.LOH<-mergedsegforsampleWGD.LOH[mergedsegforsampleWGD.LOH$nMin1_A==1,]
# LOSS events that match LOH enriched regions as post-WGD LOH events - IMPORTANT: only CLONAL added here
if (losses){
lossforsampleWGD=enrichedLoss[which(enrichedLoss$Tumour_Name==tumour.names[i] & enrichedLoss$CNA=="cLoss"),]
if (nrow(lossforsampleWGD) > 0) {
lossforsampleWGD=data.frame(lossforsampleWGD[,c("Tumour_Name","chr","startpos","endpos","nMaj1_A","nMin1_A","tumour_ploidy","CNA","noevent")],w.mean=1,ID=lossforsampleWGD$ID,no.chr.bearing.mut=NA)
lossforsampleWGD=lossforsampleWGD[which(!duplicated(lossforsampleWGD$noevent) & is.na(match(lossforsampleWGD$noevent,c(mergedsegforsamplebeforeWGD.LOH$noevent,mergedsegforsampleafterWGD.LOH)))),]
}
}
#Mutational Drivers post-WGD
mergedsegforsampleafterWGD.MUT<-mergedsegforsampleWGD.MUT[mergedsegforsampleWGD.MUT$no.chr.bearing.mut==1,]
# Gain events post-WGD
#mergedsegforsampleWGD.Gain<-mergedsegforsampleWGD[mergedsegforsampleWGD$CNA=="cGain" | mergedsegforsampleWGD$CNA=="sGain",]
mergedsegforsampleafterWGD.Gain<-mergedsegforsampleWGD.Gain[mergedsegforsampleWGD.Gain$nMaj1_A<=3,] #if after WGD, likely to be 3:2
# merge LOH and Gain post-WGD events
mergedsegforsampleafterWGD<-rbind(mergedsegforsampleafterWGD.LOH, mergedsegforsampleafterWGD.Gain,mergedsegforsampleafterWGD.MUT)
if (losses) {
mergedsegforsampleafterWGD = rbind(mergedsegforsampleafterWGD, lossforsampleWGD)
}
vector.of.events.after.WGD<- mergedsegforsampleafterWGD$noevent
if(length(vector.of.events.after.WGD)==1){
vector.of.ordered.events.after.WGD<-vector.of.events.after.WGD
}else if(length(vector.of.events.after.WGD)>1){
vector.of.ordered.events.after.WGD<-sample(vector.of.events.after.WGD, size=length(vector.of.events.after.WGD) ,replace=FALSE)
}else{
vector.of.ordered.events.after.WGD<-NULL
}
#we create the final order of events in which events that took place before the WGD, put in random order, are placed before the events which took place after WGD
# Uses max(events) as the noevent for WGD
vector.of.ordered.events.clonal <- c(vector.of.ordered.events.before.WGD,max(events),vector.of.ordered.events.after.WGD)
#print(paste(length(vector.of.ordered.events.before.WGD)/length(c(vector.of.ordered.events.after.WGD,vector.of.events.HD))))
}
if(tumour.ploidy==6){
#analysis of clonal events in hexaploid samples (TAKING INTO ACCOUNT WHOLE GENOME DUPLICATION (WGD))
mergedsegforsampleWGD<-mergedsegforsample[mergedsegforsample$w.mean==1,]
# HOM DEL events
mergedsegforsampleHD<-mergedsegforsampleWGD[mergedsegforsampleWGD$CNA=="dHD",]
vector.of.events.HD<-mergedsegforsampleHD$noevent
# Mutational Driver events
mergedsegforsampleWGD.MUT<- mergedsegforsampleWGD[mergedsegforsampleWGD$CNA=="dMut",]
mergedsegforsamplebeforeWGD.MUT<-mergedsegforsampleWGD.MUT[mergedsegforsampleWGD.MUT$no.chr.bearing.mut>=3,] # > for no.chr. having non-integer values
# LOH events
mergedsegforsampleWGD.LOH<- mergedsegforsampleWGD[mergedsegforsampleWGD$CNA=="dLOH",]
mergedsegforsamplebeforeWGD.LOH<-mergedsegforsampleWGD.LOH[mergedsegforsampleWGD.LOH$nMin1_A==0,]
# Gain events
mergedsegforsampleWGD.Gain<-mergedsegforsampleWGD[mergedsegforsampleWGD$CNA=="dGain",]
mergedsegforsamplebeforeWGD.Gain<-mergedsegforsampleWGD.Gain[mergedsegforsampleWGD.Gain$nMaj1_A>4,] # assuming 3:3
# merge LOH and GAIN and MUT pre-WGD events
mergedsegforsamplebeforeWGD<-rbind(mergedsegforsamplebeforeWGD.LOH, mergedsegforsamplebeforeWGD.Gain, mergedsegforsamplebeforeWGD.MUT)
#we select all events that are before WGD=> nMin1_A==0 and then randomly order them # randomly shuffle the pre-WGD events
vector.of.events.before.WGD<- mergedsegforsamplebeforeWGD$noevent
if(length(vector.of.events.before.WGD)==1){
vector.of.ordered.events.before.WGD<-vector.of.events.before.WGD
}else if(length(vector.of.events.before.WGD)>1){
vector.of.ordered.events.before.WGD<-sample(vector.of.events.before.WGD, size=length(vector.of.events.before.WGD) ,replace=FALSE)
}else {
vector.of.ordered.events.before.WGD<-NULL
}
vector.of.ordered.events.before.WGD <- add_hds_to_ordered_vector(vector.of.events.HD, vector.of.ordered.events.before.WGD, mergedsegforsample)
#we select all events that are before ( AFTER not before; post-WGD) WGD=> nMin1_A==1 and then randomly order them
# picks events that have occurred after WGD (e.g. 2:2 has become 4:2 (gain) or 2:1 (LOH after gain))
# LOH events post-WGD
mergedsegforsampleafterWGD.LOH<-mergedsegforsampleWGD.LOH[mergedsegforsampleWGD.LOH$nMin1_A==1,]
# LOSS events that match LOH enriched regions as post-WGD LOH events - IMPORTANT: only CLONAL added here
if (losses){
lossforsampleWGD=enrichedLoss[which(enrichedLoss$Tumour_Name==tumour.names[i] & enrichedLoss$CNA=="cLoss"),]
if (nrow(lossforsampleWGD) > 0) {
lossforsampleWGD=data.frame(lossforsampleWGD[,c("Tumour_Name","chr","startpos","endpos","nMaj1_A","nMin1_A","tumour_ploidy","CNA","noevent")],w.mean=1,ID=lossforsampleWGD$ID,no.chr.bearing.mut=NA)
lossforsampleWGD=lossforsampleWGD[which(!duplicated(lossforsampleWGD$noevent) & is.na(match(lossforsampleWGD$noevent,c(mergedsegforsamplebeforeWGD.LOH$noevent,mergedsegforsampleafterWGD.LOH)))),]
}
}
# Mutational Drivers post-WGD
mergedsegforsampleafterWGD.MUT<-mergedsegforsampleWGD.MUT[mergedsegforsampleWGD.MUT$no.chr.bearing.mut<3,]
# Gain events post-WGD
mergedsegforsampleafterWGD.Gain<-mergedsegforsampleWGD.Gain[mergedsegforsampleWGD.Gain$nMaj1_A<=4,] # assuming 3:3
# merge LOH and Gain and MUT post-WGD events
mergedsegforsampleafterWGD<-rbind(mergedsegforsampleafterWGD.LOH, mergedsegforsampleafterWGD.Gain,mergedsegforsampleafterWGD.MUT)
if (losses) {
mergedsegforsampleafterWGD = rbind(mergedsegforsampleafterWGD, lossforsampleWGD)
}
vector.of.events.after.WGD<- mergedsegforsampleafterWGD$noevent
if(length(vector.of.events.after.WGD)==1){
vector.of.ordered.events.after.WGD<-vector.of.events.after.WGD
}else if(length(vector.of.events.after.WGD)>1){
vector.of.ordered.events.after.WGD<-sample(vector.of.events.after.WGD, size=length(vector.of.events.after.WGD) ,replace=FALSE)
}else{
vector.of.ordered.events.after.WGD<-NULL
}
#we create the final order of events in which events that took place before the WGD, put in random order, are placed before the events which took place after WGD
# Uses max(events) as the noevent for WGD
vector.of.ordered.events.clonal <- c(vector.of.ordered.events.before.WGD,max(events),vector.of.ordered.events.after.WGD)
}
if(length(vector.of.ordered.events.clonal)==0){
vector.of.ordered.events.clonal=NULL
}
if (tumour.ploidy > 2) {
list(`clonal_early`=vector.of.events.before.WGD, `clonal_late`=vector.of.events.after.WGD, `clonal_unspecified`=c(), `subclonal`=vectorofeventssubclonal)
} else {
list(`clonal_early`=c(), `clonal_late`=c(), `clonal_unspecified`=setdiff(vector.of.ordered.events.clonal, max(events)), `subclonal`=vectorofeventssubclonal)
}
})
names(timing_categories) = tumour.names
if (!dir.exists(output_dir)) {
dir.create(output_dir, recursive=TRUE)
}
event_types = c("clonal_early", "clonal_late", "clonal_unspecified", "subclonal")
timing_category_matrix = t(sapply(unique(mergedseg$noevent), function(event_i) {
table(factor(sapply(timing_categories, function(tc) {
this_cat = names(which(sapply(tc, function(x) { event_i %in% x })))
if (length(this_cat) == 1) {
this_cat
} else if (length(this_cat) == 0) {
NA
} else {
stop(paste0("A result for event ", event_i, " was reported with more than 1 timing category"))
}
}), levels=event_types))
}))
event_names = sapply(unique(mergedseg$noevent), function(event_i) {
ms_row = mergedseg[which(mergedseg$noevent == event_i)[1],]
paste(gsub("^d", "", ms_row["CNA"]), gsub("^dMut_", "", ms_row["ID"]), sep="_")
})
rownames(timing_category_matrix) = event_names
timing_category_matrices = lapply(split(1:nrow(timing_category_matrix), sapply(rownames(timing_category_matrix), function(x) { strsplit(x, "_")[[1]][1] })), function(rownums) { timing_category_matrix[rownums,] })
calculate_timing_preferences = function(timing_category_matrix) {
if(is.null(nrow(timing_category_matrix))) {
timing_category_matrix = t(timing_category_matrix)
}
event_category_sums = apply(timing_category_matrix, 2, sum)
pref = t(apply(timing_category_matrix, 1, function(event_times) {
clonal_early_late_conf_mat = rbind(event_times[1:2], event_category_sums[1:2] - event_times[1:2])
clonal_early_late_ft = fisher.test(clonal_early_late_conf_mat)
event_clonal = sum(event_times[1:3])
sums_clonal = sum(event_category_sums[1:3])
event_clonal_subclonal = c(event_clonal, event_times[4])
clonal_subclonal_conf_mat = rbind(event_clonal_subclonal, c(sums_clonal, event_category_sums[4]) - event_clonal_subclonal)
clonal_subclonal_ft = fisher.test(clonal_subclonal_conf_mat)
c(`clonal_early_late_p`=clonal_early_late_ft[["p.value"]], `clonal_early_late_or`=clonal_early_late_ft[["estimate"]]["odds ratio"], `clonal_subclonal_p`=clonal_subclonal_ft[["p.value"]], `clonal_subclonal_or`=clonal_subclonal_ft[["estimate"]]["odds ratio"])
}))
pref = cbind(pref, `clonal_early_late_fdr`=p.adjust(pref[,'clonal_early_late_p'], method="fdr"), `clonal_subclonal_fdr`=p.adjust(pref[,'clonal_subclonal_p'], method="fdr"))
}
timing_preferences_within_type = lapply(timing_category_matrices, calculate_timing_preferences)
for (i in 1:length(timing_preferences_within_type)) {
print(timing_preferences_within_type[[i]][order(timing_preferences_within_type[[i]][,1]),1:2])
print(timing_preferences_within_type[[i]][order(timing_preferences_within_type[[i]][,3]),3:4])
}
timing_preferences_overall = calculate_timing_preferences(timing_category_matrix)
timing_preferences_overall[order(timing_preferences_overall[,1]),1:2]
timing_preferences_overall[order(timing_preferences_overall[,3]),3:4]
event_timing_sample_sets = lapply(unique(mergedseg$noevent), function(idx) {
etss = sapply(event_types, function(et) {
unlist(sapply(tumour.names, function(bc) {
if (idx %in% timing_categories[[bc]][[et]]) { bc } else { NULL }
}))
})
names(etss) = event_types
etss
})
names(event_timing_sample_sets) = event_names
event_sample_sets = lapply(event_timing_sample_sets, unlist)
source("path/to/Code/get_event_ids.R")
timing_category_matrix_DN2 = timing_category_matrix
timing_preferences_overall_DN2 = timing_preferences_overall
event_ids_DN2 = event_ids
########### END OF PER-SUBSET PART ###################
colour_map_from_logs = function(nums, gt0_max=NULL, lt0_min=NULL) {
if (any(is.infinite(nums))) {
non_inf_nums = nums[-which(is.infinite(nums))]
} else {
non_inf_nums = nums
}
gt0 = non_inf_nums[non_inf_nums > 0]
lt0 = non_inf_nums[non_inf_nums < 0]
if (length(gt0) > 0) {
gt0_ramp = colorRamp(c("white","red"))
if (is.null(gt0_max)) { gt0_max = max(gt0) }
}
if (length(lt0) > 0) {
lt0_ramp = colorRamp(c("white","blue"))
if (is.null(lt0_min)) { lt0_min = min(lt0) }
}
sapply(nums, function(x) {
if (is.infinite(x) && x > 0) {
gt0_ramp(1)
} else if (is.infinite(x) && x < 0) {
lt0_ramp(0)
} else if (x > 0) {
gt0_ramp(x / gt0_max)
} else if (x < 0) {
lt0_ramp(x / lt0_min)
} else {
c(255,255,255)
}
})
}
get_clonality_barplot = function(tcm, tpo, eid) {
clm = t(apply(tcm, 1, function(row) { c(`clonal`=sum(row[1:3]), row[4]) }))
names(eid) = gsub("dMut_", "", names(eid))
melted_timing_categories = melt(clm)
colnames(melted_timing_categories) = c("Event","Timing","Value")
sig_pref <- names(which(p.adjust(tpo[,"clonal_subclonal_p"], method="fdr") < 0.05))
or_order <- names(sort(tpo[sig_pref,"clonal_subclonal_or.odds ratio"], decreasing=TRUE))
mtc_sig <- melted_timing_categories[melted_timing_categories$Event %in% sig_pref,]
mtc_sig[,1] <- eid[gsub("^[^_]*_", "", mtc_sig[,1])]
or_order <- eid[gsub("^[^_]*_", "", or_order)]
mtc_sig[,1] = factor(mtc_sig[,1], levels=or_order)
mtc_sig[,2] = factor(mtc_sig[,2], levels=c("subclonal","clonal"))
p <- ggplot(mtc_sig, aes(fill=Timing, y=Value, x=Event)) +
geom_bar(position="stack", stat="identity", show.legend=FALSE) +
theme_bw() +
scale_fill_manual(values=c("#000000","#60EFE1")) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size=14, family = 'Roboto Condensed'), axis.text.y = element_text(size=14, family = 'Roboto Condensed'), axis.title=element_blank(), title=element_text(size=16, family = 'Roboto Condensed'))
p
}
png("path/to/figures/Fig4/Clonality_preferences_NSDA.png", height=2210, width=4000, res=300)
get_clonality_barplot(timing_category_matrix_DN2, timing_preferences_overall_DN2, event_ids_DN2)
dev.off()
png("path/to/figures/Fig4/Timing_preferences_SD.png", height=2100, width=2444, res=300)
get_clonality_barplot(timing_category_matrix_DN1, timing_preferences_overall_DN1, event_ids_DN1)
dev.off()
png("path/to/figures/Fig4/Clonality_preferences_NSDB.png", height=1840, width=892, res=300)
get_clonality_barplot(timing_category_matrix_DN3, timing_preferences_overall_DN3, event_ids_DN3)
dev.off()
selected_NSDA_timing_prefs = timing_preferences_overall_DN2[timing_preferences_overall_DN2[,"clonal_subclonal_fdr"] < 0.05,]
selected_NSDB_timing_prefs = timing_preferences_overall_DN3[timing_preferences_overall_DN3[,"clonal_subclonal_fdr"] < 0.05,]
selected_SD_timing_prefs = timing_preferences_overall_DN1[timing_preferences_overall_DN1[,"clonal_subclonal_fdr"] < 0.05,]
selected_NSDA_timing_prefs = selected_NSDA_timing_prefs[order(selected_NSDA_timing_prefs[,"clonal_subclonal_or.odds ratio"], decreasing=T),]
selected_NSDB_timing_prefs = selected_NSDB_timing_prefs[order(selected_NSDB_timing_prefs[,"clonal_subclonal_or.odds ratio"], decreasing=T),]
selected_SD_timing_prefs = selected_SD_timing_prefs[order(selected_SD_timing_prefs[,"clonal_subclonal_or.odds ratio"], decreasing=T),]
selected_NSDA_timing_prefs = cbind(`clonal_subclonal_OR`=selected_NSDA_timing_prefs[,4], `clonal_subclonal_FDR`=selected_NSDA_timing_prefs[,6])
selected_NSDB_timing_prefs = cbind(`clonal_subclonal_OR`=selected_NSDB_timing_prefs[,4], `clonal_subclonal_FDR`=selected_NSDB_timing_prefs[,6])
selected_SD_timing_prefs = cbind(`clonal_subclonal_OR`=selected_SD_timing_prefs[,4], `clonal_subclonal_FDR`=selected_SD_timing_prefs[,6])
prefs_list = list(selected_NSDA_timing_prefs, selected_NSDB_timing_prefs, selected_SD_timing_prefs)
all_selected_prefs = Reduce(rbind, prefs_list)
max_or = max(all_selected_prefs[!is.infinite(all_selected_prefs[,1]),1])
min_or = min(timing_preferences_overall[timing_preferences_overall[,2] > 0,2])
log_max = round(log2(max_or) + 0.5) + 1
log_min = round(log2(min_or) - 0.5) - 1
extent = max(abs(c(log_max, log_min)))
par(mar=c(0,0,0,0))
for (prefs_i in seq_along(prefs_list)) {
these_prefs = prefs_list[[prefs_i]]
if (prefs_i == 1) { traj = "NSD-A" } else if (prefs_i == 2) { traj = "NSD-B" } else { traj = "SD" }
wd = nrow(these_prefs) + 1
col_num = 1
comparison = "clonal_subclonal"
max_or = extent
min_or = -extent
pdf(paste0("path/to/figures/Fig4/", traj, "_", comparison, "_timing_categorisation_log2OR_colours.pdf"), height=2, width=wd)
or_nums = sapply(log2(these_prefs[,col_num]), function(x) { max(min(x, max_or), min_or) })
pref_cols = t(colour_map_from_logs(or_nums, gt0_max=max_or, lt0_min=min_or) / 255)
plot(x=NULL, xlim=c(0,100*nrow(these_prefs)), ylim=c(0,50), axes=FALSE, xlab="", ylab="")
for (i in 1:nrow(these_prefs)) {
rect(((i-1)*100)+5,0,(i*100)-5,50,col=rgb(pref_cols[i,1], pref_cols[i,2], pref_cols[i,3]))
}
dev.off()
}
t(t(sapply(selected_NSDA_timing_prefs[,"clonal_subclonal_FDR"], function(x) { if (x < 0.0001) { "***" } else if (x < 0.001) { "**" } else if (x < 0.05) { "*" } else { "" }})))
t(t(sapply(selected_NSDB_timing_prefs[,"clonal_subclonal_FDR"], function(x) { if (x < 0.0001) { "***" } else if (x < 0.001) { "**" } else if (x < 0.05) { "*" } else { "" }})))
t(t(sapply(selected_SD_timing_prefs[,"clonal_subclonal_FDR"], function(x) { if (x < 0.0001) { "***" } else if (x < 0.001) { "**" } else if (x < 0.05) { "*" } else { "" }})))
full_cr = colorRamp(c("red","white","blue"))
rampcols = full_cr(seq(0,1,by=0.001))
par(mar=c(0,0,0,0))
pdf("path/to/figures/Fig4/blueWhiteRed_log2OR_scale.pdf", height=3, width=20)
plot(NA, ylim=c(0,1), xlim=c(0,nrow(rampcols)), xlab="", ylab="", axes=FALSE)
for (i in 1:nrow(rampcols)) { polygon(y = c(0,0,1,1), x=c(i-1,i,i,i-1), col=rgb(rampcols[i,1]/255, rampcols[i,2]/255, rampcols[i,3]/255), border = NA) }
polygon(y=c(0,0,1,1), x=c(0, nrow(rampcols), nrow(rampcols), 0), col=NA, lwd=4)
dev.off()