Analyses/clonality_preferences.R · SherlockLung-EvolutionaryTrajectory-Analysis

### CLONALITY PREFERENCES

# This code follows through the ordering model code to the point of events being ordered as clonal (early/late/unspecified) or subclonal
# We then compare statistical preference for occurring clonally or subclonally

library(reshape2)
library(ggplot2)

var_defaults = c(
  input_dir=".",
  output_dir=".",
  unobserved_events="notIncluded", #"end", # if model 2 is chosen (to include unobserved items in ordering), the unobserved events will be randomly shuffled and added either at the 'end' or 'interspersed'
  losses="TRUE", # include loss events? Default=FALSE # using this will skew the position of WGD to far too early
  PLmethod="PlackettLuce", # could also use PLmethod="PLMIX" if interested in detecting partitions/components only
  G="1", # number of partitions - ONLY FOR "PLMIX"
  add_mut="FALSE", # do you have driver mutations to add, if so, add_mut=T
  genes="all", # top gene list to be included
  gene_stratify="FALSE", # stratify samples by driver genes?
  gene_for_stratification="TP53",
  mut_file="driver_mut_file_for_timing.txt",
  wgd_file="none"
)

### RUN THE FOLLOWING SECTION FOR EACH DE NOVO SUBSET (DN1, DN2, DN3)

name="SherlockLung_LUAD_2025_DN2"
basedir="path/to/data/"

args = c(name, paste0("input_dir=", basedir, name, "/"), paste0("output_dir=", basedir, name, "/"), "PLmethod=PlackettLuce", paste0("mut_file=", basedir, name, "/", name, "_driver_mut_file_for_timing.txt"), "wgd_file=path/to/data/SherlockLung_WGDStatus.txt", "G=1", "unobserved_events=notIncluded", "genes=all", "add_mut=TRUE")

# pass name of tumour set as first argument
tumour_set = args[1]
if (length(args) > 1) {
  split_args = strsplit(args[2:length(args)], "=")
  for (x in split_args) { assign(x[1], x[2]) }
}
for (x in names(var_defaults)) {
  if (!exists(x)) {
    assign(x, var_defaults[x])
  }
}

if (!(unobserved_events %in% c("notIncluded", "end", "interspersed"))) {
  stop('unobserved_events must be one of "notIncluded", "end", or "interspersed" (default "notIncluded")')
}

cat(paste0("Running with the following variable values:

tumour_set = ", tumour_set, "
input_dir = ", input_dir, "
output_dir = ", output_dir, "
unobserved_events = ", unobserved_events, "
losses = ", losses, "
PLmethod = ", PLmethod, "
G = ", G, "
add_mut = ", add_mut, "
genes = ", genes, "
gene_stratify = ", gene_stratify, "
gene_for_stratification = ", gene_for_stratification, "
mut_file = ", mut_file, "
wgd_file = ", wgd_file, "

"))

#model = as.numeric(model)
G = as.numeric(G)
losses = as.logical(losses)
add_mut = as.logical(add_mut)
gene_stratify = as.logical(gene_stratify)
genes = strsplit(gsub(" ", "", genes), ",")[[1]]

#library(PLmethod,character.only = T)

groups=tumour_set # Group name (only one at a time in this step) as used in previous postFDR steps 1-4



#### helper function - intersperse_unobserved ####

intersperse_unobserved <- function(observed, unobserved) {
  # get total number of events
  total_events = length(observed) + length(unobserved)
  observed_i = sort(sample(total_events, length(observed)))
  remaining_i = setdiff(1:total_events, observed_i)
  unobserved_i = sample(remaining_i, length(remaining_i))
  all_ordered = vector(mode="numeric", length=10)
  all_ordered[observed_i] = observed
  all_ordered[unobserved_i] = unobserved
  all_ordered
}

#' Insert new item into vector with positional constraint
#'
#' Allows a new item to be inserted into a vector, maintining the order of the items before and after its insertion position, with a minimum index specified
#' 
#' @param new_item the new element of the vector
#' @param vec the existing vector into which new_item should be inserted
#' @param first_possible the lowest index at which new_item should be allowed to be inserted into vec
#'
#' @return the new vector, containg all items in vec and new_item inserted at an index of at least first_possible
insert_constrained = function(new_item, vec, first_possible) {
  # get vector position of new item, between first_possible and 1 higher than current vector length
  insert_pos = sample(first_possible:(length(vec)+1), 1)
  # if position of new item is 1, put it first, followed by all existing
  if (insert_pos == 1) {
    vec = c(new_item, vec)
  # if position of new item is neither first nor last, put existing items before it first, then new item, then existing items after it
  } else if (insert_pos <= length(vec)) {
    vec = c(vec[1:(insert_pos-1)], new_item, vec[insert_pos:length(vec)])
  # if position of new item is last, put all existing items, then new item
  } else {
    vec = c(vec, new_item)
  }
  vec
}

#' Adds homozygous deletion event numbers to a vector of other ordered events
#'
#' Inserts HD events into a vector of clonal events - If the vector contains a clonal LOH whose position encompasses an HD, the HD will be ordered after the LOH
#'
#' @param hd_vector a vector of homozygous deletion event numbers
#' @param ordered_vector a vector of other clonal event numbers
#' @param merged_seg the merged segments data.frame containing event information including colnames chr, startpos, endpos, and CNA
#'
#' @return the new vector of ordered events, containg HD events inserted at appropriate positions among the existing clonal events
add_hds_to_ordered_vector = function(hd_vector, ordered_vector, merged_seg) {
  # IF we have clonal HDs:
  if (length(hd_vector) > 0) {

    # For each clonal HD:
    for (HD_noevent in hd_vector) {
      # If there are other pre-WGD events:
      if (length(ordered_vector) > 0) {
	    # check for LOH segments which entirely encompass the HD locus
        HD_locus = merged_seg[merged_seg[,"noevent"] == HD_noevent,c("chr", "startpos", "endpos")]
  	    encompassing_i = which(sapply(ordered_vector, function(before_noevent) {
          before_seg = merged_seg[merged_seg[,"noevent"] == before_noevent,]
          before_seg["chr"] == HD_locus["chr"] && before_seg["startpos"] <= HD_locus["startpos"] && before_seg["endpos"] >= HD_locus["endpos"] & before_seg["CNA"] == "dLOH"
        }))
        # If any LOH segments have been found that encompass the HD, the HD must occur AFTER these
        if (length(encompassing_i) > 0) {
          ordered_vector = insert_constrained(HD_noevent, ordered_vector, max(encompassing_i)+1)
        # Otherwise, it can occur anywhere
        } else {
          ordered_vector = insert_constrained(HD_noevent, ordered_vector, 1)
        }
      # If no other pre-WGD events, set this HD as the first one
      } else {
        ordered_vector = HD_noevent
      }
    }
  }
  ordered_vector
}


######INPUT DATA###############################  ONLY ONE GROUP AT A TIME ####

cna_type=c("LOH","Gain","HD")

mergedseg=data.frame() # this will contain all CNA types with new noevent numbers assigned
MAX=0
for (cna in 1:length(cna_type)){
  if(file.exists(paste0(input_dir, "/", groups,"_",cna_type[cna],"_mergedsegs.txt"))){
    #paste0(groups[grp],"_",cna_type[cna],"_mergedsegs.txt")
    CNA=read.table(paste0(input_dir, "/", groups,"_",cna_type[cna],"_mergedsegs.txt"),header=T,stringsAsFactors = F,sep="\t")
    print(paste(paste0(input_dir, "/", groups,"_",cna_type[cna],"_mergedsegs.txt"),nrow(CNA)))
    print(length(unique((CNA$noevent))))
    CNA$noevent=CNA$noevent+MAX
    print(range(CNA$noevent))
    mergedseg=rbind(mergedseg,CNA)
    MAX=max(CNA$noevent)
    print(MAX)
  }
}
mergedseg$ID=paste(paste0("chr",mergedseg$chr),mergedseg$startpos,mergedseg$endpos,sep="_")
mergedseg$no.chr.bearing.mut=NA # dummy for rbind/merging only

## ADD MUTATIONAL DRIVERS #### 

if (add_mut){
  mutations=read.table(mut_file,header=T,stringsAsFactors = F) #### NAP: need to import your own MUTATION TABLE ####

  #mutations=mutations[!grepl("TCGA",mutations$ID),] # for NIG
  #mutations=mutations[which(!is.na(match(mutations$ID,mergedseg$Tumour_Name))),] # for HER2
  #func=c("frameshift deletion","frameshift insertion","frameshift substitution","nonsynonymous SNV","Splice_Site","stopgain")
  mut_genes = gsub("[cs]Mut_", "", mutations$CNA)
  mut_drivers=data.frame()
  if (genes == "all") {
    genes = unique(mut_genes)
  }
  for (i in 1:length(genes)){
    MUTf=mutations[which(mut_genes==genes[i]),]

    if (nrow(MUTf) > 0) {
      #MUTf$noevent = MUTf$noevent + MAX
      MUTf$noevent = MAX + 1
      MAX = MAX + 1
	  
      ccf_order = order(MUTf$w.mean, decreasing=T)
      MUTf = MUTf[ccf_order,]
      #ID = mut_genes[ccf_order]
      ID = gsub("[cs]Mut", "dMut", MUTf$CNA)
      MUTf$CNA = "dMut"
	  
      no_dups = !duplicated(cbind(MUTf[,'Tumour_Name'], ID))
      MUTf = MUTf[no_dups,]
      ID = ID[no_dups]
      MUT_out = cbind(MUTf[,1:10], ID, no.chr.bearing.mut=MUTf[,11])
      mut_drivers = rbind(mut_drivers, MUT_out)
    }
  }
  mergedseg=rbind(mergedseg,mut_drivers)
}


# MUT DRIVER-BASED STRATIFICATION:
if (gene_stratify){
# GATA3
  mergedseg=mergedseg[which(!is.na(match(mergedseg$Tumour_Name,mergedseg[which(mergedseg$ID==gene_for_stratification),]$Tumour_Name))),]
  mergedseg=mergedseg[which(mergedseg$CNA!="dMut"),]
}

mergedseg$Tumour_Name = basename(mergedseg$Tumour_Name)

tumour.names<-unique(gsub("\\.", "_", mergedseg$Tumour_Name))


# For incorporating Loss events in WGD samples: LOH --> WGD --> Loss
allsegs=read.table(paste0(input_dir, "/", groups,"_allsegments.txt"),header=T,stringsAsFactors = F)
if (losses){
  loss=allsegs[which(!is.na(match(allsegs$CNA,c("cLoss","sLoss"))) & allsegs$tumour_ploidy>2 & allsegs$nMin1_A>0),]
  enrichedLOH=read.table(paste0(input_dir, "/", groups,"_enriched_regions_LOH_noevent.txt"),header=T,stringsAsFactors = F)
  enrichedLOH$chr=as.character(enrichedLOH$chr)
  enrichedLOH=enrichedLOH[enrichedLOH$startpos!=enrichedLOH$endpos,]
  loss=loss[which(!is.na(match(loss$chr,enrichedLOH$chr))),]
  data.table::setDT(enrichedLOH)
  data.table::setDT(loss)
  data.table::setkey(enrichedLOH,"chr","startpos","endpos")
  data.table::setkey(loss,"chr","startpos","endpos")
  enrichedLoss=data.frame(data.table::foverlaps(loss,enrichedLOH,type="any",nomatch = 0),stringsAsFactors = F) # type='any' overlap
  enrichedLoss$ID=paste(paste0("chr",enrichedLoss$chr),enrichedLoss$startpos,enrichedLoss$endpos,sep="_")
  enrichedLoss$no.chr.bearing.mut=NA # dummy for rbind only

  # REDUCE LOH/Loss events to 1 per sample per noevent, according to the following order of precedence:
  #	1) Clonal LOH
  #	2) Clonal Loss
  #	3) Subclonal loss or LOH with highest CCF (LOH takes precedence if equal)
  for (barcode in tumour.names) {

    sample_enrichedLoss = cbind(enrichedLoss[enrichedLoss[,"Tumour_Name"] == barcode,c("noevent","CNA","frac1_A")], `rowname`=rownames(enrichedLoss[enrichedLoss[,"Tumour_Name"] == barcode,]))
    sample_LOH = cbind(mergedseg[mergedseg[,"Tumour_Name"] == barcode & mergedseg[,"CNA"] == "dLOH",c("noevent","CNA","w.mean")], `rowname`=rownames(mergedseg[mergedseg[,"Tumour_Name"] == barcode & mergedseg[,"CNA"] == "dLOH",]))
    colnames(sample_LOH)[3] = "frac1_A"
    sample_L = rbind(sample_LOH, sample_enrichedLoss)

    for (noev in unique(sample_L$noevent)) {
      sample_L_noev = sample_L[sample_L$noevent == noev,]
      if (nrow(sample_L_noev) > 1) {
        clonal_LOH_row_i = which(sample_L_noev$CNA == "dLOH" & sample_L_noev$frac1_A == 1)
        if (length(clonal_LOH_row_i) > 0) {
          to_remove_row_i = setdiff(1:nrow(sample_L_noev), clonal_LOH_row_i[1])
        } else {
          clonal_loss_row_i = which(sample_L_noev$CNA == "cLoss")
	  if (length(clonal_loss_row_i) > 0) {
            to_remove_row_i = setdiff(1:nrow(sample_L_noev), clonal_loss_row_i[1])
          } else {
            highest_ccf_row_i = which(sample_L_noev$frac1_A == max(sample_L_noev$frac1_A))
            if (length(highest_ccf_row_i) > 1) {
              highest_ccf_LOH = which(sample_L_noev$CNA == "dLOH" & sample_L_noev$frac1_A == max(sample_L_noev$frac1_A))
              if (length(highest_ccf_LOH) > 0) {
                highest_ccf_row_i = highest_ccf_LOH
              }
              if (length(highest_ccf_row_i) > 1) {
                highest_ccf_row_i = highest_ccf_row_i[1]
              }
            }
            to_remove_row_i = setdiff(1:nrow(sample_L_noev), highest_ccf_row_i)
          }
        }
        # REMOVE to_remove_row_i
        for (remove_i in to_remove_row_i) {
          if (sample_L_noev[remove_i,"CNA"] == "dLOH") {
            mergedseg = mergedseg[-which(rownames(mergedseg) == sample_L_noev[remove_i,"rowname"]),]
          } else {
            enrichedLoss = enrichedLoss[-which(rownames(enrichedLoss) == sample_L_noev[remove_i,"rowname"]),]
          }
        }
      }
    }

  }

}
###


#### ORDERING THE EVENTS###################################################################

events<-c(unique(mergedseg$noevent),max(mergedseg$noevent)+1) # The final event represents WGD

matrix_coresp<-cbind(sort(events),c(1:length(events)))
colnames(matrix_coresp)<-c("true","ordered")
matrix_coresp<-as.data.frame(matrix_coresp)
matrix_out<-NULL
matrixclass<-NULL
allbics<-NULL
classification<-NULL

  timing_categories = lapply(1:length(tumour.names), function(i) {
     
    v<-NULL
    vectoroforderedeventssubclonal.initial<-NULL
    vectoroforderedeventssubclonal<-NULL
    vector.of.ordered.events.clonal<-NULL
    list.events.subclonal<-list()
    
    mergedsegforsample<-mergedseg[as.character(mergedseg$Tumour_Name)==as.character(tumour.names[i]), ]
    vector.of.total.events<-unique(mergedsegforsample$noevent)
    #take subclonal events separately and order them according to trees
    mergedsegforsamplesubclonal<-mergedsegforsample[mergedsegforsample$w.mean<1,]
    vectorofeventssubclonal<-mergedsegforsamplesubclonal$noevent

    if (losses) {
      sLoss_to_add = unique(enrichedLoss[enrichedLoss[,"Tumour_Name"] == tumour.names[i] & enrichedLoss[,"CNA"] == "sLoss","noevent"])
      if (length(sLoss_to_add) > 0) {
        vectorofeventssubclonal = c(vectorofeventssubclonal, sLoss_to_add)
      }
    }
    
    if(length(vectorofeventssubclonal)>0){
      vectoroforderedeventssubclonal<-vectorofeventssubclonal
    } else {
      vectoroforderedeventssubclonal<-NULL
    }
    
    #analysis of clonal events-
    if (wgd_file == "none") {
      tumour.ploidy=unique(mergedsegforsample$tumour_ploidy)
    } else {
      # if supplied WGD status is "WGD", set ploidy as 4 as this will execute code for samples that have undergone WGD
      # otherwise, set ploidy to 2
      wgd_status = read.table(wgd_file, stringsAsFactors=FALSE)
      #if (wgd_status[wgd_status[, 1] == as.character(tumour.names[i]),2] == "WGD") {
      if (wgd_status[gsub("\\.", "_", wgd_status[, 1]) == as.character(tumour.names[i]),2] == "WGD") {
        ## OPTIONALLY, could still allow ploidy to be > 4 if data suggest this, but ONLY if called as WGD by proper metrics
        #if (unique(mergedsegforsample$tumour_ploidy) > 4) {
        #  tumour.ploidy = unique(mergedsegforsample$tumour_ploidy)
        #} else {
        tumour.ploidy = 4
        #}
      } else {
        tumour.ploidy = 2
      }
    }  
    
    if(length(tumour.ploidy)>1){
      print("different ploidies error")
    }
    
    if(tumour.ploidy==2){
      #analysis of clonal events in diploid samples- we take all events and then randomly order them (we don't know their timing)
      mergedseg.for.sample.clonal_noHD <- mergedsegforsample[mergedsegforsample[,"w.mean"] == 1 & mergedsegforsample[,"CNA"] != "dHD",]
      vectorofeventsclonal_noHD <- mergedseg.for.sample.clonal_noHD$noevent
      mergedseg.for.sample.clonal_HD <- mergedsegforsample[mergedsegforsample[,"w.mean"] == 1 & mergedsegforsample[,"CNA"] == "dHD",]
      vectorofeventsclonal_HD <- mergedseg.for.sample.clonal_HD$noevent
      if(length(vectorofeventsclonal_noHD)==1){
        vector.of.ordered.events.clonal<-vectorofeventsclonal_noHD
      }else if (length(vectorofeventsclonal_noHD)>1){
        vector.of.ordered.events.clonal<-sample(vectorofeventsclonal_noHD, size=length(vectorofeventsclonal_noHD) ,replace=FALSE)
      }else{
        vector.of.ordered.events.clonal<-NULL
      }
      vector.of.ordered.events.clonal <- add_hds_to_ordered_vector(vectorofeventsclonal_HD, vector.of.ordered.events.clonal, mergedsegforsample)
 
    }
    if(tumour.ploidy==4){
      #analysis of clonal events in tetraploid samples (TAKING INTO ACCOUNT WHOLE GENOME DUPLICATION (WGD))
      mergedsegforsampleWGD<-mergedsegforsample[mergedsegforsample$w.mean==1,]
      # HOM DEL events
      mergedsegforsampleHD<-mergedsegforsampleWGD[mergedsegforsampleWGD$CNA=="dHD",]
      vector.of.events.HD<-mergedsegforsampleHD$noevent
      # Mutational Driver events
      mergedsegforsampleWGD.MUT<- mergedsegforsampleWGD[mergedsegforsampleWGD$CNA=="dMut",]
      mergedsegforsamplebeforeWGD.MUT<-mergedsegforsampleWGD.MUT[mergedsegforsampleWGD.MUT$no.chr.bearing.mut>=2,] # > for no.chr. having non-integer values
      # LOH events
      mergedsegforsampleWGD.LOH<- mergedsegforsampleWGD[mergedsegforsampleWGD$CNA=="dLOH",]
      mergedsegforsamplebeforeWGD.LOH<-mergedsegforsampleWGD.LOH[mergedsegforsampleWGD.LOH$nMin1_A==0,]
      # Gain events
      mergedsegforsampleWGD.Gain<-mergedsegforsampleWGD[mergedsegforsampleWGD$CNA=="dGain",]
      mergedsegforsamplebeforeWGD.Gain<-mergedsegforsampleWGD.Gain[mergedsegforsampleWGD.Gain$nMaj1_A>3,] #if 2:1 -> min nMaj after WGD -> 4 (4:2)
      # merge LOH and GAIN pre-WGD events
      mergedsegforsamplebeforeWGD<-rbind(mergedsegforsamplebeforeWGD.LOH,  mergedsegforsamplebeforeWGD.Gain, mergedsegforsamplebeforeWGD.MUT)
      
      #we select all events that are before WGD=> nMin1_A==0 and then randomly order them # randomly shuffle the pre-WGD events
      vector.of.events.before.WGD<- mergedsegforsamplebeforeWGD$noevent
      if(length(vector.of.events.before.WGD)==1){
        vector.of.ordered.events.before.WGD<-vector.of.events.before.WGD
      }else if(length(vector.of.events.before.WGD)>1){
        vector.of.ordered.events.before.WGD<-sample(vector.of.events.before.WGD, size=length(vector.of.events.before.WGD) ,replace=FALSE)
      }else {
        vector.of.ordered.events.before.WGD<-NULL
      }
      vector.of.ordered.events.before.WGD <- add_hds_to_ordered_vector(vector.of.events.HD, vector.of.ordered.events.before.WGD, mergedsegforsample)

      #we select all events that are before (post-WGD) WGD=> nMin1_A==1 and then randomly order them
      # Picks events that have occurred after WGD (e.g. 2:2 has become 4:2 (gain) or 2:1 (LOH after gain))
      # LOH events post-WGD
      mergedsegforsampleafterWGD.LOH<-mergedsegforsampleWGD.LOH[mergedsegforsampleWGD.LOH$nMin1_A==1,]
      
      # LOSS events that match LOH enriched regions as post-WGD LOH events - IMPORTANT: only CLONAL added here
      if (losses){
        lossforsampleWGD=enrichedLoss[which(enrichedLoss$Tumour_Name==tumour.names[i] & enrichedLoss$CNA=="cLoss"),]
        if (nrow(lossforsampleWGD) > 0) {      
          lossforsampleWGD=data.frame(lossforsampleWGD[,c("Tumour_Name","chr","startpos","endpos","nMaj1_A","nMin1_A","tumour_ploidy","CNA","noevent")],w.mean=1,ID=lossforsampleWGD$ID,no.chr.bearing.mut=NA)
          lossforsampleWGD=lossforsampleWGD[which(!duplicated(lossforsampleWGD$noevent) & is.na(match(lossforsampleWGD$noevent,c(mergedsegforsamplebeforeWGD.LOH$noevent,mergedsegforsampleafterWGD.LOH)))),]
        }
      }
      
      #Mutational Drivers post-WGD
      mergedsegforsampleafterWGD.MUT<-mergedsegforsampleWGD.MUT[mergedsegforsampleWGD.MUT$no.chr.bearing.mut==1,]
      # Gain events post-WGD
      #mergedsegforsampleWGD.Gain<-mergedsegforsampleWGD[mergedsegforsampleWGD$CNA=="cGain" | mergedsegforsampleWGD$CNA=="sGain",]
      mergedsegforsampleafterWGD.Gain<-mergedsegforsampleWGD.Gain[mergedsegforsampleWGD.Gain$nMaj1_A<=3,] #if after WGD, likely to be 3:2
      # merge LOH and Gain post-WGD events
      mergedsegforsampleafterWGD<-rbind(mergedsegforsampleafterWGD.LOH, mergedsegforsampleafterWGD.Gain,mergedsegforsampleafterWGD.MUT)

      if (losses) {
        mergedsegforsampleafterWGD = rbind(mergedsegforsampleafterWGD, lossforsampleWGD)
      }
      
      vector.of.events.after.WGD<- mergedsegforsampleafterWGD$noevent
      if(length(vector.of.events.after.WGD)==1){
        vector.of.ordered.events.after.WGD<-vector.of.events.after.WGD
      }else if(length(vector.of.events.after.WGD)>1){
        vector.of.ordered.events.after.WGD<-sample(vector.of.events.after.WGD, size=length(vector.of.events.after.WGD) ,replace=FALSE)
      }else{
        vector.of.ordered.events.after.WGD<-NULL
      }
      
      #we create the final order of events in which events that took place before the WGD, put in random order, are placed before the events which took place after WGD
      # Uses max(events) as the noevent for WGD
      vector.of.ordered.events.clonal <- c(vector.of.ordered.events.before.WGD,max(events),vector.of.ordered.events.after.WGD)
      #print(paste(length(vector.of.ordered.events.before.WGD)/length(c(vector.of.ordered.events.after.WGD,vector.of.events.HD))))
    } 
    if(tumour.ploidy==6){
      #analysis of clonal events in hexaploid samples (TAKING INTO ACCOUNT WHOLE GENOME DUPLICATION (WGD))
      mergedsegforsampleWGD<-mergedsegforsample[mergedsegforsample$w.mean==1,]
      # HOM DEL events
      mergedsegforsampleHD<-mergedsegforsampleWGD[mergedsegforsampleWGD$CNA=="dHD",]
      vector.of.events.HD<-mergedsegforsampleHD$noevent
      # Mutational Driver events
      mergedsegforsampleWGD.MUT<- mergedsegforsampleWGD[mergedsegforsampleWGD$CNA=="dMut",]
      mergedsegforsamplebeforeWGD.MUT<-mergedsegforsampleWGD.MUT[mergedsegforsampleWGD.MUT$no.chr.bearing.mut>=3,] # > for no.chr. having non-integer values
      # LOH events
      mergedsegforsampleWGD.LOH<- mergedsegforsampleWGD[mergedsegforsampleWGD$CNA=="dLOH",]
      mergedsegforsamplebeforeWGD.LOH<-mergedsegforsampleWGD.LOH[mergedsegforsampleWGD.LOH$nMin1_A==0,]
      # Gain events
      mergedsegforsampleWGD.Gain<-mergedsegforsampleWGD[mergedsegforsampleWGD$CNA=="dGain",]
      mergedsegforsamplebeforeWGD.Gain<-mergedsegforsampleWGD.Gain[mergedsegforsampleWGD.Gain$nMaj1_A>4,] # assuming 3:3
      # merge LOH and GAIN and MUT pre-WGD events
      mergedsegforsamplebeforeWGD<-rbind(mergedsegforsamplebeforeWGD.LOH,  mergedsegforsamplebeforeWGD.Gain, mergedsegforsamplebeforeWGD.MUT)
      
      #we select all events that are before WGD=> nMin1_A==0 and then randomly order them # randomly shuffle the pre-WGD events
      vector.of.events.before.WGD<- mergedsegforsamplebeforeWGD$noevent
      if(length(vector.of.events.before.WGD)==1){
        vector.of.ordered.events.before.WGD<-vector.of.events.before.WGD
      }else if(length(vector.of.events.before.WGD)>1){
        vector.of.ordered.events.before.WGD<-sample(vector.of.events.before.WGD, size=length(vector.of.events.before.WGD) ,replace=FALSE)
      }else {
        vector.of.ordered.events.before.WGD<-NULL
      }
      vector.of.ordered.events.before.WGD <- add_hds_to_ordered_vector(vector.of.events.HD, vector.of.ordered.events.before.WGD, mergedsegforsample)

      #we select all events that are before ( AFTER not before; post-WGD) WGD=> nMin1_A==1 and then randomly order them
      # picks events that have occurred after WGD (e.g. 2:2 has become 4:2 (gain) or 2:1 (LOH after gain))
      # LOH events post-WGD
      mergedsegforsampleafterWGD.LOH<-mergedsegforsampleWGD.LOH[mergedsegforsampleWGD.LOH$nMin1_A==1,]
      
      #  LOSS events that match LOH enriched regions as post-WGD LOH events - IMPORTANT: only CLONAL added here
       if (losses){
        lossforsampleWGD=enrichedLoss[which(enrichedLoss$Tumour_Name==tumour.names[i] & enrichedLoss$CNA=="cLoss"),]
        if (nrow(lossforsampleWGD) > 0) {
          lossforsampleWGD=data.frame(lossforsampleWGD[,c("Tumour_Name","chr","startpos","endpos","nMaj1_A","nMin1_A","tumour_ploidy","CNA","noevent")],w.mean=1,ID=lossforsampleWGD$ID,no.chr.bearing.mut=NA)
          lossforsampleWGD=lossforsampleWGD[which(!duplicated(lossforsampleWGD$noevent) & is.na(match(lossforsampleWGD$noevent,c(mergedsegforsamplebeforeWGD.LOH$noevent,mergedsegforsampleafterWGD.LOH)))),]
        }
      }

      # Mutational Drivers post-WGD
      mergedsegforsampleafterWGD.MUT<-mergedsegforsampleWGD.MUT[mergedsegforsampleWGD.MUT$no.chr.bearing.mut<3,]
      # Gain events post-WGD
      mergedsegforsampleafterWGD.Gain<-mergedsegforsampleWGD.Gain[mergedsegforsampleWGD.Gain$nMaj1_A<=4,] # assuming 3:3
      # merge LOH and Gain and MUT post-WGD events
      mergedsegforsampleafterWGD<-rbind(mergedsegforsampleafterWGD.LOH, mergedsegforsampleafterWGD.Gain,mergedsegforsampleafterWGD.MUT)

      if (losses) {
        mergedsegforsampleafterWGD = rbind(mergedsegforsampleafterWGD, lossforsampleWGD)
      }
      
      vector.of.events.after.WGD<- mergedsegforsampleafterWGD$noevent
      if(length(vector.of.events.after.WGD)==1){
        vector.of.ordered.events.after.WGD<-vector.of.events.after.WGD
      }else if(length(vector.of.events.after.WGD)>1){
        vector.of.ordered.events.after.WGD<-sample(vector.of.events.after.WGD, size=length(vector.of.events.after.WGD) ,replace=FALSE)
      }else{
        vector.of.ordered.events.after.WGD<-NULL
      }
      
      #we create the final order of events in which events that took place before the WGD, put in random order, are placed before the events which took place after WGD
      # Uses max(events) as the noevent for WGD
      vector.of.ordered.events.clonal <- c(vector.of.ordered.events.before.WGD,max(events),vector.of.ordered.events.after.WGD)
    }
    
    if(length(vector.of.ordered.events.clonal)==0){
      vector.of.ordered.events.clonal=NULL
    }
    
    if (tumour.ploidy > 2) {
      list(`clonal_early`=vector.of.events.before.WGD, `clonal_late`=vector.of.events.after.WGD, `clonal_unspecified`=c(), `subclonal`=vectorofeventssubclonal)
    } else {
      list(`clonal_early`=c(), `clonal_late`=c(), `clonal_unspecified`=setdiff(vector.of.ordered.events.clonal, max(events)), `subclonal`=vectorofeventssubclonal)
    }

  })
  names(timing_categories) = tumour.names

if (!dir.exists(output_dir)) {
  dir.create(output_dir, recursive=TRUE)
}

event_types = c("clonal_early", "clonal_late", "clonal_unspecified", "subclonal")

timing_category_matrix = t(sapply(unique(mergedseg$noevent), function(event_i) {
  table(factor(sapply(timing_categories, function(tc) {
    this_cat = names(which(sapply(tc, function(x) { event_i %in% x })))
    if (length(this_cat) == 1) {
      this_cat
    } else if (length(this_cat) == 0) {
      NA
    } else {
      stop(paste0("A result for event ", event_i, " was reported with more than 1 timing category"))
    }
  }), levels=event_types))
}))
event_names = sapply(unique(mergedseg$noevent), function(event_i) {
  ms_row = mergedseg[which(mergedseg$noevent == event_i)[1],]
  paste(gsub("^d", "", ms_row["CNA"]), gsub("^dMut_", "", ms_row["ID"]), sep="_")
})
rownames(timing_category_matrix) = event_names

timing_category_matrices = lapply(split(1:nrow(timing_category_matrix), sapply(rownames(timing_category_matrix), function(x) { strsplit(x, "_")[[1]][1] })), function(rownums) { timing_category_matrix[rownums,] })

calculate_timing_preferences = function(timing_category_matrix) {
  if(is.null(nrow(timing_category_matrix))) {
    timing_category_matrix = t(timing_category_matrix)
  }
  event_category_sums = apply(timing_category_matrix, 2, sum)
  pref = t(apply(timing_category_matrix, 1, function(event_times) {
    clonal_early_late_conf_mat = rbind(event_times[1:2], event_category_sums[1:2] - event_times[1:2])
    clonal_early_late_ft = fisher.test(clonal_early_late_conf_mat)
    event_clonal = sum(event_times[1:3])
    sums_clonal = sum(event_category_sums[1:3])
    event_clonal_subclonal = c(event_clonal, event_times[4])
    clonal_subclonal_conf_mat = rbind(event_clonal_subclonal, c(sums_clonal, event_category_sums[4]) - event_clonal_subclonal)
    clonal_subclonal_ft = fisher.test(clonal_subclonal_conf_mat)
    c(`clonal_early_late_p`=clonal_early_late_ft[["p.value"]], `clonal_early_late_or`=clonal_early_late_ft[["estimate"]]["odds ratio"], `clonal_subclonal_p`=clonal_subclonal_ft[["p.value"]], `clonal_subclonal_or`=clonal_subclonal_ft[["estimate"]]["odds ratio"])
  }))
  pref = cbind(pref, `clonal_early_late_fdr`=p.adjust(pref[,'clonal_early_late_p'], method="fdr"), `clonal_subclonal_fdr`=p.adjust(pref[,'clonal_subclonal_p'], method="fdr"))
}

timing_preferences_within_type = lapply(timing_category_matrices, calculate_timing_preferences)

for (i in 1:length(timing_preferences_within_type)) {
  print(timing_preferences_within_type[[i]][order(timing_preferences_within_type[[i]][,1]),1:2])
  print(timing_preferences_within_type[[i]][order(timing_preferences_within_type[[i]][,3]),3:4])
}

timing_preferences_overall = calculate_timing_preferences(timing_category_matrix)

timing_preferences_overall[order(timing_preferences_overall[,1]),1:2]
timing_preferences_overall[order(timing_preferences_overall[,3]),3:4]

event_timing_sample_sets = lapply(unique(mergedseg$noevent), function(idx) {
  etss = sapply(event_types, function(et) {
    unlist(sapply(tumour.names, function(bc) { 
      if (idx %in% timing_categories[[bc]][[et]]) { bc } else { NULL }
    }))
  })
  names(etss) = event_types
  etss
})
names(event_timing_sample_sets) = event_names

event_sample_sets = lapply(event_timing_sample_sets, unlist)

source("path/to/Code/get_event_ids.R")
timing_category_matrix_DN2 = timing_category_matrix
timing_preferences_overall_DN2 = timing_preferences_overall
event_ids_DN2 = event_ids


########### END OF PER-SUBSET PART ###################



colour_map_from_logs = function(nums, gt0_max=NULL, lt0_min=NULL) {
  if (any(is.infinite(nums))) {
    non_inf_nums = nums[-which(is.infinite(nums))]
  } else {
    non_inf_nums = nums
  }
  gt0 = non_inf_nums[non_inf_nums > 0]
  lt0 = non_inf_nums[non_inf_nums < 0]
  if (length(gt0) > 0) {
    gt0_ramp = colorRamp(c("white","red"))
	if (is.null(gt0_max)) { gt0_max = max(gt0) }
  }
  if (length(lt0) > 0) {
    lt0_ramp = colorRamp(c("white","blue"))
    if (is.null(lt0_min)) { lt0_min = min(lt0) }
  }
  sapply(nums, function(x) {
    if (is.infinite(x) && x > 0) {
	  gt0_ramp(1)
	} else if (is.infinite(x) && x < 0) {
	  lt0_ramp(0)
	} else if (x > 0) {
	  gt0_ramp(x / gt0_max)
	} else if (x < 0) {
	  lt0_ramp(x / lt0_min)
	} else {
	  c(255,255,255)
	}
  })
}

get_clonality_barplot = function(tcm, tpo, eid) {
  clm = t(apply(tcm, 1, function(row) { c(`clonal`=sum(row[1:3]), row[4]) }))
  names(eid) = gsub("dMut_", "", names(eid))
  melted_timing_categories = melt(clm)
  colnames(melted_timing_categories) = c("Event","Timing","Value")
  sig_pref <- names(which(p.adjust(tpo[,"clonal_subclonal_p"], method="fdr") < 0.05))
  or_order <- names(sort(tpo[sig_pref,"clonal_subclonal_or.odds ratio"], decreasing=TRUE))
  mtc_sig <- melted_timing_categories[melted_timing_categories$Event %in% sig_pref,]
  mtc_sig[,1] <- eid[gsub("^[^_]*_", "", mtc_sig[,1])]
  or_order <- eid[gsub("^[^_]*_", "", or_order)]
  mtc_sig[,1] = factor(mtc_sig[,1], levels=or_order)
  mtc_sig[,2] = factor(mtc_sig[,2], levels=c("subclonal","clonal"))
  p <- ggplot(mtc_sig, aes(fill=Timing, y=Value, x=Event)) +
  geom_bar(position="stack", stat="identity", show.legend=FALSE) +
  theme_bw() +
  scale_fill_manual(values=c("#000000","#60EFE1")) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size=14, family = 'Roboto Condensed'), axis.text.y = element_text(size=14, family = 'Roboto Condensed'), axis.title=element_blank(), title=element_text(size=16, family = 'Roboto Condensed'))
  p
}

png("path/to/figures/Fig4/Clonality_preferences_NSDA.png", height=2210, width=4000, res=300)
get_clonality_barplot(timing_category_matrix_DN2, timing_preferences_overall_DN2, event_ids_DN2)
dev.off()
png("path/to/figures/Fig4/Timing_preferences_SD.png", height=2100, width=2444, res=300)
get_clonality_barplot(timing_category_matrix_DN1, timing_preferences_overall_DN1, event_ids_DN1)
dev.off()
png("path/to/figures/Fig4/Clonality_preferences_NSDB.png", height=1840, width=892, res=300)
get_clonality_barplot(timing_category_matrix_DN3, timing_preferences_overall_DN3, event_ids_DN3)
dev.off()

selected_NSDA_timing_prefs = timing_preferences_overall_DN2[timing_preferences_overall_DN2[,"clonal_subclonal_fdr"] < 0.05,]
selected_NSDB_timing_prefs = timing_preferences_overall_DN3[timing_preferences_overall_DN3[,"clonal_subclonal_fdr"] < 0.05,]
selected_SD_timing_prefs   = timing_preferences_overall_DN1[timing_preferences_overall_DN1[,"clonal_subclonal_fdr"] < 0.05,]

selected_NSDA_timing_prefs = selected_NSDA_timing_prefs[order(selected_NSDA_timing_prefs[,"clonal_subclonal_or.odds ratio"], decreasing=T),]
selected_NSDB_timing_prefs = selected_NSDB_timing_prefs[order(selected_NSDB_timing_prefs[,"clonal_subclonal_or.odds ratio"], decreasing=T),]
selected_SD_timing_prefs   = selected_SD_timing_prefs[order(selected_SD_timing_prefs[,"clonal_subclonal_or.odds ratio"], decreasing=T),]

selected_NSDA_timing_prefs = cbind(`clonal_subclonal_OR`=selected_NSDA_timing_prefs[,4], `clonal_subclonal_FDR`=selected_NSDA_timing_prefs[,6])
selected_NSDB_timing_prefs = cbind(`clonal_subclonal_OR`=selected_NSDB_timing_prefs[,4], `clonal_subclonal_FDR`=selected_NSDB_timing_prefs[,6])
selected_SD_timing_prefs = cbind(`clonal_subclonal_OR`=selected_SD_timing_prefs[,4], `clonal_subclonal_FDR`=selected_SD_timing_prefs[,6])

prefs_list = list(selected_NSDA_timing_prefs, selected_NSDB_timing_prefs, selected_SD_timing_prefs)
all_selected_prefs = Reduce(rbind, prefs_list)

max_or = max(all_selected_prefs[!is.infinite(all_selected_prefs[,1]),1])
min_or = min(timing_preferences_overall[timing_preferences_overall[,2] > 0,2])
log_max = round(log2(max_or) + 0.5) + 1
log_min = round(log2(min_or) - 0.5) - 1
extent = max(abs(c(log_max, log_min)))

par(mar=c(0,0,0,0))

for (prefs_i in seq_along(prefs_list)) {
  these_prefs = prefs_list[[prefs_i]]
  if (prefs_i == 1) { traj = "NSD-A" } else if (prefs_i == 2) { traj = "NSD-B" } else { traj = "SD" }
  wd = nrow(these_prefs) + 1
  col_num = 1
  comparison = "clonal_subclonal"
  
  max_or = extent
  min_or = -extent
  pdf(paste0("path/to/figures/Fig4/", traj, "_", comparison, "_timing_categorisation_log2OR_colours.pdf"), height=2, width=wd)
  or_nums = sapply(log2(these_prefs[,col_num]), function(x) { max(min(x, max_or), min_or) })
  pref_cols = t(colour_map_from_logs(or_nums, gt0_max=max_or, lt0_min=min_or) / 255)
  plot(x=NULL, xlim=c(0,100*nrow(these_prefs)), ylim=c(0,50), axes=FALSE, xlab="", ylab="")  
  for (i in 1:nrow(these_prefs)) {
    rect(((i-1)*100)+5,0,(i*100)-5,50,col=rgb(pref_cols[i,1], pref_cols[i,2], pref_cols[i,3]))
  }
  dev.off()
}

t(t(sapply(selected_NSDA_timing_prefs[,"clonal_subclonal_FDR"], function(x) { if (x < 0.0001) { "***" } else if (x < 0.001) { "**" } else if (x < 0.05) { "*" } else { "" }})))
t(t(sapply(selected_NSDB_timing_prefs[,"clonal_subclonal_FDR"], function(x) { if (x < 0.0001) { "***" } else if (x < 0.001) { "**" } else if (x < 0.05) { "*" } else { "" }})))
t(t(sapply(selected_SD_timing_prefs[,"clonal_subclonal_FDR"], function(x) { if (x < 0.0001) { "***" } else if (x < 0.001) { "**" } else if (x < 0.05) { "*" } else { "" }})))

full_cr = colorRamp(c("red","white","blue"))
rampcols = full_cr(seq(0,1,by=0.001))
par(mar=c(0,0,0,0))
pdf("path/to/figures/Fig4/blueWhiteRed_log2OR_scale.pdf", height=3, width=20)
plot(NA, ylim=c(0,1), xlim=c(0,nrow(rampcols)), xlab="", ylab="", axes=FALSE)
for (i in 1:nrow(rampcols)) { polygon(y = c(0,0,1,1), x=c(i-1,i,i,i-1), col=rgb(rampcols[i,1]/255, rampcols[i,2]/255, rampcols[i,3]/255), border = NA) }
polygon(y=c(0,0,1,1), x=c(0, nrow(rampcols), nrow(rampcols), 0), col=NA, lwd=4)
dev.off()