SherlockLung-EvolutionaryTrajectory-Analysis / Ordering_Model / R / prepare_driver_mutations.R
prepare_driver_mutations.R
Raw
var_defaults = c(
  output_dir = ".",
  dp_data_dir = "none",
  wc_dir = "none",
  sample_list = "none",
  all_gene_gr_file = "./reference_files/all_gene_gr_hg19.Rdata",
  strip_vs_when_matching = FALSE
)

args = commandArgs(trailingOnly = T)

if (length(args) < 4) {
  stop(paste0("At least 4 arguments must be passed, in the following order: name, dp_annotated_input_folder, driver_symbol_file, nonsyn_mut_file.

USAGE:
Rscript prepare_driver_mutations.R --args name dp_annotated_input_folder driver_symbol_file nonsyn_mut_file [output_dir=output_dir] [dp_data_dir=dp_data_dir] [wc_dir=wc_dir] [sample_list=sample_list]

"))
}

# pass name of tumour set as first argument (after --args)
name = args[1]
dp_annotated_input_folder = args[2]
driver_symbol_file = args[3]
nonsyn_mut_file = args[4]

# optional: pass file pattern (if different to "subclones.txt") as second argument (after --args)
if (length(args) > 4) {
  split_args = strsplit(args[5:length(args)], "=")
  for (x in split_args) { assign(x[1], x[2]) }
}
for (x in names(var_defaults)) {
  if (!exists(x)) {
    assign(x, var_defaults[x])
  }
}

cat(paste0("Running with the following variable values:

dp_annotated_input_folder = ", dp_annotated_input_folder, "
driver_symbol_file = ", driver_symbol_file, "
nonsyn_mut_file = ", nonsyn_mut_file, "
output_dir = ", output_dir, "
dp_data_dir = ", dp_data_dir, "
wc_dir = ", wc_dir, "
sample_list = ", sample_list, "
all_gene_gr_file = ", all_gene_gr_file, "

"))

library(GenomicRanges)

if (!dir.exists(output_dir)) {
  dir.create(output_dir, recursive=TRUE)
}

driver_mut_file_for_timing = paste0(output_dir, "/", name, "_driver_mut_file_for_timing.txt")

nonsyn_muts = read.table(nonsyn_mut_file, header=T, stringsAsFactors = F, sep="\t")
stripped_nonsyn_mut_bcs = gsub("_vs.*", "", nonsyn_muts$Tumor_Barcode)

if (!dir.exists(dp_annotated_input_folder)) {
  dir.create(dp_annotated_input_folder, recursive=TRUE)
}

if (sample_list != "none") {
  samples_from_list=readLines(sample_list)    
}

add_clonality_ccf_DPinput <- function(dp_input_file, dp_output_file, optima_info_file, dp_annotated_input){
    dpinfo <- read.table(dp_input_file, header = T, sep = '\t', stringsAsFactors = F)
    # load the optima info
    optima_info <- read.table(optima_info_file, header = T, sep='\t',stringsAsFactors = F)
    # load the cluster info
    cluster_info <- read.table(dp_output_file, header = T, sep='\t',stringsAsFactors = F)
    ltez = which(dpinfo$subclonal.fraction<=0)
        if (length(ltez) > 0) {
          for (ltezIndex in ltez) {
            ciIndex = which(cluster_info$chr == dpinfo$chr[ltezIndex] & cluster_info$end == dpinfo$end[ltezIndex])
                cluster_info = cluster_info[-ciIndex,]
          }
        }
    # filter out VAF=0 and mutations without copy number info
    dpinfo = dpinfo[!is.na(dpinfo$subclonal.fraction) & dpinfo$subclonal.fraction>0,]

    # remove duplicate loci
    dp_info_loci = paste(dpinfo[,"chr"], dpinfo[,"end"], sep="_")
    cluster_info_loci = paste(cluster_info[,"chr"], cluster_info[,"end"], sep="_")
    dp_info_dup_rows = which(dp_info_loci %in% dp_info_loci[duplicated(dp_info_loci)])
    cluster_info_dup_rows = which(cluster_info_loci %in% cluster_info_loci[duplicated(cluster_info_loci)])
    if (length(dp_info_dup_rows) > 0) { dpinfo = dpinfo[-dp_info_dup_rows,] }
    if (length(cluster_info_dup_rows) > 0) { cluster_info = cluster_info[-cluster_info_dup_rows,] }

    # filter out any mutations that are not included in BOTH DPClust input and output files (by chromosome and NT location)
    # warn the user if any are filtered out
    rownames(dpinfo) = paste(dpinfo[,"chr"], dpinfo[,"end"], sep="_")
    rownames(cluster_info) = paste(cluster_info[,"chr"], cluster_info[,"end"], sep="_")
    shared_rows = intersect(rownames(dpinfo), rownames(cluster_info))
    input_only = setdiff(rownames(dpinfo), shared_rows)
    output_only = setdiff(rownames(cluster_info), shared_rows)
    if (length(input_only) > 0) {
      chrs_exc = unique(dpinfo[input_only, "chr"])
      if (length(chrs_exc) > 1) {
        chr_word = "chromosomes"
        chrs_exc_text = paste(chrs_exc, collapse=", ")
      } else {
        chr_word = "chromosome"
        chrs_exc_text = chrs_exc
      }
      warning(paste0(length(input_only), " mutations on ", chr_word, " ", chrs_exc_text, " were excluded as they were not present in the DPClust output file."))
    }
    if (length(output_only) > 0) {
      chrs_exc = unique(dpinfo[output_only, "chr"])
      if (length(chrs_exc) > 1) {
        chr_word = "chromosomes"
        chrs_exc_text = paste(chrs_exc, collapse=", ")
      } else {
        chr_word = "chromosome"
        chrs_exc_text = chrs_exc
      }
      warning(paste0(length(output_only), " mutations on ", chr_word, " ", chrs_exc_text, " were excluded as they were not present in the DPClust input file. That is unexpected."))
    }
    dpinfo = dpinfo[shared_rows,]
    cluster_info = cluster_info[shared_rows,]
    # check if the number of rows in the filtered dpinfo and cluster_info match - NOTE: the above line mean this should always be true
    if (identical(nrow(dpinfo),nrow(cluster_info))){
      dpinfo$cluster <- cluster_info$cluster
      # position and number of clonal cluster (picking the cluster closest to 1)
      dists = abs(optima_info[,2]-1)
      position.clonalcluster = optima_info[which(dists==min(dists)),2]
      clonalcluster = optima_info[which(dists==min(dists)),1]
      # number mutations superclonal cluster - going to assign any mutations in the superclonal clusters CCF=1
      superclonal = which(optima_info[,2]>1)
      if (length(superclonal)>0){
        superclonalcluster = optima_info[superclonal,1]
        clonalcluster <- unique(c(clonalcluster,superclonalcluster))
      }
      clonality <- c()
      ccf <- c()
      # loop over the cluster allocation
      for (j in 1:nrow(dpinfo)){
        if (dpinfo$cluster[j]%in%clonalcluster){
          clonality[j] <- 'clonal'
          ccf[j] <- 1
        } else if (!is.na(dpinfo$cluster[j])) {
          clonality[j] <- 'subclonal'
          ccf[j] <- optima_info[which(optima_info[,1]==dpinfo$cluster[j]),2]
        } else {
                  clonality[j] = NA
                  ccf[j] = NA
                }
      }
      dpinfo$clonality <-  clonality
      dpinfo$ccf <- ccf
      theseAreNA = which(sapply(dpinfo$ccf, is.na))
      if (length(theseAreNA) > 0) { dpinfo = dpinfo[-theseAreNA,] }
      write.table(dpinfo, file = dp_annotated_input, row.names = F, quote = F, sep = '\t')
    } else {
      print(paste0('The number of mutations in the input for DPclust differs from the number of mutations in the output for files ', dp_input_file, ' and ', dp_output_file))
    }
}

prepare_driver_mutations_file <- function(file_with_mutations, dp_annotated_input_folder, allsegments_file, driver_mut_file_for_timing){
  # load the driver mutations we want to prepare for the ordering script
  mutations_to_annotate <- read.table(file_with_mutations, header = T, sep = '\t', stringsAsFactors = F)

  # load allsegments_file with ploidy info
  allsegments <- read.table(allsegments_file, header = T, sep = '\t', stringsAsFactors = F)

  # Tumour_Name, chr, startpos, endpos, nMaj1_A, nMin1_A, tumour_ploidy, CNA, noevent, w.mean, no.chrs.bearing.mut
  annotated_mutations <- NULL

  unique_samples <- unique(mutations_to_annotate$Tumour_Sample_Barcode)
  print(unique_samples)
  all_annotated_DPinput <- list.files(dp_annotated_input_folder)

  # how many unique driver genes do we have to annotate?
  number_genes <- length(unique(mutations_to_annotate$Hugo_Symbol))

  # make a table to match the driver genes to the noevent
  drivers_table <- cbind(unique(mutations_to_annotate$Hugo_Symbol), 1:number_genes)
  drivers_table <- data.frame(drivers_table, stringsAsFactors = F)
  colnames(drivers_table) <- c('gene', 'noevent')

  # loop over the unique samples
  for (i in 1:length(unique_samples)){
    print(i)
    mutations_to_annotate_sample <- mutations_to_annotate[mutations_to_annotate$Tumour_Sample_Barcode==unique_samples[i],]

    # pick the annotated dpinput file for this sample
    file_index <- grep(paste0(unique_samples[i], "_"),all_annotated_DPinput)
    dpinput_sample <- read.table(file.path(dp_annotated_input_folder,all_annotated_DPinput[file_index]), header = T, sep = '\t', stringsAsFactors = F)

    dpinput_mutations <- dpinput_sample[which(dpinput_sample[,1]%in%mutations_to_annotate_sample$Chromosome &  dpinput_sample[,2]%in%mutations_to_annotate_sample$Start_Position),]

    # make sure that dpinput_mutation and mutations_to_annotate_sample have the same order
    mutations_to_annotate_sample <- mutations_to_annotate_sample[order(mutations_to_annotate_sample$Start_Position),]
    dpinput_mutations <- dpinput_mutations[order(dpinput_mutations[,2]),]

    # check if the positions match
    ident_check = identical(mutations_to_annotate_sample$Start_Position, dpinput_mutations[,2])
    print(ident_check)
    # filter down to just those that are in DPC input AND are nonsynonymous mutations hitting genes in the driver list
    if (!ident_check) {
      mtas_id = apply(mutations_to_annotate_sample, 1, function(x) { paste(gsub(" *", "", x[c("Chromosome", "Start_Position")]), collapse="_") })
      dpim_id = apply(dpinput_mutations, 1, function(x) { paste(gsub(" *", "", x[c("chr", "end")]), collapse="_") })
      in_both = intersect(mtas_id, dpim_id)
      mtas_in_both = mtas_id %in% in_both
      dpim_in_both = dpim_id %in% in_both
      mutations_to_annotate_sample = mutations_to_annotate_sample[mtas_in_both,]
      dpinput_mutations = dpinput_mutations[dpim_in_both,]
    }

    n_rows <- nrow(dpinput_mutations)

    # determine the noevent_vector for this sample - match each gene with the corresponding noevent from the table
    noevent_vector <- drivers_table[match(mutations_to_annotate_sample$Hugo_Symbol, drivers_table[,1]),2]

    # determine the tumour_ploidy for that sample
    tumour_ploidy <- unique(allsegments$tumour_ploidy[grep(unique_samples[i], allsegments$Tumour_Name)])

    if (length(tumour_ploidy) > 0) {
      # clonality prefix
      clonality_prefix <- ifelse(dpinput_mutations$clonality=='clonal','c','s')

      nMajMin = as.data.frame(t(sapply(1:nrow(mutations_to_annotate_sample), function(j) {
        allsegments[ which(allsegments$Tumour_Name==unique_samples[i] & allsegments$chr==mutations_to_annotate_sample$Chromosome[j] & allsegments$startpos<=mutations_to_annotate_sample$Start_Position[j] & allsegments$endpos>=mutations_to_annotate_sample$End_Position[j])[1], c("nMaj1_A","nMin1_A") ]    # note this version has [1] after the "which" statement in order to catch an unusual case during testing, in which multiple fits exist for some samples. This should probably be removed, reverting to the previous (commented) line, when running properly, as one fit should be selected for each sample.
      })))

      to_add <- cbind(mutations_to_annotate_sample$Tumour_Sample_Barcode, # Tumour_Name
                    mutations_to_annotate_sample$Chromosome,            #  chr
                    mutations_to_annotate_sample$Start_Position,        # startpos
                    mutations_to_annotate_sample$Start_Position,        # endpos
                    unlist(nMajMin$nMaj1_A),                            # nMaj1_A
                    unlist(nMajMin$nMin1_A),                            # nMin1_A
                    rep(tumour_ploidy, n_rows),                         # tumour_ploidy
                    paste0(clonality_prefix,'Mut_',mutations_to_annotate_sample$Hugo_Symbol), # CNA
                    noevent_vector,                                     # noevent
                    dpinput_mutations$ccf,                              # w.mean
                    dpinput_mutations$no.chrs.bearing.mut)              # no.chrs.bearing.mut
      annotated_mutations <- rbind(annotated_mutations, to_add)
    }
  }
  # save the driver mutation data for the timing model
  annotated_mutations <- data.frame(annotated_mutations, stringsAsFactors = F)
  colnames(annotated_mutations) <- c('Tumour_Name', 'chr', 'startpos', 'endpos', 'nMaj1_A', 'nMin1_A', 'tumour_ploidy', 'CNA', 'noevent', 'w.mean', 'no.chrs.bearing.mut')
  annotated_mutations$noevent = 1:nrow(annotated_mutations)
  write.table(annotated_mutations,driver_mut_file_for_timing, row.names = F, col.names = T, sep = '\t', quote = F)
}

# If the annotated input file does not already exist, a value should be passed for dp_data_dir
# In this case, the following code block is run to generate the annotated input file
# Otherwise (i.e. if no value is passed for dp_data_dir) the annotated input file is assumed to exist already
run_annotation = FALSE
if (!dir.exists(dp_annotated_input_folder)) {
  run_annotation = TRUE
} else {
  if (length(dir(dp_annotated_input_folder, all.files=TRUE, no..=TRUE)) == 0) {
    run_annotation = TRUE
  }
}
if (run_annotation) {
  dp_input_files = sort(list.files(dp_data_dir, pattern="_dpclust_input_data.txt", full.names=TRUE, recursive=TRUE))
  dp_output_files = sort(list.files(dp_data_dir, pattern="_bestConsensusAssignments.bed", full.names=TRUE, recursive=TRUE))
  if (wc_dir != "none") {
    dp_optima_files = sort(list.files(wc_dir, pattern="_WCC_ccf_nMut.txt", full.names=TRUE, recursive=TRUE))
  } else {
    dp_optima_files = sort(list.files(dp_data_dir, pattern="_bestClusterInfo.txt", full.names=TRUE, recursive=TRUE))
  }
  if (sample_list != "none") {
    dp_input_files = dp_input_files[sapply(samples_from_list, grep, dp_input_files)]
    dp_output_files = dp_output_files[sapply(samples_from_list, grep, dp_output_files)]
    dp_optima_files = dp_optima_files[sapply(samples_from_list, grep, dp_optima_files)]
  }
  if ((length(dp_input_files) != length(dp_output_files)) || (length(dp_output_files) != length(dp_optima_files))) {
    stop(paste0("There are not an equal number of (consistently named) files ending _dpclust_input_data.txt, _bestConsensusAssignments.bed, and _bestClusterInfo.txt in ", dp_data_dir, ". Please copy the ones you need into a new folder and run using that."))
  }
  for (i in seq_along(dp_input_files)) {
    bc = gsub("_dpclust_input_data.txt", "", basename(dp_input_files[i]))
    print(paste0(i, ": ", bc))
    add_clonality_ccf_DPinput(dp_input_files[i], dp_output_files[i], dp_optima_files[i], paste0(dp_annotated_input_folder, "/", bc, "_dp_annotated_input.txt"))
  }  
}

driver_symbols = unique(readLines(driver_symbol_file))

load(all_gene_gr_file)
driver_gr = all_gene_gr[all_gene_gr$symbol %in% driver_symbols]

driver_genes = driver_gr$symbol

dp_annotated_infiles = list.files(dp_annotated_input_folder, pattern="dp_annotated_input.txt", recursive=T, full.names=T)
if (sample_list != "none") {
  dp_annotated_infiles = dp_annotated_infiles[ gsub("_dp_annotated_input.txt", "", basename(dp_annotated_infiles)) %in% gsub("_DNA$", "", samples_from_list) ]
}

gene_present_counts = rep(0, length(driver_genes))
for (dpc_file in dp_annotated_infiles) {
  sample_df = read.table(dpc_file, header=T, stringsAsFactors=F)
  sample_gr = GRanges(seqnames=sample_df[,1], ranges=IRanges(start=sample_df[,2], end=sample_df[,2]))
  bc = gsub("_dp_annotated_input.txt", "", basename(dpc_file))
  # NOTE: some datasets we have worked with have inconsistent naming conventions between the nonsyn muts file and the DPClust data, which included a _vs_[normal] suffix
  # If this is the case, your comparison of the two will need to substring one of them in order to find the matches, e.g.:
  #nonsyn = nonsyn_muts[nonsyn_muts$Tumor_Barcode == substr(bc,1,13),]
  # Otherwise, just find matches directly:
  #nonsyn = nonsyn_muts[nonsyn_muts$Tumor_Barcode == bc,]
  # NEW APPROACH: allow optional stripping of everything after _vs (inclusive) when matching:
  if (strip_vs_when_matching) {
    match_bc = gsub("_vs.*", "", bc)
	nonsyn_mut_bcs = stripped_nonsyn_mut_bcs
  } else {
    match_bc = bc
    nonsyn_mut_bcs = nonsyn_muts$Tumor_Barcode
  }
  nonsyn = nonsyn_muts[which(nonsyn_mut_bcs == match_bc),]
  nonsyn_gr = GRanges(seqnames=gsub("chr", "", nonsyn$Chromosome), ranges=IRanges(start=nonsyn$Start_Position, end=nonsyn$End_Position))
  nonsyn_sample_gr = sample_gr[subjectHits(suppressWarnings(findOverlaps(nonsyn_gr, sample_gr))),]
  ov = suppressWarnings(findOverlaps(driver_gr, nonsyn_sample_gr))
  gene_present_counts[unique(queryHits(ov))] = gene_present_counts[unique(queryHits(ov))] + 1
}
names(gene_present_counts) = driver_genes
gene_freq = sort((gene_present_counts / length(dp_annotated_infiles)) * 100, decreasing=T)
gene_present_counts = sort(gene_present_counts, decreasing=T)

driver_symbols = names(gene_freq[gene_freq >= 3 & gene_present_counts >= 3])
driver_gr = driver_gr[driver_gr$symbol %in% driver_symbols]

driver_gr = driver_gr[seqnames(driver_gr) %in% c(1:22, "X"),]
driver_genes = driver_gr$symbol

driver_mutations = Reduce(rbind, lapply(dp_annotated_infiles, function(dp_in) {
  sample_df = read.table(dp_in, header=T, stringsAsFactors=F)
  sample_gr = GRanges(seqnames=sample_df[,1], ranges=IRanges(start=sample_df[,2], end=sample_df[,2]))
  bc = gsub("_dp_annotated_input.txt", "", basename(dp_in))
  # NOTE: some datasets we have worked with have inconsistent naming conventions between the nonsyn muts file and the DPClust data, which included a _vs_[normal] suffix
  # If this is the case, your comparison of the two will need to substring one of them in order to find the matches, e.g.:
  #nonsyn = nonsyn_muts[nonsyn_muts$Tumor_Barcode == substr(bc,1,13),]
  # Otherwise, just find matches directly:
  #nonsyn = nonsyn_muts[nonsyn_muts$Tumor_Barcode == bc,]
  # NEW APPROACH: allow optional stripping of everything after _vs (inclusive) when matching:
  if (strip_vs_when_matching) {
    match_bc = gsub("_vs.*", "", bc)
	nonsyn_mut_bcs = stripped_nonsyn_mut_bcs
  } else {
    match_bc = bc
    nonsyn_mut_bcs = nonsyn_muts$Tumor_Barcode
  }
  nonsyn = nonsyn_muts[which(nonsyn_mut_bcs == match_bc),]
  nonsyn_gr = GRanges(seqnames=gsub("chr", "", nonsyn$Chromosome), ranges=IRanges(start=nonsyn$Start_Position, end=nonsyn$End_Position))
  nonsyn_sample_gr = sample_gr[subjectHits(suppressWarnings(findOverlaps(nonsyn_gr, sample_gr))),]
  ov = suppressWarnings(findOverlaps(driver_gr, nonsyn_sample_gr))
  if (length(ov) > 0) {
    sample_driver_muts = nonsyn_sample_gr[subjectHits(ov),]
    cbind(Hugo_Symbol=mcols(driver_gr[queryHits(ov),])[,'symbol'], Chromosome=as.character(seqnames(sample_driver_muts)), Start_Position=start(sample_driver_muts), End_Position=end(sample_driver_muts), Tumour_Sample_Barcode=bc)
  } else {
    NULL
  }
}))

driver_mut_input_file = paste0(output_dir, "/driver_mut_input_file.txt")

write.table(driver_mutations, file=driver_mut_input_file, col.names=TRUE, row.names=FALSE, quote=FALSE, sep='\t')

all_segments_file=paste0(output_dir, "/", name, "_allsegments.txt")

prepare_driver_mutations_file(driver_mut_input_file, dp_annotated_input_folder, all_segments_file, driver_mut_file_for_timing)