var_defaults = c(
output_dir = ".",
dp_data_dir = "none",
wc_dir = "none",
sample_list = "none",
all_gene_gr_file = "./reference_files/all_gene_gr_hg19.Rdata",
strip_vs_when_matching = FALSE
)
args = commandArgs(trailingOnly = T)
if (length(args) < 4) {
stop(paste0("At least 4 arguments must be passed, in the following order: name, dp_annotated_input_folder, driver_symbol_file, nonsyn_mut_file.
USAGE:
Rscript prepare_driver_mutations.R --args name dp_annotated_input_folder driver_symbol_file nonsyn_mut_file [output_dir=output_dir] [dp_data_dir=dp_data_dir] [wc_dir=wc_dir] [sample_list=sample_list]
"))
}
# pass name of tumour set as first argument (after --args)
name = args[1]
dp_annotated_input_folder = args[2]
driver_symbol_file = args[3]
nonsyn_mut_file = args[4]
# optional: pass file pattern (if different to "subclones.txt") as second argument (after --args)
if (length(args) > 4) {
split_args = strsplit(args[5:length(args)], "=")
for (x in split_args) { assign(x[1], x[2]) }
}
for (x in names(var_defaults)) {
if (!exists(x)) {
assign(x, var_defaults[x])
}
}
cat(paste0("Running with the following variable values:
dp_annotated_input_folder = ", dp_annotated_input_folder, "
driver_symbol_file = ", driver_symbol_file, "
nonsyn_mut_file = ", nonsyn_mut_file, "
output_dir = ", output_dir, "
dp_data_dir = ", dp_data_dir, "
wc_dir = ", wc_dir, "
sample_list = ", sample_list, "
all_gene_gr_file = ", all_gene_gr_file, "
"))
library(GenomicRanges)
if (!dir.exists(output_dir)) {
dir.create(output_dir, recursive=TRUE)
}
driver_mut_file_for_timing = paste0(output_dir, "/", name, "_driver_mut_file_for_timing.txt")
nonsyn_muts = read.table(nonsyn_mut_file, header=T, stringsAsFactors = F, sep="\t")
stripped_nonsyn_mut_bcs = gsub("_vs.*", "", nonsyn_muts$Tumor_Barcode)
if (!dir.exists(dp_annotated_input_folder)) {
dir.create(dp_annotated_input_folder, recursive=TRUE)
}
if (sample_list != "none") {
samples_from_list=readLines(sample_list)
}
add_clonality_ccf_DPinput <- function(dp_input_file, dp_output_file, optima_info_file, dp_annotated_input){
dpinfo <- read.table(dp_input_file, header = T, sep = '\t', stringsAsFactors = F)
# load the optima info
optima_info <- read.table(optima_info_file, header = T, sep='\t',stringsAsFactors = F)
# load the cluster info
cluster_info <- read.table(dp_output_file, header = T, sep='\t',stringsAsFactors = F)
ltez = which(dpinfo$subclonal.fraction<=0)
if (length(ltez) > 0) {
for (ltezIndex in ltez) {
ciIndex = which(cluster_info$chr == dpinfo$chr[ltezIndex] & cluster_info$end == dpinfo$end[ltezIndex])
cluster_info = cluster_info[-ciIndex,]
}
}
# filter out VAF=0 and mutations without copy number info
dpinfo = dpinfo[!is.na(dpinfo$subclonal.fraction) & dpinfo$subclonal.fraction>0,]
# remove duplicate loci
dp_info_loci = paste(dpinfo[,"chr"], dpinfo[,"end"], sep="_")
cluster_info_loci = paste(cluster_info[,"chr"], cluster_info[,"end"], sep="_")
dp_info_dup_rows = which(dp_info_loci %in% dp_info_loci[duplicated(dp_info_loci)])
cluster_info_dup_rows = which(cluster_info_loci %in% cluster_info_loci[duplicated(cluster_info_loci)])
if (length(dp_info_dup_rows) > 0) { dpinfo = dpinfo[-dp_info_dup_rows,] }
if (length(cluster_info_dup_rows) > 0) { cluster_info = cluster_info[-cluster_info_dup_rows,] }
# filter out any mutations that are not included in BOTH DPClust input and output files (by chromosome and NT location)
# warn the user if any are filtered out
rownames(dpinfo) = paste(dpinfo[,"chr"], dpinfo[,"end"], sep="_")
rownames(cluster_info) = paste(cluster_info[,"chr"], cluster_info[,"end"], sep="_")
shared_rows = intersect(rownames(dpinfo), rownames(cluster_info))
input_only = setdiff(rownames(dpinfo), shared_rows)
output_only = setdiff(rownames(cluster_info), shared_rows)
if (length(input_only) > 0) {
chrs_exc = unique(dpinfo[input_only, "chr"])
if (length(chrs_exc) > 1) {
chr_word = "chromosomes"
chrs_exc_text = paste(chrs_exc, collapse=", ")
} else {
chr_word = "chromosome"
chrs_exc_text = chrs_exc
}
warning(paste0(length(input_only), " mutations on ", chr_word, " ", chrs_exc_text, " were excluded as they were not present in the DPClust output file."))
}
if (length(output_only) > 0) {
chrs_exc = unique(dpinfo[output_only, "chr"])
if (length(chrs_exc) > 1) {
chr_word = "chromosomes"
chrs_exc_text = paste(chrs_exc, collapse=", ")
} else {
chr_word = "chromosome"
chrs_exc_text = chrs_exc
}
warning(paste0(length(output_only), " mutations on ", chr_word, " ", chrs_exc_text, " were excluded as they were not present in the DPClust input file. That is unexpected."))
}
dpinfo = dpinfo[shared_rows,]
cluster_info = cluster_info[shared_rows,]
# check if the number of rows in the filtered dpinfo and cluster_info match - NOTE: the above line mean this should always be true
if (identical(nrow(dpinfo),nrow(cluster_info))){
dpinfo$cluster <- cluster_info$cluster
# position and number of clonal cluster (picking the cluster closest to 1)
dists = abs(optima_info[,2]-1)
position.clonalcluster = optima_info[which(dists==min(dists)),2]
clonalcluster = optima_info[which(dists==min(dists)),1]
# number mutations superclonal cluster - going to assign any mutations in the superclonal clusters CCF=1
superclonal = which(optima_info[,2]>1)
if (length(superclonal)>0){
superclonalcluster = optima_info[superclonal,1]
clonalcluster <- unique(c(clonalcluster,superclonalcluster))
}
clonality <- c()
ccf <- c()
# loop over the cluster allocation
for (j in 1:nrow(dpinfo)){
if (dpinfo$cluster[j]%in%clonalcluster){
clonality[j] <- 'clonal'
ccf[j] <- 1
} else if (!is.na(dpinfo$cluster[j])) {
clonality[j] <- 'subclonal'
ccf[j] <- optima_info[which(optima_info[,1]==dpinfo$cluster[j]),2]
} else {
clonality[j] = NA
ccf[j] = NA
}
}
dpinfo$clonality <- clonality
dpinfo$ccf <- ccf
theseAreNA = which(sapply(dpinfo$ccf, is.na))
if (length(theseAreNA) > 0) { dpinfo = dpinfo[-theseAreNA,] }
write.table(dpinfo, file = dp_annotated_input, row.names = F, quote = F, sep = '\t')
} else {
print(paste0('The number of mutations in the input for DPclust differs from the number of mutations in the output for files ', dp_input_file, ' and ', dp_output_file))
}
}
prepare_driver_mutations_file <- function(file_with_mutations, dp_annotated_input_folder, allsegments_file, driver_mut_file_for_timing){
# load the driver mutations we want to prepare for the ordering script
mutations_to_annotate <- read.table(file_with_mutations, header = T, sep = '\t', stringsAsFactors = F)
# load allsegments_file with ploidy info
allsegments <- read.table(allsegments_file, header = T, sep = '\t', stringsAsFactors = F)
# Tumour_Name, chr, startpos, endpos, nMaj1_A, nMin1_A, tumour_ploidy, CNA, noevent, w.mean, no.chrs.bearing.mut
annotated_mutations <- NULL
unique_samples <- unique(mutations_to_annotate$Tumour_Sample_Barcode)
print(unique_samples)
all_annotated_DPinput <- list.files(dp_annotated_input_folder)
# how many unique driver genes do we have to annotate?
number_genes <- length(unique(mutations_to_annotate$Hugo_Symbol))
# make a table to match the driver genes to the noevent
drivers_table <- cbind(unique(mutations_to_annotate$Hugo_Symbol), 1:number_genes)
drivers_table <- data.frame(drivers_table, stringsAsFactors = F)
colnames(drivers_table) <- c('gene', 'noevent')
# loop over the unique samples
for (i in 1:length(unique_samples)){
print(i)
mutations_to_annotate_sample <- mutations_to_annotate[mutations_to_annotate$Tumour_Sample_Barcode==unique_samples[i],]
# pick the annotated dpinput file for this sample
file_index <- grep(paste0(unique_samples[i], "_"),all_annotated_DPinput)
dpinput_sample <- read.table(file.path(dp_annotated_input_folder,all_annotated_DPinput[file_index]), header = T, sep = '\t', stringsAsFactors = F)
dpinput_mutations <- dpinput_sample[which(dpinput_sample[,1]%in%mutations_to_annotate_sample$Chromosome & dpinput_sample[,2]%in%mutations_to_annotate_sample$Start_Position),]
# make sure that dpinput_mutation and mutations_to_annotate_sample have the same order
mutations_to_annotate_sample <- mutations_to_annotate_sample[order(mutations_to_annotate_sample$Start_Position),]
dpinput_mutations <- dpinput_mutations[order(dpinput_mutations[,2]),]
# check if the positions match
ident_check = identical(mutations_to_annotate_sample$Start_Position, dpinput_mutations[,2])
print(ident_check)
# filter down to just those that are in DPC input AND are nonsynonymous mutations hitting genes in the driver list
if (!ident_check) {
mtas_id = apply(mutations_to_annotate_sample, 1, function(x) { paste(gsub(" *", "", x[c("Chromosome", "Start_Position")]), collapse="_") })
dpim_id = apply(dpinput_mutations, 1, function(x) { paste(gsub(" *", "", x[c("chr", "end")]), collapse="_") })
in_both = intersect(mtas_id, dpim_id)
mtas_in_both = mtas_id %in% in_both
dpim_in_both = dpim_id %in% in_both
mutations_to_annotate_sample = mutations_to_annotate_sample[mtas_in_both,]
dpinput_mutations = dpinput_mutations[dpim_in_both,]
}
n_rows <- nrow(dpinput_mutations)
# determine the noevent_vector for this sample - match each gene with the corresponding noevent from the table
noevent_vector <- drivers_table[match(mutations_to_annotate_sample$Hugo_Symbol, drivers_table[,1]),2]
# determine the tumour_ploidy for that sample
tumour_ploidy <- unique(allsegments$tumour_ploidy[grep(unique_samples[i], allsegments$Tumour_Name)])
if (length(tumour_ploidy) > 0) {
# clonality prefix
clonality_prefix <- ifelse(dpinput_mutations$clonality=='clonal','c','s')
nMajMin = as.data.frame(t(sapply(1:nrow(mutations_to_annotate_sample), function(j) {
allsegments[ which(allsegments$Tumour_Name==unique_samples[i] & allsegments$chr==mutations_to_annotate_sample$Chromosome[j] & allsegments$startpos<=mutations_to_annotate_sample$Start_Position[j] & allsegments$endpos>=mutations_to_annotate_sample$End_Position[j])[1], c("nMaj1_A","nMin1_A") ] # note this version has [1] after the "which" statement in order to catch an unusual case during testing, in which multiple fits exist for some samples. This should probably be removed, reverting to the previous (commented) line, when running properly, as one fit should be selected for each sample.
})))
to_add <- cbind(mutations_to_annotate_sample$Tumour_Sample_Barcode, # Tumour_Name
mutations_to_annotate_sample$Chromosome, # chr
mutations_to_annotate_sample$Start_Position, # startpos
mutations_to_annotate_sample$Start_Position, # endpos
unlist(nMajMin$nMaj1_A), # nMaj1_A
unlist(nMajMin$nMin1_A), # nMin1_A
rep(tumour_ploidy, n_rows), # tumour_ploidy
paste0(clonality_prefix,'Mut_',mutations_to_annotate_sample$Hugo_Symbol), # CNA
noevent_vector, # noevent
dpinput_mutations$ccf, # w.mean
dpinput_mutations$no.chrs.bearing.mut) # no.chrs.bearing.mut
annotated_mutations <- rbind(annotated_mutations, to_add)
}
}
# save the driver mutation data for the timing model
annotated_mutations <- data.frame(annotated_mutations, stringsAsFactors = F)
colnames(annotated_mutations) <- c('Tumour_Name', 'chr', 'startpos', 'endpos', 'nMaj1_A', 'nMin1_A', 'tumour_ploidy', 'CNA', 'noevent', 'w.mean', 'no.chrs.bearing.mut')
annotated_mutations$noevent = 1:nrow(annotated_mutations)
write.table(annotated_mutations,driver_mut_file_for_timing, row.names = F, col.names = T, sep = '\t', quote = F)
}
# If the annotated input file does not already exist, a value should be passed for dp_data_dir
# In this case, the following code block is run to generate the annotated input file
# Otherwise (i.e. if no value is passed for dp_data_dir) the annotated input file is assumed to exist already
run_annotation = FALSE
if (!dir.exists(dp_annotated_input_folder)) {
run_annotation = TRUE
} else {
if (length(dir(dp_annotated_input_folder, all.files=TRUE, no..=TRUE)) == 0) {
run_annotation = TRUE
}
}
if (run_annotation) {
dp_input_files = sort(list.files(dp_data_dir, pattern="_dpclust_input_data.txt", full.names=TRUE, recursive=TRUE))
dp_output_files = sort(list.files(dp_data_dir, pattern="_bestConsensusAssignments.bed", full.names=TRUE, recursive=TRUE))
if (wc_dir != "none") {
dp_optima_files = sort(list.files(wc_dir, pattern="_WCC_ccf_nMut.txt", full.names=TRUE, recursive=TRUE))
} else {
dp_optima_files = sort(list.files(dp_data_dir, pattern="_bestClusterInfo.txt", full.names=TRUE, recursive=TRUE))
}
if (sample_list != "none") {
dp_input_files = dp_input_files[sapply(samples_from_list, grep, dp_input_files)]
dp_output_files = dp_output_files[sapply(samples_from_list, grep, dp_output_files)]
dp_optima_files = dp_optima_files[sapply(samples_from_list, grep, dp_optima_files)]
}
if ((length(dp_input_files) != length(dp_output_files)) || (length(dp_output_files) != length(dp_optima_files))) {
stop(paste0("There are not an equal number of (consistently named) files ending _dpclust_input_data.txt, _bestConsensusAssignments.bed, and _bestClusterInfo.txt in ", dp_data_dir, ". Please copy the ones you need into a new folder and run using that."))
}
for (i in seq_along(dp_input_files)) {
bc = gsub("_dpclust_input_data.txt", "", basename(dp_input_files[i]))
print(paste0(i, ": ", bc))
add_clonality_ccf_DPinput(dp_input_files[i], dp_output_files[i], dp_optima_files[i], paste0(dp_annotated_input_folder, "/", bc, "_dp_annotated_input.txt"))
}
}
driver_symbols = unique(readLines(driver_symbol_file))
load(all_gene_gr_file)
driver_gr = all_gene_gr[all_gene_gr$symbol %in% driver_symbols]
driver_genes = driver_gr$symbol
dp_annotated_infiles = list.files(dp_annotated_input_folder, pattern="dp_annotated_input.txt", recursive=T, full.names=T)
if (sample_list != "none") {
dp_annotated_infiles = dp_annotated_infiles[ gsub("_dp_annotated_input.txt", "", basename(dp_annotated_infiles)) %in% gsub("_DNA$", "", samples_from_list) ]
}
gene_present_counts = rep(0, length(driver_genes))
for (dpc_file in dp_annotated_infiles) {
sample_df = read.table(dpc_file, header=T, stringsAsFactors=F)
sample_gr = GRanges(seqnames=sample_df[,1], ranges=IRanges(start=sample_df[,2], end=sample_df[,2]))
bc = gsub("_dp_annotated_input.txt", "", basename(dpc_file))
# NOTE: some datasets we have worked with have inconsistent naming conventions between the nonsyn muts file and the DPClust data, which included a _vs_[normal] suffix
# If this is the case, your comparison of the two will need to substring one of them in order to find the matches, e.g.:
#nonsyn = nonsyn_muts[nonsyn_muts$Tumor_Barcode == substr(bc,1,13),]
# Otherwise, just find matches directly:
#nonsyn = nonsyn_muts[nonsyn_muts$Tumor_Barcode == bc,]
# NEW APPROACH: allow optional stripping of everything after _vs (inclusive) when matching:
if (strip_vs_when_matching) {
match_bc = gsub("_vs.*", "", bc)
nonsyn_mut_bcs = stripped_nonsyn_mut_bcs
} else {
match_bc = bc
nonsyn_mut_bcs = nonsyn_muts$Tumor_Barcode
}
nonsyn = nonsyn_muts[which(nonsyn_mut_bcs == match_bc),]
nonsyn_gr = GRanges(seqnames=gsub("chr", "", nonsyn$Chromosome), ranges=IRanges(start=nonsyn$Start_Position, end=nonsyn$End_Position))
nonsyn_sample_gr = sample_gr[subjectHits(suppressWarnings(findOverlaps(nonsyn_gr, sample_gr))),]
ov = suppressWarnings(findOverlaps(driver_gr, nonsyn_sample_gr))
gene_present_counts[unique(queryHits(ov))] = gene_present_counts[unique(queryHits(ov))] + 1
}
names(gene_present_counts) = driver_genes
gene_freq = sort((gene_present_counts / length(dp_annotated_infiles)) * 100, decreasing=T)
gene_present_counts = sort(gene_present_counts, decreasing=T)
driver_symbols = names(gene_freq[gene_freq >= 3 & gene_present_counts >= 3])
driver_gr = driver_gr[driver_gr$symbol %in% driver_symbols]
driver_gr = driver_gr[seqnames(driver_gr) %in% c(1:22, "X"),]
driver_genes = driver_gr$symbol
driver_mutations = Reduce(rbind, lapply(dp_annotated_infiles, function(dp_in) {
sample_df = read.table(dp_in, header=T, stringsAsFactors=F)
sample_gr = GRanges(seqnames=sample_df[,1], ranges=IRanges(start=sample_df[,2], end=sample_df[,2]))
bc = gsub("_dp_annotated_input.txt", "", basename(dp_in))
# NOTE: some datasets we have worked with have inconsistent naming conventions between the nonsyn muts file and the DPClust data, which included a _vs_[normal] suffix
# If this is the case, your comparison of the two will need to substring one of them in order to find the matches, e.g.:
#nonsyn = nonsyn_muts[nonsyn_muts$Tumor_Barcode == substr(bc,1,13),]
# Otherwise, just find matches directly:
#nonsyn = nonsyn_muts[nonsyn_muts$Tumor_Barcode == bc,]
# NEW APPROACH: allow optional stripping of everything after _vs (inclusive) when matching:
if (strip_vs_when_matching) {
match_bc = gsub("_vs.*", "", bc)
nonsyn_mut_bcs = stripped_nonsyn_mut_bcs
} else {
match_bc = bc
nonsyn_mut_bcs = nonsyn_muts$Tumor_Barcode
}
nonsyn = nonsyn_muts[which(nonsyn_mut_bcs == match_bc),]
nonsyn_gr = GRanges(seqnames=gsub("chr", "", nonsyn$Chromosome), ranges=IRanges(start=nonsyn$Start_Position, end=nonsyn$End_Position))
nonsyn_sample_gr = sample_gr[subjectHits(suppressWarnings(findOverlaps(nonsyn_gr, sample_gr))),]
ov = suppressWarnings(findOverlaps(driver_gr, nonsyn_sample_gr))
if (length(ov) > 0) {
sample_driver_muts = nonsyn_sample_gr[subjectHits(ov),]
cbind(Hugo_Symbol=mcols(driver_gr[queryHits(ov),])[,'symbol'], Chromosome=as.character(seqnames(sample_driver_muts)), Start_Position=start(sample_driver_muts), End_Position=end(sample_driver_muts), Tumour_Sample_Barcode=bc)
} else {
NULL
}
}))
driver_mut_input_file = paste0(output_dir, "/driver_mut_input_file.txt")
write.table(driver_mutations, file=driver_mut_input_file, col.names=TRUE, row.names=FALSE, quote=FALSE, sep='\t')
all_segments_file=paste0(output_dir, "/", name, "_allsegments.txt")
prepare_driver_mutations_file(driver_mut_input_file, dp_annotated_input_folder, all_segments_file, driver_mut_file_for_timing)