args = commandArgs(trailingOnly = T)
if (length(args) < 1) {
stop(paste0("Please pass at least 1 argument - the name of the tumour set.
USAGE:
Rscript Binomial.R TUMOUR_SET_NAME [input_dir=INPUT_DIR] [subclones_dir=SUBCLONES_DIR] [output_dir=OUTPUT_DIR] [sample_list=SAMPLE_LIST]
"))
}
# pass name of tumour set as first argument (after --args)
tumour_set = args[1]
var_defaults = c(
input_dir=".",
subclones_dir=".",
sample_list="none"
)
if (length(args) > 1) {
split_args = strsplit(args[2:length(args)], "=")
for (x in split_args) { assign(x[1], x[2]) }
}
for (x in names(var_defaults)) {
if (!exists(x)) {
assign(x, var_defaults[x])
}
}
if (!exists("output_dir")) {
assign("output_dir", input_dir)
}
cat(paste0("Running with the following variable values:
tumour_set = ", tumour_set, "
input_dir = ", input_dir, "
subclones_dir = ", subclones_dir, "
output_dir = ", output_dir, "
sample_list = ", sample_list, "
"))
library(GenomicRanges)
sc_files = list.files(subclones_dir, full.names=TRUE)
allsegsa <- read.table(paste0(input_dir, "/", tumour_set,"_allsegments.txt"),header=T,stringsAsFactors=F,sep="\t")
all_data <- allsegsa[allsegsa$frac1_A>0&allsegsa$frac1_A<=1&allsegsa$tumour_ploidy>0,]
if (sample_list != "none") {
bc = readLines(sample_list)
} else {
bc = unique(all_data$Tumour_Name)
}
CNA_types = c("Gain", "LOH", "HD")
cna_segment_counts = lapply(CNA_types, function(CNA_type) {
if (CNA_type == "Gain") {
CNA_data = all_data[(all_data$CNA=="sGain"|all_data$CNA=="cGain"|all_data$CNA=="sAmp"|all_data$CNA=="cAmp"),]
} else {
CNA_data = all_data[(all_data$CNA==paste0("s", CNA_type)|all_data$CNA==paste0("c", CNA_type)),]
}
CNA_ratios = sapply(bc, function(barcode) {
sample_CNA_data = CNA_data[CNA_data$Tumour_Name == barcode,]
cna_w = sum(width(reduce(GRanges(seqnames=sample_CNA_data$chr, ranges=IRanges(start=sample_CNA_data$startpos, end=sample_CNA_data$endpos)))))
# grep command must include "fixed=TRUE" otherwise any dots in barcodes act as wildcards
sample_sc = read.table(sc_files[grep(barcode, sc_files, fixed=TRUE)], header=TRUE, stringsAsFactors=FALSE)
sc_w = sum(width(reduce(GRanges(seqnames=sample_sc$chr, ranges=IRanges(start=sample_sc$startpos, end=sample_sc$endpos)))))
cna_w / sc_w
})
background_level = mean(CNA_ratios)
n_samples = length(bc)
segment_counts = read.table(paste0(input_dir, "/", tumour_set,"_", CNA_type, "_refsegs.txt"),header=T,stringsAsFactors=F)
#segment_counts = read.table(paste0(input_dir, "/", tumour_set,"_", CNA_type, "_refsegs_fromGR.txt"),header=T,stringsAsFactors=F)
# val - 1 as we want the probability of seeing AT LEAST q. With lower.tail=FALSE, pbinom gives the probability of observing > q instances.
segment_counts$p.val = pbinom(q=segment_counts[,"val"] - 1, size=n_samples, prob=background_level, lower.tail=FALSE)
segment_counts$bonf = p.adjust(segment_counts$p.val,method="bonferroni")
segment_counts$fdr = p.adjust(segment_counts$p.val,method="fdr")
segment_counts
})
for (i in seq_along(CNA_types)) {
write.table(cna_segment_counts[[i]], paste0(output_dir, "/", tumour_set, "_samples_pvals_", CNA_types[i], ".txt"),sep="\t",quote=F)
write.table(cna_segment_counts[[i]][cna_segment_counts[[i]]$bonf<=0.05,], paste0(output_dir, "/", tumour_set, "_samples_significant_pvals_Bonferroni_", CNA_types[i], ".txt"),sep="\t",quote=F)
write.table(cna_segment_counts[[i]][cna_segment_counts[[i]]$fdr<=0.05,], paste0(output_dir, "/", tumour_set, "_samples_significant_pvals_FDR_", CNA_types[i], ".txt"),sep="\t",quote=F)
}