SherlockLung-EvolutionaryTrajectory-Analysis / Ordering_Model / R / Binomial.R
Binomial.R
Raw
args = commandArgs(trailingOnly = T)

if (length(args) < 1) {
  stop(paste0("Please pass at least 1 argument - the name of the tumour set.

USAGE:
Rscript Binomial.R TUMOUR_SET_NAME [input_dir=INPUT_DIR] [subclones_dir=SUBCLONES_DIR] [output_dir=OUTPUT_DIR] [sample_list=SAMPLE_LIST]

"))
}
# pass name of tumour set as first argument (after --args)
tumour_set = args[1]

var_defaults = c(
  input_dir=".",
  subclones_dir=".",
  sample_list="none"
)

if (length(args) > 1) {
  split_args = strsplit(args[2:length(args)], "=")
  for (x in split_args) { assign(x[1], x[2]) }
}
for (x in names(var_defaults)) {
  if (!exists(x)) {
    assign(x, var_defaults[x])
  }
}

if (!exists("output_dir")) {
  assign("output_dir", input_dir)
}

cat(paste0("Running with the following variable values:

tumour_set = ", tumour_set, "
input_dir = ", input_dir, "
subclones_dir = ", subclones_dir, "
output_dir = ", output_dir, "
sample_list = ", sample_list, "

"))

library(GenomicRanges)

sc_files = list.files(subclones_dir, full.names=TRUE)

allsegsa <- read.table(paste0(input_dir, "/", tumour_set,"_allsegments.txt"),header=T,stringsAsFactors=F,sep="\t")
all_data <- allsegsa[allsegsa$frac1_A>0&allsegsa$frac1_A<=1&allsegsa$tumour_ploidy>0,]
if (sample_list != "none") {
  bc = readLines(sample_list)
} else {
  bc = unique(all_data$Tumour_Name)
}

CNA_types = c("Gain", "LOH", "HD")
cna_segment_counts = lapply(CNA_types, function(CNA_type) {
  if (CNA_type == "Gain") {
    CNA_data = all_data[(all_data$CNA=="sGain"|all_data$CNA=="cGain"|all_data$CNA=="sAmp"|all_data$CNA=="cAmp"),]
  } else {
    CNA_data = all_data[(all_data$CNA==paste0("s", CNA_type)|all_data$CNA==paste0("c", CNA_type)),]
  }

  CNA_ratios = sapply(bc, function(barcode) {
    sample_CNA_data = CNA_data[CNA_data$Tumour_Name == barcode,]
    cna_w = sum(width(reduce(GRanges(seqnames=sample_CNA_data$chr, ranges=IRanges(start=sample_CNA_data$startpos, end=sample_CNA_data$endpos)))))
	# grep command must include "fixed=TRUE" otherwise any dots in barcodes act as wildcards
        sample_sc = read.table(sc_files[grep(barcode, sc_files, fixed=TRUE)], header=TRUE, stringsAsFactors=FALSE)
    sc_w = sum(width(reduce(GRanges(seqnames=sample_sc$chr, ranges=IRanges(start=sample_sc$startpos, end=sample_sc$endpos)))))
	cna_w / sc_w
  })
  background_level = mean(CNA_ratios)
  
  n_samples = length(bc)

  segment_counts = read.table(paste0(input_dir, "/", tumour_set,"_", CNA_type, "_refsegs.txt"),header=T,stringsAsFactors=F)
  #segment_counts = read.table(paste0(input_dir, "/", tumour_set,"_", CNA_type, "_refsegs_fromGR.txt"),header=T,stringsAsFactors=F)

  # val - 1 as we want the probability of seeing AT LEAST q. With lower.tail=FALSE, pbinom gives the probability of observing > q instances.
  segment_counts$p.val = pbinom(q=segment_counts[,"val"] - 1, size=n_samples, prob=background_level, lower.tail=FALSE)
  segment_counts$bonf = p.adjust(segment_counts$p.val,method="bonferroni")
  segment_counts$fdr = p.adjust(segment_counts$p.val,method="fdr")

  segment_counts
})

for (i in seq_along(CNA_types)) {
  write.table(cna_segment_counts[[i]], paste0(output_dir, "/", tumour_set, "_samples_pvals_", CNA_types[i], ".txt"),sep="\t",quote=F)
  write.table(cna_segment_counts[[i]][cna_segment_counts[[i]]$bonf<=0.05,], paste0(output_dir, "/", tumour_set, "_samples_significant_pvals_Bonferroni_", CNA_types[i], ".txt"),sep="\t",quote=F)
  write.table(cna_segment_counts[[i]][cna_segment_counts[[i]]$fdr<=0.05,], paste0(output_dir, "/", tumour_set, "_samples_significant_pvals_FDR_", CNA_types[i], ".txt"),sep="\t",quote=F)
}