SherlockLung-EvolutionaryTrajectory-Analysis / Ordering_Model / R / TM_3_CN_landscape_CW.R
TM_3_CN_landscape_CW.R
Raw
var_defaults = c(
  input_dir=".",
  output_dir=".",
  chr_file="chromosome_coordinates_hg19.txt"
)

args = commandArgs(trailingOnly = T)

if (length(args) == 0) {
  stop(paste0("Please pass at least one arguments - the name of the tumour set.

USAGE:
Rscript TM_3_CN_landscape_CW_SGE.R TUMOUR_SET_NAME [input_dir=INPUT_DIR] [output_dir=OUTPUT_DIR] [chr_file=CHR_FILE]

"))
}
# pass name of tumour set as first argument
tumour_set = args[1]
# optional: pass file pattern (if different to "subclones.txt") as second argument
if (length(args) > 1) {
  split_args = strsplit(args[2:length(args)], "=")
  for (x in split_args) { assign(x[1], x[2]) }
}
for (x in names(var_defaults)) {
  if (!exists(x)) {
    assign(x, var_defaults[x])
  }
}

cat(paste0("Running with the following variable values:

tumour_set = ", tumour_set, "
input_dir = ", input_dir, "
output_dir = ", output_dir, "
chr_file = ", chr_file, "

"))

library(GenomicRanges)

if (!dir.exists(output_dir)) {
  cat(paste0("output_dir ", output_dir, " does not exist - creating."))
  dir.create(output_dir)
}

CNAs <- c("LOH","HD", "Gain","cLOH","cHD", "cGain","sLOH","sHD", "sGain")

#### script collates each segment for individual tumours to give an overall landscape for each copy number state
#### writes CNA_refsegs output

L=read.table(chr_file,header=T,stringsAsFactors=F)
maxchr=L$end
copy.number.data <- read.table(paste0(input_dir, "/", tumour_set, "_allsegments.txt"),header=T, stringsAsFactors=F)
copy.number.data <- copy.number.data[copy.number.data$frac1_A>0&copy.number.data$frac1_A<=1&copy.number.data$tumour_ploidy>0,]

chr_names <- c(1:22,"X")

for (CNA in CNAs) {

  # Include Amp segments as gains
  if (length(grep("Gain", CNA)) > 0) {
    pat = paste0(CNA, "|", gsub("Gain", "Amp", CNA))
  } else {
    pat = CNA
  }
  input.data <- copy.number.data[grep(pat, copy.number.data$CNA),]
  reduced_gr = suppressWarnings( Reduce(c, lapply(unique(input.data$Tumour_Name), function(barcode) {
    sample_data = input.data[input.data$Tumour_Name == barcode,]
    sample_gr = reduce(GRanges(seqnames=sample_data$chr, ranges=IRanges(start=sample_data$startpos, end=sample_data$endpos)))
  })) )
  data_rle = coverage(reduced_gr)

  data = Reduce(rbind, lapply(chr_names, function(chr_name) {
           if (chr_name %in% names(data_rle)) {
             chr_i = which(chr_names == chr_name)
             chr_mat = cbind(chr_name, NA, cumsum(runLength(data_rle)[[chr_name]]), runValue(data_rle)[[chr_name]])
             if (chr_mat[nrow(chr_mat),3] != maxchr[chr_i]) {
               chr_mat = rbind(chr_mat, c(chr_name, NA, maxchr[chr_i], 0))
             }
             chr_mat[,2] = c(0, as.numeric(chr_mat[1:(nrow(chr_mat)-1),3])+1)
             chr_mat
           } else {
             c(chr_name, 0, maxchr[which(chr_names == chr_name)], 0)
           }
         }))
  colnames(data) = c("chr", "sp", "ep", "val")
  data = as.data.frame(data)
  write.table(data, file = paste0(output_dir, "/", tumour_set, "_", CNA, "_refsegs.txt"), sep="\t", quote=F)

}