SherlockLung-EvolutionaryTrajectory-Analysis / Ordering_Model / R / TM_2_CNA_logic_CW.R
TM_2_CNA_logic_CW.R
Raw
#### script determines copy number states allocated to each segment
#### writes allsegments output

var_defaults = c(
  input_dir=".",
  output_dir=".",
  wgd_file="none"  
)

args = commandArgs(trailingOnly = T)

if (length(args) == 0) {
  stop(paste0("Please pass at least one argument - the name of the tumour set.

USAGE:
Rscript TM_2_CNA_logic_CW.R TUMOUR_SET_NAME [input_dir=INPUT_DIR] [output_dir=OUTPUT_DIR]

"))
}

# pass name of tumour set as first argument (after --args)
tumour_set = args[1]
# optional: pass file pattern (if different to "subclones.txt") as second argument (after --args)
if (length(args) > 1) {
  split_args = strsplit(args[2:length(args)], "=")
  for (x in split_args) { assign(x[1], x[2]) }
}
for (x in names(var_defaults)) {
  if (!exists(x)) {
    assign(x, var_defaults[x])
  }
}

cat(paste0("Running with the following variable values:

tumour_set = ", tumour_set, "
input_dir = ", input_dir, "
output_dir = ", output_dir, "
wgd_file = ", wgd_file, "

"))

# Annotate segments as their appropriate copy number varaiant type, or NoCNV
CNA_logic = function (tumour.type, input_dir, output_dir, wgd_file) {  
  hd <- c("Tumour_Name",  "chr", "startpos", "endpos","nMaj1_A","nMin1_A", "frac1_A", "SDfrac_A", "tumour_ploidy")
  
  print(tumour.type)
  
  allsegs <- read.table(paste0(input_dir, "/", tumour.type,"_allsegs.txt"), header = T, stringsAsFactors = F)
  tot=dim(allsegs)[1]

  if (wgd_file != "none" && tot > 0) {
    wgd_status = read.table(wgd_file, sep="\t", stringsAsFactors=FALSE)
    sample_ploidies = unique(allsegs[,c("Tumour_Name","tumour_ploidy")])
    for (row_i in 1:nrow(sample_ploidies)) {
      barcode = sample_ploidies[row_i,1]
      ploidy = sample_ploidies[row_i,2]
      sample_wgd_status = wgd_status[wgd_status[,1] == barcode,2]
      # in some sample sets, dots have been replaced with underscores
      # if a WGD status is not found 
      if (length(sample_wgd_status) == 0) {
        sample_wgd_status = wgd_status[gsub("\\.", "_", wgd_status[,1]) == barcode,2]
      }
      if (sample_wgd_status == "nWGD") {
        new_ploidy = 2
      } else if (ploidy > 4) {
        new_ploidy = ploidy
      } else {
        new_ploidy = 4
      }
      if (new_ploidy != ploidy) {
        allsegs[allsegs[,"Tumour_Name"] == barcode,"tumour_ploidy"] = new_ploidy
      }
    }
  }

  # If you have clonal LOH, subclonal LOH is not counted as we look for the first occurrence of a given event
  # Losses annotated but will not be counted in calculations of enriched regions
  allsegsa <- NULL
  CNA <- NULL
  for (i in 1:dim(allsegs)[1]) {
    if (is.na(allsegs$nMaj2_A[i])) {
      if (allsegs$nMin1_A[i] == 0 & allsegs$nMaj1_A[i] == 0) {
        allsegsa <- rbind(allsegsa, allsegs[i,c(1:7,11:12)])
        CNA <- c(CNA, "cHD")
      }
      if (xor(allsegs$nMin1_A[i] == 0, allsegs$nMaj1_A[i] == 0)) {
        allsegsa <- rbind(allsegsa, allsegs[i,c(1:7,11:12)])
        CNA <- c(CNA, "cLOH")
      }
      if ((allsegs$nMaj1_A[i] > allsegs$tumour_ploidy[i]/2) |
            (allsegs$nMin1_A[i] > allsegs$tumour_ploidy[i]/2)) {
        if ((allsegs$nMaj1_A[i] > allsegs$tumour_ploidy[i]*2) |
              (allsegs$nMin1_A[i] > allsegs$tumour_ploidy[i]*2)) {
          allsegsa <- rbind(allsegsa, allsegs[i,c(1:7,11:12)])
          CNA <- c(CNA, "cAmp")
        }
        else {
          allsegsa <- rbind(allsegsa, allsegs[i,c(1:7,11:12)])
          CNA <- c(CNA, "cGain")
        }
      }
      if ((allsegs$nMaj1_A[i] == allsegs$tumour_ploidy[i]/2) &
            (allsegs$nMin1_A[i] == allsegs$tumour_ploidy[i]/2)) {
        allsegsa <- rbind(allsegsa, allsegs[i,c(1:7,11:12)])
        CNA <- c(CNA, "NoCNV")
      }
      if ((allsegs$nMaj1_A[i] < allsegs$tumour_ploidy[i]/2) |
            (allsegs$nMin1_A[i] < allsegs$tumour_ploidy[i]/2)) {
        allsegsa <- rbind(allsegsa, allsegs[i,c(1:7,11:12)])
        CNA <- c(CNA, "cLoss")
      }
    }
    if (!is.na(allsegs$nMaj2_A[i])) {
      if (allsegs$nMin1_A[i] == 0 & allsegs$nMaj1_A[i] == 0) {
        CNA <- c(CNA, "sHD")
        allsegsa <- rbind(allsegsa, allsegs[i,c(1:7,11:12)])
      }
      if ((allsegs$nMin1_A[i] == 0 | allsegs$nMaj1_A[i] == 0) &
            xor(allsegs$nMin2_A[i] == 0, allsegs$nMaj2_A[i] == 0)) {
        CNA <- c(CNA, "cLOH")
        tmp <- allsegs[i,c(1:4,8:12)]
        names(tmp) <- hd
        allsegsa <- rbind(allsegsa, tmp[,hd])
      }
      else if (xor(allsegs$nMin1_A[i] == 0, allsegs$nMaj1_A[i] == 0)) {
        CNA <- c(CNA, "sLOH")
        allsegsa <- rbind(allsegsa, allsegs[i,c(1:7,11:12)])
      }
      if ((allsegs$nMaj1_A[i] > allsegs$tumour_ploidy[i]/2 |
             allsegs$nMin1_A[i] > allsegs$tumour_ploidy[i]/2) &
            (allsegs$nMaj2_A[i] > allsegs$tumour_ploidy[i]/2 |
               allsegs$nMin2_A[i] > allsegs$tumour_ploidy[i]/2)) {
        if ((allsegs$nMaj1_A[i] > allsegs$tumour_ploidy[i]*2 |
               allsegs$nMin1_A[i] > allsegs$tumour_ploidy[i]*2) &
              (allsegs$nMaj2_A[i] > allsegs$tumour_ploidy[i]*2 |
                 allsegs$nMin2_A[i] > allsegs$tumour_ploidy[i]*2)) {
          CNA <- c(CNA, "cAmp")
          allsegsa <- rbind(allsegsa, allsegs[i,c(1:7,11:12)])
        }
        else {
          CNA <- c(CNA, "cGain")
          allsegsa <- rbind(allsegsa, allsegs[i,c(1:7,11:12)])
        }
      }
      if ((allsegs$nMin2_A[i] > allsegs$tumour_ploidy[i]/2 & allsegs$nMin1_A[i] != allsegs$nMin2_A[i])|
            (allsegs$nMaj2_A[i] > allsegs$tumour_ploidy[i]/2 & allsegs$nMaj1_A[i] != allsegs$nMaj2_A[i])) {
        if ((allsegs$nMin2_A[i] > allsegs$tumour_ploidy[i]*2 & allsegs$nMin1_A[i] != allsegs$nMin2_A[i]) |
              (allsegs$nMaj2_A[i] > allsegs$tumour_ploidy[i]*2 & allsegs$nMaj1_A[i] != allsegs$nMaj2_A[i])) {
          CNA <- c(CNA, "sAmp")
          tmp <- allsegs[i,c(1:4,8:12)]
          names(tmp) <- hd
          allsegsa <- rbind(allsegsa, tmp[,hd])
        }
        else {
          CNA <- c(CNA, "sGain")
          tmp <- allsegs[i,c(1:4,8:12)]
          names(tmp) <- hd
          allsegsa <- rbind(allsegsa, tmp[,hd])
        }
      }
      if ((allsegs$nMaj1_A[i] < allsegs$tumour_ploidy[i]/2 |
             allsegs$nMin1_A[i] < allsegs$tumour_ploidy[i]/2) &
            (allsegs$nMaj2_A[i] < allsegs$tumour_ploidy[i]/2 |
               allsegs$nMin2_A[i] < allsegs$tumour_ploidy[i]/2)) {
        CNA <- c(CNA, "cLoss")
        tmp <- allsegs[i,c(1:4,8:12)]
        names(tmp) <- hd
        allsegsa <- rbind(allsegsa, tmp[,hd])
      }
      if (((allsegs$nMaj1_A[i] < allsegs$tumour_ploidy[i]/2 & allsegs$nMaj1_A[i] != allsegs$nMaj2_A[i]) |
             (allsegs$nMin1_A[i] < allsegs$tumour_ploidy[i]/2 & allsegs$nMin1_A[i] != allsegs$nMin2_A[i]))) {
        allsegsa <- rbind(allsegsa, allsegs[i,c(1:7,11:12)])
        CNA <- c(CNA, "sLoss")      
      }
    }
    if(i %% 100 ==0){
      print(paste(i,"/",tot))
    }
  }  
  
  allsegsa <- cbind(allsegsa, CNA)
  allsegsa$frac1_A[allsegsa$CNA == "cGain"|allsegsa$CNA == "cAmp"|allsegsa$CNA == "cLOH"|allsegsa$CNA == "cHD"|allsegsa$CNA == "cLoss"] <- 1
  write.table(allsegsa, file = paste0(output_dir, "/", tumour.type,"_allsegments.txt"),sep="\t",quote = F,row.names = F)
  print("CNALogic done")
}
CNA_logic(tumour.type=tumour_set, input_dir, output_dir, wgd_file)