Ordering_Model/R/TM_postFDR_1_enriched_CW.R · SherlockLung-EvolutionaryTrajectory-Analysis

# NAP: this script merges segments with significant enrichment (FDR<0.05) into bigger enriched regions and removes known artefacts

var_defaults = c(
  chr_file="chromosome_coordinates_hg19.txt",
  input_dir=".",
  output_dir=".",
  min_region=1e4, # minimum length of enriched region; default at 1e4 to remove artefact peaks
  minN=3, # minimum of maximum N per segment to be used; takes care of rare/singleton events
  centro_file="none",
  remove_acrocentric="TRUE"
)

args = commandArgs(trailingOnly = T)

if (length(args) == 0) {
  stop(paste0("Please pass at least one argument - the name of the tumour set.

USAGE:
Rscript TM_postFDR_1_enriched_CW.R TUMOUR_SET_NAME [input_dir=INPUT_DIR] [output_dir=OUTPUT_DIR] [chr_file=CHR_FILE]

"))
}

# pass name of tumour set as first argument
tumour_set = args[1]
if (length(args) > 1) {
  split_args = strsplit(args[2:length(args)], "=")
  for (x in split_args) { assign(x[1], x[2]) }
}
for (x in names(var_defaults)) {
  if (!exists(x)) {
    assign(x, var_defaults[x])
  }
}

cat(paste0("Running with the following variable values:

tumour_set = ", tumour_set, "
input_dir = ", input_dir, "
output_dir = ", output_dir, "
chr_file = ", chr_file, "
min_region = ", min_region, "
minN = ", minN, "
centro_file = ", centro_file, "
remove_acrocentric = ", remove_acrocentric, "

"))

min_region = as.numeric(min_region)
minN = as.numeric(minN)
remove_acrocentric = as.logical(remove_acrocentric)

library(dplyr)
chr_loc=read.table(chr_file,header=T,stringsAsFactors = F)
groups=tumour_set
#groups=c("her2_nig_only_bc","her2_white_only_bc")
cna_type=c("Gain","LOH","HD")
#
#
# LOOP TO GENERATE OUTPUT FOR ALL THREE CNA TYPES
for (grp in 1:length(groups)){
  allsegments<-read.table(paste0(input_dir, "/", groups[grp],"_allsegments.txt"),sep="\t",stringsAsFactors=F,header=T)
  Ntumour=length(unique(allsegments$Tumour_Name))
  for (cna in 1:length(cna_type)){
    data=read.table(paste0(input_dir, "/", groups[grp],"_samples_significant_pvals_FDR_",cna_type[cna],".txt"),header=T,stringsAsFactors=F)
    print(paste(groups[grp],cna_type[cna],"file read"))
    data<-data.frame("chr" = data$chr, "sp" = data$sp, "ep"= data$ep, "val"= data$val, "fdr"= data$fdr)
    data=data[which(data$fdr<0.05),] #just checking - all fdr must be below 0.05 from FDRSummary_*.R code
    print(dim(data))
    if (nrow(data)>0){
      #cumulative chromosome coordinate generation
      v=NULL
      for (i in c(1:22,"X")){
        data.chr<-data[data$chr %in% i,]
        norow<-nrow(data.chr)
        if(i=="X"){
          j=23
        }else{
          j=i
        }
        v[j]=norow #number of rows in data for each chromomsome
      }
      v2=cumsum(v)

      all.sig.data=data.frame()
      for (i in c(1:22,"X")){
        data.chr<-data[data$chr %in% i,]
  
        if (nrow(data.chr)>0){
          segnumber<-nrow(data.chr)
          tempvector=NULL
          sig.data=NULL
          no.chr=ifelse(i=="X",22,as.numeric(i)-1)
          for( j in 1:segnumber){
            k=ifelse(no.chr==0,j,v2[no.chr]+j) #NAP: cumulative row numbering
            sig.data.segment<-data.frame("chr"=i, "startpos"= data.chr$sp[j], "endpos"= data.chr$ep[j], "eventindices"= k)
            all.sig.data<-rbind(all.sig.data,sig.data.segment)
            sig.data.segment=NULL
          }
          print(nrow(all.sig.data))
        } else {
          print(paste("chr",i,"has no enriched segments"))
        }
      }
      all.sig.data$chr=as.character(all.sig.data$chr)
      
      enriched=data.frame()
      chrs=unique(all.sig.data$chr)
      
      for (i in 1:length(chrs)){
        seg_count=nrow(enriched)
        CHR=all.sig.data[which(all.sig.data$chr==chrs[i]),]
        if (nrow(CHR)==1){
          enriched=rbind(enriched,data.frame(chr=chrs[i],startpos=CHR$startpos,endpos=CHR$endpos,length=CHR$endpos-CHR$startpos))
        } else {
          start=CHR$startpos[1] 
          for (j in 2:nrow(CHR)){
            if (CHR$startpos[j]<=CHR$endpos[j-1]+1e4){ # change interval to appropriate level (e.g. 1e4 or 1e5)
              end=CHR$endpos[j] # include the new row (j) in the merge
            } else {
              end=CHR$endpos[j-1] # stop merge at the previous row (j-1)
              enriched=rbind(enriched,data.frame(chr=chrs[i],startpos=start,endpos=end,length=(end-start)+1))
              start=CHR$startpos[j]
            }
          }
          if (start==CHR$startpos[1] & end==CHR$endpos[nrow(CHR)]) {
            enriched=rbind(enriched,data.frame(chr=chrs[i],startpos=start,endpos=end,length=(end-start)+1)) # if all segments were adjacent to each other
            print("one large segment")
          } else {
            enriched=rbind(enriched,data.frame(chr=chrs[i],startpos=start,endpos=CHR$endpos[j],length=(CHR$endpos[j]-start)+1)) # if final segment is distant from previous
            print("final segment distant from previous")
          }
        }
        segs_per_chr=nrow(enriched)-seg_count
        print(paste(chrs[i],"= ",segs_per_chr,"enriched region(s)"))
      }
      enriched$chr=as.character(enriched$chr)
      
      #identify enriched regions which are at telomeres and REMOVE ######################################################
      chrom=as.vector(unique(enriched$chr))
      ENRICHED=data.frame()

      for (i in 1:length(chrom)) {
        J=NULL
        CHROM=enriched[which(enriched$chr==chrom[i]),]
        for (j in 1:nrow(CHROM)) {
          if (CHROM$endpos[j]>=chr_loc[ifelse(chrom[i]!="X",chrom[i],23),]$end-1e6 & CHROM$length[j]<=5e6){ # length increased to 5Mb from 2Mb
            J=append(J,j)
          }
          if (CHROM$startpos[j]<1e6 & CHROM$length[j]<=2e6){
            J=append(J,j)
          }
        }
        if (!is.null(J)){
          print(paste("enriched segment",J,"removed from chromosome",chrom[i]))
          CHROM=CHROM[-J,]
        }
        if (nrow(CHROM)>0){
          ENRICHED=rbind(ENRICHED,CHROM)
        }  else {
          print(paste("Chromosome",chrom[i],": only telomeric and removed"))
        }
      }
      
      #identify enriched regions which are at centromeres and REMOVE ######################################################
      if (centro_file != "none") {
        centromere_loci = read.table(centro_file, header=T, sep='\t', stringsAsFactors=FALSE)
        if (nrow(ENRICHED)>0){
          library(GenomicRanges)
          enriched_gr = GRanges(seqnames = ENRICHED$chr, ranges=IRanges(start = ENRICHED$startpos, end = ENRICHED$endpos))
          centro_gr = as(centromere_loci, "GRanges")
          #to_remove = queryHits(suppressWarnings(findOverlaps(enriched_gr, centro_gr)))
          hits = suppressWarnings(findOverlaps(enriched_gr, centro_gr))
          if (length(hits) > 0) {
            p <- Pairs(enriched_gr, centro_gr, hits = hits)
            ol_inter <- pintersect(p)
            # remove segments in which over 50% overlaps with centromere
            to_remove = queryHits(hits)[width(ol_inter) / width(enriched_gr[queryHits(hits),]) > 0.5]
          } else {
            to_remove = c()
          }
		  
          if (length(to_remove)>0){
            ENRICHED = ENRICHED[-to_remove,]
            print(paste("number of centromeric segments removed =",length(to_remove)))
          } else {
            print("no centromere overlap")
          }
        }
      }
      
      if (remove_acrocentric) {
        if (nrow(ENRICHED)>0) {
          acro_chrs = c("13","14","15","21","22")
          to_remove = which(apply(ENRICHED, 1, function(seg) {
            if (as.character(seg['chr']) %in% acro_chrs) {
              as.numeric(seg['startpos']) < as.numeric(centromere_loci[as.character(centromere_loci[,'chr']) == as.character(seg['chr']),'start'])
            } else {
              FALSE
            }
          }))
          if (length(to_remove)>0){
            ENRICHED = ENRICHED[-to_remove,]
            print(paste("number of segments removed due to hitting acrocentric p arms =",length(to_remove)))
          } else {
            print("no segments hitting acrocentric p arms")
          }
        }
      }
      
      #identify enriched regions which are at the HLA region and REMOVE ######################################################
      #https://www.ncbi.nlm.nih.gov/grc/human/regions/MHC?asm=GRCh37 MHC region based on hg37
      if (nrow(ENRICHED)>0){
        #hla=c(28477797,33448354)  # hg19 coords
        hla=c(28510120,33480577)  #hg38 coords
        HLA=ENRICHED[which(ENRICHED$chr==6 & ((ENRICHED$startpos>hla[1] & ENRICHED$endpos<hla[2])|(ENRICHED$endpos>hla[1] & ENRICHED$startpos<hla[1])|(ENRICHED$startpos<hla[2] & ENRICHED$endpos>hla[2]))),]
        if (nrow(HLA)>0){
          print(HLA)
          print(paste("number of HLA segments removed =",nrow(HLA)))
        } else {
          print("no HLA overlap")
        }
        ENRICHED=setdiff(ENRICHED,HLA)
      }
      
      # REMOVE SINGLETONS/RARE DOUBLETS when N is high:
      rare=NULL
      if (nrow(ENRICHED)>0){
        for (i in 1:nrow(ENRICHED)){
          print(i)
          ENRICHEDi=ENRICHED[i,]
          idata=data[which(data$chr==ENRICHEDi$chr & data$sp<=ENRICHEDi$endpos & data$ep>=ENRICHEDi$startpos),]
          
          if (max(idata$val)<minN){
            rare=append(rare,i)
            print(paste("Event",i,"removed"))
          } else {
            print(paste("Event",i,"max =",max(idata$val)))
          }
        }
      }
      if (!is.null(rare)){
        ENRICHEDclean=ENRICHED[-rare,]
      } else {
        ENRICHEDclean=ENRICHED
      }
      
      # for noevent:
      if (nrow(ENRICHEDclean)>0){
        ENRICHEDclean=ENRICHEDclean[which(ENRICHEDclean$length>min_region),]
        print(range(ENRICHEDclean$length))
        ENRICHEDclean$length=NULL
        ENRICHEDclean$noevent=1:nrow(ENRICHEDclean)
        print(paste("No. of enriched regions=",nrow(ENRICHEDclean)))
        print(ENRICHEDclean)
        write.table(ENRICHEDclean, paste0(output_dir, "/", groups[grp],"_enriched_regions_",cna_type[cna],"_noevent.txt"), sep="\t",quote=F,row.names = F)
      } else {
        print(paste(groups[grp],cna_type[cna],":no file written out - no enriched region remained after correction"))
      }
    } else {
      print(paste(paste0(groups[grp],"_samples_significant_pvals_FDR_",cna_type[cna],".txt"),"is empty"))
    }
  }
}