# NAP: this script merges segments with significant enrichment (FDR<0.05) into bigger enriched regions and removes known artefacts
var_defaults = c(
chr_file="chromosome_coordinates_hg19.txt",
input_dir=".",
output_dir=".",
min_region=1e4, # minimum length of enriched region; default at 1e4 to remove artefact peaks
minN=3, # minimum of maximum N per segment to be used; takes care of rare/singleton events
centro_file="none",
remove_acrocentric="TRUE"
)
args = commandArgs(trailingOnly = T)
if (length(args) == 0) {
stop(paste0("Please pass at least one argument - the name of the tumour set.
USAGE:
Rscript TM_postFDR_1_enriched_CW.R TUMOUR_SET_NAME [input_dir=INPUT_DIR] [output_dir=OUTPUT_DIR] [chr_file=CHR_FILE]
"))
}
# pass name of tumour set as first argument
tumour_set = args[1]
if (length(args) > 1) {
split_args = strsplit(args[2:length(args)], "=")
for (x in split_args) { assign(x[1], x[2]) }
}
for (x in names(var_defaults)) {
if (!exists(x)) {
assign(x, var_defaults[x])
}
}
cat(paste0("Running with the following variable values:
tumour_set = ", tumour_set, "
input_dir = ", input_dir, "
output_dir = ", output_dir, "
chr_file = ", chr_file, "
min_region = ", min_region, "
minN = ", minN, "
centro_file = ", centro_file, "
remove_acrocentric = ", remove_acrocentric, "
"))
min_region = as.numeric(min_region)
minN = as.numeric(minN)
remove_acrocentric = as.logical(remove_acrocentric)
library(dplyr)
chr_loc=read.table(chr_file,header=T,stringsAsFactors = F)
groups=tumour_set
#groups=c("her2_nig_only_bc","her2_white_only_bc")
cna_type=c("Gain","LOH","HD")
#
#
# LOOP TO GENERATE OUTPUT FOR ALL THREE CNA TYPES
for (grp in 1:length(groups)){
allsegments<-read.table(paste0(input_dir, "/", groups[grp],"_allsegments.txt"),sep="\t",stringsAsFactors=F,header=T)
Ntumour=length(unique(allsegments$Tumour_Name))
for (cna in 1:length(cna_type)){
data=read.table(paste0(input_dir, "/", groups[grp],"_samples_significant_pvals_FDR_",cna_type[cna],".txt"),header=T,stringsAsFactors=F)
print(paste(groups[grp],cna_type[cna],"file read"))
data<-data.frame("chr" = data$chr, "sp" = data$sp, "ep"= data$ep, "val"= data$val, "fdr"= data$fdr)
data=data[which(data$fdr<0.05),] #just checking - all fdr must be below 0.05 from FDRSummary_*.R code
print(dim(data))
if (nrow(data)>0){
#cumulative chromosome coordinate generation
v=NULL
for (i in c(1:22,"X")){
data.chr<-data[data$chr %in% i,]
norow<-nrow(data.chr)
if(i=="X"){
j=23
}else{
j=i
}
v[j]=norow #number of rows in data for each chromomsome
}
v2=cumsum(v)
all.sig.data=data.frame()
for (i in c(1:22,"X")){
data.chr<-data[data$chr %in% i,]
if (nrow(data.chr)>0){
segnumber<-nrow(data.chr)
tempvector=NULL
sig.data=NULL
no.chr=ifelse(i=="X",22,as.numeric(i)-1)
for( j in 1:segnumber){
k=ifelse(no.chr==0,j,v2[no.chr]+j) #NAP: cumulative row numbering
sig.data.segment<-data.frame("chr"=i, "startpos"= data.chr$sp[j], "endpos"= data.chr$ep[j], "eventindices"= k)
all.sig.data<-rbind(all.sig.data,sig.data.segment)
sig.data.segment=NULL
}
print(nrow(all.sig.data))
} else {
print(paste("chr",i,"has no enriched segments"))
}
}
all.sig.data$chr=as.character(all.sig.data$chr)
enriched=data.frame()
chrs=unique(all.sig.data$chr)
for (i in 1:length(chrs)){
seg_count=nrow(enriched)
CHR=all.sig.data[which(all.sig.data$chr==chrs[i]),]
if (nrow(CHR)==1){
enriched=rbind(enriched,data.frame(chr=chrs[i],startpos=CHR$startpos,endpos=CHR$endpos,length=CHR$endpos-CHR$startpos))
} else {
start=CHR$startpos[1]
for (j in 2:nrow(CHR)){
if (CHR$startpos[j]<=CHR$endpos[j-1]+1e4){ # change interval to appropriate level (e.g. 1e4 or 1e5)
end=CHR$endpos[j] # include the new row (j) in the merge
} else {
end=CHR$endpos[j-1] # stop merge at the previous row (j-1)
enriched=rbind(enriched,data.frame(chr=chrs[i],startpos=start,endpos=end,length=(end-start)+1))
start=CHR$startpos[j]
}
}
if (start==CHR$startpos[1] & end==CHR$endpos[nrow(CHR)]) {
enriched=rbind(enriched,data.frame(chr=chrs[i],startpos=start,endpos=end,length=(end-start)+1)) # if all segments were adjacent to each other
print("one large segment")
} else {
enriched=rbind(enriched,data.frame(chr=chrs[i],startpos=start,endpos=CHR$endpos[j],length=(CHR$endpos[j]-start)+1)) # if final segment is distant from previous
print("final segment distant from previous")
}
}
segs_per_chr=nrow(enriched)-seg_count
print(paste(chrs[i],"= ",segs_per_chr,"enriched region(s)"))
}
enriched$chr=as.character(enriched$chr)
#identify enriched regions which are at telomeres and REMOVE ######################################################
chrom=as.vector(unique(enriched$chr))
ENRICHED=data.frame()
for (i in 1:length(chrom)) {
J=NULL
CHROM=enriched[which(enriched$chr==chrom[i]),]
for (j in 1:nrow(CHROM)) {
if (CHROM$endpos[j]>=chr_loc[ifelse(chrom[i]!="X",chrom[i],23),]$end-1e6 & CHROM$length[j]<=5e6){ # length increased to 5Mb from 2Mb
J=append(J,j)
}
if (CHROM$startpos[j]<1e6 & CHROM$length[j]<=2e6){
J=append(J,j)
}
}
if (!is.null(J)){
print(paste("enriched segment",J,"removed from chromosome",chrom[i]))
CHROM=CHROM[-J,]
}
if (nrow(CHROM)>0){
ENRICHED=rbind(ENRICHED,CHROM)
} else {
print(paste("Chromosome",chrom[i],": only telomeric and removed"))
}
}
#identify enriched regions which are at centromeres and REMOVE ######################################################
if (centro_file != "none") {
centromere_loci = read.table(centro_file, header=T, sep='\t', stringsAsFactors=FALSE)
if (nrow(ENRICHED)>0){
library(GenomicRanges)
enriched_gr = GRanges(seqnames = ENRICHED$chr, ranges=IRanges(start = ENRICHED$startpos, end = ENRICHED$endpos))
centro_gr = as(centromere_loci, "GRanges")
#to_remove = queryHits(suppressWarnings(findOverlaps(enriched_gr, centro_gr)))
hits = suppressWarnings(findOverlaps(enriched_gr, centro_gr))
if (length(hits) > 0) {
p <- Pairs(enriched_gr, centro_gr, hits = hits)
ol_inter <- pintersect(p)
# remove segments in which over 50% overlaps with centromere
to_remove = queryHits(hits)[width(ol_inter) / width(enriched_gr[queryHits(hits),]) > 0.5]
} else {
to_remove = c()
}
if (length(to_remove)>0){
ENRICHED = ENRICHED[-to_remove,]
print(paste("number of centromeric segments removed =",length(to_remove)))
} else {
print("no centromere overlap")
}
}
}
if (remove_acrocentric) {
if (nrow(ENRICHED)>0) {
acro_chrs = c("13","14","15","21","22")
to_remove = which(apply(ENRICHED, 1, function(seg) {
if (as.character(seg['chr']) %in% acro_chrs) {
as.numeric(seg['startpos']) < as.numeric(centromere_loci[as.character(centromere_loci[,'chr']) == as.character(seg['chr']),'start'])
} else {
FALSE
}
}))
if (length(to_remove)>0){
ENRICHED = ENRICHED[-to_remove,]
print(paste("number of segments removed due to hitting acrocentric p arms =",length(to_remove)))
} else {
print("no segments hitting acrocentric p arms")
}
}
}
#identify enriched regions which are at the HLA region and REMOVE ######################################################
#https://www.ncbi.nlm.nih.gov/grc/human/regions/MHC?asm=GRCh37 MHC region based on hg37
if (nrow(ENRICHED)>0){
#hla=c(28477797,33448354) # hg19 coords
hla=c(28510120,33480577) #hg38 coords
HLA=ENRICHED[which(ENRICHED$chr==6 & ((ENRICHED$startpos>hla[1] & ENRICHED$endpos<hla[2])|(ENRICHED$endpos>hla[1] & ENRICHED$startpos<hla[1])|(ENRICHED$startpos<hla[2] & ENRICHED$endpos>hla[2]))),]
if (nrow(HLA)>0){
print(HLA)
print(paste("number of HLA segments removed =",nrow(HLA)))
} else {
print("no HLA overlap")
}
ENRICHED=setdiff(ENRICHED,HLA)
}
# REMOVE SINGLETONS/RARE DOUBLETS when N is high:
rare=NULL
if (nrow(ENRICHED)>0){
for (i in 1:nrow(ENRICHED)){
print(i)
ENRICHEDi=ENRICHED[i,]
idata=data[which(data$chr==ENRICHEDi$chr & data$sp<=ENRICHEDi$endpos & data$ep>=ENRICHEDi$startpos),]
if (max(idata$val)<minN){
rare=append(rare,i)
print(paste("Event",i,"removed"))
} else {
print(paste("Event",i,"max =",max(idata$val)))
}
}
}
if (!is.null(rare)){
ENRICHEDclean=ENRICHED[-rare,]
} else {
ENRICHEDclean=ENRICHED
}
# for noevent:
if (nrow(ENRICHEDclean)>0){
ENRICHEDclean=ENRICHEDclean[which(ENRICHEDclean$length>min_region),]
print(range(ENRICHEDclean$length))
ENRICHEDclean$length=NULL
ENRICHEDclean$noevent=1:nrow(ENRICHEDclean)
print(paste("No. of enriched regions=",nrow(ENRICHEDclean)))
print(ENRICHEDclean)
write.table(ENRICHEDclean, paste0(output_dir, "/", groups[grp],"_enriched_regions_",cna_type[cna],"_noevent.txt"), sep="\t",quote=F,row.names = F)
} else {
print(paste(groups[grp],cna_type[cna],":no file written out - no enriched region remained after correction"))
}
} else {
print(paste(paste0(groups[grp],"_samples_significant_pvals_FDR_",cna_type[cna],".txt"),"is empty"))
}
}
}