library(GenomicRanges)
cyto_file = "path/to/data/cytoBand_hg38.txt"
driver_file = "path/to/data/sherlock_driver_symbols.txt"
gene_gr_file = "path/to/reference_files/all_gene_gr_hg38.Rdata"
get_event_ids <- function(mergedseg, cyto_file, driver_file, gene_gr_file) {
cyto = read.table(cyto_file, stringsAsFactors=FALSE, header=FALSE)
cyto_gr = GRanges(seqnames=gsub("chr", "", cyto[,1]), ranges=IRanges(start=cyto[,2], end=cyto[,3]), cytoband=cyto[,4])
ids = unique(mergedseg$ID)
names(ids) = ids
cna_i = grep("chr", ids)
cna_ids = ids[cna_i]
seg_mat = Reduce(rbind, strsplit(cna_ids, "_"))
if (ncol(seg_mat) == 3) {
colnames(seg_mat) = c("chr", "start", "end")
} else {
colnames(seg_mat) = c("event_type", "chr", "start", "end")
}
seg_gr = GRanges(seqnames=gsub("chr", "", seg_mat[,"chr"]), ranges=IRanges(start=as.numeric(seg_mat[,"start"]), end=as.numeric(seg_mat[,"end"])), id=cna_ids)
chr_cbs = lapply(split(cyto, cyto[,1]), function(x) {
list(`p`=x[grep("p", x[,4]),4], `q`=x[grep("q", x[,4]),4])
})
names(chr_cbs) = gsub("chr", "", names(chr_cbs))
ov = findOverlaps(seg_gr, cyto_gr)
cyto_ids = sapply(unique(queryHits(ov)), function(qhi) {
cbs = mcols(cyto_gr)[subjectHits(ov)[queryHits(ov) == qhi],'cytoband']
chr = as.vector(seqnames(cyto_gr)[subjectHits(ov)[queryHits(ov) == qhi]])[1]
if (length(cbs) > 1) {
arm = substr(cbs[1], 1, 1)
if (cbs[1] == chr_cbs[[chr]][[arm]][1] && cbs[length(cbs)] == chr_cbs[[chr]][[arm]][length(chr_cbs[[chr]][[arm]])]) {
paste0(chr, arm)
} else {
last_cb <- cbs[length(cbs)]
last_cb <- substr(last_cb, 2, nchar(last_cb))
paste0(chr, cbs[1], "-", last_cb)
}
} else {
paste0(chr, cbs[1])
}
})
ids[cna_i] = cyto_ids
ids = gsub("dMut_", "", ids)
ids[cna_i] = paste(sapply(names(ids)[cna_i], function(x) { gsub("d", "", unique(mergedseg[mergedseg$ID == x,"CNA"])) }), ids[cna_i], sep=": ")
all_drivers = unique(readLines(driver_file))
load(gene_gr_file)
driver_gr = all_gene_gr[all_gene_gr$symbol %in% all_drivers,]
drivers_in_region = sapply(names(ids), function(id) {
text_to_add = ""
if (length(grep("chr", id)) > 0) {
locus_split = strsplit(id, "_")[[1]]
locus_gr = GRanges(seqnames=gsub("chr", "", locus_split[1]), ranges=IRanges(start=as.numeric(locus_split[2]), end=as.numeric(locus_split[3])))
ol = suppressWarnings(findOverlaps(locus_gr, driver_gr))
if (length(ol) > 0) {
text_to_add = paste0("(", paste(driver_gr$symbol[subjectHits(ol)], collapse=", "), ")")
}
}
text_to_add
})
event_ids = gsub(" $", "", paste(ids, drivers_in_region[names(ids)]))
names(event_ids) = names(ids)
event_ids
}
event_ids = get_event_ids(mergedseg, cyto_file, driver_file, gene_gr_file)