SherlockLung-EvolutionaryTrajectory-Analysis / Analyses / pre_wgd_event_distribution.R
pre_wgd_event_distribution.R
Raw
# load per-iteration ordering matrices
load("path/to/data/SherlockLung_LUAD_2025_DN2/SherlockLung_LUAD_2025_DN2_ordering_matrices_G1.Rdata")
nsda_om = ordering_matrices
load("path/to/data/SherlockLung_LUAD_2025_DN3/SherlockLung_LUAD_2025_DN3_ordering_matrices_G1.Rdata")
nsdb_om = ordering_matrices

### RUN FROM HERE FOR tumour_set = "SherlockLung_LUAD_2025_DN2" with nsda_prop_prewgd = get_proportion_prewgd(nsda_om, wgd_i, cna_i) AND THEN FOR tumour_set = "SherlockLung_LUAD_2025_DN3" with nsdb_prop_prewgd = get_proportion_prewgd(nsdb_om, wgd_i, cna_i)

tumour_set = "SherlockLung_LUAD_2025_DN2"
#tumour_set = "SherlockLung_LUAD_2025_DN3"
input_dir = paste0("path/to/data/", tumour_set, "/")
wgd_file = "path/to/data/SherlockLung_WGDStatus.txt"

G = 1

mergedseg=read.table(paste0(input_dir, "/", tumour_set,"_mergedseg_G1.txt"),header=T,stringsAsFactors = F)
matrix_coresp=read.table(paste0(input_dir, "/", tumour_set,"_matrix_coresp_G1.txt"),header=T,stringsAsFactors = F)
matrix_out=read.table(paste0(input_dir, "/", tumour_set,"_matrix_out_G1.txt"),header=T,stringsAsFactors = F)

cnai = which(mergedseg$CNA %in% c("dGain","dLOH","dHD"))
mergedseg$ID[cnai] = sapply(cnai, function(i) { paste0(gsub("d", "", mergedseg$CNA[i]), "_", mergedseg$ID[i]) })

if (wgd_file != "none") {
  wgd_status = read.table(wgd_file, stringsAsFactors=FALSE)
  all_wgd_samples = wgd_status[wgd_status[,2] == "WGD",1]
  wgd_samples = list(intersect(unique(mergedseg$Tumour_Name), all_wgd_samples))
} else {
  wgd_samples = list(unique(mergedseg[mergedseg$tumour_ploidy > 2,'Tumour_Name']))
}

event.id=unique(mergedseg[,c("ID","noevent","CNA")])
if (including_wgd) {
  event.id=rbind(data.frame(ID="Whole_Genome_Duplication",noevent=(length(unique(mergedseg$ID))+1),CNA="cWGD",stringsAsFactors = F),event.id)
}
event.id$cna=base::substring(event.id$CNA,first = 2)
event.id=unique(event.id[,c("ID","noevent","cna")])

wgd_i = event.id[event.id$cna == "WGD","noevent"]
cna_i = event.id[event.id$cna %in% c("LOH","Gain","HD"),"noevent"]

get_num_prewgd_cna = function(om, wgd_i, cna_i) {
  wgd_rn = which(om[[1]][,wgd_i] > 0)
  sapply(wgd_rn, function(samp_i) {
    mean(sapply(om, function(ordering_mat) {
      pre_wgd = which(ordering_mat[samp_i,] > 0 & ordering_mat[samp_i,] < ordering_mat[samp_i,wgd_i])
	  sum(pre_wgd %in% cna_i)
    }))
  })
}

nsda_prewgd_events = nsda_om[[1]][nsda_om[[1]][,ncol(nsda_om[[1]])] > 0,ncol(nsda_om[[1]])] - 1
nsdb_prewgd_events = nsdb_om[[1]][nsdb_om[[1]][,ncol(nsdb_om[[1]])] > 0,ncol(nsdb_om[[1]])] - 1

get_proportion_prewgd = function(om, wgd_i, cna_i) {
  wgd_rn = which(om[[1]][,wgd_i] > 0)
  sapply(wgd_rn, function(samp_i) {
    ordering_mat = om[[1]]
    pre_wgd = length(which(ordering_mat[samp_i,] > 0 & ordering_mat[samp_i,] < ordering_mat[samp_i,wgd_i]))
	tot = length(which(ordering_mat[samp_i,] > 0))
    pre_wgd / tot
  })
}

nsda_prop_prewgd = get_proportion_prewgd(nsda_om, wgd_i, cna_i)
#nsdb_prop_prewgd = get_proportion_prewgd(nsdb_om, wgd_i, cna_i)

### END OF SECTION TO RUN FOR EACH NSD TRAJECTORY

# plot preWGD distributions

library(reshape2)

pre_wgd_proportions = melt(list(`NSD-A`=nsda_prop_prewgd, `NSD-B`=nsdb_prop_prewgd))
colnames(pre_wgd_proportions) = c("proportion", "trajectory")
p <- ggplot(pre_wgd_proportions, aes(x=proportion, fill=trajectory)) +
  geom_density(alpha=0.5) +
  theme_bw(base_family = "Roboto Condensed") +
  scale_fill_manual(values=c("#8adbf5","#a371cb")) +
  xlab('Proportion of events occurring pre-WGD')

ggsave(file='path/to/figures/Fig4/NSDA_vs_NSDB_pre_wgd_proportion_density.png',plot = p,width = 4,height = 3.5,device = "png")