Analyses/demographics.R · SherlockLung-EvolutionaryTrajectory-Analysis

library(reshape2)
library(dplyr)
library(forcats)
library(ggplot2)

# generate demographic confusion matrices
sns_ancestry_traj_cm = table(dn_landscape[dn_landscape$Assigned_Population %in% c("EAS","EUR"),c("Assigned_Population","DN_group")])
ns_ancestry_traj_cm = table(dn_landscape[dn_landscape$Smoking == "Non-Smoker" & dn_landscape$Assigned_Population %in% c("EAS","EUR"),c("Assigned_Population","DN_group")])
eu_smoking_traj_cm = table(dn_landscape[dn_landscape$Assigned_Population %in% "EUR",c("Smoking","DN_group")])
ns_gender_traj_cm = table(dn_landscape[dn_landscape$Smoking == "Non-Smoker",c("Gender","DN_group")])
s_gender_traj_cm = table(dn_landscape[dn_landscape$Smoking == "Smoker",c("Gender","DN_group")])

# convert to ggplot-friendly format
melted_ns_gender_traj_df = melt(ns_gender_traj_cm)
melted_sns_ancestry_traj_df = melt(sns_ancestry_traj_cm)
melted_ns_ancestry_traj_df = melt(ns_ancestry_traj_cm)
melted_eu_smoking_traj_df = melt(eu_smoking_traj_cm)
melted_s_gender_traj_df = melt(s_gender_traj_cm)

melted_ns_gender_traj_df$DN_group <- factor(melted_ns_gender_traj_df$DN_group, levels=c("SD", "NSD-B", "NSD-A"))
melted_sns_ancestry_traj_df$DN_group <- factor(melted_sns_ancestry_traj_df$DN_group, levels=c("SD", "NSD-B", "NSD-A"))
melted_ns_ancestry_traj_df$DN_group <- factor(melted_ns_ancestry_traj_df$DN_group, levels=c("SD", "NSD-B", "NSD-A"))
melted_eu_smoking_traj_df$DN_group <- factor(melted_eu_smoking_traj_df$DN_group, levels=c("SD", "NSD-B", "NSD-A"))
melted_s_gender_traj_df$DN_group <- factor(melted_s_gender_traj_df$DN_group, levels=c("SD", "NSD-B", "NSD-A"))

# adjust labels
melted_sns_ancestry_traj_df <- melted_sns_ancestry_traj_df %>%  
  mutate(Assigned_Population = fct_recode(Assigned_Population,
    "East Asian" = "EAS",
    "European" = "EUR"
  )) 

melted_ns_ancestry_traj_df <- melted_ns_ancestry_traj_df %>%  
  mutate(Assigned_Population = fct_recode(Assigned_Population,
    "East Asian" = "EAS",
    "European" = "EUR"
  )) 

# prepare for SD vs combined NSD trajectories comparisons
ns_gender_nsd_vs_sd_cm = t(apply(ns_gender_traj_cm, 1, function(x) { c(`NSD`=sum(x[1:2]), x[3]) }))
sns_ancestry_nsd_vs_sd_cm = t(apply(sns_ancestry_traj_cm, 1, function(x) { c(`NSD`=sum(x[1:2]), x[3]) }))
ns_ancestry_nsd_vs_sd_cm = t(apply(ns_ancestry_traj_cm, 1, function(x) { c(`NSD`=sum(x[1:2]), x[3]) }))
eu_smoking_nsd_vs_sd_cm = t(apply(eu_smoking_traj_cm, 1, function(x) { c(`NSD`=sum(x[1:2]), x[3]) }))
s_gender_nsd_vs_sd_cm = t(apply(s_gender_traj_cm, 1, function(x) { c(`NSD`=sum(x[1:2]), x[3]) }))

get_risk_ratio = function(mat) {
  (mat[2,2] / sum(mat[2,])) / (mat[1,2] / sum(mat[1,]))
}

colour_codes = c("#5A5A69","#9C66C7","#8ADBF5")

get_percentage_barplot = function(data, xlabel="", colour_codes, main="") {
  colnames(data) = c("xvar", "fillvar", "value")
  ggplot(data, aes(fill=fillvar, y=value, x=xvar)) + 
    geom_bar(position="fill", stat="identity") +
	theme_bw(base_family="Roboto Condensed") +
	theme(legend.position="none", axis.title.x = element_text(size=20), axis.text.x = element_text(size=18), axis.title.y = element_text(size=20), axis.text.y = element_text(size=18), plot.title = element_text(size = 20)) +
	xlab(xlabel) +
	ylab("Proportion") +
	scale_fill_manual(values=colour_codes) +
	ggtitle(main)
}

# plot and output
p <- get_percentage_barplot(melted_ns_gender_traj_df, "Sex", colour_codes, main="Non-smoker tumours")
ggsave(file='path/to/supplementary/figures/ns_sex_trajectory_bars.png',plot = p,width = 7,height = 9,units="in",device = "png")

p <- get_percentage_barplot(melted_sns_ancestry_traj_df, "Ancestry", colour_codes, main="Non-smoker tumours")
ggsave(file='path/to/supplementary/figures/ns_ancestry_trajectory_bars.png',plot = p,width = 7,height = 9,units="in",device = "png")

p <- get_percentage_barplot(melted_ns_ancestry_traj_df, "Ancestry", colour_codes, main="Non-smoker tumours")
ggsave(file='path/to/supplementary/figures/ns_ancestry_trajectory_bars.png',plot = p,width = 7,height = 9,units="in",device = "png")

p <- get_percentage_barplot(melted_eu_smoking_traj_df, "Smoking", colour_codes, main="Tumours in patients of European ancestry")
ggsave(file='path/to/supplementary/figures/eur_smoking_trajectory_bars.png',plot = p,width = 7,height = 9,units="in",device = "png")

p <- get_percentage_barplot(melted_s_gender_traj_df, "Sex", colour_codes, main="Non-smoker tumours")
ggsave(file='path/to/supplementary/figures/s_sex_trajectory_bars.png',plot = p,width = 7,height = 9,units="in",device = "png")

# Passive smoking vs Trajectory
# No significant association between passive smoking and trajectory
ns_ps_traj_cm = table(as.data.frame(t(sapply(dn_landscape[dn_landscape$Smoking == "Non-Smoker","Tumor_Barcode"], function(bc) {
  c(dn_landscape[bc,"DN_group"], passive[bc,"Passive_Smoking"])
}))))

melted_ns_ps_traj_df = melt(t(ns_ps_traj_cm))
melted_ns_ps_traj_df[,2] <- factor(melted_ns_ps_traj_df[,2], levels=c("SD", "NSD-B", "NSD-A"))

p <- get_percentage_barplot(melted_ns_ps_traj_df, "Passive Smoking", colour_codes, main="Non-smoker tumours")
ggsave(file='path/to/supplementary/figures/ns_passive_trajectory_bars.png',plot = p,width = 7,height = 9,units="in",device = "png")

ns_ps_nsd_sd_cm = t(apply(ns_ps_traj_cm, 2, function(x) { c(`NSD`=sum(x[1:2]), x[3]) }))

# NS-LUAD confusion matrix by SBS4 present vs absent
ns_s4_traj_cm = table(dn_landscape[dn_landscape$Smoking == "Non-Smoker",c("SBS4","DN_group")])
rownames(ns_s4_traj_cm) = c("Absent","Present")
melted_ns_s4_traj_df = melt(ns_s4_traj_cm)
melted_ns_s4_traj_df[,2] <- factor(melted_ns_s4_traj_df[,2], levels=c("SD", "NSD-B", "NSD-A"))

p <- get_percentage_barplot(melted_ns_s4_traj_df, "SBS4", colour_codes, main="Non-smoker tumours")
ggsave(file='path/to/supplementary/figures/ns_SBS4_trajectory_bars.png',plot = p,width = 7,height = 9,units="in",device = "png")

ns_s4_nsd_sd_cm = t(apply(ns_s4_traj_cm, 1, function(x) { c(`NSD`=sum(x[1:2]), x[3]) }))

# statistical comparisons with multiple testing correction across demographics
p.adjust(sapply(list(eu_smoking_nsd_vs_sd_cm, ns_s4_nsd_sd_cm, ns_ancestry_nsd_vs_sd_cm, ns_gender_nsd_vs_sd_cm, s_gender_nsd_vs_sd_cm, ns_ps_nsd_sd_cm), function(x) { fisher.test(x)$p.value }), method="fdr")
sapply(list(eu_smoking_nsd_vs_sd_cm, ns_s4_nsd_sd_cm, ns_ancestry_nsd_vs_sd_cm, ns_gender_nsd_vs_sd_cm, s_gender_nsd_vs_sd_cm, ns_ps_nsd_sd_cm), function(x) { fisher.test(x)$estimate[["odds ratio"]] })
sapply(list(eu_smoking_nsd_vs_sd_cm, ns_s4_nsd_sd_cm, ns_ancestry_nsd_vs_sd_cm, ns_gender_nsd_vs_sd_cm, s_gender_nsd_vs_sd_cm, ns_ps_nsd_sd_cm), get_risk_ratio)