library(data.table)
library(plyr)
library(dplyr)
library(ggplot2)
library(scales)
library(RColorBrewer)
library(PMCMRplus)
library(gridExtra)
library(patchwork)
library(grid)
library(gtable)
mypath = "/Users/jankatalinic/FAKS/PhD/Rci_variants/Morgan/Nanopore/Data/ANALYSIS/14Rci/R_plots/Repl1_140kANDrepl2/m34/txt_sampled_4k"
setwd(mypath)
#create list of files. Recursive allows to enter subdirectories if true. Pattern allows for regex.
list_of_files = list.files(path = mypath, recursive = FALSE, pattern = "\\.txt$", full.names = FALSE)
txtdf <- lapply(list_of_files, function(x) {read.table(file = x, header = F, sep ="_")})
# Combine them.
combined_df <- do.call("rbind", lapply(txtdf, as.data.frame))
# This rbindlist alternative works even if nr of columns is different.
#combined <- rbindlist(txtdf, fill = TRUE)
#following naming of columns to be used for plots!!
#cols = c("1", "2", "3", "4", "5")
#following naming needed if I want to check frequency of each permutation via 'reps' !
cols = c("pos1", "pos2", "pos3", "pos4", "pos5")
colnames(combined_df_Punj) = cols
#summarise and get proportions
smry = lapply(combined_df, table)
dt = rbindlist(lapply(smry, function(x) as.data.frame.list(x)), fill=TRUE)
# !!!!!! check dt and if it has all 10 columns before proceeding !!!!!!!
############ only to execute if missing a specific mod! ##################
# Transform function allows addition of columns and filling them with values.
#To add multiple, just specify and separate each by a coma.
#dt <- transform(dt, M1r = NA, M2r = NA, M3r = NA, M4r = NA, M5r = NA)
################### rest of data formatting #####################
col_order = c("M1", "M1r", "M2", "M2r", "M3", "M3r", "M4", "M4r", "M5", "M5r")
ref_order = c("M1", "M2", "M3", "M4", "M5")
setcolorder(dt, col_order)
#Calculate proportions of each mod in each of the 5 positions
props = as.data.table(apply(dt, 1, function(x){x/sum(x, na.rm = TRUE)}))
setnames(props, cols)
#long format and prep for plot
longdt = melt(props)
longdt[, "mod_id" := rep(col_order, times = (nrow(longdt)/10))]
longdt$percentage <- longdt$value * 100
setcolorder(longdt,c(1,4,2,3))
lgnd = factor(longdt$mod_id, levels = col_order)
###################### WATERMELON PLOT ###################
#for when some modules missing in occurrence. drop = FALSE allows to keep missing values in the legend.
Krad_pooled = ggplot(longdt, aes(x=variable, y=percentage, color = I("black"))) +
geom_col(aes(fill = lgnd)) +
scale_fill_brewer(palette="PiYG", drop = FALSE, name = "Module", labels = c(expression(paste("M"^"1 ")),
expression(paste("M"^"1r")),
expression(paste("M"^"2 ")),
expression(paste("M"^"2r")),
expression(paste("M"^"3 ")),
expression(paste("M"^"3r")),
expression(paste("M"^"4 ")),
expression(paste("M"^"4r")),
expression(paste("M"^"5 ")),
expression(paste("M"^"5r")))) +
scale_x_discrete(limits=cols, expand = c(0,0.6)) +
labs(y = "occupancy percentage", x = "position in the shufflon") +
theme(axis.title = element_text(size = 17),
axis.text = element_text(size = 14),
legend.title = element_text(size = 17),
legend.text = element_text(size = 17),
legend.spacing.y = unit(0.1, "cm"),
panel.background = element_blank()) +
guides(fill = guide_legend(byrow = TRUE)) +
scale_y_continuous(expand = c(0, 1))
# Rci1_new = ggplot(longdt, aes(x=variable, y=percentage, color = I("black"))) +
# geom_col(aes(fill = lgnd)) +
# scale_fill_brewer(palette="PiYG", drop = FALSE, name = "Module", labels = c(expression(bquote(M^fontsize(10)(1))),
# expression(bquote(M^(parse(text = "fontsize(10) (1r)")))),
# expression(bquote(M^(parse(text = "fontsize(10) (2)")))),
# expression(bquote(M^(parse(text = "fontsize(10) (2r)")))),
# expression(bquote(M^(parse(text = "fontsize(10) (3)")))),
# expression(bquote(M^(parse(text = "fontsize(10) (3r)")))),
# expression(bquote(M^(parse(text = "fontsize(10) (4)")))),
# expression(bquote(M^(parse(text = "fontsize(10) (4r)")))),
# expression(bquote(M^(parse(text = "fontsize(10) (5)")))),
# expression(bquote(M^(parse(text = "fontsize(10) (5r)")))))) +
# scale_x_discrete(limits=cols) +
# labs(y = "occupancy percentage", x = "position in the shufflon") +
# theme(axis.title = element_text(size = 16),
# axis.text = element_text(size = 14),
# legend.text = element_text(size = 14),
# legend.spacing.y = unit(0.2, "cm")) +
# guides(fill = guide_legend(byrow = TRUE))
# PuOr is a decent alternative to PiYG palette
#################### search for unshuffled #############
#the following counts number of repetitions of each unique row i.e. each variant
reps_Rci7rep7_repl1 <- ddply(combined_df,.(pos1, pos2, pos3, pos4, pos5),nrow)
sum_single <- sum(table(cut(reps_Rci1$V1,breaks=seq.int(from=1,to=9))))
sum_double <- sum(table(cut(reps_Rci1$V1,breaks=seq.int(from=10,to=99))))
sum_triple <- sum(table(cut(reps_Rci1$V1,breaks=seq.int(from=100,to=999))))
sum_quad <- sum(table(cut(reps_Rci1$V1,breaks=seq.int(from=1000,to=9999))))
#The following function can count nr of rows w/ n-digit numbers in V1 of 'reps'
count_rows <- function(df, col_name){
sum(nchar(as.character(df[, col_name])) == 3)
}
#shuffling rate per lib
Rci7rep9 <- (1 - (reps_Rci7rep9[1,6] / nrow(combined_df))) * 100
# Store all shuffling rates into a table and export
SR <- rbind(Rci1rep1, Rci1rep3, Rci1rep5, Rci1rep7, Rci1rep8, Rci1rep10, Rci7rep1, Rci7rep3, Rci7rep5, Rci7rep7, Rci7rep8, Rci7rep10)
SR <- rbind(Rci1, Rci3, Rci4, Rci5, Rci6, Rci7, Rci8, Rci9, Rci10, Rci11, Rci12, m8, m18, m34)
SR <- rbind(Rci9rep1, Rci9rep3, Rci9rep5, Rci9rep7, Rci9rep8, Rci9rep9, Rci9rep10, Rci1rep9, Rci7rep9)
write.csv(SR, "/Users/jankatalinic/FAKS/PhD/Rci_variants/Morgan/Nanopore/Data/Orthogonality3_130324/SR_Orthog3.csv")
####### SHUFFLING RATE BAR PLOT ####
## 1. Make bar plot in ggplot2 with SD error bars.
induced_dt = setDT(induced_variants)
colnames(induced_dt) = c("iRci", "iRci - no arabinose", "P. parmentieri", "P. punjabense", "K. radicincitans")
induced_long = melt(induced_dt)
cols2 = c("group", "value")
colnames(induced_long) = cols2
summary_data_unind <- uninduced_long %>%
group_by(group) %>%
summarise(
mean = mean(value),
sd = sd(value)
)
#Change names for uninduced
summary_data_unind$group = as.character(summary_data_unind$group)
newnames_ordered = c("Rci", "SI-H. paralvei", "SI-Y. pseudotuberculosis", "SI-E. cloacae",
"SI-E. hormaechei", "SI-E. E76", "SI-M. glucosetrophus", "SI-B. ubonensis",
"SI-P. parvum", "SI-P. citronellolis", "SI-P. koreensis", "SI-P. aeruginosa",
"Rci m8", "Rci m18", "Rci m34")
summary_data_unind$group = newnames_ordered
summary_data_unind$group <- factor(summary_data_unind$group, levels = newnames_ordered)
#Change names for induced
summary_data$group = as.character(summary_data$group)
newnames2_ordered = c("Rci", "iRci", "SI-P. parmentieri", "SI-P. punjabense", "SI-K. radicincitans")
summary_data$group = newnames2_ordered
summary_data$group = factor(summary_data$group, levels = newnames2_ordered)
#colour for uninduced: "turquoise3"
#colour for induced: "#ff6600"
barV1 = ggplot(summary_data, aes(x = factor(group), y = mean)) +
geom_bar(stat = "identity", fill = "turquoise3", width = 0.8) +
geom_segment(aes(x = factor(group), xend = factor(group), y = mean, yend = mean + sd),
size = 0.5) +
geom_segment(aes(x = as.numeric(factor(group)) - 0.2,
xend = as.numeric(factor(group)) + 0.2,
y = mean + sd, yend = mean + sd),
linewidth = 0.5) +
labs(x = "SSR", y = "Shuffling rate (%)") +
theme(axis.title = element_text(size = 16),
axis.text = element_text(size = 14),
panel.background = element_blank(),
axis.line.x = element_line(color = "black", linewidth = 0.5),
axis.line.y = element_line(color = "black", linewidth = 0.5),
axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1, color = "black")) +
scale_y_continuous(expand = c(0, 1), limits = c(0, 100))
## 2. Perform Welch's ANOVA, then post hoc Dunnet's C test for pairwise comp. of each group's mean to Rci mean.
# Remove P cintronellolis whose SD = 0.
#uninduced_long_noRci11 = uninduced_long[-(19:20),]
# Factor groups
uninduced_long_noRci11$group <- as.factor(uninduced_long_noRci11$group)
# Perform Welch's ANOVA
fit_uninduced = oneway.test(value ~ group, data = uninduced_long_noRci11, var.equal = FALSE)
# Perform Dunnet's C test
DunnettC_uninduced = dunnettTest(uninduced_long_noRci11$value, uninduced_long_noRci11$group, control = "iRci")
summary_dunnett_unind = summary(DunnettC_uninduced)
## 3. Introduce significance values to bar plot
# Extract p-values
p_values <- DunnettC_induced$p.value[,1] # Dunnett's C test results
# Assign significance labels based on p-value thresholds
significance_labels <- sapply(p_values, function(p) {
if (p <= 0.001) {
"***"
} else if (p <= 0.01) {
"**"
} else if (p <= 0.05) {
"*"
} else {
"ns"
}
})
# Prepare a dataframe for plotting with significance labels
significance_df <- data.frame(
group = names(p_values),
p_value = p_values,
label = significance_labels
)
# Merge significance labels with sd summary data
newnames_ordered_noRci11 = c("SI-H. paralvei", "SI-Y. pseudotuberculosis", "SI-E. cloacae",
"SI-E. hormaechei", "SI-E. E76", "SI-M. glucosetrophus", "SI-B. ubonensis",
"SI-P. parvum", "SI-P. koreensis", "SI-P. aeruginosa",
"Rci m8", "Rci m18", "Rci m34")
significance_df_unind$group = newnames_ordered_noRci11
plot_data_unind <- merge(summary_data_unind, significance_df_unind, by.x = "group", by.y = "group", all.x = TRUE)
# Add significance labels above the bars
barV2 <- barV1 +
geom_text(data = plot_data, aes(x = group, y = mean + sd + 5, label = label),
size = 7, color = "black", na.rm = TRUE)
###### BARPLOT INDUCED #######
barV1 = ggplot(summary_data, aes(x = factor(group), y = mean)) +
geom_bar(stat = "identity", fill = "#ff6600", width = 0.4) +
geom_segment(aes(x = factor(group), xend = factor(group), y = mean, yend = mean + sd),
size = 0.5) +
geom_segment(aes(x = as.numeric(factor(group)) - 0.1,
xend = as.numeric(factor(group)) + 0.1,
y = mean + sd, yend = mean + sd),
linewidth = 0.5) +
labs(x = "SI", y = "Shuffling rate (%)") +
theme(axis.title = element_text(size = 16),
axis.text = element_text(size = 14),
panel.background = element_blank(),
axis.line.x = element_line(color = "black", linewidth = 0.5),
axis.line.y = element_line(color = "black", linewidth = 0.5),
axis.text.x = element_blank(),
axis.title.x = element_blank()) +
scale_y_continuous(expand = c(0, 1), limits = c(0, 100))
# Change names of recombinases
newnames3 = (c("iRci", "SI-P. parmentieri", "SI-P. punjabense", "SI-K. radicincitans"))
significance_df$group = newnames3
plot_data = merge(summary_data, significance_df, by.x = "group", by.y = "group", all.x = TRUE)
# Add significance labels above the bars
barV2 <- barV1 +
geom_text(data = plot_data, aes(x = group, y = mean + sd + 5, label = label),
size = 7, color = "black", na.rm = TRUE)
# For adding table for induced plot #
data <- data.frame(
SI = c("Rci", "iRci", "SI-P. parmentieri", "SI-P. punjabense", "SI-K. radicincitans"),
arabinose = c(TRUE, FALSE, TRUE, TRUE, TRUE)
)
# Replace TRUE/FALSE with +/-
data_new <- data %>%
mutate(arabinose = ifelse(arabinose, "+", "-")) %>% t()
# Define a custom theme using ttheme_default and setting all borders explicitly
custom_theme <- ttheme_default(
core = list(
fg_params = list(col = "black", fontsize = 11), # Text color
bg_params = list(fill = "white"), # Background color
hline = list(gpar(col = "black", lwd = 1)), # Horizontal lines
vline = list(gpar(col = "black", lwd = 1)) # Vertical lines
),
colhead = list(
fg_params = list(col = "black", fontsize = 11), # Column header text color
bg_params = list(fill = "white"), # Background color
hline = list(gpar(col = "black", lwd = 1)), # Horizontal lines for headers
vline = list(gpar(col = "black", lwd = 1)) # Vertical lines for headers
),
rowhead = list(
fg_params = list(col = "black", fontface = "bold", fontsize = 11), # Row header text color
bg_params = list(fill = "white"), # Background color
hline = list(gpar(col = "black", lwd = 1)), # Horizontal lines for row headers
vline = list(gpar(col = "black", lwd = 1)) # Vertical lines for row headers
)
)
# Create the table grob using the custom theme
table_grob <- tableGrob(data_new, rows = row.names(data_new), theme = custom_theme)
# Convert the table grob to a gtable object for more control
gtable_table <- gtable::gtable_add_grob(
table_grob,
grobs = replicate(nrow(table_grob) * ncol(table_grob),
rectGrob(gp = gpar(fill = NA, col = "black")),
simplify = FALSE),
t = rep(seq_len(nrow(table_grob)), ncol(table_grob)),
l = rep(seq_len(ncol(table_grob)), each = nrow(table_grob)),
b = rep(seq_len(nrow(table_grob)), ncol(table_grob)),
r = rep(seq_len(ncol(table_grob)), each = nrow(table_grob))
)
# Calculate the number of columns including the row names column
total_cols <- ncol(data_new) + 1
# Adjust the table column widths to align with the bar plot
# Use "null" units to let the grid layout system allocate space equally
gtable_table$widths <- unit(rep(1, total_cols), "null")
# Combine the bar plot and the table using patchwork
barV3 <- barV2 / gtable_table + plot_layout(heights = c(3, 1))
# Display the combined plot
print(barV3)
############# ORTHOGONALITY HEATMAP ########
# Import table from Excel first
y <- c("Rci", "E. E76", "B. ubonensis")
xaxis <- c("Rci", "H. paralvei", "E. cloacae", "E. E76", "M. glucosetrophus", "B.ubonensis", "P. parvum", "P. aeruginosa")
reps <- c("Rci", "Rci", "H. paralvei", "H. paralvei", "E. cloacae", "E. cloacae", "E. E76", "E. E76", "M. glucosetrophus", "M. glucosetrophus", "B. ubonensis", "B. ubonensis", "P. parvum", "P. parvum", "P. aeruginosa", "P. aeruginosa")
exp <- c("Rci", "Rci", "Rci", "Rci", "Rci", "Rci", "Rci", "Rci", "Rci", "Rci", "Rci", "Rci", "Rci", "Rci", "Rci", "Rci", "E. E76", "E. E76", "E. E76", "E. E76", "E. E76", "E. E76", "E. E76", "E. E76", "E. E76", "E. E76", "E. E76", "E. E76", "E. E76", "E. E76", "E. E76", "E. E76",
"B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis")
#orthog <- shufflingrates_complete
orthog <- as.data.table(shufflingrates_complete)
orthog_long <- melt(orthog)
orthog_long <- data.table(orthog_long, sfx = reps, SSR = exp)
clean <- orthog_long[,2:4]
clean <- clean[ ,c("SSR", "sfx", "value")]
# Calculate mean shuffling rates
orthog_long_means = orthog_long %>% group_by(variable) %>% summarize(Average = mean(value))
reps_short <- c("Rci", "H. paralvei", "E. cloacae", "E. E76", "M. glucosetrophus", "B. ubonensis", "P. parvum", "P. aeruginosa")
exp_short <- c("Rci", "Rci", "Rci", "Rci", "Rci", "Rci", "Rci", "Rci", "E. E76", "E. E76", "E. E76", "E. E76", "E. E76", "E. E76", "E. E76", "E. E76",
"B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis")
orthog_means_clean = cbind(orthog_long_means[,2], reps_short, exp_short)
names = c("rate", "reporter", "SSR")
setnames(orthog_means_clean, names)
# Limit nr of decimal points
means_formatted <- orthog_means_clean %>% mutate_at(vars(rate), ~ round(., 1))
lgnd_orthog = factor(means_formatted$reporter, levels = reps_short)
##### Plots ##
# heatmap2 <- ggplot(orthog_means_clean, aes(x = factor(reporter, level = xaxis),y = SSR,fill = rate)) +
# geom_tile() + scale_fill_viridis_c()
# Following produces a landscape heatmap (SSR on y-axis)
# heatmapV2 <- ggplot(means_formatted, aes(x = factor(reporter, level = xaxis),y = SSR,fill = rate)) +
# geom_tile() + geom_text(aes(label = rate), color = "white", size = 8) + scale_fill_viridis_c() +
# theme(axis.title = element_text(size = 16),
# axis.text = element_text(size = 14),
# legend.text = element_text(size = 12),
# panel.background = element_blank())
# Following produces a vertical heatmap (SSR on x-axis)
# heatmapV3 <- ggplot(means_formatted, aes(x = SSR, y = factor(reporter, level = xaxis),fill = rate)) +
# geom_tile() + geom_text(aes(label = rate), color = "white", size = 8) + scale_fill_viridis_c() +
# theme(axis.title = element_text(size = 16),
# axis.text = element_text(size = 14),
# legend.text = element_text(size = 12),
# panel.background = element_blank(),
# legend.position = "bottom") +
# labs(x = "SSR", y = "Reporter")
# Following produces a vertical heatmap (SSR on x-axis) and fixes reporter labels
heatmapV4 <- ggplot(means_formatted, aes(x = SSR, y = lgnd_orthog, fill = rate)) +
geom_tile() + geom_text(aes(label = sprintf("%.1f", rate)), color = "white", size = 8) + scale_fill_viridis_c() +
theme(axis.title = element_text(size = 16),
axis.text = element_text(size = 14),
legend.text = element_text(size = 12),
panel.background = element_blank(),
legend.position = "bottom") +
labs(x = "SSR", y = "Reporter", fill = "Shuffling rate (%)")
gradient_palette <- colorRampPalette(c("white", "#1560BD"))
gradient_colors <- gradient_palette(101)
heatmapV5 <- ggplot(means_formatted, aes(x = SSR, y = lgnd_orthog, fill = rate)) +
geom_tile(color = "black", size = 0.2) + geom_text(aes(label = sprintf("%.1f", rate)), color = "black", size = 8) +
scale_fill_gradientn(colors = gradient_colors) +
theme(axis.title = element_text(size = 16),
axis.text = element_text(size = 14),
legend.text = element_text(size = 12),
panel.background = element_blank(),
legend.position = "bottom") +
labs(x = "SSR", y = "Reporter", fill = "Shuffling rate (%)")
#### Orthogonality stat test #####
rep1_test = t.test(orthog$`1 + 1`, orthog$`7 + 1`, var.equal = FALSE)
rep5_test = t.test(orthog$`1 + 5`, orthog$`7 + 5`, var.equal = FALSE)
rep7_test = t.test(orthog$`1 + 7`, orthog$`7 + 7`, var.equal = FALSE)
rep9_test = t.test(orthog$`1 + 9`, orthog$`7 + 9`, var.equal = FALSE)
rep13_test = t.test(orthog$`1 + 13`, orthog$`7 + 13`, var.equal = FALSE)
SSR_test = t.test(orthog$`1 + 1`, orthog$`7 + 9`, var.equal = FALSE)
SSR_test2 = t.test(orthog$`1 + 1`, orthog$`7 + 5`, var.equal = FALSE)
colnames(orthog_long) = c("group", "value", "sfx", "SSR")
orthog_long$group <- as.factor(orthog_long$group)
fit_orthog = oneway.test(value ~ group, data = orthog_long, var.equal = FALSE)
# Why is ANOVA not working here??
######## Permutation representations ######
# For all 9 shufflers, 4k reads were randomly sampled.
reps_Rci <- ddply(combined_df,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_E76 <- ddply(combined_df,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_Bubo <- ddply(combined_df,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_m18 <- ddply(combined_df,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_m34 <- ddply(combined_df,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_iRci <- ddply(combined_df,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_Krad <- ddply(combined_df,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_Parm <- ddply(combined_df,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_Punj <- ddply(combined_df,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_E76rep5 <- ddply(combined_df,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_E76rep9 <- ddply(combined_df,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_Rcirep1_orthog <- ddply(combined_df,.(pos1, pos2, pos3, pos4, pos5),nrow)
# Here, for all 9 shufflers, all reads pooled from both biological replicates
reps_Rci <- ddply(combined_df_Rci,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_E76 <- ddply(combined_df_E76,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_Bubo <- ddply(combined_df_Bubo,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_m18 <- ddply(combined_df_m18,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_m34 <- ddply(combined_df_m34,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_iRci <- ddply(combined_df_iRci,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_Krad <- ddply(combined_df_Krad,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_Parm <- ddply(combined_df_Parm,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_Punj <- ddply(combined_df_Punj,.(pos1, pos2, pos3, pos4, pos5),nrow)
#The following function can count nr of rows w/ n-digit numbers in V1 of 'reps'.
#Execute in console for when n=1, n=2, n=3, n=4. Manually insert results into an Excel table.
# e.g. count_rows(reps_Rci, "V1")
count_rows <- function(df, col_name){
sum(nchar(as.character(df[, col_name])) == 1)
}
mod1_reps = table(combined_df$pos1)
mod2_reps = table(combined_df$pos2)
mod3_reps = table(combined_df$pos3)
mod4_reps = table(combined_df$pos4)
mod5_reps = table(combined_df$pos5)
print(mod1_reps[1:2])
print(mod2_reps[1:2])
print(mod4_reps[3:4])
print(mod5_reps[5:6])
# # Rci7
#
# reps_Rci7 <- ddply(combined_df_Rci7,.(pos1, pos2, pos3, pos4, pos5),nrow)
#
# mod1_reps = table(combined_df_Rci7$pos1)
# mod2_reps = table(combined_df_Rci7$pos2)
# mod3_reps = table(combined_df_Rci7$pos3)
# mod4_reps = table(combined_df_Rci7$pos4)
# mod5_reps = table(combined_df_Rci7$pos5)
#
# print(mod1_reps[1:2])
# print(mod2_reps[1:2])
# print(mod4_reps[3:4])
# print(mod5_reps[5:6])
#
#
# # Rci9
#
# reps_m34 <- ddply(combined_df_m34,.(pos1, pos2, pos3, pos4, pos5),nrow)
#
# mod1_reps = table(combined_df_m34$pos1)
# mod2_reps = table(combined_df_m34$pos2)
# mod3_reps = table(combined_df_m34$pos3)
# mod4_reps = table(combined_df_m34$pos4)
# mod5_reps = table(combined_df_m34$pos5)
#
# print(mod1_reps[1:2])
# print(mod2_reps[1:2])
# print(mod4_reps[3:4])
# print(mod5_reps[5:6])
# STAT TEST
Rci1_invfreq = as.data.table(Rci1_invfreq)
Rci1_invfreq_long = melt(Rci1_invfreq)
Rci7_invfreq = as.data.table(Rci7_invfreq)
Rci7_invfreq_long = melt(Rci7_invfreq)
Rci9_invfreq = as.data.table(Rci9_invfreq)
Rci9_invfreq_long = melt(Rci9_invfreq)
m18_invfreq = as.data.table(m18_invfreq)
m18_invfreq_long = melt(m18_invfreq)
m34_invfreq = as.data.table(m34_invfreq)
m34_invfreq_long = melt(m34_invfreq)
cols2 = c("event", "group", "count")
colnames(Rci1_invfreq_long) = cols2
colnames(Rci7_invfreq_long) = cols2
colnames(Rci9_invfreq_long) = cols2
colnames(m18_invfreq_long) = cols2
colnames(m34_invfreq_long) = cols2
contingency_table <- dcast(m34_invfreq_long, event ~ group, value.var = "count", fun.aggregate = sum, fill = 0)
contingency_table <- xtabs(count ~ event + group, data = m34_invfreq_long)
chisq_result <- chisq.test(contingency_table)
pairwise_chisq <- function(tab) {
n_groups <- ncol(tab)
p_values <- matrix(NA, nrow = n_groups, ncol = n_groups)
for (i in 1:(n_groups - 1)) {
for (j in (i + 1):n_groups) {
sub_table <- tab[, c(i, j)]
chi_squared_result <- chisq.test(sub_table)
p_values[i, j] <- chi_squared_result$p.value
p_values[j, i] <- chi_squared_result$p.value
}
}
rownames(p_values) <- colnames(p_values) <- colnames(tab)
return(p_values)
}
# Perform pairwise Chi-squared comparisons
pairwise_p_values <- pairwise_chisq(contingency_table)
# Print the pairwise Chi-squared comparisons
print(pairwise_p_values)
# Apply Bonferroni correction to the pairwise p-values
alpha <- 0.05 # Set your desired alpha level
n_comparisons <- ncol(contingency_table) * (ncol(contingency_table) - 1) / 2
adjusted_alpha <- alpha / n_comparisons
# Print a sentence for each p-value
n_groups <- ncol(contingency_table)
for (i in 1:(n_groups - 1)) {
for (j in (i + 1):n_groups) {
p_value <- pairwise_p_values[i, j]
significant <- ifelse(p_value < adjusted_alpha, "statistically significant", "not statistically significant")
cat(sprintf("Comparison between %s and %s: p-value = %.4f (%s)\n", colnames(contingency_table)[i], colnames(contingency_table)[j], p_value, significant))
}
}
#### Extract reads of reference module order and count inversions in each shufflon position ####
reference <- "12345"
extract_digits <- function(x, reference) {
# Remove non-digit characters using gsub
x <- gsub("[^1-5]", "", x)
# Concatenate remaining digits using paste and store the resulting string
digits <- paste(x, collapse="")
# Compare string to reference string
identical(digits, reference)
}
# Apply the extract_digits function to each row of the dataframe
same_rows_Rci <- apply(combined_df_Rci, 1, extract_digits, reference)
same_rows_E76 <- apply(combined_df_E76, 1, extract_digits, reference)
same_rows_Bubo <- apply(combined_df_Bubo, 1, extract_digits, reference)
same_rows_m18 <- apply(combined_df_m18, 1, extract_digits, reference)
same_rows_m34 <- apply(combined_df_m34, 1, extract_digits, reference)
same_rows_iRci <- apply(combined_df_iRci, 1, extract_digits, reference)
same_rows_Krad <- apply(combined_df_Krad, 1, extract_digits, reference)
same_rows_Parm <- apply(combined_df_Parm, 1, extract_digits, reference)
same_rows_Punj <- apply(combined_df_Punj, 1, extract_digits, reference)
# Subset rows of reference module order from original dataframe (by applying logical output from extract_digits to df)
Rci_same <- combined_df_Rci[same_rows_Rci,]
E76_same <- combined_df_E76[same_rows_E76,]
Bubo_same <- combined_df_Bubo[same_rows_Bubo,]
m18_same <- combined_df_m18[same_rows_m18,]
m34_same <- combined_df_m34[same_rows_m34,]
iRci_same <- combined_df_iRci[same_rows_iRci,]
Krad_same <- combined_df_Krad[same_rows_Krad,]
Parm_same <- combined_df_Parm[same_rows_Parm,]
Punj_same <- combined_df_Punj[same_rows_Punj,]
# Randomly pick n reads from larger subsets (normalise subsets according to smallest)
Rci_same_norm = sample_n(Rci_same, 2201)
E76_same_norm = sample_n(E76_same, 2201)
Bubo_same_norm = sample_n(Bubo_same, 2201)
m18_same_norm = sample_n(m18_same, 2201)
m34_same_norm = sample_n(m34_same, 2201)
iRci_same_norm = sample_n(iRci_same, 2201)
Krad_same_norm = sample_n(Krad_same, 2201)
Parm_same_norm = sample_n(Parm_same, 2201)
# Count reps of inverted and uninverted module in each shufflon position
mod1_reps = table(Punj_same$V1)
mod2_reps = table(Punj_same$V2)
mod3_reps = table(Punj_same$V3)
mod4_reps = table(Punj_same$V4)
mod5_reps = table(Punj_same$V5)
print(mod1_reps)
print(mod2_reps)
print(mod3_reps)
print(mod4_reps)
print(mod5_reps)
##### Regression model for mod length vs inv freq ####
# All the values need to be numeric, so instead of SSRs displayed by their names, they were arbitrarily numbered.
# Following produced approximately the same adjusted R^2 value as Excel, whose R^2 = 0.687.
# reg_model2 = lm(formula = inv_freq ~ SSR + mod_len, data = len_vs_invfreqV2)
# model_stats2 = summary(reg_model2)
# model2_coef = coefficients(reg_model2)
#
# cor(len_vs_invfreqV2$inv_freq, len_vs_invfreqV2$mod_len)
#
#
#
# # Test w/ unadjusted data to see if I obtain same linear regression characteristics
#
# reg_model3 = lm(formula = inv_freq ~ SSR + mod_len, data = len_vs_invfreq_unadjusted)
# model_stats3 = summary(reg_model3)
#
# # Test regression when only mod_len and inv freq considered, no SSR
#
# reg_model4 = lm(formula = inv_freq ~ mod_len, data = len_vs_invfreqV2)
# model_stats4 = summary(reg_model4)
# Regression model with normalised data where each subset contained 2201 reads.
reg_model5 = lm(formula = inv_freq ~ mod_len + SI, data = len_vs_invfreq_norm)
model_stats5 = summary(reg_model5)
# Plot
len_vs_invfreq_norm$asfraction = len_vs_invfreq_norm$inv_freq / 2201
legend = c("Rci constitutive", "E. E76", "B. ubonensis", "Rci m18", "Rci m34", "Rci induced", "K. radicincitans", "P. parmentieri", "P. punjabense")
len_vs_invfreq_norm$SI <- factor(len_vs_invfreq_norm$SI, levels = legend)
# In colour and same shape of points
# scatter <- ggplot(data = len_vs_invfreq_norm, aes(x = mod_len, y = inv_freq, fill = SI)) +
# geom_point(shape = 21, size = 3, stroke = 0.5) +
# geom_smooth(aes(group = 1), method = "lm", se = FALSE, color = "black", linewidth = 0.7) +
# coord_cartesian(xlim = c(0, 600), ylim = c(0, 1250)) +
# #theme_minimal() + # Use a minimal theme (customize as needed)
# labs(x = "module length (bp)", y = "inversion frequency")+
# theme(legend.box.background = element_blank(),
# panel.background = element_blank(),
# panel.grid = element_blank(),
# axis.ticks = element_line(linewidth = 0.5), # Add ticks on both axes
# axis.line = element_line(linewidth = 0.5),
# axis.title = element_text(size = 14),
# axis.text = element_text(size = 14),
# legend.text = element_text(size = 12),
# legend.key.height = unit(1, "cm")) +
# scale_y_continuous(expand = c(0, 0)) + # Start y-axis from zero
# scale_x_continuous(expand = c(0, 0)) +
# scale_fill_manual(values = c("#648FFF", "#FFB000", "#DC267F", "#00D4CC", "#AFABAB", "yellow", "green", "black", "purple"),
# name = "SSR:",
# labels = c("Rci constitutive", "SI-E. E76", "SI-B. ubonensis",
# "Rci m18", "Rci m34", "Rci induced", "SI-K. radicincitans", "SI-P. parmentieri", "SI-P. punjabense"))
# Following version of scatter keeps every point black but each group of points has a different shape.
scatter2 <- ggplot(data = len_vs_invfreq_norm, aes(x = mod_len, y = asfraction, shape = SI)) +
geom_point(size = 3, stroke = 0.5, colour = "black") +
geom_smooth(aes(group = 1), method = "lm", se = FALSE, color = "black", linewidth = 0.7) +
coord_cartesian(xlim = c(100, 600), ylim = c(0, 0.5)) +
#theme_minimal() + # Use a minimal theme (customize as needed)
labs(x = "module length (bp)", y = "inversion frequency")+
theme(legend.box.background = element_blank(),
panel.background = element_blank(),
panel.grid = element_blank(),
axis.ticks = element_line(linewidth = 0.5), # Add ticks on both axes
axis.line = element_line(linewidth = 0.5),
axis.title = element_text(size = 14),
axis.text = element_text(size = 14, color = "black"),
legend.text = element_text(size = 14),
legend.key.height = unit(1, "cm")) +
scale_y_continuous(expand = c(0, 0)) + # Start y-axis from zero
scale_x_continuous(expand = c(0, 0)) +
scale_shape_manual(values = 1:9, name = "SI:",
labels = c("Rci constitutive", "SI-E. E76", "SI-B. ubonensis",
"Rci m18", "Rci m34", "Rci induced", "SI-K. radicincitans", "SI-P. parmentieri", "SI-P. punjabense"))
#SSR_name = rep(c("Rci", "E. E76", "B. ubonensis", "m18", "m34"), each = 5)
#len_vs_invfreq_unadjusted = cbind(len_vs_invfreq_unadjusted, SSR_name)
#len_vs_invfreqV2 = len_vs_invfreqV2[,1:3]
# custom_colours = c("#648FFF", "#FFB000", "#DC267F", "#00D4CC", "#AFABAB")
#
# scatter2 <- ggplot()+
# geom_point(data = len_vs_invfreq_norm, aes(x = mod_len, y = inv_freq, fill = SSR_name), shape = 21, size = 3, stroke = 0.5) +
# #geom_smooth(method = "lm", se = FALSE, color = "black", linewidth = 0.7, formula = y ~ x) +
# #coord_cartesian(xlim = c(0, 14.5), ylim = c(0, 1)) +
# #theme_minimal() + # Use a minimal theme (customize as needed)
# labs(x = "inversion frequency", y = "module length (bp)")+
# theme(legend.box.background = element_blank(),
# panel.background = element_blank(),
# panel.grid = element_blank(),
# axis.ticks = element_line(linewidth = 0.5), # Add ticks on both axes
# axis.line = element_line(linewidth = 0.5),
# axis.title = element_text(size = 14),
# axis.text = element_text(size = 14),
# legend.text = element_text(size = 12),
# legend.key.height = unit(1, "cm")) +
# #scale_y_continuous(expand = c(0, 0)) + # Start y-axis from zero
# #scale_x_continuous(expand = c(0, 0), breaks = custom_x_ticks3) +
# scale_fill_manual(values = custom_colours,
# name = "SSR",
# labels = c("Rci", "E. E76", "B. ubonensis", "Rci m18", "Rci m34"))
#
##### Revised test ######
# Turn categorical variable into factors and set reference level.
len_vs_invfreq_norm$SSR <- relevel(as.factor(len_vs_invfreq_norm$SSR), ref = "Rci")
# Create linear regression model with multiple independent variables (mod length and SSR)
model_new <- lm(inv_freq ~ SI + mod_len, data = len_vs_invfreq_norm)
# Print details of linear regression (p-values, F-test etc.)
print(summary(model_new))
# Gives a single p-value for both mod length and SSR.
anova(model_new)
# Welch's ANOVA to double-check effect of changing SSR on inversion frequency of individual modules
oneway.test(inv_freq ~ SSR, data = len_vs_invfreq_norm, var.equal = FALSE)
##### Shuffling rate for >1 inv ######
# Exclude rows which match:
# 1) unshuffled permutation
# 2) all 9 permutations which can be obtained from 1 inversion only.
exclusion_sets = list(
c("M1", "M2", "M3", "M4", "M5"),
c("M1r", "M2", "M3", "M4", "M5"),
c("M1", "M2r", "M3", "M4", "M5"),
c("M1", "M2", "M3r", "M4", "M5"),
c("M1", "M2", "M3", "M4r", "M5"),
c("M1", "M2", "M3", "M4", "M5r"),
c("M3r", "M2r", "M1r", "M4", "M5"),
c("M1", "M4r", "M3r", "M2r", "M5"),
c("M1", "M2", "M5r", "M4r", "M3r"),
c("M5r", "M4r", "M3r", "M2r", "M1r")
)
# Function to filter out rows matching exclusion sets
filter_rows <- function(df, exclusion_sets) {
for (set in exclusion_sets) {
df <- df[!(df$V1 == set[1] & df$V2 == set[2] & df$V3 == set[3] & df$V4 == set[4] & df$V5 == set[5]), ]
}
return(df)
}
# Apply the filtering function
filtered_df_Bubo <- filter_rows(combined_df_Bubo, exclusion_sets)
filtered_df_E76 <- filter_rows(combined_df_E76, exclusion_sets)
filtered_df_iRci <- filter_rows(combined_df_iRci, exclusion_sets)
filtered_df_Krad <- filter_rows(combined_df_Krad, exclusion_sets)
filtered_df_m18 <- filter_rows(combined_df_m18, exclusion_sets)
filtered_df_m34 <- filter_rows(combined_df_m34, exclusion_sets)
filtered_df_Parm <- filter_rows(combined_df_Parm, exclusion_sets)
filtered_df_Punj <- filter_rows(combined_df_Punj, exclusion_sets)
filtered_df_Rci <- filter_rows(combined_df_Rci, exclusion_sets)