easywrangler.R · Rci-Variants

library(data.table)
library(plyr)
library(dplyr)
library(ggplot2)
library(scales)
library(RColorBrewer)
library(PMCMRplus)
library(gridExtra)
library(patchwork)
library(grid)
library(gtable)


mypath = "/Users/jankatalinic/FAKS/PhD/Rci_variants/Morgan/Nanopore/Data/ANALYSIS/14Rci/R_plots/Repl1_140kANDrepl2/m34/txt_sampled_4k"
setwd(mypath)

#create list of files. Recursive allows to enter subdirectories if true. Pattern allows for regex.
list_of_files = list.files(path = mypath, recursive = FALSE, pattern = "\\.txt$", full.names = FALSE)

txtdf <- lapply(list_of_files, function(x) {read.table(file = x, header = F, sep ="_")})

# Combine them.
combined_df <- do.call("rbind", lapply(txtdf, as.data.frame))

# This rbindlist alternative works even if nr of columns is different.
#combined <- rbindlist(txtdf, fill = TRUE)

#following naming of columns to be used for plots!!
#cols = c("1", "2", "3", "4", "5")
#following naming needed if I want to check frequency of each permutation via 'reps' !
cols = c("pos1", "pos2", "pos3", "pos4", "pos5")
colnames(combined_df_Punj) = cols

#summarise and get proportions
smry = lapply(combined_df, table)

dt = rbindlist(lapply(smry, function(x) as.data.frame.list(x)), fill=TRUE)


# !!!!!! check dt and if it has all 10 columns before proceeding !!!!!!!

############ only to execute if missing a specific mod! ##################

# Transform function allows addition of columns and filling them with values. 
#To add multiple, just specify and separate each by a coma. 

#dt <- transform(dt, M1r = NA, M2r = NA, M3r = NA, M4r = NA, M5r = NA)



################### rest of data formatting #####################
col_order = c("M1", "M1r", "M2", "M2r", "M3", "M3r", "M4", "M4r", "M5", "M5r")
ref_order = c("M1", "M2", "M3", "M4", "M5")

setcolorder(dt, col_order)



#Calculate proportions of each mod in each of the 5 positions
props = as.data.table(apply(dt, 1, function(x){x/sum(x, na.rm = TRUE)}))
setnames(props, cols)


#long format and prep for plot
longdt = melt(props)
longdt[, "mod_id" := rep(col_order, times = (nrow(longdt)/10))]
longdt$percentage <- longdt$value * 100

setcolorder(longdt,c(1,4,2,3))

lgnd = factor(longdt$mod_id, levels = col_order)


###################### WATERMELON PLOT ###################

#for when some modules missing in occurrence. drop = FALSE allows to keep missing values in the legend.
Krad_pooled = ggplot(longdt, aes(x=variable, y=percentage, color = I("black"))) + 
  geom_col(aes(fill = lgnd)) +
  scale_fill_brewer(palette="PiYG", drop = FALSE, name = "Module", labels = c(expression(paste("M"^"1 ")),
                                                           expression(paste("M"^"1r")),
                                                           expression(paste("M"^"2 ")),
                                                           expression(paste("M"^"2r")),
                                                           expression(paste("M"^"3 ")),
                                                           expression(paste("M"^"3r")),
                                                           expression(paste("M"^"4 ")),
                                                           expression(paste("M"^"4r")),
                                                           expression(paste("M"^"5 ")),
                                                           expression(paste("M"^"5r")))) +
  scale_x_discrete(limits=cols, expand = c(0,0.6)) + 
  labs(y = "occupancy percentage", x = "position in the shufflon") +
  theme(axis.title = element_text(size = 17),
        axis.text = element_text(size = 14), 
        legend.title = element_text(size = 17), 
        legend.text = element_text(size = 17),
        legend.spacing.y = unit(0.1, "cm"),
        panel.background = element_blank()) +
  guides(fill = guide_legend(byrow = TRUE)) +
  scale_y_continuous(expand = c(0, 1)) 



# Rci1_new = ggplot(longdt, aes(x=variable, y=percentage, color = I("black"))) + 
#   geom_col(aes(fill = lgnd)) +
#   scale_fill_brewer(palette="PiYG", drop = FALSE, name = "Module", labels = c(expression(bquote(M^fontsize(10)(1))),
#                                                                               expression(bquote(M^(parse(text = "fontsize(10) (1r)")))),
#                                                                               expression(bquote(M^(parse(text = "fontsize(10) (2)")))),
#                                                                               expression(bquote(M^(parse(text = "fontsize(10) (2r)")))),
#                                                                               expression(bquote(M^(parse(text = "fontsize(10) (3)")))),
#                                                                               expression(bquote(M^(parse(text = "fontsize(10) (3r)")))),
#                                                                               expression(bquote(M^(parse(text = "fontsize(10) (4)")))),
#                                                                               expression(bquote(M^(parse(text = "fontsize(10) (4r)")))),
#                                                                               expression(bquote(M^(parse(text = "fontsize(10) (5)")))),
#                                                                               expression(bquote(M^(parse(text = "fontsize(10) (5r)")))))) +
#   scale_x_discrete(limits=cols) + 
#   labs(y = "occupancy percentage", x = "position in the shufflon") +
#   theme(axis.title = element_text(size = 16), 
#         axis.text = element_text(size = 14), 
#         legend.text = element_text(size = 14),
#         legend.spacing.y = unit(0.2, "cm")) +
#   guides(fill = guide_legend(byrow = TRUE))




# PuOr is a decent alternative to PiYG palette

#################### search for unshuffled #############


#the following counts number of repetitions of each unique row i.e. each variant
reps_Rci7rep7_repl1 <- ddply(combined_df,.(pos1, pos2, pos3, pos4, pos5),nrow)

sum_single <- sum(table(cut(reps_Rci1$V1,breaks=seq.int(from=1,to=9))))
sum_double <- sum(table(cut(reps_Rci1$V1,breaks=seq.int(from=10,to=99))))
sum_triple <- sum(table(cut(reps_Rci1$V1,breaks=seq.int(from=100,to=999))))
sum_quad <- sum(table(cut(reps_Rci1$V1,breaks=seq.int(from=1000,to=9999))))

#The following function can count nr of rows w/ n-digit numbers in V1 of 'reps'
count_rows <- function(df, col_name){
  sum(nchar(as.character(df[, col_name])) == 3)
}


#shuffling rate per lib
Rci7rep9 <- (1 - (reps_Rci7rep9[1,6] / nrow(combined_df))) * 100


# Store all shuffling rates into a table and export
SR <- rbind(Rci1rep1, Rci1rep3, Rci1rep5, Rci1rep7, Rci1rep8, Rci1rep10, Rci7rep1, Rci7rep3, Rci7rep5, Rci7rep7, Rci7rep8, Rci7rep10)
SR <- rbind(Rci1, Rci3, Rci4, Rci5, Rci6, Rci7, Rci8, Rci9, Rci10, Rci11, Rci12, m8, m18, m34)
SR <- rbind(Rci9rep1, Rci9rep3, Rci9rep5, Rci9rep7, Rci9rep8, Rci9rep9, Rci9rep10, Rci1rep9, Rci7rep9)
write.csv(SR, "/Users/jankatalinic/FAKS/PhD/Rci_variants/Morgan/Nanopore/Data/Orthogonality3_130324/SR_Orthog3.csv")




####### SHUFFLING RATE BAR PLOT ####

## 1. Make bar plot in ggplot2 with SD error bars.

induced_dt = setDT(induced_variants)
colnames(induced_dt) = c("iRci", "iRci - no arabinose", "P. parmentieri", "P. punjabense", "K. radicincitans")
induced_long = melt(induced_dt)
cols2 = c("group", "value")
colnames(induced_long) = cols2

summary_data_unind <- uninduced_long %>%
  group_by(group) %>%
  summarise(
    mean = mean(value),
    sd = sd(value)
  )

#Change names for uninduced
summary_data_unind$group = as.character(summary_data_unind$group)
newnames_ordered = c("Rci", "SI-H. paralvei", "SI-Y. pseudotuberculosis", "SI-E. cloacae",
                             "SI-E. hormaechei", "SI-E. E76", "SI-M. glucosetrophus", "SI-B. ubonensis",
                             "SI-P. parvum", "SI-P. citronellolis", "SI-P. koreensis", "SI-P. aeruginosa",
                             "Rci m8", "Rci m18", "Rci m34")

summary_data_unind$group = newnames_ordered
summary_data_unind$group <- factor(summary_data_unind$group, levels = newnames_ordered)

#Change names for induced
summary_data$group = as.character(summary_data$group)
newnames2_ordered = c("Rci", "iRci", "SI-P. parmentieri", "SI-P. punjabense", "SI-K. radicincitans")
summary_data$group = newnames2_ordered
summary_data$group = factor(summary_data$group, levels = newnames2_ordered)

#colour for uninduced: "turquoise3"
#colour for induced: "#ff6600"
barV1 = ggplot(summary_data, aes(x = factor(group), y = mean)) + 
  geom_bar(stat = "identity", fill = "turquoise3", width = 0.8) +
  geom_segment(aes(x = factor(group), xend = factor(group), y = mean, yend = mean + sd), 
               size = 0.5) +
  geom_segment(aes(x = as.numeric(factor(group)) - 0.2, 
                   xend = as.numeric(factor(group)) + 0.2, 
                   y = mean + sd, yend = mean + sd), 
               linewidth = 0.5) +
  labs(x = "SSR", y = "Shuffling rate (%)") +
  theme(axis.title = element_text(size = 16),
        axis.text = element_text(size = 14), 
        panel.background = element_blank(),
        axis.line.x = element_line(color = "black", linewidth = 0.5),
        axis.line.y = element_line(color = "black", linewidth = 0.5),
        axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1, color = "black")) +
  scale_y_continuous(expand = c(0, 1), limits = c(0, 100)) 


## 2. Perform Welch's ANOVA, then post hoc Dunnet's C test for pairwise comp. of each group's mean to Rci mean.


# Remove P cintronellolis whose SD = 0.
#uninduced_long_noRci11 = uninduced_long[-(19:20),]

# Factor groups
uninduced_long_noRci11$group <- as.factor(uninduced_long_noRci11$group)

# Perform Welch's ANOVA
fit_uninduced = oneway.test(value ~ group, data = uninduced_long_noRci11, var.equal = FALSE)


# Perform Dunnet's C test
DunnettC_uninduced = dunnettTest(uninduced_long_noRci11$value, uninduced_long_noRci11$group, control = "iRci")

summary_dunnett_unind = summary(DunnettC_uninduced)

## 3. Introduce significance values to bar plot


# Extract p-values
p_values <- DunnettC_induced$p.value[,1]  # Dunnett's C test results

# Assign significance labels based on p-value thresholds
significance_labels <- sapply(p_values, function(p) {
  if (p <= 0.001) {
    "***"
  } else if (p <= 0.01) {
    "**"
  } else if (p <= 0.05) {
    "*"
  } else {
    "ns"
  }
})

# Prepare a dataframe for plotting with significance labels
significance_df <- data.frame(
  group = names(p_values),
  p_value = p_values,
  label = significance_labels
)

# Merge significance labels with sd summary data
newnames_ordered_noRci11 = c("SI-H. paralvei", "SI-Y. pseudotuberculosis", "SI-E. cloacae",
                     "SI-E. hormaechei", "SI-E. E76", "SI-M. glucosetrophus", "SI-B. ubonensis",
                     "SI-P. parvum", "SI-P. koreensis", "SI-P. aeruginosa",
                     "Rci m8", "Rci m18", "Rci m34")
significance_df_unind$group = newnames_ordered_noRci11
plot_data_unind <- merge(summary_data_unind, significance_df_unind, by.x = "group", by.y = "group", all.x = TRUE)



# Add significance labels above the bars
barV2 <- barV1 +
  geom_text(data = plot_data, aes(x = group, y = mean + sd + 5, label = label), 
            size = 7, color = "black", na.rm = TRUE)



###### BARPLOT INDUCED #######


barV1 = ggplot(summary_data, aes(x = factor(group), y = mean)) + 
  geom_bar(stat = "identity", fill = "#ff6600", width = 0.4) +
  geom_segment(aes(x = factor(group), xend = factor(group), y = mean, yend = mean + sd), 
               size = 0.5) +
  geom_segment(aes(x = as.numeric(factor(group)) - 0.1, 
                   xend = as.numeric(factor(group)) + 0.1, 
                   y = mean + sd, yend = mean + sd), 
               linewidth = 0.5) +
  labs(x = "SI", y = "Shuffling rate (%)") +
  theme(axis.title = element_text(size = 16),
        axis.text = element_text(size = 14), 
        panel.background = element_blank(),
        axis.line.x = element_line(color = "black", linewidth = 0.5),
        axis.line.y = element_line(color = "black", linewidth = 0.5),
        axis.text.x = element_blank(),
        axis.title.x = element_blank()) +
  scale_y_continuous(expand = c(0, 1), limits = c(0, 100)) 

# Change names of recombinases
newnames3 = (c("iRci", "SI-P. parmentieri", "SI-P. punjabense", "SI-K. radicincitans"))
significance_df$group = newnames3
plot_data = merge(summary_data, significance_df, by.x = "group", by.y = "group", all.x = TRUE)


# Add significance labels above the bars
barV2 <- barV1 +
  geom_text(data = plot_data, aes(x = group, y = mean + sd + 5, label = label), 
            size = 7, color = "black", na.rm = TRUE)


# For adding table for induced plot #
data <- data.frame(
  SI = c("Rci", "iRci", "SI-P. parmentieri", "SI-P. punjabense", "SI-K. radicincitans"),
  arabinose = c(TRUE, FALSE, TRUE, TRUE, TRUE)
)

# Replace TRUE/FALSE with +/-
data_new <- data %>%
  mutate(arabinose = ifelse(arabinose, "+", "-")) %>% t()

# Define a custom theme using ttheme_default and setting all borders explicitly
custom_theme <- ttheme_default(
  core = list(
    fg_params = list(col = "black", fontsize = 11),   # Text color
    bg_params = list(fill = "white"),  # Background color
    hline = list(gpar(col = "black", lwd = 1)),  # Horizontal lines
    vline = list(gpar(col = "black", lwd = 1))   # Vertical lines
  ),
  colhead = list(
    fg_params = list(col = "black", fontsize = 11),   # Column header text color
    bg_params = list(fill = "white"),  # Background color
    hline = list(gpar(col = "black", lwd = 1)),  # Horizontal lines for headers
    vline = list(gpar(col = "black", lwd = 1))   # Vertical lines for headers
  ),
  rowhead = list(
    fg_params = list(col = "black", fontface = "bold", fontsize = 11),   # Row header text color
    bg_params = list(fill = "white"),  # Background color
    hline = list(gpar(col = "black", lwd = 1)),  # Horizontal lines for row headers
    vline = list(gpar(col = "black", lwd = 1))   # Vertical lines for row headers
  )
)

# Create the table grob using the custom theme
table_grob <- tableGrob(data_new, rows = row.names(data_new), theme = custom_theme)

# Convert the table grob to a gtable object for more control
gtable_table <- gtable::gtable_add_grob(
  table_grob,
  grobs = replicate(nrow(table_grob) * ncol(table_grob), 
                    rectGrob(gp = gpar(fill = NA, col = "black")), 
                    simplify = FALSE),
  t = rep(seq_len(nrow(table_grob)), ncol(table_grob)),
  l = rep(seq_len(ncol(table_grob)), each = nrow(table_grob)),
  b = rep(seq_len(nrow(table_grob)), ncol(table_grob)),
  r = rep(seq_len(ncol(table_grob)), each = nrow(table_grob))
)

# Calculate the number of columns including the row names column
total_cols <- ncol(data_new) + 1

# Adjust the table column widths to align with the bar plot
# Use "null" units to let the grid layout system allocate space equally
gtable_table$widths <- unit(rep(1, total_cols), "null")


# Combine the bar plot and the table using patchwork
barV3 <- barV2 / gtable_table + plot_layout(heights = c(3, 1))

# Display the combined plot
print(barV3)




############# ORTHOGONALITY HEATMAP ########

# Import table from Excel first

y <- c("Rci", "E. E76", "B. ubonensis")
xaxis <- c("Rci", "H. paralvei", "E. cloacae", "E. E76", "M. glucosetrophus", "B.ubonensis", "P. parvum", "P. aeruginosa")
reps <- c("Rci", "Rci", "H. paralvei", "H. paralvei", "E. cloacae", "E. cloacae", "E. E76", "E. E76", "M. glucosetrophus", "M. glucosetrophus", "B. ubonensis", "B. ubonensis", "P. parvum", "P. parvum", "P. aeruginosa", "P. aeruginosa")
exp <- c("Rci", "Rci", "Rci", "Rci", "Rci", "Rci", "Rci", "Rci", "Rci", "Rci", "Rci", "Rci", "Rci", "Rci", "Rci", "Rci", "E. E76", "E. E76", "E. E76", "E. E76", "E. E76", "E. E76", "E. E76", "E. E76", "E. E76", "E. E76", "E. E76", "E. E76", "E. E76", "E. E76", "E. E76", "E. E76",
         "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis")

#orthog <- shufflingrates_complete
orthog <- as.data.table(shufflingrates_complete)
orthog_long <- melt(orthog)

orthog_long <- data.table(orthog_long, sfx = reps, SSR = exp)
clean <- orthog_long[,2:4]
clean <- clean[ ,c("SSR", "sfx", "value")]

# Calculate mean shuffling rates
orthog_long_means = orthog_long %>% group_by(variable) %>% summarize(Average = mean(value))



reps_short <- c("Rci", "H. paralvei", "E. cloacae", "E. E76", "M. glucosetrophus", "B. ubonensis", "P. parvum", "P. aeruginosa")
exp_short <- c("Rci", "Rci", "Rci", "Rci", "Rci", "Rci", "Rci", "Rci", "E. E76", "E. E76", "E. E76", "E. E76", "E. E76", "E. E76", "E. E76", "E. E76", 
               "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis", "B. ubonensis")

orthog_means_clean = cbind(orthog_long_means[,2], reps_short, exp_short)

names = c("rate", "reporter", "SSR")
setnames(orthog_means_clean, names)

# Limit nr of decimal points
means_formatted <- orthog_means_clean %>% mutate_at(vars(rate), ~ round(., 1))

lgnd_orthog = factor(means_formatted$reporter, levels = reps_short)


##### Plots ##

# heatmap2 <- ggplot(orthog_means_clean, aes(x = factor(reporter, level = xaxis),y = SSR,fill = rate)) +
#   geom_tile() + scale_fill_viridis_c()

# Following produces a landscape heatmap (SSR on y-axis)
# heatmapV2 <- ggplot(means_formatted, aes(x = factor(reporter, level = xaxis),y = SSR,fill = rate)) +
#   geom_tile() + geom_text(aes(label = rate), color = "white", size = 8) + scale_fill_viridis_c() +
#   theme(axis.title = element_text(size = 16),
#         axis.text = element_text(size = 14),
#         legend.text = element_text(size = 12),
#         panel.background = element_blank())

# Following produces a vertical heatmap (SSR on x-axis)
# heatmapV3 <- ggplot(means_formatted, aes(x = SSR, y = factor(reporter, level = xaxis),fill = rate)) +
#   geom_tile() + geom_text(aes(label = rate), color = "white", size = 8) + scale_fill_viridis_c() +
#   theme(axis.title = element_text(size = 16),
#         axis.text = element_text(size = 14),
#         legend.text = element_text(size = 12),
#         panel.background = element_blank(),
#         legend.position = "bottom") +
#   labs(x = "SSR", y = "Reporter")

# Following produces a vertical heatmap (SSR on x-axis) and fixes reporter labels
heatmapV4 <- ggplot(means_formatted, aes(x = SSR, y = lgnd_orthog, fill = rate)) +
  geom_tile() + geom_text(aes(label = sprintf("%.1f", rate)), color = "white", size = 8) + scale_fill_viridis_c() +
  theme(axis.title = element_text(size = 16),
        axis.text = element_text(size = 14),
        legend.text = element_text(size = 12),
        panel.background = element_blank(),
        legend.position = "bottom") +
  labs(x = "SSR", y = "Reporter", fill = "Shuffling rate (%)")


gradient_palette <- colorRampPalette(c("white", "#1560BD"))
gradient_colors <- gradient_palette(101) 


heatmapV5 <- ggplot(means_formatted, aes(x = SSR, y = lgnd_orthog, fill = rate)) +
  geom_tile(color = "black", size = 0.2) + geom_text(aes(label = sprintf("%.1f", rate)), color = "black", size = 8) + 
  scale_fill_gradientn(colors = gradient_colors) +
  theme(axis.title = element_text(size = 16),
        axis.text = element_text(size = 14),
        legend.text = element_text(size = 12),
        panel.background = element_blank(),
        legend.position = "bottom") +
  labs(x = "SSR", y = "Reporter", fill = "Shuffling rate (%)")

#### Orthogonality stat test #####

rep1_test = t.test(orthog$`1 + 1`, orthog$`7 + 1`, var.equal = FALSE)
rep5_test = t.test(orthog$`1 + 5`, orthog$`7 + 5`, var.equal = FALSE)
rep7_test = t.test(orthog$`1 + 7`, orthog$`7 + 7`, var.equal = FALSE)
rep9_test = t.test(orthog$`1 + 9`, orthog$`7 + 9`, var.equal = FALSE)
rep13_test = t.test(orthog$`1 + 13`, orthog$`7 + 13`, var.equal = FALSE)
SSR_test = t.test(orthog$`1 + 1`, orthog$`7 + 9`, var.equal = FALSE)
SSR_test2 = t.test(orthog$`1 + 1`, orthog$`7 + 5`, var.equal = FALSE)

colnames(orthog_long) = c("group", "value", "sfx", "SSR")
orthog_long$group <- as.factor(orthog_long$group)
fit_orthog = oneway.test(value ~ group, data = orthog_long, var.equal = FALSE)
# Why is ANOVA not working here??

######## Permutation representations ######


# For all 9 shufflers, 4k reads were randomly sampled. 
reps_Rci <- ddply(combined_df,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_E76 <- ddply(combined_df,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_Bubo <- ddply(combined_df,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_m18 <- ddply(combined_df,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_m34 <- ddply(combined_df,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_iRci <- ddply(combined_df,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_Krad <- ddply(combined_df,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_Parm <- ddply(combined_df,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_Punj <- ddply(combined_df,.(pos1, pos2, pos3, pos4, pos5),nrow)

reps_E76rep5 <- ddply(combined_df,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_E76rep9 <- ddply(combined_df,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_Rcirep1_orthog <- ddply(combined_df,.(pos1, pos2, pos3, pos4, pos5),nrow)


# Here, for all 9 shufflers, all reads pooled from both biological replicates

reps_Rci <- ddply(combined_df_Rci,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_E76 <- ddply(combined_df_E76,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_Bubo <- ddply(combined_df_Bubo,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_m18 <- ddply(combined_df_m18,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_m34 <- ddply(combined_df_m34,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_iRci <- ddply(combined_df_iRci,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_Krad <- ddply(combined_df_Krad,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_Parm <- ddply(combined_df_Parm,.(pos1, pos2, pos3, pos4, pos5),nrow)
reps_Punj <- ddply(combined_df_Punj,.(pos1, pos2, pos3, pos4, pos5),nrow)

#The following function can count nr of rows w/ n-digit numbers in V1 of 'reps'.
#Execute in console for when n=1, n=2, n=3, n=4. Manually insert results into an Excel table.
# e.g. count_rows(reps_Rci, "V1")

count_rows <- function(df, col_name){
  sum(nchar(as.character(df[, col_name])) == 1)
}




mod1_reps = table(combined_df$pos1)
mod2_reps = table(combined_df$pos2)
mod3_reps = table(combined_df$pos3)
mod4_reps = table(combined_df$pos4)
mod5_reps = table(combined_df$pos5)
print(mod1_reps[1:2])
print(mod2_reps[1:2])
print(mod4_reps[3:4])
print(mod5_reps[5:6])

# # Rci7
# 
# reps_Rci7 <- ddply(combined_df_Rci7,.(pos1, pos2, pos3, pos4, pos5),nrow)
# 
# mod1_reps = table(combined_df_Rci7$pos1)
# mod2_reps = table(combined_df_Rci7$pos2)
# mod3_reps = table(combined_df_Rci7$pos3)
# mod4_reps = table(combined_df_Rci7$pos4)
# mod5_reps = table(combined_df_Rci7$pos5)
# 
# print(mod1_reps[1:2])
# print(mod2_reps[1:2])
# print(mod4_reps[3:4])
# print(mod5_reps[5:6])
# 
# 
# # Rci9
# 
# reps_m34 <- ddply(combined_df_m34,.(pos1, pos2, pos3, pos4, pos5),nrow)
# 
# mod1_reps = table(combined_df_m34$pos1)
# mod2_reps = table(combined_df_m34$pos2)
# mod3_reps = table(combined_df_m34$pos3)
# mod4_reps = table(combined_df_m34$pos4)
# mod5_reps = table(combined_df_m34$pos5)
# 
# print(mod1_reps[1:2])
# print(mod2_reps[1:2])
# print(mod4_reps[3:4])
# print(mod5_reps[5:6])



# STAT TEST



Rci1_invfreq = as.data.table(Rci1_invfreq)
Rci1_invfreq_long = melt(Rci1_invfreq)

Rci7_invfreq = as.data.table(Rci7_invfreq)
Rci7_invfreq_long = melt(Rci7_invfreq)

Rci9_invfreq = as.data.table(Rci9_invfreq)
Rci9_invfreq_long = melt(Rci9_invfreq)

m18_invfreq = as.data.table(m18_invfreq)
m18_invfreq_long = melt(m18_invfreq)

m34_invfreq = as.data.table(m34_invfreq)
m34_invfreq_long = melt(m34_invfreq)

cols2 = c("event", "group", "count")
colnames(Rci1_invfreq_long) = cols2
colnames(Rci7_invfreq_long) = cols2
colnames(Rci9_invfreq_long) = cols2
colnames(m18_invfreq_long) = cols2
colnames(m34_invfreq_long) = cols2


contingency_table <- dcast(m34_invfreq_long, event ~ group, value.var = "count", fun.aggregate = sum, fill = 0)

contingency_table <- xtabs(count ~ event + group, data = m34_invfreq_long)

chisq_result <- chisq.test(contingency_table)

pairwise_chisq <- function(tab) {
  n_groups <- ncol(tab)
  p_values <- matrix(NA, nrow = n_groups, ncol = n_groups)
  
  for (i in 1:(n_groups - 1)) {
    for (j in (i + 1):n_groups) {
      sub_table <- tab[, c(i, j)]
      chi_squared_result <- chisq.test(sub_table)
      p_values[i, j] <- chi_squared_result$p.value
      p_values[j, i] <- chi_squared_result$p.value
    }
  }
  
  rownames(p_values) <- colnames(p_values) <- colnames(tab)
  return(p_values)
}

# Perform pairwise Chi-squared comparisons
pairwise_p_values <- pairwise_chisq(contingency_table)

# Print the pairwise Chi-squared comparisons
print(pairwise_p_values)

# Apply Bonferroni correction to the pairwise p-values
alpha <- 0.05  # Set your desired alpha level
n_comparisons <- ncol(contingency_table) * (ncol(contingency_table) - 1) / 2
adjusted_alpha <- alpha / n_comparisons


# Print a sentence for each p-value
n_groups <- ncol(contingency_table)
for (i in 1:(n_groups - 1)) {
  for (j in (i + 1):n_groups) {
    p_value <- pairwise_p_values[i, j]
    significant <- ifelse(p_value < adjusted_alpha, "statistically significant", "not statistically significant")
    cat(sprintf("Comparison between %s and %s: p-value = %.4f (%s)\n", colnames(contingency_table)[i], colnames(contingency_table)[j], p_value, significant))
  }
}




#### Extract reads of reference module order and count inversions in each shufflon position ####

reference <- "12345"


extract_digits <- function(x, reference) {
  # Remove non-digit characters using gsub
  x <- gsub("[^1-5]", "", x)
  # Concatenate remaining digits using paste and store the resulting string
  digits <- paste(x, collapse="")
  # Compare string to reference string
  identical(digits, reference)
}

# Apply the extract_digits function to each row of the dataframe
same_rows_Rci <- apply(combined_df_Rci, 1, extract_digits, reference)
same_rows_E76 <- apply(combined_df_E76, 1, extract_digits, reference)
same_rows_Bubo <- apply(combined_df_Bubo, 1, extract_digits, reference)
same_rows_m18 <- apply(combined_df_m18, 1, extract_digits, reference)
same_rows_m34 <- apply(combined_df_m34, 1, extract_digits, reference)
same_rows_iRci <- apply(combined_df_iRci, 1, extract_digits, reference)
same_rows_Krad <- apply(combined_df_Krad, 1, extract_digits, reference)
same_rows_Parm <- apply(combined_df_Parm, 1, extract_digits, reference)
same_rows_Punj <- apply(combined_df_Punj, 1, extract_digits, reference)



# Subset rows of reference module order from original dataframe (by applying logical output from extract_digits to df)
Rci_same <- combined_df_Rci[same_rows_Rci,]
E76_same <- combined_df_E76[same_rows_E76,]
Bubo_same <- combined_df_Bubo[same_rows_Bubo,]
m18_same <- combined_df_m18[same_rows_m18,]
m34_same <- combined_df_m34[same_rows_m34,]
iRci_same <- combined_df_iRci[same_rows_iRci,]
Krad_same <- combined_df_Krad[same_rows_Krad,]
Parm_same <- combined_df_Parm[same_rows_Parm,]
Punj_same <- combined_df_Punj[same_rows_Punj,]


# Randomly pick n reads from larger subsets (normalise subsets according to smallest)

Rci_same_norm = sample_n(Rci_same, 2201)
E76_same_norm = sample_n(E76_same, 2201)
Bubo_same_norm = sample_n(Bubo_same, 2201)
m18_same_norm = sample_n(m18_same, 2201)
m34_same_norm = sample_n(m34_same, 2201)
iRci_same_norm = sample_n(iRci_same, 2201)
Krad_same_norm = sample_n(Krad_same, 2201)
Parm_same_norm = sample_n(Parm_same, 2201)



# Count reps of inverted and uninverted module in each shufflon position
mod1_reps = table(Punj_same$V1)
mod2_reps = table(Punj_same$V2)
mod3_reps = table(Punj_same$V3)
mod4_reps = table(Punj_same$V4)
mod5_reps = table(Punj_same$V5)

print(mod1_reps)
print(mod2_reps)
print(mod3_reps)
print(mod4_reps)
print(mod5_reps)



##### Regression model for mod length vs inv freq ####

# All the values need to be numeric, so instead of SSRs displayed by their names, they were arbitrarily numbered.
# Following produced approximately the same adjusted R^2 value as Excel, whose R^2 = 0.687.

# reg_model2 = lm(formula = inv_freq ~ SSR + mod_len, data = len_vs_invfreqV2)
# model_stats2 = summary(reg_model2)
# model2_coef = coefficients(reg_model2)
# 
# cor(len_vs_invfreqV2$inv_freq, len_vs_invfreqV2$mod_len)
# 
# 
# 
# # Test w/ unadjusted data to see if I obtain same linear regression characteristics
# 
# reg_model3 = lm(formula = inv_freq ~ SSR + mod_len, data = len_vs_invfreq_unadjusted)
# model_stats3 = summary(reg_model3)
# 
# # Test regression when only mod_len and inv freq considered, no SSR
# 
# reg_model4 = lm(formula = inv_freq ~ mod_len, data = len_vs_invfreqV2)
# model_stats4 = summary(reg_model4)


# Regression model with normalised data where each subset contained 2201 reads.

reg_model5 = lm(formula = inv_freq ~ mod_len + SI, data = len_vs_invfreq_norm)
model_stats5 = summary(reg_model5)

# Plot

len_vs_invfreq_norm$asfraction = len_vs_invfreq_norm$inv_freq / 2201

legend = c("Rci constitutive", "E. E76", "B. ubonensis", "Rci m18", "Rci m34", "Rci induced", "K. radicincitans", "P. parmentieri", "P. punjabense")

len_vs_invfreq_norm$SI <- factor(len_vs_invfreq_norm$SI, levels = legend)

# In colour and same shape of points
# scatter <- ggplot(data = len_vs_invfreq_norm, aes(x = mod_len, y = inv_freq, fill = SI)) +
#   geom_point(shape = 21, size = 3, stroke = 0.5) +
#   geom_smooth(aes(group = 1), method = "lm", se = FALSE, color = "black", linewidth = 0.7) +
#   coord_cartesian(xlim = c(0, 600), ylim = c(0, 1250)) +
#   #theme_minimal() +  # Use a minimal theme (customize as needed)
#   labs(x = "module length (bp)", y = "inversion frequency")+ 
#   theme(legend.box.background = element_blank(),
#         panel.background = element_blank(),
#         panel.grid = element_blank(),
#         axis.ticks = element_line(linewidth = 0.5),  # Add ticks on both axes
#         axis.line = element_line(linewidth = 0.5),
#         axis.title = element_text(size = 14),
#         axis.text = element_text(size = 14), 
#         legend.text = element_text(size = 12),
#         legend.key.height = unit(1, "cm")) + 
#   scale_y_continuous(expand = c(0, 0)) +  # Start y-axis from zero
#   scale_x_continuous(expand = c(0, 0)) +
#   scale_fill_manual(values = c("#648FFF", "#FFB000", "#DC267F", "#00D4CC", "#AFABAB", "yellow", "green", "black", "purple"), 
#                     name = "SSR:", 
#                     labels = c("Rci constitutive", "SI-E. E76", "SI-B. ubonensis",
#                                "Rci m18", "Rci m34", "Rci induced", "SI-K. radicincitans", "SI-P. parmentieri", "SI-P. punjabense"))

# Following version of scatter keeps every point black but each group of points has a different shape.

scatter2 <- ggplot(data = len_vs_invfreq_norm, aes(x = mod_len, y = asfraction, shape = SI)) +
  geom_point(size = 3, stroke = 0.5, colour = "black") +
  geom_smooth(aes(group = 1), method = "lm", se = FALSE, color = "black", linewidth = 0.7) +
  coord_cartesian(xlim = c(100, 600), ylim = c(0, 0.5)) +
  #theme_minimal() +  # Use a minimal theme (customize as needed)
  labs(x = "module length (bp)", y = "inversion frequency")+ 
  theme(legend.box.background = element_blank(),
        panel.background = element_blank(),
        panel.grid = element_blank(),
        axis.ticks = element_line(linewidth = 0.5),  # Add ticks on both axes
        axis.line = element_line(linewidth = 0.5),
        axis.title = element_text(size = 14),
        axis.text = element_text(size = 14, color = "black"), 
        legend.text = element_text(size = 14),
        legend.key.height = unit(1, "cm")) + 
  scale_y_continuous(expand = c(0, 0)) +  # Start y-axis from zero
  scale_x_continuous(expand = c(0, 0)) +
  scale_shape_manual(values = 1:9, name = "SI:", 
                     labels = c("Rci constitutive", "SI-E. E76", "SI-B. ubonensis",
                                "Rci m18", "Rci m34", "Rci induced", "SI-K. radicincitans", "SI-P. parmentieri", "SI-P. punjabense"))

#SSR_name = rep(c("Rci", "E. E76", "B. ubonensis", "m18", "m34"), each = 5)
#len_vs_invfreq_unadjusted = cbind(len_vs_invfreq_unadjusted, SSR_name)
#len_vs_invfreqV2 = len_vs_invfreqV2[,1:3]

# custom_colours = c("#648FFF", "#FFB000", "#DC267F", "#00D4CC", "#AFABAB")
# 
# scatter2 <- ggplot()+
#   geom_point(data = len_vs_invfreq_norm, aes(x = mod_len, y = inv_freq, fill = SSR_name), shape = 21, size = 3, stroke = 0.5) +
#   #geom_smooth(method = "lm", se = FALSE, color = "black", linewidth = 0.7, formula = y ~ x) +
#   #coord_cartesian(xlim = c(0, 14.5), ylim = c(0, 1)) +
#   #theme_minimal() +  # Use a minimal theme (customize as needed)
#   labs(x = "inversion frequency", y = "module length (bp)")+
#   theme(legend.box.background = element_blank(),
#         panel.background = element_blank(),
#         panel.grid = element_blank(),
#         axis.ticks = element_line(linewidth = 0.5),  # Add ticks on both axes
#         axis.line = element_line(linewidth = 0.5),
#         axis.title = element_text(size = 14),
#         axis.text = element_text(size = 14),
#         legend.text = element_text(size = 12),
#         legend.key.height = unit(1, "cm")) +
#   #scale_y_continuous(expand = c(0, 0)) +  # Start y-axis from zero
#   #scale_x_continuous(expand = c(0, 0), breaks = custom_x_ticks3) +
#   scale_fill_manual(values = custom_colours,
#                     name = "SSR",
#                     labels = c("Rci", "E. E76", "B. ubonensis", "Rci m18", "Rci m34"))
# 



##### Revised test ######



# Turn categorical variable into factors and set reference level.
len_vs_invfreq_norm$SSR <- relevel(as.factor(len_vs_invfreq_norm$SSR), ref = "Rci")

# Create linear regression model with multiple independent variables (mod length and SSR)
model_new <- lm(inv_freq ~ SI + mod_len, data = len_vs_invfreq_norm)

# Print details of linear regression (p-values, F-test etc.)
print(summary(model_new))

# Gives a single p-value for both mod length and SSR.
anova(model_new)

# Welch's ANOVA to double-check effect of changing SSR on inversion frequency of individual modules

oneway.test(inv_freq ~ SSR, data = len_vs_invfreq_norm, var.equal = FALSE)






##### Shuffling rate for >1 inv ######

# Exclude rows which match:
#   1) unshuffled permutation
#   2) all 9 permutations which can be obtained from 1 inversion only.

exclusion_sets = list(
  c("M1", "M2", "M3", "M4", "M5"),
  c("M1r", "M2", "M3", "M4", "M5"),
  c("M1", "M2r", "M3", "M4", "M5"),
  c("M1", "M2", "M3r", "M4", "M5"),
  c("M1", "M2", "M3", "M4r", "M5"),
  c("M1", "M2", "M3", "M4", "M5r"),
  c("M3r", "M2r", "M1r", "M4", "M5"),
  c("M1", "M4r", "M3r", "M2r", "M5"),
  c("M1", "M2", "M5r", "M4r", "M3r"),
  c("M5r", "M4r", "M3r", "M2r", "M1r")
)

# Function to filter out rows matching exclusion sets
filter_rows <- function(df, exclusion_sets) {
  for (set in exclusion_sets) {
    df <- df[!(df$V1 == set[1] & df$V2 == set[2] & df$V3 == set[3] & df$V4 == set[4] & df$V5 == set[5]), ]
  }
  return(df)
}


# Apply the filtering function
filtered_df_Bubo <- filter_rows(combined_df_Bubo, exclusion_sets)
filtered_df_E76 <- filter_rows(combined_df_E76, exclusion_sets)
filtered_df_iRci <- filter_rows(combined_df_iRci, exclusion_sets)
filtered_df_Krad <- filter_rows(combined_df_Krad, exclusion_sets)
filtered_df_m18 <- filter_rows(combined_df_m18, exclusion_sets)
filtered_df_m34 <- filter_rows(combined_df_m34, exclusion_sets)
filtered_df_Parm <- filter_rows(combined_df_Parm, exclusion_sets)
filtered_df_Punj <- filter_rows(combined_df_Punj, exclusion_sets)
filtered_df_Rci <- filter_rows(combined_df_Rci, exclusion_sets)