ADPr-TAE-analysis / analysis / Script1_Position_specific_mutation.R
Script1_Position_specific_mutation.R
Raw
###################### SCRIPT INFO ######################
# Script to quantify the total percentage of reads with substitution or deletion at a specific position
# Return 2x tables: 
# (1) First one with each row corresponding to one sample -> Combined_table.csv/xlsx
# (2) a second one where replicates of the same condition are grouped together on a single row -> Replicate_table.csv/xlsx
#
# Sample information regarding the replicates (group) and parameters must be reported in the sample sheet.
#
# KNOWN LIMITATION: can't handle multiple samples with different reference sequences

# Set up directories ====
wd_dir <- "."
setwd(wd_dir)

# Load sample sheet (with check for correct function) 
sample_sheet <- read.csv2("Script1_Sample_sheet.csv", check.names = FALSE)

if (ncol(sample_sheet) == 1) { #To reload with read.csv if necessary
  sample_sheet <- read.csv("Script1_Sample_sheet.csv", check.names = FALSE)
}

# Setup output directory
output_folder <- "../outputs/Output_files_Script1"
dir.create(output_folder, recursive = TRUE)
setwd(output_folder)

# Locate input files
raw_data <- "../../data/Input_files_Script1"

# Run script ====
combined_table <- data.frame() #initialize combined result table
sample_files <- list.files(raw_data)[grep("\\.txt$", list.files(raw_data))]

for (samp in sample_files) {
  
  # Load data and parameters for selected sample
  data <- read.delim(paste0(raw_data, "/", samp), check.names = FALSE)
  mutation_pos <- sample_sheet$Mutation_position[sample_sheet$Sample == samp]
  thresh_counts <- sample_sheet$Threshold_read_counts[sample_sheet$Sample == samp]
  
  # Remove reference sequence with deletions
  if (length(grep("-", data$Reference_Sequence)) == 0) {
    ref_seq <- unique(data$Reference_Sequence)
  } else {
    ref_seq <- data$Reference_Sequence[-grep("-", data$Reference_Sequence)]
    ref_seq <- unique(ref_seq)
  }
  
  # Set reference sequence and decompose it per nucleotide
  ref_nt <- substr(ref_seq, start = mutation_pos, stop = mutation_pos)
  
  # Perform mutational analysis
  data$MBE <- substr(data$Aligned_Sequence, start = mutation_pos, stop = mutation_pos) # extract nucleotide in target position
  data <- subset(data, subset = MBE != ref_nt & data$`#Reads` > thresh_counts) # remove unedited read
  
  possible_MBE <- c("A", "T", "C", "G", "-") # list all possible nucleotides
  possible_MBE <- possible_MBE[-grep(ref_nt, possible_MBE)] # remove reference nucleotide from the list
  
  # Create output table
  results <- data.frame(
    Sample_name = gsub(".txt", "", samp),
    Percent_mut1 = sum(subset(data, subset = MBE == possible_MBE[1])$`%Reads`),
    Percent_mut2 = sum(subset(data, subset = MBE == possible_MBE[2])$`%Reads`),
    Percent_mut3 = sum(subset(data, subset = MBE == possible_MBE[3])$`%Reads`),
    Percent_total_substitution = sum(subset(data, subset = MBE != "-")$`%Reads`), # total is excluding deletions ??
    Percent_deletion = sum(subset(data, subset = MBE == "-")$`%Reads`)
  )
  
  column_names <- c( #define column names for the result table
    paste0("Percent_", possible_MBE[1]),
    paste0("Percent_", possible_MBE[2]),
    paste0("Percent_", possible_MBE[3])
  )
  
  colnames(results)[2:4] <- column_names #update column names
  
  combined_table <- rbind(combined_table, results)
  
} # end for loop


# Save output table with each sample separated ====
write.csv(combined_table, "Combined_table.csv", row.names = F) #save as csv file
openxlsx::write.xlsx(combined_table, "Combined_table.xlsx", rowNames = FALSE) #save as xlsx file for GraphPad

# Save output table with replicates combined together ====
replicate_table <- data.frame() #initialize table

# Extract group list and order by decreasing number of replicates, to start for loop with highest number of replicates
groups <- table(sample_sheet$Group)
group_list <- names(groups)[order(groups, decreasing = TRUE)]

for (samp_group in group_list) {
  #Select samples from the same replicate
  samples <- sample_sheet$Sample[sample_sheet$Group == samp_group]
  samples <- gsub(".txt", "", samples)
  
  
  tab <- combined_table[combined_table$Sample_name %in% samples, ]
  tab <- as.data.frame(
    c(
      data.frame(
        Samples = paste(tab$Sample_name, collapse = ", ")
      ),
      unlist(tab[, -1])
    )
  )
  
  #Handle case with only 1 replicate
  if (ncol(tab) == 6) {
    colnames(tab)[-1] <- paste0(
      colnames(tab)[-1],
      "1"
    )
  }
  
  
  replicate_table <- dplyr::bind_rows(replicate_table, tab)
} #end for loop

# Save Replicate tables
write.csv(replicate_table, "Replicate_table.csv", row.names = F) #save as csv file
openxlsx::write.xlsx(replicate_table, "Replicate_table.xlsx", rowNames = FALSE, keepNA = TRUE) #save as xlsx file for GraphPad

###################### END OF SCRIPT ######################