RACE-Seq / 03_visualization / 03_visualization_functions_X1KO.R
03_visualization_functions_X1KO.R
Raw
#functions required for visualization

#read_and_merge_file needs dataframe *.softclips.parsed and table "gene_biotype" in the following format:
#| ensembl_gene_id | gene_biotype | transcript_start | transcript_end | chromosome_name | external_gene_name | strand |
#strand 1 (positive) or -1 (negative)

read_and_merge_file <- function(f) {
  df = read.csv(f, sep ='\t', header = FALSE,na.strings=c("", " ","NA"))
  print("File read")
  colnames(df)[1] <- "id"
  colnames(df)[2] <- "clip5"
  colnames(df)[3] <- "clip3"
  colnames(df)[4] <- "pos"
  colnames(df)[5] <- "chr"
  
  merge_soft_clip_R = sqldf("
    SELECT *
    FROM df d1 JOIN gene_biotype d2
    ON d1.chr = d2.chromosome_name
    AND d1.pos BETWEEN CAST(d2.transcript_start AS int) AND (d2.transcript_end)
  ")
  print("Files merged")
  merge_soft_clip_R$chromosome_name <- NULL
  return(merge_soft_clip_R)
}


#Funktion decode
decode <- function(gene) {
  gene$clip3_tail_len_decoded <- case_when(
    gene$clip3_tail==1000 ~ "other",
    gene$clip3_tail==1003 ~ "poliAU in reads and shorter reads",
    gene$clip3_tail==2000 ~ "no tail",
    gene$clip3_tail==1002 ~ "poliA in reads and shorter reads",
    gene$clip3_tail==1001 ~ "poliU in reads and shorter reads",
    gene$clip3_tail>=10 & gene$clip3_tail<1000 ~ "10+",
    TRUE ~ as.character(gene$clip3_tail)
  )
  gene$tail <- case_when(
    gene$clip3_tail==1000 ~ 0,
    gene$clip3_tail==1003 ~ 1,
    gene$clip3_tail==2000 ~ 0,
    gene$clip3_tail==1002 ~ 1,
    gene$clip3_tail==1001 ~ 1,
    gene$clip3_tail>=10 & gene$clip3_tail<1000 ~ 1,
    TRUE ~ 1
  )
  gene$clip3_tail_len_range <- case_when(
    gene$clip3_tail_len>=1 & gene$clip3_tail_len<=10 ~ "1-10",
    gene$clip3_tail_len>=11 & gene$clip3_tail_len<=20 ~ "11-20",
    gene$clip3_tail_len>=21 & gene$clip3_tail_len<=30 ~ "21-30",
    gene$clip3_tail_len>=31 & gene$clip3_tail_len<=40 ~ "31-40",
    gene$clip3_tail_len>=41 & gene$clip3_tail_len<=50 ~ "41-50",
    gene$clip3_tail_len>=51 & gene$clip3_tail_len<=60 ~ "51-60",
    gene$clip3_tail_len>=61 & gene$clip3_tail_len<2000 ~ "61+",
    gene$clip3_tail_len == 0 ~ "0",
    TRUE ~ as.character(gene$clip3_tail_len)
  )
  return(gene)
}

#Function tail_analysis_for_genes_0t
#tails are read from the reads which 3'end is present in chosen window
#here window has size from transcript_end to transcript_end - 24nt
tail_analysis_for_genes_0t <- function (gene_name, df2) {
  gene <- subset(df2, external_gene_name == gene_name)
  gene$clip3<- ifelse(is.na(gene$clip3),as.character(0),gene$clip3)
  
  gene$clip3_tail <- case_when( #no tail at all
    gene$clip3=="0" ~  case_when( #no tail
      gene$pos == gene$transcript_end ~ 2000,
      gene$pos < gene$transcript_end & gene$pos >= gene$transcript_end-24 ~ 2000, 
      gene$pos != gene$transcript_end ~ 1000
    ),
    str_count(gene$clip3,"T")!=0 & str_count(gene$clip3,"T") == nchar(gene$clip3) ~ case_when(  #U-tail
      gene$pos == gene$transcript_end ~ 1001,
      gene$pos < gene$transcript_end & gene$pos >= gene$transcript_end-24 ~ 1001,
      gene$pos != gene$transcript_end ~ 1000 
    ),
    str_count(gene$clip3,"A")!=0 & str_count(gene$clip3,"A") == nchar(gene$clip3) ~ case_when(  #A-tail
      gene$pos == gene$transcript_end ~ 1002,
      gene$pos < gene$transcript_end & gene$pos >= gene$transcript_end-24 ~ 1002,
      gene$pos != gene$transcript_end ~ 1000 
    ),
    str_count(gene$clip3, regex("[^GC]")) & str_count(gene$clip3, regex("^A")) & str_count(gene$clip3, regex("T$")) & str_count(gene$clip3,"A|T")!=0 & str_count(gene$clip3,"A|T") == nchar(gene$clip3) ~ case_when(  #AU-tail
      gene$pos == gene$transcript_end ~ 1003,
      gene$pos < gene$transcript_end & gene$pos >= gene$transcript_end-24 ~ 1003,
      gene$pos != gene$transcript_end ~ 1000 
    ),
    #other
    TRUE ~ case_when( #heterogenous tail
      gene$pos == gene$transcript_end ~ 1000,
      gene$pos != gene$transcript_end ~ 1000
    )
  )
  gene$clip3_tail_len <- case_when( #no tail at all
    gene$clip3=="0" ~  case_when( #no tail
      gene$pos == gene$transcript_end ~ 0,
      gene$pos < gene$transcript_end & gene$pos >= gene$transcript_end-24 ~ 0, 
      gene$pos != gene$transcript_end ~ 0
    ),
    str_count(gene$clip3,"T")!=0 & str_count(gene$clip3,"T") == nchar(gene$clip3) ~ case_when(  #U-tail
      gene$pos == gene$transcript_end ~ nchar(gene$clip3)+0,
      gene$pos < gene$transcript_end & gene$pos >= gene$transcript_end-24 ~ nchar(gene$clip3)+0,
      gene$pos != gene$transcript_end ~ nchar(gene$clip3)+0 
    ),
    str_count(gene$clip3,"A")!=0 & str_count(gene$clip3,"A") == nchar(gene$clip3) ~ case_when(  #A-tail
      gene$pos == gene$transcript_end ~ nchar(gene$clip3)+0,
      gene$pos < gene$transcript_end & gene$pos >= gene$transcript_end-24 ~ nchar(gene$clip3)+0,
      gene$pos != gene$transcript_end ~ nchar(gene$clip3)+0 
    ),
    str_count(gene$clip3, regex("[^GC]")) & str_count(gene$clip3, regex("^A")) & str_count(gene$clip3, regex("T$")) & str_count(gene$clip3,"A|T")!=0 & str_count(gene$clip3,"A|T") == nchar(gene$clip3) ~ case_when(  #AU-tail
      gene$pos == gene$transcript_end ~ nchar(gene$clip3)+0,
      gene$pos < gene$transcript_end & gene$pos >= gene$transcript_end-24 ~ nchar(gene$clip3)+0,
      gene$pos != gene$transcript_end ~ nchar(gene$clip3)+0 
    ),
    #other
    TRUE ~ case_when( #heterogenous tail
      gene$pos == gene$transcript_end ~ nchar(gene$clip3)+0,
      gene$pos != gene$transcript_end ~ nchar(gene$clip3)+0
    )
  )
  
  gene$clip3_tail_len_UinAU <- str_count(str_extract(gene$clip3, regex("T*$")))
  
  gene <- decode(gene)
  return(gene)
}



#Function for tails length ploting
#tail_code: "1002", polyA; "1003", polyAU; "1001", polyU. gene_name in ""

tail_length_fun <- function(tail_code, gene_name, tail_name){
  len = len_LINE1_NGS4[len_LINE1_NGS4$clip3_tail == tail_code,]
  
  percent_len <- data.frame(no_row=c(nrow(len[len$clip3_tail_len <= 32 & len$sample == "WT_25",]), nrow(len[len$clip3_tail_len > 32 & len$clip3_tail_len <= 64 & len$sample == "WT_25",]), nrow(len[len$clip3_tail_len > 64 & len$sample == "WT_25",]),
                                     nrow(len[len$clip3_tail_len <= 32 & len$sample == "WT_26",]), nrow(len[len$clip3_tail_len > 32 & len$clip3_tail_len <= 64 & len$sample == "WT_26",]), nrow(len[len$clip3_tail_len > 64 & len$sample == "WT_26",]),
                                     nrow(len[len$clip3_tail_len <= 32 & len$sample == "WT_27",]), nrow(len[len$clip3_tail_len > 32 & len$clip3_tail_len <= 64 & len$sample == "WT_27",]), nrow(len[len$clip3_tail_len > 64 & len$sample == "WT_27",]),
                                     nrow(len[len$clip3_tail_len <= 32 & len$sample == "WT_28",]), nrow(len[len$clip3_tail_len > 32 & len$clip3_tail_len <= 64 & len$sample == "WT_28",]), nrow(len[len$clip3_tail_len > 64 & len$sample == "WT_28",]),
                                     nrow(len[len$clip3_tail_len <= 32 & len$sample == "WT_29",]), nrow(len[len$clip3_tail_len > 32 & len$clip3_tail_len <= 64 & len$sample == "WT_29",]), nrow(len[len$clip3_tail_len > 64 & len$sample == "WT_29",]),
                                     nrow(len[len$clip3_tail_len <= 32 & len$sample == "WT_30",]), nrow(len[len$clip3_tail_len > 32 & len$clip3_tail_len <= 64 & len$sample == "WT_30",]), nrow(len[len$clip3_tail_len > 64 & len$sample == "WT_30",]),
                                     nrow(len[len$clip3_tail_len <= 32 & len$sample == "X1KO_17",]), nrow(len[len$clip3_tail_len > 32 & len$clip3_tail_len <= 64 & len$sample == "X1KO_17",]), nrow(len[len$clip3_tail_len > 64 & len$sample == "X1KO_17",]),
                                     nrow(len[len$clip3_tail_len <= 32 & len$sample == "X1KO_18",]), nrow(len[len$clip3_tail_len > 32 & len$clip3_tail_len <= 64 & len$sample == "X1KO_18",]), nrow(len[len$clip3_tail_len > 64 & len$sample == "X1KO_18",]),
                                     nrow(len[len$clip3_tail_len <= 32 & len$sample == "X1KO_19",]), nrow(len[len$clip3_tail_len > 32 & len$clip3_tail_len <= 64 & len$sample == "X1KO_19",]), nrow(len[len$clip3_tail_len > 64 & len$sample == "X1KO_19",]),
                                     nrow(len[len$clip3_tail_len <= 32 & len$sample == "X1KO_20",]), nrow(len[len$clip3_tail_len > 32 & len$clip3_tail_len <= 64 & len$sample == "X1KO_20",]), nrow(len[len$clip3_tail_len > 64 & len$sample == "X1KO_20",]),
                                     nrow(len[len$clip3_tail_len <= 32 & len$sample == "X1KO_21",]), nrow(len[len$clip3_tail_len > 32 & len$clip3_tail_len <= 64 & len$sample == "X1KO_21",]), nrow(len[len$clip3_tail_len > 64 & len$sample == "X1KO_21",]),
                                     nrow(len[len$clip3_tail_len <= 32 & len$sample == "X1KO_22",]), nrow(len[len$clip3_tail_len > 32 & len$clip3_tail_len <= 64 & len$sample == "X1KO_22",]), nrow(len[len$clip3_tail_len > 64 & len$sample == "X1KO_22",])),
                            tail_class=c(rep(c("short (<=32)", "mid (33-64)", "long (>64)"),12)),
                            sample=c(rep("WT_25",3),rep("WT_26",3),rep("WT_27",3),rep("WT_28",3),rep("WT_29",3),rep("WT_30",3),rep("X1KO_17",3),rep("X1KO_18",3),rep("X1KO_19",3),rep("X1KO_20",3),rep("X1KO_21",3),rep("X1KO_22",3)))
  
  p1 = ggplot(percent_len, aes(fill=tail_class, y=no_row, x=sample)) + 
    geom_bar(position="fill", stat="identity") +
    ggtitle(paste(gene_name,tail_name)) + 
    theme(axis.text.x=element_text(angle=45, hjust=1),
          axis.title=element_text(size=14),
          axis.text=element_text(size=12),
          legend.title=element_text(size=14),
          legend.text=element_text(size=12)) +
    scale_fill_manual(values=c("short (<=32)" = "#AED66B", "mid (33-64)" = "#FD8D3C", "long (>64)" = "#6BAED6"), name = "Tail length", breaks = c("long (>64)", "mid (33-64)", "short (<=32)")) +
    ylab("Percentage of tail lengths")
  
  p2 = ggplot(len, aes(x=clip3_tail_len, color=sample)) +
    geom_density(adjust = 2) +
    ggtitle(paste(gene_name,tail_name)) +
    theme(axis.title=element_text(size=14),
          axis.text=element_text(size=12),
          legend.title=element_text(size=14),
          legend.text=element_text(size=12)) +
    xlab(paste(tail_name,"tail length"))
  
  p3 = ggplot(len[len$clip3_tail_len <= 64,], aes(x=clip3_tail_len, color=sample)) +
    geom_density(adjust = 2) +
    ggtitle(paste(gene_name,tail_name)) +
    theme(axis.title=element_text(size=14),
          axis.text=element_text(size=12),
          legend.title=element_text(size=14),
          legend.text=element_text(size=12)) +
    xlab(paste(tail_name,"tail length"))
  
  p4 = ggplot(len, aes(x=clip3_tail_len, y=sample)) +
    geom_density_ridges(aes(fill = sample), scale = 0.9, quantile_lines = TRUE) +
    ggtitle(paste(gene_name,tail_name)) +
    theme(axis.title=element_text(size=14),
          axis.text=element_text(size=12),
          legend.title=element_text(size=14),
          legend.text=element_text(size=12)) +
    xlab(paste(tail_name,"tail length"))
  
  p4_1 = ggplot(len[len$clip3_tail_len <= 64,], aes(x = clip3_tail_len, y = sample)) +
    geom_density_ridges(aes(fill = sample), scale = 0.9, quantile_lines = TRUE) +
    ggtitle(paste(gene_name,tail_name)) +
    theme(axis.title=element_text(size=14),
          axis.text=element_text(size=12),
          legend.title=element_text(size=14),
          legend.text=element_text(size=12)) +
    xlab(paste(tail_name,"tail length"))
  
  p5 = ggplot(len, aes(x=clip3_tail_len, color=sample)) +
    stat_ecdf() +
    ggtitle(paste(gene_name,tail_name)) +
    theme(axis.title=element_text(size=14),
          axis.text=element_text(size=12),
          legend.title=element_text(size=14),
          legend.text=element_text(size=12)) +
    xlab(paste(tail_name,"tail length"))
  
  p5_1 = ggplot(len[len$clip3_tail_len <= 64,], aes(x=clip3_tail_len, color=sample)) +
    stat_ecdf() +
    ggtitle(paste(gene_name,tail_name)) +
    theme(axis.title=element_text(size=14),
          axis.text=element_text(size=12),
          legend.title=element_text(size=14),
          legend.text=element_text(size=12)) +
    xlab(paste(tail_name,"tail length"))
  
  test1 <- c((nrow(len[len$clip3_tail_len <= 32 & len$sample == "WT_25",])*100)/nrow(len[len$sample == "WT_25",]),
             (nrow(len[len$clip3_tail_len <= 32 & len$sample == "WT_26",])*100)/nrow(len[len$sample == "WT_26",]),
             (nrow(len[len$clip3_tail_len <= 32 & len$sample == "WT_27",])*100)/nrow(len[len$sample == "WT_27",]),
             (nrow(len[len$clip3_tail_len <= 32 & len$sample == "WT_28",])*100)/nrow(len[len$sample == "WT_28",]),
             (nrow(len[len$clip3_tail_len <= 32 & len$sample == "WT_29",])*100)/nrow(len[len$sample == "WT_29",]),
             (nrow(len[len$clip3_tail_len <= 32 & len$sample == "WT_30",])*100)/nrow(len[len$sample == "WT_30",])
  )
  
  test2 <- c((nrow(len[len$clip3_tail_len <= 32 & len$sample == "X1KO_17",])*100)/nrow(len[len$sample == "X1KO_17",]),
             (nrow(len[len$clip3_tail_len <= 32 & len$sample == "X1KO_18",])*100)/nrow(len[len$sample == "X1KO_18",]),
             (nrow(len[len$clip3_tail_len <= 32 & len$sample == "X1KO_19",])*100)/nrow(len[len$sample == "X1KO_19",]),
             (nrow(len[len$clip3_tail_len <= 32 & len$sample == "X1KO_20",])*100)/nrow(len[len$sample == "X1KO_20",]),
             (nrow(len[len$clip3_tail_len <= 32 & len$sample == "X1KO_21",])*100)/nrow(len[len$sample == "X1KO_21",]),
             (nrow(len[len$clip3_tail_len <= 32 & len$sample == "X1KO_22",])*100)/nrow(len[len$sample == "X1KO_22",])
  )
  if(mean(test1) != mean(test2)){
    test32 <- t.test(test1,test2)
  } else {
    test32 <- "t.test not performed - equel values"
  }
  
  test1 <- c((nrow(len[len$clip3_tail_len > 32 & len$clip3_tail_len <= 64 & len$sample == "WT_25",])*100)/nrow(len[len$sample == "WT_25",]),
             (nrow(len[len$clip3_tail_len > 32 & len$clip3_tail_len <= 64 & len$sample == "WT_26",])*100)/nrow(len[len$sample == "WT_26",]),
             (nrow(len[len$clip3_tail_len > 32 & len$clip3_tail_len <= 64 & len$sample == "WT_27",])*100)/nrow(len[len$sample == "WT_27",]),
             (nrow(len[len$clip3_tail_len > 32 & len$clip3_tail_len <= 64 & len$sample == "WT_28",])*100)/nrow(len[len$sample == "WT_28",]),
             (nrow(len[len$clip3_tail_len > 32 & len$clip3_tail_len <= 64 & len$sample == "WT_29",])*100)/nrow(len[len$sample == "WT_29",]),
             (nrow(len[len$clip3_tail_len > 32 & len$clip3_tail_len <= 64 & len$sample == "WT_30",])*100)/nrow(len[len$sample == "WT_30",])
  )
  
  test2 <- c((nrow(len[len$clip3_tail_len > 32 & len$clip3_tail_len <= 64 & len$sample == "X1KO_17",])*100)/nrow(len[len$sample == "X1KO_17",]),
             (nrow(len[len$clip3_tail_len > 32 & len$clip3_tail_len <= 64 & len$sample == "X1KO_18",])*100)/nrow(len[len$sample == "X1KO_18",]),
             (nrow(len[len$clip3_tail_len > 32 & len$clip3_tail_len <= 64 & len$sample == "X1KO_19",])*100)/nrow(len[len$sample == "X1KO_19",]),
             (nrow(len[len$clip3_tail_len > 32 & len$clip3_tail_len <= 64 & len$sample == "X1KO_20",])*100)/nrow(len[len$sample == "X1KO_20",]),
             (nrow(len[len$clip3_tail_len > 32 & len$clip3_tail_len <= 64 & len$sample == "X1KO_21",])*100)/nrow(len[len$sample == "X1KO_21",]),
             (nrow(len[len$clip3_tail_len > 32 & len$clip3_tail_len <= 64 & len$sample == "X1KO_22",])*100)/nrow(len[len$sample == "X1KO_22",])
  )
  if(mean(test1) != mean(test2)){
    test33 <- t.test(test1,test2)
  } else {
    test33 <- "t.test not performed - equel values"
  }
  
  test1 <- c((nrow(len[len$clip3_tail_len > 64 & len$sample == "WT_25",])*100)/nrow(len[len$sample == "WT_25",]),
             (nrow(len[len$clip3_tail_len > 64 & len$sample == "WT_26",])*100)/nrow(len[len$sample == "WT_26",]),
             (nrow(len[len$clip3_tail_len > 64 & len$sample == "WT_27",])*100)/nrow(len[len$sample == "WT_27",]),
             (nrow(len[len$clip3_tail_len > 64 & len$sample == "WT_28",])*100)/nrow(len[len$sample == "WT_28",]),
             (nrow(len[len$clip3_tail_len > 64 & len$sample == "WT_29",])*100)/nrow(len[len$sample == "WT_29",]),
             (nrow(len[len$clip3_tail_len > 64 & len$sample == "WT_30",])*100)/nrow(len[len$sample == "WT_30",])
  )
  
  test2 <- c((nrow(len[len$clip3_tail_len > 64 & len$sample == "X1KO_17",])*100)/nrow(len[len$sample == "X1KO_17",]),
             (nrow(len[len$clip3_tail_len > 64 & len$sample == "X1KO_18",])*100)/nrow(len[len$sample == "X1KO_18",]),
             (nrow(len[len$clip3_tail_len > 64 & len$sample == "X1KO_19",])*100)/nrow(len[len$sample == "X1KO_19",]),
             (nrow(len[len$clip3_tail_len > 64 & len$sample == "X1KO_20",])*100)/nrow(len[len$sample == "X1KO_20",]),
             (nrow(len[len$clip3_tail_len > 64 & len$sample == "X1KO_21",])*100)/nrow(len[len$sample == "X1KO_21",]),
             (nrow(len[len$clip3_tail_len > 64 & len$sample == "X1KO_22",])*100)/nrow(len[len$sample == "X1KO_22",])
  )
  if(mean(test1) != mean(test2)){
    test64 <- t.test(test1,test2)
  } else {
    test64 <- "t.test not performed - equel values"
  } 
  
  
  print(p1)
  print("T-test for short tails")
  print(test32)
  print("T-test for mid tails")
  print(test33)
  print("T-test for long tails")
  print(test64)
  print(p2)
  print(p3)
  print(p4)
  print(p4_1)
  print(p5)
  print(p5_1)
  
  if(tail_code == "1003"){
    p6 = ggplot(len[len$clip3_tail_len <= 64,], aes(x=clip3_tail_len_UinAU, color=sample)) +
      geom_density(adjust = 2) +
      ggtitle(paste(gene_name,"U in polyAU")) +
      theme(axis.title=element_text(size=14),
            axis.text=element_text(size=12),
            legend.title=element_text(size=14),
            legend.text=element_text(size=12)) +
      xlab("length of U in polyAU")
    
    p6_1 = ggplot(len[len$clip3_tail_len <= 64,], aes(x=clip3_tail_len_UinAU, y=sample)) +
      geom_density_ridges(aes(fill = sample), scale = 0.9, quantile_lines = TRUE) +
      ggtitle(paste(gene_name,"U in polyAU")) +
      theme(axis.title=element_text(size=14),
            axis.text=element_text(size=12),
            legend.title=element_text(size=14),
            legend.text=element_text(size=12)) +
      xlab("length of U in polyAU")
    
    p6_2 = ggplot(len[len$clip3_tail_len <= 64,], aes(x=clip3_tail_len_UinAU, color=sample)) +
      stat_ecdf() +
      ggtitle(paste(gene_name,"U in polyAU")) +
      theme(axis.title=element_text(size=14),
            axis.text=element_text(size=12),
            legend.title=element_text(size=14),
            legend.text=element_text(size=12)) +
      xlab("length of U in polyAU")
    print(p6)
    print(p6_1)
    print(p6_2)
  }
}

#Function for mean tails length ploting
#tail_code: "1002", polyA; "1003", polyAU; "1001", polyU. gene_name in ""

tail_length_mean_fun <- function(tail_code, gene_name, tail_name){
  len = len_LINE1_NGS4_mean[len_LINE1_NGS4_mean$clip3_tail == tail_code,]
  
  percent_len <- data.frame(no_row=c(nrow(len[len$clip3_tail_len <= 32 & len$sample == "WT",]), nrow(len[len$clip3_tail_len > 32 & len$clip3_tail_len <= 64 & len$sample == "WT",]), nrow(len[len$clip3_tail_len > 64 & len$sample == "WT",]),
                                     nrow(len[len$clip3_tail_len <= 32 & len$sample == "X1KO",]), nrow(len[len$clip3_tail_len > 32 & len$clip3_tail_len <= 64 & len$sample == "X1KO",]), nrow(len[len$clip3_tail_len > 64 & len$sample == "X1KO",])),
                            tail_class=c(rep(c("short (<=32)", "mid (33-64)", "long (>64)"),2 )),
                            sample=c(rep("WT",3),rep("X1KO",3)))
  
  p1 = ggplot(percent_len, aes(fill=tail_class, y=no_row, x=sample)) + 
    geom_bar(position="fill", stat="identity") +
    ggtitle(paste(gene_name,tail_name)) + 
    theme(axis.text.x=element_text(angle=45, hjust=1),
          axis.title=element_text(size=14),
          axis.text=element_text(size=12),
          legend.title=element_text(size=14),
          legend.text=element_text(size=12)) +
    scale_fill_manual(values=c("short (<=32)" = "#AED66B", "mid (33-64)" = "#FD8D3C", "long (>64)" = "#6BAED6"), name = "Tail length", breaks = c("long (>64)", "mid (33-64)", "short (<=32)")) +
    ylab("Percentage of tail lengths")
  
  p2 = ggplot(len, aes(x=clip3_tail_len, color=sample)) +
    geom_density(adjust = 2) +
    ggtitle(paste(gene_name,tail_name)) +
    theme(axis.title=element_text(size=14),
          axis.text=element_text(size=12),
          legend.title=element_text(size=14),
          legend.text=element_text(size=12)) +
    xlab(paste(tail_name,"tail length"))
  
  p3 = ggplot(len[len$clip3_tail_len <= 64,], aes(x=clip3_tail_len, color=sample)) +
    geom_density(adjust = 2) +
    ggtitle(paste(gene_name,tail_name)) +
    theme(axis.title=element_text(size=14),
          axis.text=element_text(size=12),
          legend.title=element_text(size=14),
          legend.text=element_text(size=12)) +
    xlab(paste(tail_name,"tail length"))
  
  p4 = ggplot(len, aes(x=clip3_tail_len, y=sample)) +
    geom_density_ridges(aes(fill = sample), scale = 0.9, quantile_lines = TRUE) +
    ggtitle(paste(gene_name,tail_name)) +
    theme(axis.title=element_text(size=14),
          axis.text=element_text(size=12),
          legend.title=element_text(size=14),
          legend.text=element_text(size=12)) +
    xlab(paste(tail_name,"tail length"))
  
  p4_1 = ggplot(len[len$clip3_tail_len <= 64,], aes(x=clip3_tail_len, y=sample)) +
    geom_density_ridges(aes(fill = sample), scale = 0.9, quantile_lines = TRUE) +
    ggtitle(paste(gene_name,tail_name)) +
    theme(axis.title=element_text(size=14),
          axis.text=element_text(size=12),
          legend.title=element_text(size=14),
          legend.text=element_text(size=12)) +
    xlab(paste(tail_name,"tail length"))
  
  p5 = ggplot(len, aes(x=clip3_tail_len, color=sample)) +
    stat_ecdf() +
    ggtitle(paste(gene_name,tail_name)) +
    theme(axis.title=element_text(size=14),
          axis.text=element_text(size=12),
          legend.title=element_text(size=14),
          legend.text=element_text(size=12)) +
    xlab(paste(tail_name,"tail length"))
  
  p5_1 = ggplot(len[len$clip3_tail_len <= 64,], aes(x=clip3_tail_len, color=sample)) +
    stat_ecdf() +
    ggtitle(paste(gene_name,tail_name)) +
    theme(axis.title=element_text(size=14),
          axis.text=element_text(size=12),
          legend.title=element_text(size=14),
          legend.text=element_text(size=12)) +
    xlab(paste(tail_name,"tail length"))
  
  print(p1)
  print(p2)
  print(p3)
  print(p4)
  print(p4_1)
  print(p5)
  print(p5_1)
  
  if(tail_code == "1003"){
    p6 = ggplot(len[len$clip3_tail_len <= 64,], aes(x=clip3_tail_len_UinAU, color=sample)) +
      geom_density(adjust = 2) +
      ggtitle(paste(gene_name,"U in polyAU")) +
      theme(axis.title=element_text(size=14),
            axis.text=element_text(size=12),
            legend.title=element_text(size=14),
            legend.text=element_text(size=12)) +
      xlab("length of U in polyAU")
    
    p6_1 = ggplot(len[len$clip3_tail_len <= 64,], aes(x=clip3_tail_len_UinAU, y=sample)) +
      geom_density_ridges(aes(fill = sample), scale = 0.9, quantile_lines = TRUE) +
      ggtitle(paste(gene_name,"U in polyAU")) +
      theme(axis.title=element_text(size=14),
            axis.text=element_text(size=12),
            legend.title=element_text(size=14),
            legend.text=element_text(size=12)) +
      xlab("length of U in polyAU")
    
    p6_2 = ggplot(len[len$clip3_tail_len <= 64,], aes(x = clip3_tail_len_UinAU, color=sample)) +
      stat_ecdf() +
      ggtitle(paste(gene_name,"U in polyAU")) +
      theme(axis.title=element_text(size=14),
            axis.text=element_text(size=12),
            legend.title=element_text(size=14),
            legend.text=element_text(size=12)) +
      xlab("length of U in polyAU")
    print(p6)
    print(p6_1)
    print(p6_2)
  }
}

#Function for percentage calculation
percents_fun = function(sample_name,column){
  nrow <- nrow(sample_name)
  nrow_woOther <- nrow(sample_name[sample_name$clip3_tail!="1000",])
  nrow_poliU <- nrow(sample_name[sample_name$clip3_tail=="1001",])
  nrow_poliA <- nrow(sample_name[sample_name$clip3_tail=="1002",])
  nrow_poliAU <- nrow(sample_name[sample_name$clip3_tail=="1003",])
  nrow_noTail <- nrow(sample_name[sample_name$clip3_tail=="2000",])
  
  percent_poliU <- nrow_poliU*100/nrow_woOther
  percent_poliA <- nrow_poliA*100/nrow_woOther
  percent_poliAU <- nrow_poliAU*100/nrow_woOther
  percent_noTail <- nrow_noTail*100/nrow_woOther
  
  percent_xxx <- data.frame(percent=c(percent_poliU,percent_poliA,percent_poliAU,percent_noTail),
                            tail=c("U tail","A tail","AU tail","no tail"),
                            sample=c(rep(column,4)))
  return(percent_xxx)
}

#Function for number of reads calculation
nreads_fun = function(sample_name){
  nrow <- nrow(sample_name)
  nrow_woOther <- nrow(sample_name[sample_name$clip3_tail!="1000",])
  nrow_poliU <- nrow(sample_name[sample_name$clip3_tail=="1001",])
  nrow_poliA <- nrow(sample_name[sample_name$clip3_tail=="1002",])
  nrow_poliAU <- nrow(sample_name[sample_name$clip3_tail=="1003",])
  nrow_noTail <- nrow(sample_name[sample_name$clip3_tail=="2000",])
  
  nreads_xxx = data.frame(
    all = nrow,
    discarded = nrow - nrow_woOther,
    poliA = nrow_poliA,
    poliAU = nrow_poliAU,
    poliU = nrow_poliU,
    no_tail = nrow_noTail
  )
  return(nreads_xxx)
}

#Function to percentage of tails in length range calculation
#column in ""
percents_bins_fun <- function(sample_name, column){ 
  nrow <- nrow(sample_name)
  nrow_woOther <- nrow(sample_name[sample_name$clip3_tail!="1000",])
  
  nrow_poliU_1_10 <- nrow(sample_name[sample_name$clip3_tail=="1001" & sample_name$clip3_tail_len_range=="1-10",])
  nrow_poliU_11_20 <- nrow(sample_name[sample_name$clip3_tail=="1001" & sample_name$clip3_tail_len_range=="11-20",])
  nrow_poliU_21_30 <- nrow(sample_name[sample_name$clip3_tail=="1001" & sample_name$clip3_tail_len_range=="21-30",])
  nrow_poliU_31_40 <- nrow(sample_name[sample_name$clip3_tail=="1001" & sample_name$clip3_tail_len_range=="31-40",])
  nrow_poliU_41_50 <- nrow(sample_name[sample_name$clip3_tail=="1001" & sample_name$clip3_tail_len_range=="41-50",])
  nrow_poliU_51_60 <- nrow(sample_name[sample_name$clip3_tail=="1001" & sample_name$clip3_tail_len_range=="51-60",])
  nrow_poliU_61 <- nrow(sample_name[sample_name$clip3_tail=="1001" & sample_name$clip3_tail_len_range=="61+",])
  
  nrow_poliA_1_10 <- nrow(sample_name[sample_name$clip3_tail=="1002" & sample_name$clip3_tail_len_range=="1-10",])
  nrow_poliA_11_20 <- nrow(sample_name[sample_name$clip3_tail=="1002" & sample_name$clip3_tail_len_range=="11-20",])
  nrow_poliA_21_30 <- nrow(sample_name[sample_name$clip3_tail=="1002" & sample_name$clip3_tail_len_range=="21-30",])
  nrow_poliA_31_40 <- nrow(sample_name[sample_name$clip3_tail=="1002" & sample_name$clip3_tail_len_range=="31-40",])
  nrow_poliA_41_50 <- nrow(sample_name[sample_name$clip3_tail=="1002" & sample_name$clip3_tail_len_range=="41-50",])
  nrow_poliA_51_60 <- nrow(sample_name[sample_name$clip3_tail=="1002" & sample_name$clip3_tail_len_range=="51-60",])
  nrow_poliA_61 <- nrow(sample_name[sample_name$clip3_tail=="1002" & sample_name$clip3_tail_len_range=="61+",])  
  
  nrow_poliAU_1_10 <- nrow(sample_name[sample_name$clip3_tail=="1003" & sample_name$clip3_tail_len_range=="1-10",])
  nrow_poliAU_11_20 <- nrow(sample_name[sample_name$clip3_tail=="1003" & sample_name$clip3_tail_len_range=="11-20",])
  nrow_poliAU_21_30 <- nrow(sample_name[sample_name$clip3_tail=="1003" & sample_name$clip3_tail_len_range=="21-30",])
  nrow_poliAU_31_40 <- nrow(sample_name[sample_name$clip3_tail=="1003" & sample_name$clip3_tail_len_range=="31-40",])
  nrow_poliAU_41_50 <- nrow(sample_name[sample_name$clip3_tail=="1003" & sample_name$clip3_tail_len_range=="41-50",])
  nrow_poliAU_51_60 <- nrow(sample_name[sample_name$clip3_tail=="1003" & sample_name$clip3_tail_len_range=="51-60",])
  nrow_poliAU_61 <- nrow(sample_name[sample_name$clip3_tail=="1003" & sample_name$clip3_tail_len_range=="61+",]) 
  
  nrow_noTail <- nrow(sample_name[sample_name$clip3_tail=="2000",])
  
  percent_poliU_1_10 <- nrow_poliU_1_10*100/(nrow_poliU_1_10+nrow_poliA_1_10+nrow_poliAU_1_10)
  percent_poliU_1_10[is.na(percent_poliU_1_10)] <- 0
  percent_poliA_1_10 <- nrow_poliA_1_10*100/(nrow_poliU_1_10+nrow_poliA_1_10+nrow_poliAU_1_10)
  percent_poliA_1_10[is.na(percent_poliA_1_10)] <- 0
  percent_poliAU_1_10 <- nrow_poliAU_1_10*100/(nrow_poliU_1_10+nrow_poliA_1_10+nrow_poliAU_1_10)
  percent_poliAU_1_10[is.na(percent_poliAU_1_10)] <- 0
  
  percent_poliU_11_20 <- nrow_poliU_11_20*100/(nrow_poliU_11_20+nrow_poliA_11_20+nrow_poliAU_11_20)
  percent_poliU_11_20[is.na(percent_poliU_11_20)] <- 0
  percent_poliA_11_20 <- nrow_poliA_11_20*100/(nrow_poliU_11_20+nrow_poliA_11_20+nrow_poliAU_11_20)
  percent_poliA_11_20[is.na(percent_poliA_11_20)] <- 0
  percent_poliAU_11_20 <- nrow_poliAU_11_20*100/(nrow_poliU_11_20+nrow_poliA_11_20+nrow_poliAU_11_20)
  percent_poliAU_11_20[is.na(percent_poliAU_11_20)] <- 0 
  
  percent_poliU_21_30 <- nrow_poliU_21_30*100/(nrow_poliU_21_30+nrow_poliA_21_30+nrow_poliAU_21_30)
  percent_poliU_21_30[is.na(percent_poliU_21_30)] <- 0
  percent_poliA_21_30 <- nrow_poliA_21_30*100/(nrow_poliU_21_30+nrow_poliA_21_30+nrow_poliAU_21_30)
  percent_poliA_21_30[is.na(percent_poliA_21_30)] <- 0
  percent_poliAU_21_30 <- nrow_poliAU_21_30*100/(nrow_poliU_21_30+nrow_poliA_21_30+nrow_poliAU_21_30)
  percent_poliAU_21_30[is.na(percent_poliAU_21_30)] <- 0  
  
  percent_poliU_31_40 <- nrow_poliU_31_40*100/(nrow_poliU_31_40+nrow_poliA_31_40+nrow_poliAU_31_40)
  percent_poliU_31_40[is.na(percent_poliU_31_40)] <- 0
  percent_poliA_31_40 <- nrow_poliA_31_40*100/(nrow_poliU_31_40+nrow_poliA_31_40+nrow_poliAU_31_40)
  percent_poliA_31_40[is.na(percent_poliA_31_40)] <- 0
  percent_poliAU_31_40 <- nrow_poliAU_31_40*100/(nrow_poliU_31_40+nrow_poliA_31_40+nrow_poliAU_31_40)
  percent_poliAU_31_40[is.na(percent_poliAU_31_40)] <- 0
  
  percent_poliU_41_50 <- nrow_poliU_41_50*100/(nrow_poliU_41_50+nrow_poliA_41_50+nrow_poliAU_41_50)
  percent_poliU_41_50[is.na(percent_poliU_41_50)] <- 0
  percent_poliA_41_50 <- nrow_poliA_41_50*100/(nrow_poliU_41_50+nrow_poliA_41_50+nrow_poliAU_41_50)
  percent_poliA_41_50[is.na(percent_poliA_41_50)] <- 0
  percent_poliAU_41_50 <- nrow_poliAU_41_50*100/(nrow_poliU_41_50+nrow_poliA_41_50+nrow_poliAU_41_50)
  percent_poliAU_41_50[is.na(percent_poliAU_41_50)] <- 0
  
  percent_poliU_51_60 <- nrow_poliU_51_60*100/(nrow_poliU_51_60+nrow_poliA_51_60+nrow_poliAU_51_60)
  percent_poliU_51_60[is.na(percent_poliU_51_60)] <- 0
  percent_poliA_51_60 <- nrow_poliA_51_60*100/(nrow_poliU_51_60+nrow_poliA_51_60+nrow_poliAU_51_60)
  percent_poliA_51_60[is.na(percent_poliA_51_60)] <- 0
  percent_poliAU_51_60 <- nrow_poliAU_51_60*100/(nrow_poliU_51_60+nrow_poliA_51_60+nrow_poliAU_51_60)
  percent_poliAU_51_60[is.na(percent_poliAU_51_60)] <- 0
  
  percent_poliU_61 <- nrow_poliU_61*100/(nrow_poliU_61+nrow_poliA_61+nrow_poliAU_61)
  percent_poliU_61[is.na(percent_poliU_61)] <- 0
  percent_poliA_61 <- nrow_poliA_61*100/(nrow_poliU_61+nrow_poliA_61+nrow_poliAU_61)
  percent_poliA_61[is.na(percent_poliA_61)] <- 0
  percent_poliAU_61 <- nrow_poliAU_61*100/(nrow_poliU_61+nrow_poliA_61+nrow_poliAU_61)
  percent_poliAU_61[is.na(percent_poliAU_61)] <- 0
  
  percent_xxx <- data.frame(percent=c(percent_poliU_1_10,percent_poliA_1_10,percent_poliAU_1_10,
                                      percent_poliU_11_20,percent_poliA_11_20,percent_poliAU_11_20,
                                      percent_poliU_21_30,percent_poliA_21_30,percent_poliAU_21_30,
                                      percent_poliU_31_40,percent_poliA_31_40,percent_poliAU_31_40,
                                      percent_poliU_41_50,percent_poliA_41_50,percent_poliAU_41_50,
                                      percent_poliU_51_60,percent_poliA_51_60,percent_poliAU_51_60,
                                      percent_poliU_61,percent_poliA_61,percent_poliAU_61),
                            tail=c(rep(c("U tail","A tail","AU tail"),7)),
                            range=(c(rep("1-10",3),rep("11-20",3),rep("21-30",3),rep("31-40",3),rep("41-50",3),rep("51-60",3),rep("61+",3))),
                            sample=c(rep(column,21)))
  return(percent_xxx)
}

#Function to number of reads in length range calculation
#column in ""
nreads_bins_fun <- function(sample_name, column){ 
  nrow <- nrow(sample_name)
  nrow_woOther <- nrow(sample_name[sample_name$clip3_tail!="1000",])
  
  nrow_poliU_1_10 <- nrow(sample_name[sample_name$clip3_tail=="1001" & sample_name$clip3_tail_len_range=="1-10",])
  nrow_poliU_11_20 <- nrow(sample_name[sample_name$clip3_tail=="1001" & sample_name$clip3_tail_len_range=="11-20",])
  nrow_poliU_21_30 <- nrow(sample_name[sample_name$clip3_tail=="1001" & sample_name$clip3_tail_len_range=="21-30",])
  nrow_poliU_31_40 <- nrow(sample_name[sample_name$clip3_tail=="1001" & sample_name$clip3_tail_len_range=="31-40",])
  nrow_poliU_41_50 <- nrow(sample_name[sample_name$clip3_tail=="1001" & sample_name$clip3_tail_len_range=="41-50",])
  nrow_poliU_51_60 <- nrow(sample_name[sample_name$clip3_tail=="1001" & sample_name$clip3_tail_len_range=="51-60",])
  nrow_poliU_61 <- nrow(sample_name[sample_name$clip3_tail=="1001" & sample_name$clip3_tail_len_range=="61+",])
  
  nrow_poliA_1_10 <- nrow(sample_name[sample_name$clip3_tail=="1002" & sample_name$clip3_tail_len_range=="1-10",])
  nrow_poliA_11_20 <- nrow(sample_name[sample_name$clip3_tail=="1002" & sample_name$clip3_tail_len_range=="11-20",])
  nrow_poliA_21_30 <- nrow(sample_name[sample_name$clip3_tail=="1002" & sample_name$clip3_tail_len_range=="21-30",])
  nrow_poliA_31_40 <- nrow(sample_name[sample_name$clip3_tail=="1002" & sample_name$clip3_tail_len_range=="31-40",])
  nrow_poliA_41_50 <- nrow(sample_name[sample_name$clip3_tail=="1002" & sample_name$clip3_tail_len_range=="41-50",])
  nrow_poliA_51_60 <- nrow(sample_name[sample_name$clip3_tail=="1002" & sample_name$clip3_tail_len_range=="51-60",])
  nrow_poliA_61 <- nrow(sample_name[sample_name$clip3_tail=="1002" & sample_name$clip3_tail_len_range=="61+",])  
  
  nrow_poliAU_1_10 <- nrow(sample_name[sample_name$clip3_tail=="1003" & sample_name$clip3_tail_len_range=="1-10",])
  nrow_poliAU_11_20 <- nrow(sample_name[sample_name$clip3_tail=="1003" & sample_name$clip3_tail_len_range=="11-20",])
  nrow_poliAU_21_30 <- nrow(sample_name[sample_name$clip3_tail=="1003" & sample_name$clip3_tail_len_range=="21-30",])
  nrow_poliAU_31_40 <- nrow(sample_name[sample_name$clip3_tail=="1003" & sample_name$clip3_tail_len_range=="31-40",])
  nrow_poliAU_41_50 <- nrow(sample_name[sample_name$clip3_tail=="1003" & sample_name$clip3_tail_len_range=="41-50",])
  nrow_poliAU_51_60 <- nrow(sample_name[sample_name$clip3_tail=="1003" & sample_name$clip3_tail_len_range=="51-60",])
  nrow_poliAU_61 <- nrow(sample_name[sample_name$clip3_tail=="1003" & sample_name$clip3_tail_len_range=="61+",]) 
  
  nrow_noTail <- nrow(sample_name[sample_name$clip3_tail=="2000",])
  
  nreads_xxx = data.frame(
    poliA_1_10 = nrow_poliA_1_10,
    poliA_11_20 = nrow_poliA_11_20,
    poliA_21_30 = nrow_poliA_21_30,
    poliA_31_40 = nrow_poliA_31_40,
    poliA_41_50 = nrow_poliA_41_50,
    poliA_51_60 = nrow_poliA_51_60,
    poliA_61 = nrow_poliA_61,
    poliAU_1_10 = nrow_poliAU_1_10,
    poliAU_11_20 = nrow_poliAU_11_20,
    poliAU_21_30 = nrow_poliAU_21_30,
    poliAU_31_40 = nrow_poliAU_31_40,
    poliAU_41_50 = nrow_poliAU_41_50,
    poliAU_51_60 = nrow_poliAU_51_60,
    poliAU_61 = nrow_poliAU_61,
    poliU_1_10 = nrow_poliU_1_10,
    poliU_11_20 = nrow_poliU_11_20,
    poliU_21_30 = nrow_poliU_21_30,
    poliU_31_40 = nrow_poliU_31_40,
    poliU_41_50 = nrow_poliU_41_50,
    poliU_51_60 = nrow_poliU_51_60,
    poliU_61 = nrow_poliU_61
  )
  
  nreads_xxx = t(nreads_xxx)
  colnames(nreads_xxx) = column
  
  return(nreads_xxx)
}

#Function to percentage of U in AU tails in length range
#column in ""
percents_AU_bins_fun <- function(sample_name, column){ 
  
  sample_name$clip3_tail_U_len_range <- case_when(
    sample_name$clip3_tail_len_UinAU == 1 ~ "1",
    sample_name$clip3_tail_len_UinAU == 2 ~ "2",
    sample_name$clip3_tail_len_UinAU>=3 & sample_name$clip3_tail_len_UinAU<=4 ~ "3-4",
    sample_name$clip3_tail_len_UinAU>=5 & sample_name$clip3_tail_len_UinAU<=7 ~ "5-7",
    sample_name$clip3_tail_len_UinAU>=8 & sample_name$clip3_tail_len_UinAU<2000 ~ "8+",
    sample_name$clip3_tail_len_UinAU == 0 ~ "0",
    TRUE ~ as.character(sample_name$clip3_tail_len_UinAU)
  )
  
  sample_name <- sample_name[sample_name$clip3_tail == 1003,]
  nrow <- nrow(sample_name)
  
  nrow_1U_1_10 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="1" & sample_name$clip3_tail_len_range=="1-10",])
  nrow_1U_11_20 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="1" & sample_name$clip3_tail_len_range=="11-20",])
  nrow_1U_21_30 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="1" & sample_name$clip3_tail_len_range=="21-30",])
  nrow_1U_31_40 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="1" & sample_name$clip3_tail_len_range=="31-40",])
  nrow_1U_41_50 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="1" & sample_name$clip3_tail_len_range=="41-50",])
  nrow_1U_51_60 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="1" & sample_name$clip3_tail_len_range=="51-60",])
  nrow_1U_61 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="1" & sample_name$clip3_tail_len_range=="61+",])
  
  nrow_2U_1_10 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="2" & sample_name$clip3_tail_len_range=="1-10",])
  nrow_2U_11_20 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="2" & sample_name$clip3_tail_len_range=="11-20",])
  nrow_2U_21_30 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="2" & sample_name$clip3_tail_len_range=="21-30",])
  nrow_2U_31_40 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="2" & sample_name$clip3_tail_len_range=="31-40",])
  nrow_2U_41_50 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="2" & sample_name$clip3_tail_len_range=="41-50",])
  nrow_2U_51_60 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="2" & sample_name$clip3_tail_len_range=="51-60",])
  nrow_2U_61 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="2" & sample_name$clip3_tail_len_range=="61+",])
  
  nrow_34U_1_10 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="3-4" & sample_name$clip3_tail_len_range=="1-10",])
  nrow_34U_11_20 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="3-4" & sample_name$clip3_tail_len_range=="11-20",])
  nrow_34U_21_30 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="3-4" & sample_name$clip3_tail_len_range=="21-30",])
  nrow_34U_31_40 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="3-4" & sample_name$clip3_tail_len_range=="31-40",])
  nrow_34U_41_50 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="3-4" & sample_name$clip3_tail_len_range=="41-50",])
  nrow_34U_51_60 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="3-4" & sample_name$clip3_tail_len_range=="51-60",])
  nrow_34U_61 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="3-4" & sample_name$clip3_tail_len_range=="61+",])
  
  nrow_57U_1_10 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="5-7" & sample_name$clip3_tail_len_range=="1-10",])
  nrow_57U_11_20 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="5-7" & sample_name$clip3_tail_len_range=="11-20",])
  nrow_57U_21_30 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="5-7" & sample_name$clip3_tail_len_range=="21-30",])
  nrow_57U_31_40 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="5-7" & sample_name$clip3_tail_len_range=="31-40",])
  nrow_57U_41_50 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="5-7" & sample_name$clip3_tail_len_range=="41-50",])
  nrow_57U_51_60 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="5-7" & sample_name$clip3_tail_len_range=="51-60",])
  nrow_57U_61 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="5-7" & sample_name$clip3_tail_len_range=="61+",])
  
  nrow_8U_1_10 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="8+" & sample_name$clip3_tail_len_range=="1-10",])
  nrow_8U_11_20 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="8+" & sample_name$clip3_tail_len_range=="11-20",])
  nrow_8U_21_30 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="8+" & sample_name$clip3_tail_len_range=="21-30",])
  nrow_8U_31_40 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="8+" & sample_name$clip3_tail_len_range=="31-40",])
  nrow_8U_41_50 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="8+" & sample_name$clip3_tail_len_range=="41-50",])
  nrow_8U_51_60 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="8+" & sample_name$clip3_tail_len_range=="51-60",])
  nrow_8U_61 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="8+" & sample_name$clip3_tail_len_range=="61+",])
  
  nrow_noTail <- nrow(sample_name[sample_name$clip3_tail==2000,])
  
  nrow_xxx <- data.frame(reads=c(nrow_1U_1_10,nrow_2U_1_10,nrow_34U_1_10,nrow_57U_1_10,nrow_8U_1_10,
                                 nrow_1U_11_20,nrow_2U_11_20,nrow_34U_11_20,nrow_57U_11_20,nrow_8U_11_20,
                                 nrow_1U_21_30,nrow_2U_21_30,nrow_34U_21_30,nrow_57U_21_30,nrow_8U_21_30,
                                 nrow_1U_31_40,nrow_2U_31_40,nrow_34U_31_40,nrow_57U_31_40,nrow_8U_31_40,
                                 nrow_1U_41_50,nrow_2U_41_50,nrow_34U_41_50,nrow_57U_41_50,nrow_8U_41_50,
                                 nrow_1U_51_60,nrow_2U_51_60,nrow_34U_51_60,nrow_57U_51_60,nrow_8U_51_60,
                                 nrow_1U_61,nrow_2U_61,nrow_34U_61,nrow_57U_61,nrow_8U_61),
                         U_range=c(rep(c("1","2","3-4","5-7","8+"),7)),
                         range=(c(rep("1-10",5),rep("11-20",5),rep("21-30",5),rep("31-40",5),rep("41-50",5),rep("51-60",5),rep("61+",5))),
                         sample=c(rep(column,35)))
  
  nrow_xxx = transform(nrow_xxx, percent = ave(reads, range, FUN = prop.table))
  nrow_xxx$percent <- ifelse(is.na(nrow_xxx$percent),0,nrow_xxx$percent)
  
  return(nrow_xxx)
}


#Function to percentage of U in AU tails in length range
#column in ""
percents_UandAU_bins_fun <- function(sample_name, column){ 
  
  sample_name$clip3_tail_U_len_range <- case_when(
    sample_name$clip3_tail_len_UinAU == 1 ~ "1",
    sample_name$clip3_tail_len_UinAU == 2 ~ "2",
    sample_name$clip3_tail_len_UinAU>=3 & sample_name$clip3_tail_len_UinAU<=4 ~ "3-4",
    sample_name$clip3_tail_len_UinAU>=5 & sample_name$clip3_tail_len_UinAU<=7 ~ "5-7",
    sample_name$clip3_tail_len_UinAU>=8 & sample_name$clip3_tail_len_UinAU<2000 ~ "8+",
    sample_name$clip3_tail_len_UinAU == 0 ~ "0",
    TRUE ~ as.character(sample_name$clip3_tail_len_UinAU)
  )
  
  sample_name = sample_name[sample_name$clip3_tail %in% c(1001,1003),]
  nrow <- nrow(sample_name)
  
  nrow_1U_1_10 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="1" & sample_name$clip3_tail_len_range=="1-10",])
  nrow_1U_11_20 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="1" & sample_name$clip3_tail_len_range=="11-20",])
  nrow_1U_21_30 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="1" & sample_name$clip3_tail_len_range=="21-30",])
  nrow_1U_31_40 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="1" & sample_name$clip3_tail_len_range=="31-40",])
  nrow_1U_41_50 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="1" & sample_name$clip3_tail_len_range=="41-50",])
  nrow_1U_51_60 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="1" & sample_name$clip3_tail_len_range=="51-60",])
  nrow_1U_61 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="1" & sample_name$clip3_tail_len_range=="61+",])
  
  nrow_2U_1_10 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="2" & sample_name$clip3_tail_len_range=="1-10",])
  nrow_2U_11_20 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="2" & sample_name$clip3_tail_len_range=="11-20",])
  nrow_2U_21_30 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="2" & sample_name$clip3_tail_len_range=="21-30",])
  nrow_2U_31_40 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="2" & sample_name$clip3_tail_len_range=="31-40",])
  nrow_2U_41_50 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="2" & sample_name$clip3_tail_len_range=="41-50",])
  nrow_2U_51_60 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="2" & sample_name$clip3_tail_len_range=="51-60",])
  nrow_2U_61 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="2" & sample_name$clip3_tail_len_range=="61+",])
  
  nrow_34U_1_10 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="3-4" & sample_name$clip3_tail_len_range=="1-10",])
  nrow_34U_11_20 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="3-4" & sample_name$clip3_tail_len_range=="11-20",])
  nrow_34U_21_30 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="3-4" & sample_name$clip3_tail_len_range=="21-30",])
  nrow_34U_31_40 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="3-4" & sample_name$clip3_tail_len_range=="31-40",])
  nrow_34U_41_50 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="3-4" & sample_name$clip3_tail_len_range=="41-50",])
  nrow_34U_51_60 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="3-4" & sample_name$clip3_tail_len_range=="51-60",])
  nrow_34U_61 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="3-4" & sample_name$clip3_tail_len_range=="61+",])
  
  nrow_57U_1_10 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="5-7" & sample_name$clip3_tail_len_range=="1-10",])
  nrow_57U_11_20 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="5-7" & sample_name$clip3_tail_len_range=="11-20",])
  nrow_57U_21_30 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="5-7" & sample_name$clip3_tail_len_range=="21-30",])
  nrow_57U_31_40 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="5-7" & sample_name$clip3_tail_len_range=="31-40",])
  nrow_57U_41_50 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="5-7" & sample_name$clip3_tail_len_range=="41-50",])
  nrow_57U_51_60 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="5-7" & sample_name$clip3_tail_len_range=="51-60",])
  nrow_57U_61 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="5-7" & sample_name$clip3_tail_len_range=="61+",])
  
  nrow_8U_1_10 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="8+" & sample_name$clip3_tail_len_range=="1-10",])
  nrow_8U_11_20 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="8+" & sample_name$clip3_tail_len_range=="11-20",])
  nrow_8U_21_30 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="8+" & sample_name$clip3_tail_len_range=="21-30",])
  nrow_8U_31_40 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="8+" & sample_name$clip3_tail_len_range=="31-40",])
  nrow_8U_41_50 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="8+" & sample_name$clip3_tail_len_range=="41-50",])
  nrow_8U_51_60 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="8+" & sample_name$clip3_tail_len_range=="51-60",])
  nrow_8U_61 <- nrow(sample_name[sample_name$clip3_tail_U_len_range=="8+" & sample_name$clip3_tail_len_range=="61+",])
  
  nrow_noTail <- nrow(sample_name[sample_name$clip3_tail==2000,])
  
  nrow_xxx <- data.frame(reads=c(nrow_1U_1_10,nrow_2U_1_10,nrow_34U_1_10,nrow_57U_1_10,nrow_8U_1_10,
                                 nrow_1U_11_20,nrow_2U_11_20,nrow_34U_11_20,nrow_57U_11_20,nrow_8U_11_20,
                                 nrow_1U_21_30,nrow_2U_21_30,nrow_34U_21_30,nrow_57U_21_30,nrow_8U_21_30,
                                 nrow_1U_31_40,nrow_2U_31_40,nrow_34U_31_40,nrow_57U_31_40,nrow_8U_31_40,
                                 nrow_1U_41_50,nrow_2U_41_50,nrow_34U_41_50,nrow_57U_41_50,nrow_8U_41_50,
                                 nrow_1U_51_60,nrow_2U_51_60,nrow_34U_51_60,nrow_57U_51_60,nrow_8U_51_60,
                                 nrow_1U_61,nrow_2U_61,nrow_34U_61,nrow_57U_61,nrow_8U_61),
                         U_range=c(rep(c("1","2","3-4","5-7","8+"),7)),
                         range=(c(rep("1-10",5),rep("11-20",5),rep("21-30",5),rep("31-40",5),rep("41-50",5),rep("51-60",5),rep("61+",5))),
                         sample=c(rep(column,35)))
  
  nrow_xxx = transform(nrow_xxx, percent = ave(reads, range, FUN = prop.table))
  nrow_xxx$percent <- ifelse(is.na(nrow_xxx$percent),0,nrow_xxx$percent)
  
  return(nrow_xxx)
}