02_tails_identification/poli_clip_all.R · RACE-Seq

library('biomaRt')
library('stringr')
library('dplyr')
library('sqldf')
library('stringi')
library('optparse')

option_list = list(
  make_option(c("-f", "--file"), type="character", default=NULL,
              help="file name without \".sam\"", metavar="character"),
  make_option(c("-o", "--out"), type="character", default="results.csv",
              help="output file name with full path [default= %default]", metavar="character")
);

opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);

print("Reading file")

if (is.null(opt$file)){
  print_help(opt_parser)
  stop("At least one argument must be supplied (input file).n", call.=FALSE)
}

f=opt$file
filetowrite=opt$out

df_raw = read.csv(f, sep ='\t', header = FALSE,na.strings=c("", " ","NA"))
colnames(df_raw) <- c("id", "tail_seq", "pos","chr", "code")

print("Performing calculations")
#Length
df_raw$lenght = nchar(df_raw$tail_seq)
df_raw$lenght[is.na(df_raw$lenght)] <- 0

#Counting A,C,T,G
df_raw$U_count<- str_count(df_raw$tail_seq , "T")
df_raw$A_count<- str_count(df_raw$tail_seq , "A")
df_raw$G_count<- str_count(df_raw$tail_seq , "G")
df_raw$C_count<- str_count(df_raw$tail_seq , "C")
df_raw$A_count[is.na(df_raw$A_count)] <- 0
df_raw$U_count[is.na(df_raw$U_count)] <- 0
df_raw$C_count[is.na(df_raw$C_count)] <- 0
df_raw$G_count[is.na(df_raw$G_count)] <- 0

#strand
df_raw$strand <- case_when(
  df_raw$code==163 | df_raw$code==89 ~ "-",
  df_raw$code==99 | df_raw$code==73 ~ "+",
  TRUE ~ "other"

)

#tail type
df_raw$tail_type <- case_when(
  (df_raw$G_count + df_raw$C_count) == df_raw$lenght ~ "G/C",
  df_raw$lenght ==0 ~ "no_tail",
  df_raw$U !=0 & df_raw$U==df_raw$lenght ~ "poliU",
  df_raw$A !=0 & df_raw$A==df_raw$lenght ~ "poliA",
  df_raw$A!=0 & df_raw$U !=0 & (df_raw$A + df_raw$U) == df_raw$lenght & stri_sub(df_raw$tail_seq,-1)== "T" ~ "mixed_AU",
  TRUE ~ "other"
)

#Counting last Us
df_raw$last_U <- nchar(stri_extract_last(df_raw$tail_seq, regex=c('T+$')))


################GENES MATCHIG##################
print("Getting gene coordinates using biomaRt")

hsapiens <- useEnsembl(biomart = "genes",
                       dataset = "hsapiens_gene_ensembl")

all_infos <- getBM(attributes=c('ensembl_gene_id',
                                   'gene_biotype',
                                   'transcript_start',
                                   'transcript_end',
                                   'chromosome_name',
                                   'external_gene_name',
                                #"3_utr_start",
                                #"3_utr_end",
                                #"cds_start",
                                #"cds_end",
                                   'strand'),
                      mart = hsapiens)

#strand
all_infos$strand2 <- case_when(
  all_infos$strand ==-1  ~ "-",
  all_infos$strand ==1 ~ "+",
  TRUE ~ "other"
)

merged = sqldf("
    SELECT *
    FROM df_raw d1 JOIN all_infos d2
    ON d1.chr = d2.chromosome_name AND d1.strand == d2.strand2
    AND d1.pos BETWEEN CAST(d2.transcript_start AS int) AND (d2.transcript_end)
  ")

print("Writing output file")
merged[20] <- NULL
merged[20] <- NULL
merged$chromosome_name <- NULL
write.table(merged, filetowrite, sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE)

gc()