library('biomaRt') library('stringr') library('dplyr') library('sqldf') library('stringi') library('optparse') option_list = list( make_option(c("-f", "--file"), type="character", default=NULL, help="file name without \".sam\"", metavar="character"), make_option(c("-o", "--out"), type="character", default="results.csv", help="output file name with full path [default= %default]", metavar="character") ); opt_parser = OptionParser(option_list=option_list); opt = parse_args(opt_parser); print("Reading file") if (is.null(opt$file)){ print_help(opt_parser) stop("At least one argument must be supplied (input file).n", call.=FALSE) } f=opt$file filetowrite=opt$out df_raw = read.csv(f, sep ='\t', header = FALSE,na.strings=c("", " ","NA")) colnames(df_raw) <- c("id", "tail_seq", "pos","chr", "code") print("Performing calculations") #Length df_raw$lenght = nchar(df_raw$tail_seq) df_raw$lenght[is.na(df_raw$lenght)] <- 0 #Counting A,C,T,G df_raw$U_count<- str_count(df_raw$tail_seq , "T") df_raw$A_count<- str_count(df_raw$tail_seq , "A") df_raw$G_count<- str_count(df_raw$tail_seq , "G") df_raw$C_count<- str_count(df_raw$tail_seq , "C") df_raw$A_count[is.na(df_raw$A_count)] <- 0 df_raw$U_count[is.na(df_raw$U_count)] <- 0 df_raw$C_count[is.na(df_raw$C_count)] <- 0 df_raw$G_count[is.na(df_raw$G_count)] <- 0 #strand df_raw$strand <- case_when( df_raw$code==163 | df_raw$code==89 ~ "-", df_raw$code==99 | df_raw$code==73 ~ "+", TRUE ~ "other" ) #tail type df_raw$tail_type <- case_when( (df_raw$G_count + df_raw$C_count) == df_raw$lenght ~ "G/C", df_raw$lenght ==0 ~ "no_tail", df_raw$U !=0 & df_raw$U==df_raw$lenght ~ "poliU", df_raw$A !=0 & df_raw$A==df_raw$lenght ~ "poliA", df_raw$A!=0 & df_raw$U !=0 & (df_raw$A + df_raw$U) == df_raw$lenght & stri_sub(df_raw$tail_seq,-1)== "T" ~ "mixed_AU", TRUE ~ "other" ) #Counting last Us df_raw$last_U <- nchar(stri_extract_last(df_raw$tail_seq, regex=c('T+$'))) ################GENES MATCHIG################## print("Getting gene coordinates using biomaRt") hsapiens <- useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl") all_infos <- getBM(attributes=c('ensembl_gene_id', 'gene_biotype', 'transcript_start', 'transcript_end', 'chromosome_name', 'external_gene_name', #"3_utr_start", #"3_utr_end", #"cds_start", #"cds_end", 'strand'), mart = hsapiens) #strand all_infos$strand2 <- case_when( all_infos$strand ==-1 ~ "-", all_infos$strand ==1 ~ "+", TRUE ~ "other" ) merged = sqldf(" SELECT * FROM df_raw d1 JOIN all_infos d2 ON d1.chr = d2.chromosome_name AND d1.strand == d2.strand2 AND d1.pos BETWEEN CAST(d2.transcript_start AS int) AND (d2.transcript_end) ") print("Writing output file") merged[20] <- NULL merged[20] <- NULL merged$chromosome_name <- NULL write.table(merged, filetowrite, sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) gc()