library("GenomicRanges")
library("DiffBind")
library("DESeq2")
library("limma")
library("ggplot2")
library("ggrepel")
library("RColorBrewer")
library("pheatmap")
library("gridExtra")
library("grid")
library("sva")
############ Modify strings
modify_string = function (x,delimtter,position) {
output_string = unlist(strsplit(as.character(x),split = delimtter))[position]
return (output_string)
}
#Functions involving genomic coordinates
##### To make gr objeject
to_make_gr_object_chr_base_loc = function (chr_base_loc_vec,column_name) {
chr_base_loc = unique(chr_base_loc_vec)
chr_base_loc_gr = GRanges(chr_base_loc)
chr_base_loc_gr$meta_col = chr_base_loc
names(mcols(chr_base_loc_gr)) = column_name
return (chr_base_loc_gr) }
##### to_find overlap between gr
to_overlap_grange_object = function (gr1, gr2, gr1_col, gr2_col) {
query_gr = gr1
subject_gr = gr2
query_col = gr1_col
subject_col = gr2_col
overlap = findOverlaps(query_gr,subject_gr)
query_row_ids = queryHits(overlap)
subject_rowids = subjectHits(overlap)
query_overlap_df = as.data.frame(elementMetadata(query_gr)[query_row_ids,query_col])
subject_overlap_df= as.data.frame(elementMetadata(subject_gr)[subject_rowids, subject_col])
### to find the interesection of overlaps
overlap_width = width(pintersect(query_gr[queryHits(overlap)],subject_gr[subjectHits(overlap )]))
Overlap_df = cbind.data.frame(query_overlap_df,subject_overlap_df)
Overlap_df["width"] = overlap_width
return(Overlap_df)
}
to_find_nearest_dist_grange_object = function (gr1, gr2, gr1_col, gr2_col) {
query_gr = gr1
subject_gr = gr2
query_col = gr1_col
subject_col = gr2_col
overlap = distanceToNearest(query_gr,subject_gr,select="arbitrary")
query_row_ids = queryHits(overlap)
subject_rowids = subjectHits(overlap)
query_overlap_df = as.data.frame(elementMetadata(query_gr)[query_row_ids,query_col])
subject_overlap_df= as.data.frame(elementMetadata(subject_gr)[subject_rowids, subject_col])
dist_df = as.data.frame(elementMetadata(overlap))
Overlap_df = Reduce(f = cbind.data.frame,list(query_overlap_df,subject_overlap_df,dist_df))
return(Overlap_df)
}
to_get_overlap_bw_gr_from_peakids = function (input_genomic_coord1,input_genomic_coord2,input1_colname,input2_colname) {
input_genomic_coord1 = input_genomic_coord1[!(is.na(input_genomic_coord1))]
input_genomic_coord2 = input_genomic_coord2[!(is.na(input_genomic_coord2))]
input1_gr = unique_grobject_metacol(genomic_coord = input_genomic_coord1,metacolumn_name = "Location")
input2_gr = unique_grobject_metacol(genomic_coord = input_genomic_coord2,metacolumn_name= "Location")
overlap_df = to_overlap_grange_object(gr1 = input1_gr,gr2 = input2_gr,
gr1_col = "Location",gr2_col = "Location")
names(overlap_df) = c(input1_colname,input2_colname,"width")
return(overlap_df)
}
########### To get raw read counts from peaks
to_get_raw_counts_consensus_peaks = function (metatable,overlap_vector,output_path,output_file_name_base) {
data_dba = dba(sampleSheet= as.data.frame(metatable))
for (i in overlap_vector) {
output_file_name = paste(output_file_name_base, i, "overlap.txt", sep = "_")
output_file_name_path = file.path(output_path, output_file_name)
raw_reads_count = dba.count(DBA = data_dba ,minOverlap = i,score = DBA_SCORE_READS)
raw_reads_df <- dba.peakset(raw_reads_count, bRetrieve=T, DataType=DBA_DATA_FRAME)
write.table(x = raw_reads_df,file = output_file_name_path,quote = F,sep = "\t",row.names = F,col.names = T)
}
}
############# To get dds object from Deseq2
to_get_dds_rld_object = function (raw_count_file,metatable, experiment_design,output_file_name,output_path) {
raw_count_data = read.table(file = raw_count_file,header = T,sep = "\t")
######### Modifying_table
raw_count_data$Peak_identity = paste(raw_count_data$CHR,":",raw_count_data$START,
"-",raw_count_data$END,sep = "")
rownames(raw_count_data) = raw_count_data$Peak_identity
raw_count_data[,c("Peak_identity","CHR","START","END")] = NULL
raw_count_data_matrix = as.matrix(raw_count_data)
############### Making a dds object
dds = DESeqDataSetFromMatrix(countData = raw_count_data_matrix,
colData = metatable,
design = experiment_design)
############### For visualisation
dds = estimateSizeFactors(dds)
rld <- rlogTransformation(dds, blind=TRUE)
############## Output_file name
output_file_name_ext = paste(output_file_name,"rda",sep = ".")
output_file_name_path = file.path(output_path, output_file_name_ext)
save(list=c("dds","rld"),file = output_file_name_path)
}
##### Differential analysis using DeSeq2
compute_pca <- function(data_mat, ntop,meta_df){
Pvars <- rowVars(data_mat)
select <- order(Pvars, decreasing = TRUE)[seq_len(min(ntop,
length(Pvars)))]
PCA <- prcomp(t(data_mat[select, ]), scale. = F, center = TRUE)
percentVar <- round(100*PCA$sdev^2/sum(PCA$sdev^2),1)
pca_df = data.frame(PC1 = PCA$x[,1], PC2 = PCA$x[,2],
PC3 = PCA$x[,3], PC4 = PCA$x[,4],PC5 = PCA$x[,5],PC6 = PCA$x[,6])
#meta_df = as.data.frame(colData(data_mat))
pca_meta_df = cbind(pca_df, meta_df)
return(list(PCA_meta_df = pca_meta_df,
perc_var = percentVar,
selected_vars = select))
}
make_volcano_plot = function(data,x_col,y_col,y_lim,xlab,ylab,
title_lab,data_annotate) {
colnames(data)[colnames(data)%in%x_col] = "x"
colnames(data)[colnames(data)%in%y_col] = "y"
p1 = ggplot(data = data,mapping = aes(x = x,y = y,alpha = ifelse(y <= 1.3, 0.6,1)))
p2 = p1 +
geom_point(pch = 19, size = 2) +
ylim(y_lim) +
theme_classic(base_size = 12) +
theme(
axis.text = element_text(face = "bold"),
legend.title=element_blank()) +
geom_hline(yintercept= 1.3, color= "black",linetype = "dashed",size = rel(0.4)) +
geom_vline(xintercept= 0, color="black",linetype = "dashed") +
labs(title = title_lab,x = x_lab, y = y_lab)
p2 +
annotate("rect", xmin = -Inf, xmax = 0.0, ymin = -Inf, ymax = +Inf, fill= "#67A9CF",alpha = 0.3) +
annotate("rect", xmin = 0.0, xmax = Inf, ymin = -Inf, ymax = +Inf, fill= col_pallete[1],alpha = 0.3) +
guides(alpha = FALSE) + geom_label_repel(data = data_annotate,aes(x = x,y = y,label = TF), size = 2.5,min.segment.length = 0)
}
########### GO and pathway enrichment
to_get_GO_kegg_enrichment = function (expressedTarget,expressedGenes,pval_thresh = 1,ont = "BP") {
ck <- enrichGO(gene = expressedTarget,
OrgDb='org.Hs.eg.db',
pvalueCutoff= 1,
ont = ont,
keyType= "ENTREZID",
universe=expressedGenes)
ck_df=ck@result
ck_df = ck_df[order(ck_df$p.adjust),]
kk <- enrichKEGG(gene = expressedTarget,
organism = "human",
pvalueCutoff = 1,universe = expressedGenes)
kk_df = kk@result
kk_df = kk_df[order(kk_df$p.adjust),]
return (list(ck_df,kk_df))
}
to_do_fisher_exact_test = function (enrichment_category_groups,input_category_groups_to_be_tested,x,n,m,k,...) {
#k = length of hit list;
#x = overlap of hit list with category of interest;
#m = number of genes in a category of interest;
#n = total no of genes (universe)
data_vector = c(x,k-x,m-x,((n-m)-(k-x)) )
contingency.table = matrix(data_vector,nrow = 2,ncol = 2,byrow = TRUE)
colnames(contingency.table) <- enrichment_category_groups
rownames(contingency.table) <- input_category_groups_to_be_tested
f.test = fisher.test(contingency.table,...)
return (list(f.test,contingency.table))
}
## Sushi plot
## Coloc function