##### R version 3.6.2
library(rGREAT)
library("stringr")
setwd("/g/scb2/zaugg/daga/Autoimmune_all/Scripts/")
############## Functions
modify_string = function (x,delimtter,position) {
output_string = unlist(strsplit(as.character(x),split = delimtter))[position]
return (output_string)
}
########### Function for Great enrichment
to_submit_job = function (file_path_input,file_path_bg,basalPlus = TRUE,twoClosest = FALSE,oneClosest =FALSE) {
input_fh = read.table(file = file_path_input,header = F,sep = "\t")
bg_fh = read.table(file = file_path_bg,header = F,sep = "\t")
names(input_fh) = c("chr","start","stop")
names(bg_fh) = c("chr","start","stop")
if (basalPlus) {
job = submitGreatJob(gr = input_fh,species = "hg38",bg = bg_fh,includeCuratedRegDoms = TRUE,rule = "basalPlusExt")
}
if (twoClosest) {
job = submitGreatJob(gr = input_fh,species = "hg38",bg = bg_fh,includeCuratedRegDoms = TRUE,rule = "twoClosest")
}
if(oneClosest) {
job = submitGreatJob(gr = input_fh,species = "hg38",bg = bg_fh,includeCuratedRegDoms = TRUE,rule = "oneClosest")
}
return (job)
}
############ To get all the enrichment
to_get_enrichment_table_threshold = function (job, param_name,threshold) {
names_ontolgies = availableOntologies(job)
enrich_table_all = getEnrichmentTables(job, ontology = names_ontolgies)
to_subset_enrich_table = function(x,param_name, threshold){
enrich_table_sub = x[x[,param_name] <=threshold,]
return(enrich_table_sub)
}
#enrich_table_thresh = lapply(X = enrich_table_all,FUN = to_subset_enrich_table,param_name = "Hyper_Adjp_BH", threshold = 0.1)
enrich_table_thresh = lapply(X = enrich_table_all,FUN = to_subset_enrich_table,param_name = param_name, threshold = threshold)
enrich_table_thresh_df = Reduce(rbind,enrich_table_thresh)
############# To add ontologies names in the data frame
if(nrow(enrich_table_thresh_df) > 0) {
nrow_list_names = unlist(lapply(enrich_table_thresh,FUN = nrow))
nrow_list_names_sub = nrow_list_names[nrow_list_names > 0]
ont_name_vector = character()
for (i in seq(1, length(nrow_list_names_sub))){
ont_names= names(nrow_list_names_sub)
ont_name_length = nrow_list_names_sub[[ont_names[i]]]
ont_name_rep = rep(ont_names[i],ont_name_length)
ont_name_vector = append(ont_name_vector,ont_name_rep)
}
enrich_table_thresh_df$Ontology_name = ont_name_vector
}
return(list("enrich_list" = enrich_table_thresh,"enrich_df" = enrich_table_thresh_df))
}
########## To get the gene for particluar GO ids
##### Bed file
input_files = list.files("../km_means_data/All_disease_with_JIASF/resNorm_l2FC_cond_any_norm_kmeans_10",pattern = ".*kmean.*bed",full.names = T)
for (file in input_files) {
fn_query = basename(file)
fn_middle = gsub(pattern = "_.bed",replacement = "",modify_string(x = fn_query,delimtter = "norm_",position = 2))
fn_prefix = "resNorm_l2FC_cond_any"
fn_suffix = "greatobj_enrich.RData"
out_fn = paste(fn_prefix,fn_middle,fn_suffix,sep = "_")
enrich_path = "../km_means_data/All_disease_with_JIASF/resNorm_l2FC_cond_any_norm_kmeans_10"
out_fn_path = file.path(enrich_path,out_fn)
bg_path = "../km_means_data/All_disease_with_JIASF/resNorm_l2FC_cond_any_norm_kmeans_10/resNorm_l2FC_cond_any_norm_bedfile.txt"
foreground_path = file
print (c(bg_path,foreground_path,out_fn_path))
job_input = to_submit_job(file_path_input = foreground_path,
file_path_bg= bg_path,basalPlus = TRUE,twoClosest = FALSE,oneClosest =FALSE)
Sys.sleep(60)
enrich_data = to_get_enrichment_table_threshold(job = job_input,param_name = "Hyper_Adjp_BH" ,threshold = 1)
save(list = c("enrich_data","job_input"),file = out_fn_path)
}
################# Function
to_get_top_GO_term_GREAT_enrich_df = function(enrich_data_all_df,input_GO_ont,col_select,top_n){
#input_GO_ont = c("GO Molecular Function","GO Biological Process","GO Cellular Component")
enrich_data_all_df_sub = enrich_data_all_df[enrich_data_all_df$Ontology_name%in%input_GO_ont,]
#### Top select param
#top_GO_select_param = c("Hyper_Adjp_BH")
#col_select = c("ID","name","Hyper_Adjp_BH","Hyper_Fold_Enrichment","Ontology_name")
#top_n = 50
#### Order to select the top
enrich_data_all_df_sub = enrich_data_all_df_sub[order(enrich_data_all_df_sub[,top_GO_select_param]),]
enrich_data_top = enrich_data_all_df_sub[1:top_n,col_select]
### To filter top for signifcant values
enrich_data_top_filter = enrich_data_top[enrich_data_top[,"Hyper_Adjp_BH"] <= 0.05,]
return(list(enrich_data_top_filter,enrich_data_all_df_sub))
}
########## To make the master file per ontology for all the cluster to do enrichment analysis
to_make_master_file_topID_terms_inall_clusters = function (cluster_enrich_file_vector,
top_GO_select_param,col_select,top_n,
ont_name) {
top_GO_list = list()
all_GO_list = list()
for (file in cluster_enrich_file_vector) {
out_path = dirname(file)
filenames = basename(file)
cluster_name = str_extract(filenames, "kmean.*[0-9]+")
#cluster_name_final = gsub(pattern = "kmean20_",replacement = "cluster ",x = cluster_name)
load(file)
### Extract enrich df
enrich_data_all_df = enrich_data[[2]]
cluster_GO = to_get_top_GO_term_GREAT_enrich_df(enrich_data_all_df = enrich_data_all_df,input_GO_ont = ont_name,col_select = col_select,top_n = top_n)
top_GO = cluster_GO[[1]]
all_GO = cluster_GO[[2]]
#### no significant terms
if (nrow(top_GO) > 0) {
top_GO$cluster_name = cluster_name
top_GO_list[[cluster_name]] = top_GO
}
all_GO$cluster_name = cluster_name
all_GO_list[[cluster_name]] = all_GO
rm(list = c("job_input","enrich_data"))
}
### Master file of top GO
top_GO_df_all_cluster = Reduce(f = rbind.data.frame,top_GO_list)
all_GO_df_all_cluster = Reduce(f = rbind.data.frame,all_GO_list)
####### Saving this master file for visulaisation for all ontologies
if (length(ont_name) == 1){
out_fn = paste(ont_name,"top",top_n,"terms_all_cluster.RData",sep = "_")
}else {
out_fn = paste("all","top",top_n,"terms_all_cluster.RData",sep = "_")
}
out_fn_path = file.path(out_path,out_fn)
save(list = c("top_GO_df_all_cluster","all_GO_df_all_cluster"),file = out_fn_path)
}
####### To sapply for individual terms and then all terms
### For top terms in individual ontology
cluster_enrich_files = list.files(path = "../km_means_data/All_disease_with_JIASF/resNorm_l2FC_cond_any_norm_kmeans_10",full.names = T,pattern = "resNorm.*.RData")
load(cluster_enrich_files[3])
input_ont = availableOntologies(job_input)
input_ont_sub = c("GO Molecular Function","GO Biological Process","GO Cellular Component","Ensembl Genes")
rm(list = c("enrich_data","job_input"))
#### Top select param
top_GO_select_param = c("Hyper_Adjp_BH")
col_select = c("ID","name","Hyper_Adjp_BH","Hyper_Fold_Enrichment","Ontology_name")
top_n = 50
sapply(X = input_ont,FUN = to_make_master_file_topID_terms_inall_clusters,
cluster_enrich_file_vector = cluster_enrich_files,top_GO_select_param = top_GO_select_param,col_select = col_select,top_n = top_n)
#### Top terms in all ontology
to_make_master_file_topID_terms_inall_clusters(cluster_enrich_file_vector = cluster_enrich_files,
top_GO_select_param = top_GO_select_param,col_select = col_select,top_n = 50,
ont_name = input_ont_sub)
######### Check
#load("../km_means_data/All_disease_with_JIASF/resNorm_l2FC_cond_any_norm_kmeans_10/all_top_50_terms_all_cluster.RData")
#load("../km_means_data/All_disease_with_JIASF/resNorm_l2FC_cond_any_norm_kmeans_10/GO Biological Process_top_50_terms_all_cluster.RData")
#interleukin_df = all_GO_df_all_cluster[grepl(pattern = "interleukin",x = all_GO_df_all_cluster$name),]
#interleukin_df1 = top_GO_df_all_cluster[grepl(pattern = "interleukin-17",x = top_GO_df_all_cluster$name),]