AD-Enhancer-Remodelling / GREAT_analysis_kmeans_cluster_with_JIASF.R
GREAT_analysis_kmeans_cluster_with_JIASF.R
Raw
##### R version 3.6.2
library(rGREAT)
library("stringr")

setwd("/g/scb2/zaugg/daga/Autoimmune_all/Scripts/")


############## Functions

modify_string = function (x,delimtter,position) {
  
  
  output_string = unlist(strsplit(as.character(x),split = delimtter))[position]
  
  return (output_string)
  
}


########### Function for Great enrichment 
to_submit_job = function (file_path_input,file_path_bg,basalPlus = TRUE,twoClosest = FALSE,oneClosest =FALSE) {
  
  input_fh = read.table(file = file_path_input,header = F,sep = "\t")
  bg_fh = read.table(file = file_path_bg,header = F,sep = "\t")
  
  names(input_fh) = c("chr","start","stop")
  names(bg_fh) = c("chr","start","stop")
  
  if (basalPlus) {
    job = submitGreatJob(gr = input_fh,species = "hg38",bg = bg_fh,includeCuratedRegDoms = TRUE,rule = "basalPlusExt")
  }
  
  if (twoClosest) {
    job = submitGreatJob(gr = input_fh,species = "hg38",bg = bg_fh,includeCuratedRegDoms = TRUE,rule = "twoClosest")
  }
  
  if(oneClosest) {
    job = submitGreatJob(gr = input_fh,species = "hg38",bg = bg_fh,includeCuratedRegDoms = TRUE,rule = "oneClosest")
  }
  
  return (job)
  
}


############ To get all the enrichment 
to_get_enrichment_table_threshold = function (job, param_name,threshold) {
  
  names_ontolgies = availableOntologies(job)
  enrich_table_all = getEnrichmentTables(job, ontology = names_ontolgies)
  
  to_subset_enrich_table = function(x,param_name, threshold){
    
    enrich_table_sub = x[x[,param_name] <=threshold,]
    
    return(enrich_table_sub)
    
  }
  
  #enrich_table_thresh = lapply(X = enrich_table_all,FUN = to_subset_enrich_table,param_name = "Hyper_Adjp_BH", threshold = 0.1)
  enrich_table_thresh = lapply(X = enrich_table_all,FUN = to_subset_enrich_table,param_name = param_name, threshold = threshold)
  enrich_table_thresh_df = Reduce(rbind,enrich_table_thresh)
  
  
  ############# To add ontologies names in the data frame 
  if(nrow(enrich_table_thresh_df) > 0) {
    
    nrow_list_names = unlist(lapply(enrich_table_thresh,FUN = nrow))
    nrow_list_names_sub = nrow_list_names[nrow_list_names > 0]
    
    ont_name_vector = character()
    
    
    for (i in seq(1, length(nrow_list_names_sub))){
      
      ont_names= names(nrow_list_names_sub)
      ont_name_length = nrow_list_names_sub[[ont_names[i]]]
      
      ont_name_rep = rep(ont_names[i],ont_name_length)
      ont_name_vector = append(ont_name_vector,ont_name_rep)
      
    }
    
    enrich_table_thresh_df$Ontology_name = ont_name_vector
  }
  
  return(list("enrich_list" = enrich_table_thresh,"enrich_df" = enrich_table_thresh_df))
  
}


########## To get the gene for particluar GO ids
##### Bed file 
input_files = list.files("../km_means_data/All_disease_with_JIASF/resNorm_l2FC_cond_any_norm_kmeans_10",pattern = ".*kmean.*bed",full.names = T)



for (file in input_files) {
  
  
  fn_query = basename(file)
  
  fn_middle = gsub(pattern = "_.bed",replacement = "",modify_string(x = fn_query,delimtter = "norm_",position = 2))
  fn_prefix = "resNorm_l2FC_cond_any"
  fn_suffix = "greatobj_enrich.RData"
  
  out_fn = paste(fn_prefix,fn_middle,fn_suffix,sep = "_")
  enrich_path = "../km_means_data/All_disease_with_JIASF/resNorm_l2FC_cond_any_norm_kmeans_10"
  out_fn_path = file.path(enrich_path,out_fn)
  
  bg_path = "../km_means_data/All_disease_with_JIASF/resNorm_l2FC_cond_any_norm_kmeans_10/resNorm_l2FC_cond_any_norm_bedfile.txt"
  foreground_path = file
  
  print (c(bg_path,foreground_path,out_fn_path)) 
  
  
  
  job_input = to_submit_job(file_path_input = foreground_path,
                            file_path_bg= bg_path,basalPlus = TRUE,twoClosest = FALSE,oneClosest =FALSE)
  
  Sys.sleep(60)
  
  enrich_data = to_get_enrichment_table_threshold(job = job_input,param_name = "Hyper_Adjp_BH" ,threshold = 1)
  
  save(list = c("enrich_data","job_input"),file = out_fn_path)
  
  
}




################# Function

to_get_top_GO_term_GREAT_enrich_df = function(enrich_data_all_df,input_GO_ont,col_select,top_n){


  #input_GO_ont = c("GO Molecular Function","GO Biological Process","GO Cellular Component")
  enrich_data_all_df_sub = enrich_data_all_df[enrich_data_all_df$Ontology_name%in%input_GO_ont,]

  #### Top select param
  #top_GO_select_param = c("Hyper_Adjp_BH")
  #col_select = c("ID","name","Hyper_Adjp_BH","Hyper_Fold_Enrichment","Ontology_name")
  #top_n = 50

  #### Order to select the top
  enrich_data_all_df_sub = enrich_data_all_df_sub[order(enrich_data_all_df_sub[,top_GO_select_param]),]
  enrich_data_top = enrich_data_all_df_sub[1:top_n,col_select]
  ### To filter top for signifcant values
  enrich_data_top_filter = enrich_data_top[enrich_data_top[,"Hyper_Adjp_BH"] <= 0.05,]

  return(list(enrich_data_top_filter,enrich_data_all_df_sub))

}

########## To make the master file per ontology for all the cluster to do enrichment analysis

to_make_master_file_topID_terms_inall_clusters = function (cluster_enrich_file_vector,
                                                           top_GO_select_param,col_select,top_n,
                                                           ont_name) {

  top_GO_list = list()
  all_GO_list = list()

  for (file in cluster_enrich_file_vector) {

    out_path = dirname(file)
    filenames = basename(file)
    cluster_name = str_extract(filenames, "kmean.*[0-9]+")
    #cluster_name_final = gsub(pattern = "kmean20_",replacement = "cluster ",x = cluster_name)

    load(file)

    ### Extract enrich df
    enrich_data_all_df = enrich_data[[2]]
    cluster_GO = to_get_top_GO_term_GREAT_enrich_df(enrich_data_all_df = enrich_data_all_df,input_GO_ont = ont_name,col_select = col_select,top_n = top_n)

    top_GO =  cluster_GO[[1]]
    all_GO = cluster_GO[[2]]

    #### no significant terms
    if (nrow(top_GO) > 0) {
      top_GO$cluster_name = cluster_name
      top_GO_list[[cluster_name]] = top_GO
    }


    all_GO$cluster_name = cluster_name
    all_GO_list[[cluster_name]] = all_GO

    rm(list = c("job_input","enrich_data"))

  }

  ### Master file of top GO
  top_GO_df_all_cluster = Reduce(f = rbind.data.frame,top_GO_list)
  all_GO_df_all_cluster = Reduce(f = rbind.data.frame,all_GO_list)

  ####### Saving this master file for visulaisation for all ontologies
  if (length(ont_name) == 1){
    out_fn = paste(ont_name,"top",top_n,"terms_all_cluster.RData",sep = "_")
  }else {
    out_fn = paste("all","top",top_n,"terms_all_cluster.RData",sep = "_")
  }


  out_fn_path = file.path(out_path,out_fn)

  save(list = c("top_GO_df_all_cluster","all_GO_df_all_cluster"),file = out_fn_path)

}



####### To sapply for individual terms and then all terms
### For top terms in individual ontology
cluster_enrich_files = list.files(path = "../km_means_data/All_disease_with_JIASF/resNorm_l2FC_cond_any_norm_kmeans_10",full.names = T,pattern = "resNorm.*.RData")

load(cluster_enrich_files[3])
input_ont = availableOntologies(job_input)
input_ont_sub = c("GO Molecular Function","GO Biological Process","GO Cellular Component","Ensembl Genes")

rm(list = c("enrich_data","job_input"))
#### Top select param
top_GO_select_param = c("Hyper_Adjp_BH")
col_select = c("ID","name","Hyper_Adjp_BH","Hyper_Fold_Enrichment","Ontology_name")
top_n = 50


sapply(X = input_ont,FUN = to_make_master_file_topID_terms_inall_clusters,
       cluster_enrich_file_vector = cluster_enrich_files,top_GO_select_param = top_GO_select_param,col_select = col_select,top_n = top_n)

#### Top terms in all ontology
to_make_master_file_topID_terms_inall_clusters(cluster_enrich_file_vector = cluster_enrich_files,
                                               top_GO_select_param = top_GO_select_param,col_select = col_select,top_n = 50,
                                               ont_name = input_ont_sub)





######### Check
#load("../km_means_data/All_disease_with_JIASF/resNorm_l2FC_cond_any_norm_kmeans_10/all_top_50_terms_all_cluster.RData")
#load("../km_means_data/All_disease_with_JIASF/resNorm_l2FC_cond_any_norm_kmeans_10/GO Biological Process_top_50_terms_all_cluster.RData")

#interleukin_df = all_GO_df_all_cluster[grepl(pattern = "interleukin",x = all_GO_df_all_cluster$name),]
#interleukin_df1 = top_GO_df_all_cluster[grepl(pattern = "interleukin-17",x = top_GO_df_all_cluster$name),]