Luise-Seeker-Human-WM-Glia / src / 15_Trajectory_monocle.R
15_Trajectory_monocle.R
Raw
library(dplyr)
library(monocle)
library(Seurat)

#Extract data, phenotype data, and feature data from the SeuratObject

split_sr_by_t <- c("CSC", "CB", "BA4")


run_monocle_split <- function(seur_obj, 
                        split_by = "NULL",
                        split_sr_by,
                        mod_form_string, 
                        save_dir = getwd(),
                        cores_use = 8){
   
  if(split_by != "NULL"){
    
    for(i in 1: length(split_sr_by)){
     
    Idents(seur_obj) <- split_by
    seur_spli_obj <- subset(seur_obj, ident = split_sr_by[i])
    mon_data <- as(as.matrix(seur_spli_obj@assays$RNA@data), 'sparseMatrix')
    pd <- new('AnnotatedDataFrame', data = seur_spli_obj@meta.data)
    f_data <- data.frame(gene_short_name = row.names(mon_data), 
                         row.names = row.names(mon_data))
    fd <- new('AnnotatedDataFrame', data = f_data)
    
    #Construct monocle cds
    ol_cds <- newCellDataSet(mon_data,
                             phenoData = pd,
                             featureData = fd,
                             lowerDetectionLimit = 0.5,
                             expressionFamily = negbinomial.size())
    ol_cds <- estimateSizeFactors(ol_cds)
    ol_cds <- estimateDispersions(ol_cds)
    clustering_oligos <- differentialGeneTest(ol_cds,
                                              fullModelFormulaStr = mod_form_string,
                                              cores = cores_use, verbose = T)
    
    # select genes that are significant at an FDR <10%
    genes_ol_cds <- subset(clustering_oligos, qval < 0.1) 
    
    
    # my_ordering_genes <- row.names(clustering_DEG_genes)[order(clustering_DEG_genes$qval)][1:1000]
    ol_cds <- setOrderingFilter(ol_cds, ordering_genes = genes_ol_cds)
    ol_cds <- reduceDimension(ol_cds, method = 'DDRTree', cores=cores_use, verbose = TRUE)
    ol_cds <- orderCells(ol_cds)
    my_pseudotime_de <- differentialGeneTest(ol_cds,
                                             fullModelFormulaStr = "~sm.ns(Pseudotime)",
                                             cores = cores_use)
    
    saveRDS(ol_cds, paste(save_dir, "/monocle_obj_", split_sr_by[i], ".RDS", sep = ""))
    saveRDS(my_pseudotime_de, paste(save_dir, "/pseudotime_de_", split_sr_by[i], ".RDS", sep = ""))
    }
  }
  print("Done")
}

dir.create("/Users/lseeker/Documents/Work/HumanCellAtlas/2021_oligos_out/monocle")
run_monocle_split(nad_ol, split_by = "Tissue",
                  split_sr_by = split_sr_by_t,
                  mod_form_string = '~ol_clusters_named', 
                  save_dir = "/Users/lseeker/Documents/Work/HumanCellAtlas/2021_oligos_out/monocle",
                  cores_use = 8)

# For me this created pseudotimes with the wrong root

ol_cds_csc <-readRDS("/Users/lseeker/Documents/Work/HumanCellAtlas/2021_oligos_out/monocle/wrong_start/monocle_obj_CSC.RDS")

ol_cds_csc <- orderCells(ol_cds_csc, root_state = 2)

saveRDS(ol_cds_csc,"/Users/lseeker/Documents/Work/HumanCellAtlas/2021_oligos_out/monocle/monocle_obj_CSC.RDS" )

ol_cds_cb <-readRDS("/Users/lseeker/Documents/Work/HumanCellAtlas/2021_oligos_out/monocle/wrong_start/monocle_obj_CB.RDS")

ol_cds_cb <- orderCells(ol_cds_cb, root_state = 4)

saveRDS(ol_cds_cb,"/Users/lseeker/Documents/Work/HumanCellAtlas/2021_oligos_out/monocle/monocle_obj_CB.RDS" )

ol_cds_ba4 <-readRDS("/Users/lseeker/Documents/Work/HumanCellAtlas/2021_oligos_out/monocle/wrong_start/monocle_obj_BA4.RDS")

ol_cds_ba4 <- orderCells(ol_cds_ba4, root_state = 3)
saveRDS(ol_cds_ba4,"/Users/lseeker/Documents/Work/HumanCellAtlas/2021_oligos_out/monocle/monocle_obj_BA4.RDS" )


ol_cds<- ol_cds_csc

ol_cds<- ol_cds_ba4



#whole data

nad_ol <- readRDS("/exports/eddie/scratch/lseeker/srt_oligos_and_opcs_LS.RDS")

ol_data <- as(as.matrix(nad_ol@assays$RNA@data), 'sparseMatrix')

pd <- new('AnnotatedDataFrame', data = nad_ol@meta.data)

f_data <- data.frame(gene_short_name = row.names(ol_data), row.names = row.names(ol_data))
fd <- new('AnnotatedDataFrame', data = f_data)

#Construct monocle cds
ol_cds <- newCellDataSet(ol_data,
                            phenoData = pd,
                            featureData = fd,
                            lowerDetectionLimit = 0.5,
                            expressionFamily = negbinomial.size())

pData(ol_cds)


ol_cds <- estimateSizeFactors(ol_cds)
ol_cds <- estimateDispersions(ol_cds)

clustering_oligos <- differentialGeneTest(ol_cds,
                                          fullModelFormulaStr = '~ol_clusters_named',
                                          cores = 1, verbose = T)

# select genes that are significant at an FDR <10%
genes_ol_cds <- subset(clustering_oligos, qval < 0.1) 


# my_ordering_genes <- row.names(clustering_DEG_genes)[order(clustering_DEG_genes$qval)][1:1000]
ol_cds <- setOrderingFilter(ol_cds, ordering_genes = genes_ol_cds)
ol_cds <- reduceDimension(ol_cds, method = 'DDRTree', cores=1, verbose = TRUE)
ol_cds <- orderCells(ol_cds)

saveRDS(ol_cds, "/exports/eddie/scratch/lseeker/ol_cds.RDS")

ol_cds <- readRDS("/exports/eddie/scratch/lseeker/ol_cds.RDS")

my_pseudotime_de <- differentialGeneTest(ol_cds,
                                         fullModelFormulaStr = "~sm.ns(Pseudotime)")

saveRDS(ol_cds, "/exports/eddie/scratch/lseeker/monocle_all_cds.RDS")
saveRDS(my_pseudotime_de, "/exports/eddie/scratch/lseeker/monocle_all_pseudotime.RDS")




plot_cell_trajectory(ol_cds, color_by = "State") + facet_wrap(~State) + 
  scale_colour_manual(values=mycoloursP[20:40])

plot_cell_trajectory(ol_cds, color_by = "State")  + 
  scale_colour_manual(values=mycoloursP[20:40])


plot_cell_trajectory(ol_cds, color_by = "ol_clusters_named") + 
  facet_wrap(~ol_clusters_named) +
  scale_colour_manual(values=mycoloursP[6:40])

#plot_cell_trajectory(ol_cds, color_by = "Tissue") + facet_wrap(~Tissue)
plot_cell_trajectory(ol_cds, color_by = "gender", cell_size = 0.2)+
  scale_colour_manual(values=mycoloursP[10:40])

plot_cell_trajectory(ol_cds, color_by = "Pseudotime")
plot_cell_trajectory(ol_cds, color_by = "ol_clusters_named")+
  scale_colour_manual(values=mycoloursP[6:40])


head(pData(ol_cds))
#the pseudotime and its state is now in the pData

my_pseudotime_de <- differentialGeneTest(ol_cds,
                                         fullModelFormulaStr = "~sm.ns(Pseudotime)",
                                         cores = 18)
top_genes <- my_pseudotime_de %>% arrange(qval) %>% head(10)

top_genes
# select the 10 highest genes and use them for plotting
plot_genes_in_pseudotime(ol_cds[rownames(top_genes)])


plot_genes_in_pseudotime(ol_cds[c("DSCAM","TNR","PTPRZ1","LHFPL3", "PCDH15")])
plot_genes_in_pseudotime(ol_cds[c("SOX6","VCAN","MMP16","FGF14", "CA10")])
plot_genes_in_pseudotime(ol_cds[c("DPYD","COL11A1","TNR","BRINP3", "BCAN")])

plot_genes_in_pseudotime(ol_cds[c("PDGFRA","PCDH15", "PLP1","MBP")], 
                         color_by = "ol_clusters_named") +  
  scale_colour_manual(values=mycoloursP[6:40])


# cluster the top 50 genes that vary as a function of pseudotime
my_pseudotime_de %>% arrange(qval) %>% head(50) %>% select(gene_short_name) -> gene_to_cluster
gene_to_cluster <- gene_to_cluster$gene_short_name
gene_to_cluster <- c(gene_to_cluster, "MAG", "MOG", "PLP1", "SPARC", "RBFOX1", 
                     "OPALIN", "KLK6", "SPARCL1", "SLC22A3", "PAX3", "NELL1", 
                     "AFF3", "LGALS1", "FMN1", "HHIP", "TUBB2A")
my_pseudotime_cluster <- plot_pseudotime_heatmap(ol_cds[gene_to_cluster,],
                                                 num_clusters = 3,
                                                 cores = 8,
                                                 show_rownames = TRUE,
                                                 return_heatmap = TRUE)

my_cluster <- cutree(my_pseudotime_cluster$tree_row, 5)
my_cluster

my_pseudotime_de[names(my_cluster[my_cluster == 1]),"gene_short_name"]



#  I am surprised about how the pseudotime trajectories look like. I expected 
# one cluster on one side of a fork and the other onthe other side. So if this
# trajectory analysis is not picking up on cluster differences, what is driving
# the fork then? 

# A differential gene expression of the two end points may be helpful

ol_cds$subs_pt <- ifelse(ol_cds$State == 1 & ol_cds$Pseudotime > 45, "state1_end_point", "FALSE")
ol_cds$subs_pt <- ifelse(ol_cds$State == 3 & ol_cds$Pseudotime > 28, "state3_end_point", ol_cds$subs_pt)
plot_cell_trajectory(ol_cds, color_by = "subs_pt")+
  scale_colour_manual(values=mycoloursP[36:40])

# extract barcode , pseudotime, state and and subs_pt and merge with seur metadata

mon_df <- data.frame(Barcode = ol_cds$Barcode, 
                     Pseudotime = ol_cds$Pseudotime, 
                     monocle_state = ol_cds$State, 
                     monocle_end_points = ol_cds$subs_pt)

# Do the above for all the different tissues, create master mon_df, bring barcodes
# in same order as in seurat obj, add columns to seurat object metadata. 
# Plot Dimplots that show states for all nuclei and separated by tissue. 
# Do differential gene expression analysis of end states. 


ol_cds_cb<- readRDS("/Users/lseeker/Documents/Work/HumanCellAtlas/2021_oligos_out/monocle/monocle_obj_CB.RDS")
ol_cds_cb$subs_pt <- ifelse(ol_cds_cb$State == 1 & ol_cds_cb$Pseudotime > 40, "state1_end_point", "FALSE")
ol_cds_cb$subs_pt <- ifelse(ol_cds_cb$State == 5 & ol_cds_cb$Pseudotime > 32, "state5_end_point", ol_cds_cb$subs_pt)
plot_cell_trajectory(ol_cds_cb, color_by = "subs_pt")+
  scale_colour_manual(values=mycoloursP[36:40])

# extract barcode , pseudotime, state and and subs_pt and merge with seur metadata

mon_df_cb <- data.frame(Barcode = ol_cds_cb$Barcode, 
                     Pseudotime = ol_cds_cb$Pseudotime, 
                     monocle_state = ol_cds_cb$State, 
                     monocle_end_points = ol_cds_cb$subs_pt)

#And for BA4

ol_cds_ba4<- readRDS("/Users/lseeker/Documents/Work/HumanCellAtlas/2021_oligos_out/monocle/monocle_obj_BA4.RDS")
ol_cds_ba4$subs_pt <- ifelse(ol_cds_ba4$State == 1 & ol_cds_ba4$Pseudotime > 46, "state1_end_point", "FALSE")
plot_cell_trajectory(ol_cds_ba4, color_by = "subs_pt")+
  scale_colour_manual(values=mycoloursP[36:40])

# extract barcode , pseudotime, state and and subs_pt and merge with seur metadata

mon_df_ba4 <- data.frame(Barcode = ol_cds_ba4$Barcode, 
                        Pseudotime = ol_cds_ba4$Pseudotime, 
                        monocle_state = ol_cds_ba4$State, 
                        monocle_end_points = ol_cds_ba4$subs_pt)


merged_mon_df <- rbind(mon_df, mon_df_cb, mon_df_ba4)


# sonrt barcodes so that they are the same in this dataframe and in the seurat
# object, so that data can be added to metadata of seurat object

nad_ol <- readRDS("/Users/lseeker/Documents/Work/HumanCellAtlas/srt_oligos_Nadine/srt_oligos_and_opcs_LS.RDS")

summary(nad_ol@meta.data$Barcode == merged_mon_df$Barcode)

nrow(nad_ol@meta.data) == nrow(merged_mon_df)

merged_mon_df <- merged_mon_df[match(nad_ol@meta.data$Barcode,
                                     merged_mon_df$Barcode),]

summary(nad_ol@meta.data$Barcode == merged_mon_df$Barcode)

nad_ol$mon_pseudotime <- merged_mon_df$Pseudotime
nad_ol$monocle_state <- merged_mon_df$monocle_state
nad_ol$monocle_end_points <- merged_mon_df$monocle_end_points

DimPlot(nad_ol, group.by = "monocle_end_points", split.by = "Tissue")


# looking at differential gene expression of end points seems to make more 
# sense with in the tissue than across tissues. 

Idents(nad_ol) <- "Tissue"
csc <- subset(nad_ol, ident = "CSC")

Idents(csc) <- "monocle_end_points"

end1_markers <- FindMarkers(object = csc, ident.1 = "state1_end_point", 
                            ident.2 = "state3_end_point", min.pct = 0.25)

print(x = head(x = end1_markers, n = 5))

end3_markers <- FindMarkers(object = csc, ident.1 = "state3_end_point", 
                            ident.2 = "state1_end_point", min.pct = 0.25, 
                            only.pos = TRUE)

print(x = head(x = end3_markers, n = 5))


fil_end1_mark <- subset(end1_markers, end1_markers$avg_log2FC > 1.5)
fil_end3_mark <- subset(end3_markers, end3_markers$avg_log2FC > 1.5)