--- title: "HCA_slingshot" author: "Luise A. Seeker" date: "22/03/2021" output: html_document --- ```{r} library(ggsci) library(slingshot) library(BUSpaRse) library(tidyverse) library(tidymodels) library(Seurat) library(scales) library(viridis) library(Matrix) library(dplyr) library(here) ``` # Pick colour pallets ```{r, echo = F} mypal <- pal_npg("nrc", alpha = 0.7)(10) mypal2 <-pal_tron("legacy", alpha = 0.7)(7) mypal3 <- pal_lancet("lanonc", alpha = 0.7)(9) mypal4 <- pal_simpsons(palette = c("springfield"), alpha = 0.7)(16) mypal5 <- pal_rickandmorty(palette = c("schwifty"), alpha = 0.7)(6) mypal6 <- pal_futurama(palette = c("planetexpress"), alpha = 0.7)(5) mypal7 <- pal_startrek(palette = c("uniform"), alpha = 0.7)(5) mycoloursP<- c(mypal, mypal2, mypal3, mypal4, mypal5, mypal6, mypal7) show_col(mycoloursP, labels =F) ``` ```{r} nad_ol <- readRDS(here("data", "single_nuc_data", "oligodendroglia", "srt_oligos_and_opcs_LS.RDS")) ``` ```{r} ggplot(nad_ol@meta.data, aes(nCount_RNA, nFeature_RNA, color = ol_clusters_named)) + geom_point(size = 0.5) + scale_color_manual(values = mycoloursP[6:50])+ scale_x_log10() + scale_y_log10() + theme_bw() + # Make points larger in legend guides(color = guide_legend(override.aes = list(size = 3))) + labs(x = "Total UMI counts", y = "Number of genes detected") ``` # Trajectory analysis using SLINGSHOT # Whole dataset ## Slingshot with start cluster OPC_B (which is only in BA4) ```{r} sds <- slingshot(Embeddings(nad_ol, "umap"), clusterLabels = nad_ol$ol_clusters_named, start.clus = "OPC_B", stretch = 0) cl <-nad_ol$ol_clusters_named cols <- mycoloursP[6:40] plot(slingReducedDim(sds), col = cols[cl], pch = 16, cex = 0.5) lines(SlingshotDataSet(sds), lwd = 2, type = 'lineages', col = 'black') ``` ## Slingshot with start cluster OPC_A (which is only in CB & CSC) ```{r} sds_b <- slingshot(Embeddings(nad_ol, "umap"), clusterLabels = nad_ol$ol_clusters_named, start.clus = "OPC_A", stretch = 0) plot(slingReducedDim(sds_b), col = cols[cl], pch = 16, cex = 0.5) lines(SlingshotDataSet(sds_b), lwd = 2, type = 'lineages', col = 'black') ``` The trajectories of the full dataset are identical, regardless of choosing OPC_A or OPC_B as start point ```{r} nc <- 3 pt <- slingPseudotime(sds) nms <- colnames(pt) nr <- ceiling(length(nms)/nc) pal <- viridis(100, end = 0.95) par(mfrow = c(nr, nc)) for (i in nms) { colors <- pal[cut(pt[,i], breaks = 100)] plot(slingReducedDim(sds), col = colors, pch = 16, cex = 0.5, main = i) lines(SlingshotDataSet(sds), lwd = 2, col = 'black', type = 'lineages') } for (i in nms) { colors <- pal[cut(pt[,i], breaks = 100)] plot(slingReducedDim(sds), col = colors, pch = 16, cex = 0.5, main = i) lines(SlingshotDataSet(sds), lwd = 2, col = 'black') } ``` ## Differential gene expression along curves ### Get top highly variable genes ```{r} top_hvg <- HVFInfo(nad_ol) %>% mutate(., bc = rownames(.)) %>% arrange(desc(variance.standardized)) %>% top_n(300, variance.standardized) %>% pull(bc) ``` ### Prepare data for random forest ```{r} dat_use <- t(GetAssayData(nad_ol, slot = "data")[top_hvg,]) dat_use_df <- cbind(slingPseudotime(sds)[,2], dat_use) # Do curve 2, so 2nd columnn colnames(dat_use_df)[1] <- "pseudotime" dat_use_df <- as.data.frame(dat_use_df[!is.na(dat_use_df[,1]),]) ``` ### use tidymodels #### generate training and validation dataset ```{r} set.seed(123) dat_split <- initial_split(dat_use_df) dat_train <- training(dat_split) dat_val <- testing(dat_split) ``` #### train model ```{r} model <- rand_forest(mtry = 200, trees = 1400, min_n = 15, mode = "regression") %>% set_engine("ranger", importance = "impurity", num.threads = 3) %>% fit(pseudotime ~ ., data = dat_train) ``` #### Evaluate model with validation set ```{r} val_results <- dat_val %>% mutate(estimate = predict(model, .[,-1]) %>% pull()) %>% dplyr::select(truth = pseudotime, estimate) ``` #### Identify top 20 genes ```{r} var_imp <- sort(model$fit$variable.importance, decreasing = TRUE) top_genes <- names(var_imp)[1:40] ``` #### Plot top 40 genes ```{r, fig.height = 80, fig.width = 6} par(mfrow = c(3, 2)) for (i in seq_along(top_genes)) { colors <- pal[cut(dat_use[,top_genes[i]], breaks = 100)] plot(slingReducedDim(sds), col = colors, pch = 16, cex = 0.5, main = top_genes[i]) lines(SlingshotDataSet(sds), lwd = 2, col = 'black', type = 'lineages') } ``` # Perform pseudotime analysis separately for each tissue ## BA4 ```{r} Idents(nad_ol)<-"Tissue" ba4<-subset(nad_ol, ident = "BA4") ``` ```{r} sds_ba4 <- slingshot(Embeddings(ba4, "umap"), clusterLabels = ba4$ol_clusters_named, start.clus = "OPC_B", stretch = 0) plot(slingReducedDim(sds_ba4), col = cols[cl], pch = 16, cex = 0.5) lines(SlingshotDataSet(sds_ba4), lwd = 2, type = 'lineages', col = 'black') ``` ### Differential gene expression along curves #### Use random forest ##### Get top highly variable genes ```{r} top_hvg <- HVFInfo(ba4) %>% mutate(., bc = rownames(.)) %>% arrange(desc(variance.standardized)) %>% top_n(300, variance.standardized) %>% pull(bc) ``` ##### Prepare data for random forest ```{r} dat_use <- t(GetAssayData(ba4, slot = "data")[top_hvg,]) dat_use_df <- cbind(slingPseudotime(sds_ba4)[,2], dat_use) # Do curve 2, so 2nd columnn colnames(dat_use_df)[1] <- "pseudotime" dat_use_df <- as.data.frame(dat_use_df[!is.na(dat_use_df[,1]),]) ``` ##### use tidymodels ###### generate training and validation dataset ```{r} set.seed(123) dat_split <- initial_split(dat_use_df) dat_train <- training(dat_split) dat_val <- testing(dat_split) ``` ###### train model ```{r} model <- rand_forest(mtry = 200, trees = 1400, min_n = 15, mode = "regression") %>% set_engine("ranger", importance = "impurity", num.threads = 3) %>% fit(pseudotime ~ ., data = dat_train) ``` ###### Evaluate model with validation set ```{r} val_results <- dat_val %>% mutate(estimate = predict(model, .[,-1]) %>% pull()) %>% dplyr::select(truth = pseudotime, estimate) ``` ###### Identify top 20 genes ```{r} var_imp <- sort(model$fit$variable.importance, decreasing = TRUE) top_genes <- names(var_imp)[1:20] ``` ###### Plot top 20 genes ```{r, fig.height = 20, fig.width = 6} par(mfrow = c(3, 2)) for (i in seq_along(top_genes)) { colors <- pal[cut(dat_use[,top_genes[i]], breaks = 100)] plot(reducedDim(sds_ba4), col = colors, pch = 16, cex = 0.5, main = top_genes[i]) lines(sds, lwd = 2, col = 'black', type = 'lineages') } ``` ## CB ```{r} Idents(nad_ol)<-"Tissue" cb<-subset(nad_ol, ident = "CB") ``` ```{r} sds_cb <- slingshot(Embeddings(cb, "umap"), clusterLabels = cb$ol_clusters_named, start.clus = "OPC_A", stretch = 0) cl_cb <-cb$ol_clusters_named cols_cb <- mycoloursP[6:40] plot(slingReducedDim(sds_cb), col = cols_cb[cl_cb], pch = 16, cex = 0.5) lines(SlingshotDataSet(sds_cb), lwd = 2, type = 'lineages', col = 'black') ``` ### Differential gene expression along curves #### Use random forest ##### Get top highly variable genes ```{r} top_hvg <- HVFInfo(cb) %>% mutate(., bc = rownames(.)) %>% arrange(desc(variance.standardized)) %>% top_n(300, variance.standardized) %>% pull(bc) ``` ##### Prepare data for random forest ```{r} dat_use <- t(GetAssayData(cb, slot = "data")[top_hvg,]) dat_use_df <- cbind(slingPseudotime(sds_cb)[,2], dat_use) # Do curve 2, so 2nd columnn colnames(dat_use_df)[1] <- "pseudotime" dat_use_df <- as.data.frame(dat_use_df[!is.na(dat_use_df[,1]),]) ``` ##### use tidymodels ###### generate training and validation dataset ```{r} set.seed(123) dat_split <- initial_split(dat_use_df) dat_train <- training(dat_split) dat_val <- testing(dat_split) ``` ###### train model ```{r} model <- rand_forest(mtry = 200, trees = 1400, min_n = 15, mode = "regression") %>% set_engine("ranger", importance = "impurity", num.threads = 3) %>% fit(pseudotime ~ ., data = dat_train) ``` ###### Evaluate model with validation set ```{r} val_results <- dat_val %>% mutate(estimate = predict(model, .[,-1]) %>% pull()) %>% dplyr::select(truth = pseudotime, estimate) ``` ###### Identify top 20 genes ```{r} var_imp <- sort(model$fit$variable.importance, decreasing = TRUE) top_genes <- names(var_imp)[1:20] ``` ###### Plot top 20 genes ```{r, fig.height = 20, fig.width = 6} par(mfrow = c(3, 2)) for (i in seq_along(top_genes)) { colors <- pal[cut(dat_use[,top_genes[i]], breaks = 100)] plot(reducedDim(sds_cb), col = colors, pch = 16, cex = 0.5, main = top_genes[i]) lines(sds, lwd = 2, col = 'black', type = 'lineages') } ``` ## CSC ```{r} Idents(nad_ol)<-"Tissue" csc<-subset(nad_ol, ident = "CSC") ``` ```{r} sds_csc <- slingshot(Embeddings(csc, "umap"), clusterLabels = csc$ol_clusters_named, start.clus = "OPC_A", stretch = 0) cl_csc <-csc$ol_clusters_named cols_csc <- mycoloursP[6:40] plot(slingReducedDim(sds_csc), col = cols_csc[cl_csc], pch = 16, cex = 0.5) lines(SlingshotDataSet(sds_csc), lwd = 2, type = 'lineages', col = 'black') ``` ### Differential gene expression along curves #### Use random forest ##### Get top highly variable genes ```{r} top_hvg <- HVFInfo(csc) %>% mutate(., bc = rownames(.)) %>% arrange(desc(variance.standardized)) %>% top_n(300, variance.standardized) %>% pull(bc) ``` ##### Prepare data for random forest ```{r} dat_use <- t(GetAssayData(csc, slot = "data")[top_hvg,]) dat_use_df <- cbind(slingPseudotime(sds_csc)[,2], dat_use) # Do curve 2, so 2nd columnn colnames(dat_use_df)[1] <- "pseudotime" dat_use_df <- as.data.frame(dat_use_df[!is.na(dat_use_df[,1]),]) ``` ##### use tidymodels ###### generate training and validation dataset ```{r} set.seed(123) dat_split <- initial_split(dat_use_df) dat_train <- training(dat_split) dat_val <- testing(dat_split) ``` ###### train model ```{r} model <- rand_forest(mtry = 200, trees = 1400, min_n = 15, mode = "regression") %>% set_engine("ranger", importance = "impurity", num.threads = 3) %>% fit(pseudotime ~ ., data = dat_train) ``` ###### Evaluate model with validation set ```{r} val_results <- dat_val %>% mutate(estimate = predict(model, .[,-1]) %>% pull()) %>% dplyr::select(truth = pseudotime, estimate) ``` ###### Identify top 20 genes ```{r} var_imp <- sort(model$fit$variable.importance, decreasing = TRUE) top_genes <- names(var_imp)[1:20] ``` ###### Plot top 20 genes ```{r, fig.height = 20, fig.width = 6} par(mfrow = c(3, 2)) for (i in seq_along(top_genes)) { colors <- pal[cut(dat_use[,top_genes[i]], breaks = 100)] plot(reducedDim(sds_csc), col = colors, pch = 16, cex = 0.5, main = top_genes[i]) lines(sds, lwd = 2, col = 'black', type = 'lineages') } ``` # Save objects ```{r} dir.create("/Users/lseeker/Documents/Work/HumanCellAtlas/2021_oligos_out/slingshot_data") saveRDS(sds, "/Users/lseeker/Documents/Work/HumanCellAtlas/2021_oligos_out/slingshot_data/slingshot_full_data_OPC_B.RDS") saveRDS(sds_b, "/Users/lseeker/Documents/Work/HumanCellAtlas/2021_oligos_out/slingshot_data/slingshot_full_data_OPC_A.RDS") saveRDS(sds_ba4, "/Users/lseeker/Documents/Work/HumanCellAtlas/2021_oligos_out/slingshot_data/slingshot_BA4.RDS") saveRDS(sds_cb, "/Users/lseeker/Documents/Work/HumanCellAtlas/2021_oligos_out/slingshot_data/slingshot_CB.RDS") saveRDS(sds_csc, "/Users/lseeker/Documents/Work/HumanCellAtlas/2021_oligos_out/slingshot_data/slingshot_CSC.RDS") ``` # Session info ```{r} sessionInfo() ```