src/16_Trajectory_slingshot.Rmd · Luise-Seeker-Human-WM-Glia

---
title: "HCA_slingshot"
author: "Luise A. Seeker"
date: "22/03/2021"
output: html_document
---


```{r}
library(ggsci)
library(slingshot)
library(BUSpaRse)
library(tidyverse)
library(tidymodels)
library(Seurat)
library(scales)
library(viridis)
library(Matrix)
library(dplyr)
library(here)



```
# Pick colour pallets

```{r, echo = F}
mypal <- pal_npg("nrc", alpha = 0.7)(10)
mypal2 <-pal_tron("legacy", alpha = 0.7)(7)
mypal3 <- pal_lancet("lanonc", alpha = 0.7)(9)
mypal4 <- pal_simpsons(palette = c("springfield"), alpha = 0.7)(16)
mypal5 <- pal_rickandmorty(palette = c("schwifty"), alpha = 0.7)(6)
mypal6 <- pal_futurama(palette = c("planetexpress"), alpha = 0.7)(5)
mypal7 <- pal_startrek(palette = c("uniform"), alpha = 0.7)(5)

mycoloursP<- c(mypal, mypal2, mypal3, mypal4, mypal5, mypal6, mypal7)

show_col(mycoloursP, labels =F)
```

```{r}
nad_ol <- readRDS(here("data", 
                       "single_nuc_data", 
                       "oligodendroglia",
                       "srt_oligos_and_opcs_LS.RDS"))
                  
                  

```

```{r}
ggplot(nad_ol@meta.data, aes(nCount_RNA, nFeature_RNA, color = ol_clusters_named)) +
  geom_point(size = 0.5) +
  scale_color_manual(values = mycoloursP[6:50])+
  scale_x_log10() +
  scale_y_log10() +
  theme_bw() +
  # Make points larger in legend
  guides(color = guide_legend(override.aes = list(size = 3))) +
  labs(x = "Total UMI counts", y = "Number of genes detected")

```


# Trajectory analysis using SLINGSHOT


# Whole dataset
## Slingshot with start cluster OPC_B (which is only in BA4)


```{r}
sds <- slingshot(Embeddings(nad_ol, "umap"), 
                 clusterLabels = nad_ol$ol_clusters_named, 
                 start.clus = "OPC_B", stretch = 0)


cl <-nad_ol$ol_clusters_named
cols <- mycoloursP[6:40]

plot(slingReducedDim(sds), col = cols[cl], pch = 16, cex = 0.5) 
lines(SlingshotDataSet(sds), lwd = 2, type = 'lineages', col = 'black') 
```

## Slingshot with start cluster OPC_A (which is only in CB & CSC)

```{r}
sds_b <- slingshot(Embeddings(nad_ol, "umap"), 
                 clusterLabels = nad_ol$ol_clusters_named, 
                 start.clus = "OPC_A", stretch = 0)

plot(slingReducedDim(sds_b), col = cols[cl], pch = 16, cex = 0.5) 
lines(SlingshotDataSet(sds_b), lwd = 2, type = 'lineages', col = 'black') 
```

The trajectories of the full dataset are identical, regardless of choosing
OPC_A or OPC_B as start point

```{r}
nc <- 3
pt <- slingPseudotime(sds)
nms <- colnames(pt)
nr <- ceiling(length(nms)/nc)
pal <- viridis(100, end = 0.95)
par(mfrow = c(nr, nc))
for (i in nms) {
  colors <- pal[cut(pt[,i], breaks = 100)]
  plot(slingReducedDim(sds), col = colors, pch = 16, cex = 0.5, main = i)
  lines(SlingshotDataSet(sds), lwd = 2, col = 'black', type = 'lineages')
}

for (i in nms) {
  colors <- pal[cut(pt[,i], breaks = 100)]
  plot(slingReducedDim(sds), col = colors, pch = 16, cex = 0.5, main = i)
  lines(SlingshotDataSet(sds), lwd = 2, col = 'black')
}

```
## Differential gene expression along curves

### Get top highly variable genes
```{r}
top_hvg <- HVFInfo(nad_ol) %>% 
  mutate(., bc = rownames(.)) %>% 
  arrange(desc(variance.standardized)) %>% 
  top_n(300, variance.standardized) %>% 
  pull(bc)
```

### Prepare data for random forest
```{r}
dat_use <- t(GetAssayData(nad_ol, slot = "data")[top_hvg,])
dat_use_df <- cbind(slingPseudotime(sds)[,2], dat_use) # Do curve 2, so 2nd columnn
colnames(dat_use_df)[1] <- "pseudotime"
dat_use_df <- as.data.frame(dat_use_df[!is.na(dat_use_df[,1]),])
```

### use tidymodels
#### generate training and validation dataset
```{r}
set.seed(123)
dat_split <- initial_split(dat_use_df)
dat_train <- training(dat_split)
dat_val <- testing(dat_split)

```

#### train model
```{r}
model <- rand_forest(mtry = 200, 
                     trees = 1400, 
                     min_n = 15, 
                     mode = "regression") %>%
  set_engine("ranger", 
             importance = "impurity", 
             num.threads = 3) %>%
  fit(pseudotime ~ ., data = dat_train)

```


#### Evaluate model with validation set

```{r}
val_results <- dat_val %>% 
  mutate(estimate = predict(model, .[,-1]) %>% pull()) %>% 
  dplyr::select(truth = pseudotime, estimate)

```


#### Identify top 20 genes
```{r}
var_imp <- sort(model$fit$variable.importance, decreasing = TRUE)
top_genes <- names(var_imp)[1:40]


```


#### Plot top 40 genes
```{r, fig.height = 80, fig.width = 6}
par(mfrow = c(3, 2))
for (i in seq_along(top_genes)) {
  colors <- pal[cut(dat_use[,top_genes[i]], breaks = 100)]
  plot(slingReducedDim(sds), col = colors, 
       pch = 16, cex = 0.5, main = top_genes[i])
  lines(SlingshotDataSet(sds), lwd = 2, col = 'black', type = 'lineages')
}

```





# Perform pseudotime analysis separately for each tissue


## BA4
```{r}
Idents(nad_ol)<-"Tissue"
ba4<-subset(nad_ol, ident = "BA4")

```



```{r}

sds_ba4 <- slingshot(Embeddings(ba4, "umap"), 
                 clusterLabels = ba4$ol_clusters_named, 
                 start.clus = "OPC_B", stretch = 0)


plot(slingReducedDim(sds_ba4), col = cols[cl], pch = 16, cex = 0.5)
lines(SlingshotDataSet(sds_ba4), lwd = 2, type = 'lineages', col = 'black')



```



### Differential gene expression along curves

#### Use random forest
##### Get top highly variable genes
```{r}
top_hvg <- HVFInfo(ba4) %>% 
  mutate(., bc = rownames(.)) %>% 
  arrange(desc(variance.standardized)) %>% 
  top_n(300, variance.standardized) %>% 
  pull(bc)
```

##### Prepare data for random forest
```{r}
dat_use <- t(GetAssayData(ba4, slot = "data")[top_hvg,])
dat_use_df <- cbind(slingPseudotime(sds_ba4)[,2], dat_use) # Do curve 2, so 2nd columnn
colnames(dat_use_df)[1] <- "pseudotime"
dat_use_df <- as.data.frame(dat_use_df[!is.na(dat_use_df[,1]),])
```




##### use tidymodels
###### generate training and validation dataset
```{r}
set.seed(123)
dat_split <- initial_split(dat_use_df)
dat_train <- training(dat_split)
dat_val <- testing(dat_split)

```

###### train model
```{r}
model <- rand_forest(mtry = 200, 
                     trees = 1400, 
                     min_n = 15, 
                     mode = "regression") %>%
  set_engine("ranger", 
             importance = "impurity", 
             num.threads = 3) %>%
  fit(pseudotime ~ ., data = dat_train)

```


###### Evaluate model with validation set

```{r}
val_results <- dat_val %>% 
  mutate(estimate = predict(model, .[,-1]) %>% pull()) %>% 
  dplyr::select(truth = pseudotime, estimate)

```


###### Identify top 20 genes
```{r}
var_imp <- sort(model$fit$variable.importance, decreasing = TRUE)
top_genes <- names(var_imp)[1:20]
```


###### Plot top 20 genes
```{r, fig.height = 20, fig.width = 6}
par(mfrow = c(3, 2))
for (i in seq_along(top_genes)) {
  colors <- pal[cut(dat_use[,top_genes[i]], breaks = 100)]
  plot(reducedDim(sds_ba4), col = colors, 
       pch = 16, cex = 0.5, main = top_genes[i])
  lines(sds, lwd = 2, col = 'black', type = 'lineages')
}

```







## CB
```{r}
Idents(nad_ol)<-"Tissue"
cb<-subset(nad_ol, ident = "CB")

```



```{r}

sds_cb <- slingshot(Embeddings(cb, "umap"), 
                 clusterLabels = cb$ol_clusters_named, 
                 start.clus = "OPC_A", stretch = 0)


cl_cb <-cb$ol_clusters_named
cols_cb <- mycoloursP[6:40]

plot(slingReducedDim(sds_cb), col = cols_cb[cl_cb], pch = 16, cex = 0.5)
lines(SlingshotDataSet(sds_cb), lwd = 2, type = 'lineages', col = 'black')

```




### Differential gene expression along curves

#### Use random forest
##### Get top highly variable genes
```{r}
top_hvg <- HVFInfo(cb) %>% 
  mutate(., bc = rownames(.)) %>% 
  arrange(desc(variance.standardized)) %>% 
  top_n(300, variance.standardized) %>% 
  pull(bc)
```

##### Prepare data for random forest
```{r}
dat_use <- t(GetAssayData(cb, slot = "data")[top_hvg,])
dat_use_df <- cbind(slingPseudotime(sds_cb)[,2], dat_use) # Do curve 2, so 2nd columnn
colnames(dat_use_df)[1] <- "pseudotime"
dat_use_df <- as.data.frame(dat_use_df[!is.na(dat_use_df[,1]),])
```

##### use tidymodels
###### generate training and validation dataset
```{r}
set.seed(123)
dat_split <- initial_split(dat_use_df)
dat_train <- training(dat_split)
dat_val <- testing(dat_split)

```

###### train model
```{r}
model <- rand_forest(mtry = 200, 
                     trees = 1400, 
                     min_n = 15, 
                     mode = "regression") %>%
  set_engine("ranger", 
             importance = "impurity", 
             num.threads = 3) %>%
  fit(pseudotime ~ ., data = dat_train)

```


###### Evaluate model with validation set

```{r}
val_results <- dat_val %>% 
  mutate(estimate = predict(model, .[,-1]) %>% pull()) %>% 
  dplyr::select(truth = pseudotime, estimate)

```


###### Identify top 20 genes
```{r}
var_imp <- sort(model$fit$variable.importance, decreasing = TRUE)
top_genes <- names(var_imp)[1:20]
```


###### Plot top 20 genes
```{r, fig.height = 20, fig.width = 6}
par(mfrow = c(3, 2))
for (i in seq_along(top_genes)) {
  colors <- pal[cut(dat_use[,top_genes[i]], breaks = 100)]
  plot(reducedDim(sds_cb), col = colors, 
       pch = 16, cex = 0.5, main = top_genes[i])
  lines(sds, lwd = 2, col = 'black', type = 'lineages')
}

```





## CSC
```{r}
Idents(nad_ol)<-"Tissue"
csc<-subset(nad_ol, ident = "CSC")

```



```{r}

sds_csc <- slingshot(Embeddings(csc, "umap"), 
                 clusterLabels = csc$ol_clusters_named, 
                 start.clus = "OPC_A", stretch = 0)


cl_csc <-csc$ol_clusters_named
cols_csc <- mycoloursP[6:40]

plot(slingReducedDim(sds_csc), col = cols_csc[cl_csc], pch = 16, cex = 0.5)
lines(SlingshotDataSet(sds_csc), lwd = 2, type = 'lineages', col = 'black')


```




### Differential gene expression along curves

#### Use random forest
##### Get top highly variable genes
```{r}
top_hvg <- HVFInfo(csc) %>% 
  mutate(., bc = rownames(.)) %>% 
  arrange(desc(variance.standardized)) %>% 
  top_n(300, variance.standardized) %>% 
  pull(bc)
```

##### Prepare data for random forest
```{r}
dat_use <- t(GetAssayData(csc, slot = "data")[top_hvg,])
dat_use_df <- cbind(slingPseudotime(sds_csc)[,2], dat_use) # Do curve 2, so 2nd columnn
colnames(dat_use_df)[1] <- "pseudotime"
dat_use_df <- as.data.frame(dat_use_df[!is.na(dat_use_df[,1]),])
```

##### use tidymodels
###### generate training and validation dataset
```{r}
set.seed(123)
dat_split <- initial_split(dat_use_df)
dat_train <- training(dat_split)
dat_val <- testing(dat_split)

```

###### train model
```{r}
model <- rand_forest(mtry = 200, 
                     trees = 1400, 
                     min_n = 15, 
                     mode = "regression") %>%
  set_engine("ranger", 
             importance = "impurity", 
             num.threads = 3) %>%
  fit(pseudotime ~ ., data = dat_train)

```


###### Evaluate model with validation set

```{r}
val_results <- dat_val %>% 
  mutate(estimate = predict(model, .[,-1]) %>% pull()) %>% 
  dplyr::select(truth = pseudotime, estimate)

```


###### Identify top 20 genes
```{r}
var_imp <- sort(model$fit$variable.importance, decreasing = TRUE)
top_genes <- names(var_imp)[1:20]
```


###### Plot top 20 genes
```{r, fig.height = 20, fig.width = 6}
par(mfrow = c(3, 2))
for (i in seq_along(top_genes)) {
  colors <- pal[cut(dat_use[,top_genes[i]], breaks = 100)]
  plot(reducedDim(sds_csc), col = colors, 
       pch = 16, cex = 0.5, main = top_genes[i])
  lines(sds, lwd = 2, col = 'black', type = 'lineages')
}

```




# Save objects
```{r}
dir.create("/Users/lseeker/Documents/Work/HumanCellAtlas/2021_oligos_out/slingshot_data")

saveRDS(sds, "/Users/lseeker/Documents/Work/HumanCellAtlas/2021_oligos_out/slingshot_data/slingshot_full_data_OPC_B.RDS")
saveRDS(sds_b, "/Users/lseeker/Documents/Work/HumanCellAtlas/2021_oligos_out/slingshot_data/slingshot_full_data_OPC_A.RDS")
saveRDS(sds_ba4, "/Users/lseeker/Documents/Work/HumanCellAtlas/2021_oligos_out/slingshot_data/slingshot_BA4.RDS")
saveRDS(sds_cb, "/Users/lseeker/Documents/Work/HumanCellAtlas/2021_oligos_out/slingshot_data/slingshot_CB.RDS")
saveRDS(sds_csc, "/Users/lseeker/Documents/Work/HumanCellAtlas/2021_oligos_out/slingshot_data/slingshot_CSC.RDS")

```


# Session info 
```{r}

sessionInfo()
```