scr/oligodendroglia/Integration with Pasca O4+ dataset.Rmd · CorticalOrganoids

---
title: "Integration with Pasca O4+ dataset"
author: "Nina-Lydia Kazakou"
date: "06/07/2021"
output: html_document
---

# Load libraries
```{r}
library(GEOquery)
library(Seurat)
library(tidyverse)
library(ggplot2)
library(dplyr)
library(ggsci)
library(here)
```

# Set the colour pallete 
```{r}
mypal <- pal_npg("nrc", alpha = 0.7)(10)
mypal2 <-pal_tron("legacy", alpha = 0.7)(7)
mypal3 <- pal_lancet("lanonc", alpha = 0.7)(9)
mypal4 <- pal_simpsons(palette = c("springfield"), alpha = 0.7)(16)
mypal5 <- pal_rickandmorty(palette = c("schwifty"), alpha = 0.7)(6)
mypal6 <- pal_futurama(palette = c("planetexpress"), alpha = 0.7)(5)
mypal7 <- pal_startrek(palette = c("uniform"), alpha = 0.7)(5)
mycoloursP<- c(mypal, mypal2, mypal3, mypal4, mypal5, mypal6, mypal7)
```

# Download data 
https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE115011
```{r}
getGEOSuppFiles("GSE115011")
```

# Create suerat.object
```{r}
csvfile <- here("Materials", "GSE115011", "GSE115011_marton_all_cells.csv")

#Error duplicate 'row.names' are not allowed
#Save with no row.names
if (exists("csvfile")) {
raw_counts <- read.table(file = csvfile , header = TRUE, sep = ',', row.names = NULL)
}

if (exists("tsvfile")) {
raw_counts <- read.table(file = tsvfile , header = TRUE, sep = '\t', row.names = NULL)
}

#Check the dimension and format
dim(raw_counts)
raw_counts[1:5, 1:3]

# Which names are duplicated?
name_duplicate <- raw_counts[which(duplicated(raw_counts$Gene.name)),1] 
length(name_duplicate) # number of gene name duplicates (52 in this case)

# Which are exactly all the raw duplicated?
row_duplicate <- raw_counts[which(duplicated(raw_counts)),1]
length(row_duplicate) # number of exactly all raw duplicated (10)

# You might want to save these name_duplicate and row_duplicate genes

# Delete the ones that are exactly the same 
# and change name to the ones duplicated but with different values.
raw_counts <- unique(raw_counts)
dim(raw_counts) #Should have (10) less than before
Gene_name <- make.unique(as.character(raw_counts$Gene.name))

# Reassign the rownames setted to NULL unil now to the new Gene names
rownames(raw_counts) <- Gene_name
raw_counts <- raw_counts[,-1] # get rid of old names


exp_GSE115011 <- CreateSeuratObject(counts = raw_counts)
exp_GSE115011
```

# Find variable features
```{r fig.width=8, fig.height=6, fig.fullwidth=TRUE}
exp_GSE115011 <- FindVariableFeatures(exp_GSE115011, 
                               selection.method = "vst", 
                               nfeatures = 2000)

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(exp_GSE115011), 10)

# plot variable features with and without labels
plot1 <- VariableFeaturePlot(exp_GSE115011, cols = mycoloursP[4:5])
plot2 <- LabelPoints(plot = plot1, 
                     points = top10, 
                     repel = TRUE)
plot2 
```

# Scale all genes
```{r}
all_genes <- rownames(exp_GSE115011)
exp_GSE115011 <- ScaleData(exp_GSE115011, features = all_genes)
```

# Linear dimension reduction
```{r}
exp_GSE115011 <- RunPCA(exp_GSE115011, 
                       features = VariableFeatures(object = exp_GSE115011), 
                       npcs = 100)

ElbowPlot(exp_GSE115011, ndims = 100)

exp_GSE115011 <- FindNeighbors(exp_GSE115011, dims = 1:80)
exp_GSE115011 <- FindClusters(exp_GSE115011, resolution = 0.5)

exp_GSE115011 <- RunUMAP(exp_GSE115011, dims = 1:80)

exp_GSE115011 <- RunTSNE(exp_GSE115011, dims = 1:80, check_duplicates = FALSE)

```

```{r fig.width=6, fig.height=5, fig.fullwidth=TRUE}
# Plots
DimPlot(exp_GSE115011, reduction = "umap", label = T, cols = mycoloursP[6:40], pt.size = 1) & NoAxes()

DimPlot(exp_GSE115011, reduction = "tsne", label = T, cols = mycoloursP[6:40], pt.size = 1) & NoAxes()
```

```{r fig.width=6, fig.height=15, fig.fullwidth=TRUE}
FeaturePlot(exp_GSE115011, features = c("OLIG1", "OLIG2", "SOX10", "PDGFRA", "PCDH15", "PTPRZ1",
                                        "CSPG4", "GPR17", "MYRF", "MYC", "BMP4", "NKX2-2", "MAL", 
                                        "MOG", "MBP", "PLP1", "CNP", "BCAN", "NEU4"), ncol = 2) & NoAxes()
```

```{r fig.width=6, fig.height=8, fig.fullwidth=TRUE}
FeaturePlot(exp_GSE115011, reduction = "umap", features = c("OPALIN", "KLK6", "RBFOX1", "SPARC", 
                                                            "SPARCL1", "DLL1", "EGFR", "GFAP"), ncol = 2) & NoAxes()
```

```{r}
saveRDS(exp_GSE115011, here("Materials", "GSE115011", file = "exp_GSE115011.rds"))
```

# Load objects object or take from the environment 
```{r}
exp_GSE115011 <- readRDS(here("Materials", "GSE115011", "exp_GSE115011.rds"))

alt.co.oligos <- readRDS(here("data", "Oligos_only", "alt.co.oligos.rds"))
head(alt.co.oligos@meta.data,3)

# CleanUp the meta.data
keep.metadata.co <- c("orig.ident", "Cells", "Sample", "Phase", "nFeature_RNA", "detected", "sum", "total", "sizeFactor", "scDblFinder.class", "percent.mito", "RNA_snn_res.1", "scSorter_predID", "Origins", "nCount_RNA", "log10GenesPerUMI", "Chemistry", "ManualClusters", "ClusterID")
alt.co.oligos@meta.data <- alt.co.oligos@meta.data[, keep.metadata.co]
head(alt.co.oligos@meta.data, 2)
# Indicate that the counts and features are from the previous object with all the celltypes

# Note: I want to keep columns present in both objects with the same names, i.e. "orig.ident", "nCount_RNA", "nFeature_RNA", "Cells", "percent.mito"
alt.co.oligos$NK_detected <- alt.co.oligos$detected
alt.co.oligos$NK_total <- alt.co.oligos$total
alt.co.oligos$NK_Sample <- alt.co.oligos$Sample
alt.co.oligos$NK_Phase <- alt.co.oligos$Phase
alt.co.oligos$NK_sum <- alt.co.oligos$sum
alt.co.oligos$NK_sizeFactor <- alt.co.oligos$sizeFactor
alt.co.oligos$NK_scDblFinder.class <- alt.co.oligos$scDblFinder.class
alt.co.oligos$NK_RNA_snn_res.1 <- alt.co.oligos$RNA_snn_res.1
alt.co.oligos$NK_scSorter_predID <- alt.co.oligos$scSorter_predID
alt.co.oligos$NK_OriginPopulation <- alt.co.oligos$Origins
alt.co.oligos$NK_log10GenesPerUMI <- alt.co.oligos$log10GenesPerUMI
alt.co.oligos$NK_Chemistry <- alt.co.oligos$Chemistry
alt.co.oligos$NK_ManualClusters <- alt.co.oligos$ManualClusters
alt.co.oligos$NK_ClusterID <- alt.co.oligos$ClusterID


alt.co.oligos$detected <- NULL
alt.co.oligos$total <- NULL
alt.co.oligos$Sample <- NULL
alt.co.oligos$Phase <- NULL
alt.co.oligos$sum <- NULL
alt.co.oligos$sizeFactor <- NULL
alt.co.oligos$scDblFinder.class <- NULL
alt.co.oligos$RNA_snn_res.1 <- NULL
alt.co.oligos$scSorter_predID <- NULL
alt.co.oligos$Origins <- NULL
alt.co.oligos$log10GenesPerUMI <- NULL
alt.co.oligos$Chemistry <- NULL
alt.co.oligos$ManualClusters <- NULL
alt.co.oligos$ClusterID <- NULL

head(alt.co.oligos)

keep.metadata.exp_GSE115011 <- c("orig.ident", "nFeature_RNA", "nCount_RNA")
exp_GSE115011@meta.data <- exp_GSE115011@meta.data[, keep.metadata.exp_GSE115011]

# Calculate the percentage of mitochondrial genes
exp_GSE115011[["percent.mito"]] <- PercentageFeatureSet(exp_GSE115011, pattern = "^MT-")
# Label the cells origin
exp_GSE115011@meta.data$Cells <- "exp_GSE115011"
head(exp_GSE115011)
```

# Add a dataset column
```{r}
exp_GSE115011@meta.data$dataset <- "Marton et al., 2019"

alt.co.oligos@meta.data$dataset <- "Kazakou"
```

# Integrate datasets
```{r}
# Create a list of objects
srt_list <- list(alt.co.oligos, exp_GSE115011)

# normalize and identify variable features for each dataset independently
srt_list <- lapply(X = srt_list, FUN = function(x) {
    x <- NormalizeData(x)
    x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})

# select features that are repeatedly variable across datasets for integration
var_features <- SelectIntegrationFeatures(object.list = srt_list)

# create anchors to integrate the datasets
anchors <- FindIntegrationAnchors(object.list = srt_list, anchor.features = var_features, k.anchor = 5, k.filter = NA) #for small datasets  

# create an 'integrated' data assay 
ol_comb <- IntegrateData(anchorset = anchors, dims = 1:20)
```

# Run the standard workflow for visualisation & clustering
```{r}
DefaultAssay(ol_comb) <- "integrated"
Idents(ol_comb) <- "dataset"

# Scale data 
all_genes <- rownames(ol_comb)
ol_comb <- ScaleData(ol_comb, features = all_genes)

# Dimentionality reduction
ol_comb <- RunPCA(ol_comb, npcs = 30, verbose = FALSE)

# Choose the mumber of PCs
ElbowPlot(ol_comb, ndims = 50) #20

# Clustering
ol_comb <- RunUMAP(ol_comb, reduction = "pca", dims = 1:20)
ol_comb <- FindNeighbors(ol_comb, reduction = "pca", dims = 1:20)
ol_comb <- FindClusters(ol_comb, resolution = c(0.05, 0.1, 0.3, 0.5, 0.7, 0.8, 1.0))

colnames(ol_comb@meta.data)
```

# Visualization 
```{r}
DimPlot(ol_comb, pt.size = 1, cols = mycoloursP[5:40]) & NoAxes()
DimPlot(ol_comb, pt.size = 1, group.by = "dataset", cols = mycoloursP[5:40]) & NoAxes()
DimPlot(ol_comb, group.by = "dataset", split.by = "dataset", pt.size = 1, cols = mycoloursP[5:40]) & NoAxes()

DimPlot(ol_comb, group.by = "NK_RNA_snn_res.1", pt.size = 1, cols = mycoloursP[15:40], label = TRUE) & NoAxes() & NoLegend()
DimPlot(ol_comb, group.by = "NK_RNA_snn_res.1", split.by = "dataset", pt.size = 1, cols = mycoloursP[15:40], label = TRUE) & NoAxes() & NoLegend()
```

```{r fig.width=8, fig.height=10, fig.fullwidth=TRUE}
DefaultAssay(ol_comb) <- "RNA"

FeaturePlot(ol_comb, reduction = "umap", pt.size = 1, features = c("PDGFRA", "MBP", "PLP1", "MAG", "MOG", "MYRF", "GPR17", "EGFR", "SPARC"), ncol = 2) & NoAxes()
```


```{r}
saveRDS(ol_comb, here("Materials", "GSE115011", file = "ol_comb_alt.co.oligos_pasca.rds"))
```


# Analyse integrated datasets
```{r}
# Generate DimPlots for all the different resolutions
dir.create(here("Materials", "GSE115011", "Integration_Analysis", "DimPlots_allRes"))

plot_list_func <- function(seur_obj,
                           col_pattern, 
                           plot_cols, 
                           clust_lab = TRUE,
                           label_size = 8,
                           num_col = 4,
                           save_dir = getwd(),
                           width=7,
                           height=5){
  extr_res_col <- grep(pattern = col_pattern, names(seur_obj@meta.data))
  res_names <- names(seur_obj@meta.data[extr_res_col])
  # gtools function, sorts gene_names alphanumeric:
  res_names <- mixedsort(res_names) 
  plot_l <-list()
  for(i in 1: length(res_names)){
    pdf(paste0(save_dir, 
               res_names[i], "_umap.pdf"), width=width, height=height)
    dim_plot <- DimPlot(seur_obj, 
                        reduction = "umap", 
                        cols= plot_cols,
                        group.by = res_names[i],
                        label = clust_lab,
                        label.size = label_size) + NoLegend()
    print(dim_plot)
    dev.off()
    print(dim_plot)
  }
}

plot_list_func(seur_obj = ol_comb,
               col_pattern = "integrated_snn_res.",
               plot_cols = mycoloursP[6:40],
               save_dir = here("Materials", "GSE115011", "Integration_Analysis", "DimPlots_allRes"))

```

## Run approximation models
# Load libraries
```{r}
library(SingleCellExperiment)
library(devtools)
library(Matrix)
library(scales)
library(clustree)
library(gtools)
library(cluster)
library(bluster)
```

```{r fig.width=9, fig.height=10, fig.fullwidth=TRUE}
 # Plot a ClusterTree
dir.create(here("Materials", "GSE115011", "Integration_Analysis", "ClusterTree"))

#For ploting all the resolutions
clust_tree <- clustree(ol_comb, prefix = "integrated_snn_res.", edge_arrow=FALSE) # Maybe res.0.6?

print(clust_tree)
```

Notes: In the lowest resolution (0.0.5), Cluster 0 seems to be progenitor cells, while cluster 1 corresponds to mature OL.
```{r}
Idents(ol_comb) <- "integrated_snn_res.0.05"

DefaultAssay(ol_comb) <- "RNA"

DimPlot(ol_comb)

FeaturePlot(ol_comb, features = c("OLIG2", "SOX10", "PDGFRA", "MBP"), label = T)
```

```{r fig.width=10, fig.height=8, fig.fullwidth=TRUE}
 # Approximate Silhouette Plots
dir.create(here("Materials", "GSE115011", "Integration_Analysis", "Silhouette_Plots"))
dir.create(here("Materials", "GSE115011", "Integration_Analysis", "ClusterPurity_Plots"))

ol_comb_sce <- as.SingleCellExperiment(ol_comb)

#1.Approximate Silhouette
approx_sil <- function(sce_obj, reduction = "PCA",
                           col_pattern = "integrated_snn_res.", 
                           plot_cols, 
                           clust_lab = TRUE,
                           label_size = 8,
                           save_dir = getwd(),
                           width=7,
                           height=5){
  res_col <- grep(pattern = col_pattern, names(colData(sce_obj)))
  names_col <- names(colData(sce_obj))[res_col]
  # gtools function, sorts gene_names alphanumeric:
  names_col <- mixedsort(names_col)
  met_dat <- as.data.frame(colData(ol_comb_sce))
  for(i in 1: length(names_col)){
    clust <- met_dat[[names_col[i]]]
    clust_int <- as.integer(paste0(clust))
    
    sil_approx <- approxSilhouette(reducedDim(sce_obj, reduction), 
                                   clusters = clust_int)
    sil_data <- as.data.frame(sil_approx)
    sil_data$closest <- factor(ifelse(sil_data$width > 0, clust_int, sil_data$other))
    sil_data$cluster <- factor(clust_int)
    
    apr_sil_plot <-ggplot(sil_data, aes(x=cluster, y=width, colour=closest)) + 
    ggbeeswarm::geom_quasirandom(method="smiley") + theme_bw(20) +
      xlab(names_col[i])
    
    
  pdf(paste0(save_dir, 
               names_col[i], "_sil.pdf"), width=width, height=height)
  
 
  print(apr_sil_plot)
  
  dev.off()
  
  print(apr_sil_plot)
  }
  print("Done")
}

approx_sil(sce_obj = ol_comb_sce, 
                          col_pattern = "integrated_snn_res.",
                          plot_cols = mycoloursP[6:40],
                          save_dir =here("Materials", "GSE115011", "Integration_Analysis", "Silhouette_Plots"))


#2.Cluster Purity
clu_pure <- function(sce_obj, reduction = "PCA",
                           col_pattern = "integrated_snn_res.", 
                           plot_cols, 
                           clust_lab = TRUE,
                           label_size = 8,
                           save_dir = getwd(),
                           width=7,
                           height=5){
  res_col <- grep(pattern = col_pattern, names(colData(sce_obj)))
  names_col <- names(colData(sce_obj))[res_col]
  # gtools function, sorts gene_names alphanumeric:
  names_col <- mixedsort(names_col)
  met_dat <- as.data.frame(colData(ol_comb_sce))
  for(i in 1: length(names_col)){
    clust <- met_dat[[names_col[i]]]
    clust_int <- as.integer(paste0(clust))
    
    pure <- neighborPurity(reducedDim(sce_obj, reduction), clusters = clust_int)
    pure_data <- as.data.frame(pure)
    pure_data$maximum <- factor(pure_data$maximum)
    pure_data$cluster <- factor(clust_int)
    
    
    pure_plot <- ggplot(pure_data, aes(x=cluster, y=purity, colour=maximum)) + 
      ggbeeswarm::geom_quasirandom(method="smiley") +
      theme_bw(20) +
      xlab(names_col[i])
    
    
    pdf(paste0(save_dir, 
               names_col[i], "_sil.pdf"), width=width, height=height)
    
    print(pure_plot)
  
    dev.off()
    
    print(pure_plot)
  }
  print("Done")
}

clu_pure(sce_obj = ol_comb_sce, 
                          col_pattern = "integrated_snn_res.",
                          plot_cols = mycoloursP[6:40],
                          save_dir = here("Materials", "GSE115011", "Integration_Analysis", "ClusterPurity_Plots"))
```

Notes: The OL population in the ClusterTree seems to be stable in one cluster until resolution 0.7. After that a new mix of progenitors and OLs takes place. Based on the Purity plots, maybe resolution 0.5 would be better though.

# Cell Cycle Annotation
```{r}
# A list of cell cycle markers, from Tirosh et al, 2015, is loaded with Seurat.  
s.genes <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes

ol_comb <- CellCycleScoring(ol_comb, s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE)

head(ol_comb@meta.data)

DimPlot(ol_comb, group.by = "Phase", cols = mycoloursP, label = T)
DimPlot(ol_comb, split.by = "Phase", group.by = "dataset", cols = mycoloursP)
```

# Cluster Annotation for resultion 0.7
```{r}
Idents(ol_comb) <- "integrated_snn_res.0.7"
DimPlot(ol_comb, cols = mycoloursP, label = TRUE, pt.size = 1) & NoLegend()
DimPlot(ol_comb, cols = mycoloursP[10:40], group.by = "dataset") 
```


```{r fig.width=12, fig.height=5, fig.fullwidth=TRUE}
DimPlot(ol_comb, cols = mycoloursP, split.by = "NK_Sample", group.by = "integrated_snn_res.0.7", label = TRUE, pt.size = 1) & NoLegend()
```

```{r}
# Check that all samples contribute to the formed clusters
IDsPerCluster_ol_comb_0.7 <- as.data.frame(tapply(ol_comb@meta.data$dataset, 
                                      ol_comb@meta.data$integrated_snn_res.0.7, 
                                      function(x) length(unique(x)) ))
names(IDsPerCluster_ol_comb_0.7) <- "NumberOfIDs_ol_comb_0.7"
IDsPerCluster_ol_comb_0.7$NumberOfIDs_ol_comb_0.7 <- rownames(IDsPerCluster_ol_comb_0.7)
IDsPerCluster_ol_comb_0.7$Cluster <- rownames(IDsPerCluster_ol_comb_0.7)
IDsPerCluster_ol_comb_0.7

# Identify the number of cells per cluster
n_cells_ol_comb_0.7 <- FetchData(ol_comb, 
                     vars = c("integrated_snn_res.0.7", "dataset")) %>%
  dplyr::count(integrated_snn_res.0.7, dataset) %>%
  tidyr::spread(integrated_snn_res.0.7, n)

n_cells_ol_comb_0.7

dir.create(here("Materials", "GSE115011", "Integration_Analysis", "Integrated_Res.0.7"))

write.csv(n_cells_ol_comb_0.7, here("Materials", "GSE115011", "Integration_Analysis", "Integrated_Res.0.7", file = "no.ofCellsperCluster_integrated_snn_res.0.7.csv"))
```


```{r}
DefaultAssay(ol_comb) <- "RNA"
```


```{r fig.width=7, fig.height=6, fig.fullwidth=TRUE}
VlnPlot(ol_comb, features = c("OLIG1", "OLIG2", "SOX10", "PPP1R16B"), cols = mycoloursP, pt.size = 0.2, ncol = 2)
# Clusters 1 & 7 are the OL clusters
# Clusters 2 & 5 seem to be really early?
```

```{r fig.width=7, fig.height=6, fig.fullwidth=TRUE}
VlnPlot(ol_comb, features = c("PDGFRA", "PCDH15", "SOX6"), cols = mycoloursP, pt.size = 0.2, ncol = 2)
# Clusters 0, 3, 4, & maybe 6 are progenitor cells
# Could Cluster 2 be COPs? and maybe 6 too?
```

```{r fig.width=7, fig.height=8, fig.fullwidth=TRUE}
VlnPlot(ol_comb, features = c("GPR17", "MYC", "NKX2-2", "MYRF", "CLDN11"), cols = mycoloursP, pt.size = 0.2, ncol = 2)
# Clusters 0 & 6 seem to be something like COPs
```


```{r fig.width=7, fig.height=8, fig.fullwidth=TRUE}
VlnPlot(ol_comb, features = c("MBP", "CNP", "PLP1", "MOG", "MAG", "OSP", "MPZ"), cols = mycoloursP, pt.size = 0.2, ncol = 2)
# Clusters 4 & 6 are really MBP positive
```

```{r fig.width=7, fig.height=6, fig.fullwidth=TRUE}
VlnPlot(ol_comb, features = c("DLL1", "DLL3", "EGFR"), cols = mycoloursP, pt.size = 0.2, ncol = 2)
```


```{r fig.width=7, fig.height=6, fig.fullwidth=TRUE}
VlnPlot(ol_comb, features = c("HOPX", "SPARCL1", "SPARC"), pt.size = 0.2, ncol = 2, cols = mycoloursP)
```
 
```{r fig.width=7, fig.height=3, fig.fullwidth=TRUE}
VlnPlot(ol_comb, features = c("GFAP", "SOX9"), cols = mycoloursP, pt.size = 0.2)
```

```{r fig.width=7, fig.height=3, fig.fullwidth=TRUE}
VlnPlot(ol_comb, features = c("MKI67", "TOP2A"), cols = mycoloursP, pt.size = 0.2)
```


```{r fig.width=7, fig.height=12, fig.fullwidth=TRUE}
FeaturePlot(ol_comb, features = c("OPALIN", "RBFOX1", "SPARC", "NELL1", "PAX3", "FMN1", "LAMA2", "SLC22A3"), pt.size = 0.7, ncol = 2, label = TRUE)
VlnPlot(ol_comb, features = c("OPALIN", "RBFOX1", "SPARC", "NELL1", "PAX3", "FMN1", "LAMA2", "SLC22A3"), pt.size = 0.2, cols = mycoloursP, ncol = 2)
```


```{r fig.width=15, fig.height=8, fig.fullwidth=TRUE}
DotPlot(ol_comb, features = c("OLIG1", "OLIG2", "SOX10", "PPP1R16B", "PDGFRA", "PCDH15", "SOX6", "GPR17", "MYC", "NKX2-2", "MYRF", "CLDN11", "MBP", "CNP", "PLP1", "MOG", "MAG", "MPZ", "DLL1", "DLL3", "EGFR","HOPX", "SPARCL1", "SPARC", "GFAP", "SOX9", "OPALIN", "RBFOX1", "NELL1", "PAX3", "FMN1", "LAMA2", "SLC22A3"), cols = mycoloursP) + FontSize(7)
```

Notes: Cluster 7: Maturest OL (FMN1+), Cluster 1: Mature OL , Cluster 2: Pri-OPCs, Cluster 5: OAPC, Cluster 0: Typical OPCs, Cluster 4:?, Cluster 6?:

# DE analysis 

```{r}
# All resolutions
int_res_all_mark <- function(seur_obj, 
                             int_cols,
                             only_pos = TRUE,
                             min_pct = 0.25,
                             logfc_threshold = 0.25,
                             fil_pct_1 = 0.25,
                             fil_pct_2 = 0.6,
                             avg_log = 1.2,
                             save_dir = getwd(),
                             test_use = "MAST"
                             ){
  for(i in 1:length(int_cols)){
    Idents(seur_obj) <- int_cols[i]
    all_mark <- FindAllMarkers(seur_obj, 
                               only.pos = only_pos, 
                               min.pct = min_pct, 
                               logfc.threshold = logfc_threshold,
                               test.use = test_use)
    fil_mark<- subset(all_mark, 
                      all_mark$pct.1 > fil_pct_1 & 
                        all_mark$pct.2 < fil_pct_2 )
    
    write.csv(all_mark, paste(save_dir, "/all_mark", int_cols[i], ".csv", sep = "" ))
    write.csv(fil_mark, paste(save_dir, "/fil_mark", int_cols[i], ".csv", sep = "" ))
    
  }
}

dir.create(here("Materials", "GSE115011", "Integration_Analysis", "DE_Analysis"))

int_res_all_mark(ol_comb, 
                 int_cols = c("integrated_snn_res.0.05",
                              "integrated_snn_res.0.1",
                              "integrated_snn_res.0.3",
                              "integrated_snn_res.0.5",
                              "integrated_snn_res.0.7",
                              "integrated_snn_res.0.8",
                              "integrated_snn_res.1.0"),
                 save_dir = here("Materials", "GSE115011", "Integration_Analysis", "DE_Analysis"))
```

```{r}
fil.de.markers.integrated_snn_res.0.7 <- read.csv(here("Materials", "GSE115011", "Integration_Analysis", "DE_Analysis", "fil_markintegrated_snn_res.0.7.csv"))
head(fil.de.markers.integrated_snn_res.0.7)

 # Identify the top10 genes
top10_integrated_snn_res.0.7 <- fil.de.markers.integrated_snn_res.0.7 %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC) #subset for the top10 genes
top10_integrated_snn_res.0.7

 # Calculate the cluster averages to plot those as well
cluster_averages_integrated_snn_res.0.7 <- AverageExpression(ol_comb, group.by = "integrated_snn_res.0.7",
                                      return.seurat = TRUE)

cluster_averages_integrated_snn_res.0.7@meta.data$cluster <- factor(levels(as.factor(ol_comb@meta.data$integrated_snn_res.0.7)), 
                                             levels = c(0, 1, 2, 3, 4, 5, 6, 7))
```

```{r fig.width=7, fig.height=12, fig.fullwidth=TRUE}
DefaultAssay(ol_comb) <- "integrated"

DoHeatmap(ol_comb, features = top10_integrated_snn_res.0.7$gene, label = TRUE, group.by = "integrated_snn_res.0.7", group.colors = mycoloursP[6:40], draw.lines = FALSE, size = 3)

DoHeatmap(object = cluster_averages_integrated_snn_res.0.7, features = top10_integrated_snn_res.0.7$gene, label = TRUE, group.by = "cluster",  group.colors = mycoloursP[6:40], draw.lines = F)
```

# Cluster Annotation
```{r}
ol_comb@meta.data$ClusterID <- ol_comb@meta.data$integrated_snn_res.0.7
colnames(ol_comb@meta.data)

current.ids <- c("0", "1", "2", "3", "4", "5", "6", "7")
new.ids <- c("OPC", "mOL1", "pri-OPC", "Cyc", "imOL", "OAPC", "other OPC", "mOL2")
ol_comb@meta.data$ClusterID <- plyr::mapvalues(x = ol_comb@meta.data$ClusterID, from = current.ids, to = new.ids)

head(ol_comb@meta.data)
```


```{r}
saveRDS(ol_comb, here("Materials", "GSE115011", file = "ol_comb_alt.co.oligos_pasca.rds"))
```

# Plots
```{r}
Idents(ol_comb) <- "ClusterID"

DimPlot(ol_comb, split.by = "dataset", cols = mycoloursP[12:40], pt.size = 1, label = FALSE) & NoAxes()
```

```{r fig.width=7, fig.height=12, fig.fullwidth=TRUE}
DefaultAssay(ol_comb) <- "integrated"

cluster_averages_ClusterID <- AverageExpression(ol_comb, group.by = "ClusterID",
                                      return.seurat = TRUE)

cluster_averages_ClusterID@meta.data$HM <- factor(levels(as.factor(ol_comb@meta.data$ClusterID)), 
                                             levels = c("OPC", "mOL1", "pri-OPC", "Cyc", "imOL", "OPAC", "other OPC", "mOL2"))

DoHeatmap(object = cluster_averages_ClusterID, features = top10_integrated_snn_res.0.7$gene, label = TRUE, group.by = "HM",  group.colors = mycoloursP[6:40], draw.lines = F)

saveRDS(cluster_averages_ClusterID, here("Materials", "GSE115011", file = "cluster_averages_ClusterID.rds"))
```

```{r}
sessionInfo()
```