--- title: '[clem] Resolution Definition' author: "Nina-Lydia Kazakou" date: '2022-07-18' output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` Here, I will try to determine the best clustering resolution for my dataset. I will use a few different methods to help me define the best resolution, but I will also check for expression patterns. # Set-up ### Load libraries ```{r load_libraries, message=FALSE, warning=FALSE} library(SingleCellExperiment) library(Seurat) library(tidyverse) library(Matrix) library(scales) library(cowplot) library(RCurl) library(ggplot2) library(edgeR) library(dplyr) library(ggsci) library(here) library(gtools) ``` ### Colour Palette ```{r load_palette} mypal1 <- pal_npg("nrc", alpha = 0.7)(10) mypal2 <- pal_tron("legacy", alpha = 0.7)(7) mypal3 <- pal_lancet("lanonc", alpha = 0.7)(9) mypal4 <- pal_simpsons(palette = c("springfield"), alpha = 0.7)(16) mypal5 <- pal_rickandmorty(palette = c("schwifty"), alpha = 0.7)(6) mypal6 <- pal_futurama(palette = c("planetexpress"), alpha = 0.7)(5) mypal7 <- pal_startrek(palette = c("uniform"), alpha = 0.7)(5) mycoloursP<- c(mypal1, mypal2, mypal3, mypal4) ``` ### Load object ```{r load_object} srt <- readRDS(here("Data", "Human_R_Analysis", "Clemastine_DMSO", "clem_srt.rds")) ``` # Generate & Save DimPlots for all the calculated resolutions: ```{r generate_umap, eval=FALSE} plot_list_func <- function(seur_obj, col_pattern, plot_cols, clust_lab = TRUE, label_size = 8, num_col = 4, save_dir = getwd(), width=7, height=5){ extr_res_col <- grep(pattern = col_pattern, names(seur_obj@meta.data)) res_names <- names(seur_obj@meta.data[extr_res_col]) # gtools function, sorts gene_names alphanumeric: res_names <- mixedsort(res_names) plot_l <-list() for(i in 1: length(res_names)){ pdf(paste0(save_dir, res_names[i], "_umap.pdf"), width=width, height=height) dim_plot <- DimPlot(seur_obj, reduction = "umap", cols= plot_cols, group.by = res_names[i], label = clust_lab, label.size = label_size) + NoLegend() print(dim_plot) dev.off() print(dim_plot) } } dir.create(here("outs", "filter_70", "Human_R_Analysis", "Clemastine_DMSO", "DimPlots_AllRes")) dir.create(here("outs", "filter_70", "Human_R_Analysis", "Clemastine_DMSO", "DimPlots_AllRes", "UMAP")) plot_list_func(seur_obj = srt, col_pattern = "originalexp_snn_res.", plot_cols = mycoloursP[1:40], save_dir = here("outs", "filter_70", "Human_R_Analysis", "Clemastine_DMSO", "DimPlots_AllRes", "UMAP")) ``` ```{r generate_tSNE, eval=FALSE} plot_list_func <- function(seur_obj, col_pattern, plot_cols, clust_lab = TRUE, label_size = 8, num_col = 4, save_dir = getwd(), width=7, height=5){ extr_res_col <- grep(pattern = col_pattern, names(seur_obj@meta.data)) res_names <- names(seur_obj@meta.data[extr_res_col]) # gtools function, sorts gene_names alphanumeric: res_names <- mixedsort(res_names) plot_l <-list() for(i in 1: length(res_names)){ pdf(paste0(save_dir, res_names[i], "_tsne.pdf"), width=width, height=height) dim_plot <- DimPlot(seur_obj, reduction = "tsne", cols= plot_cols, group.by = res_names[i], label = clust_lab, label.size = label_size) + NoLegend() print(dim_plot) dev.off() print(dim_plot) } } dir.create(here("outs", "filter_70", "Human_R_Analysis", "Clemastine_DMSO", "DimPlots_AllRes", "tSNE")) plot_list_func(seur_obj = srt, col_pattern = "originalexp_snn_res.", plot_cols = mycoloursP[1:40], save_dir = here("outs", "filter_70", "Human_R_Analysis", "Clemastine_DMSO", "DimPlots_AllRes", "tSNE")) ``` # ClusterTree ttps://cran.r-project.org/web/packages/clustree/vignettes/clustree.html A clustering tree is different in that it visualises the relationships between at a range of resolutions. To build a clustering tree I need to look at how cells move as the clustering resolution is increased. Each cluster forms a node in the tree and edges are constructed by considering the cells in a cluster at a lower resolution (say k=2) that end up in a cluster at the next highest resolution (say k=3). By connecting clusters in this way we can see how clusters are related to each other, which are clearly distinct and which are unstable. ```{r message=FALSE, warning=FALSE} library(clustree) ``` ```{r} combined_clust_res <- cbind(k.0.1 = srt@meta.data$originalexp_snn_res.0.01, k.0.3 = srt@meta.data$originalexp_snn_res.0.03, k.0.5 = srt@meta.data$originalexp_snn_res.0.05, k.0.7 = srt@meta.data$originalexp_snn_res.0.07, k.1 = srt@meta.data$originalexp_snn_res.0.1, k.2 = srt@meta.data$originalexp_snn_res.0.2, k.3 = srt@meta.data$originalexp_snn_res.0.3, k.4 = srt@meta.data$originalexp_snn_res..0.4, k.5 = srt@meta.data$originalexp_snn_res.0.5, k.6 = srt@meta.data$originalexp_snn_res..0.6, k.7 = srt@meta.data$originalexp_snn_res..0.7, k.8 = srt@meta.data$originalexp_snn_res.0.8, k.9 = srt@meta.data$originalexp_snn_res.0.9, k.10 = srt@meta.data$originalexp_snn_res.1) ``` ```{r basic_clustertree, fig.height=10, fig.width=12, message=FALSE, warning=FALSE} clustree(combined_clust_res, prefix="k.", edge_arrow=FALSE) ``` ```{r eval=FALSE} dir.create(here("outs", "filter_70", "Human_R_Analysis", "Clemastine_DMSO", "ClusterTree")) clust_tree_01 <- clustree(combined_clust_res, prefix="k.", edge_arrow=FALSE) # Step 1: Call the pdf command to start the plot pdf(file = here("outs", "filter_70", "Human_R_Analysis", "Clemastine_DMSO", "ClusterTree", "ClusterTree_01.pdf"), width = 10, height = 10) # Step 2: Create the plot with R code print(clust_tree_01) # Step 3: Run dev.off() to create the file! dev.off() ``` In this plot, the size of each node is related to the number of samples in each cluster and the color indicates the clustering resolution. The edges are coloured according to the number of samples they represent and the transparency shows the incoming node proportion, the number of samples in the edge divided by the number of samples in the node it points to. ```{r clustree, fig.height=10, fig.width=10} clustree(srt, prefix = paste0("originalexp_snn_res."), exprs = c("data", "counts", "scale.data"), assay = NULL ) ``` ```{r eval=FALSE} clust_tree_02 <- clustree(srt, prefix = paste0("originalexp_snn_res."), exprs = c("data", "counts", "scale.data"), assay = NULL ) pdf(file = here("outs", "filter_70", "Human_R_Analysis", "Clemastine_DMSO", "ClusterTree", "ClusterTree_02.pdf"), width = 10, height = 10) print(clust_tree_02) dev.off() ``` In originalexp_snn_res.0.01 there is only 1 cluster, indicating the clustering start, while in originalexp_snn_res.0.03 I can already see two clusters (which are also spatially distinct in the UMAP), which probably represents the separation of progenitor and mature cells. It looks like the highest complexity resolution I could pick is originalexp_snn_res.1; this resolution does not look too complex either, although there seems to be a cell population exchange between clusters that branch separately very early on. Given the fact that the dataset contains only 710 cells, I will probably opt for a less complex resolution. originalexp_snn_res.0.4, originalexp_snn_res.0.5 & originalexp_snn_res.0.6 seem to be very similar, although having a reasonable level of complexion, thus I will examine originalexp_snn_res.0.4. I will also examine a more complex resolution, originalexp_snn_res.0.7. # Sihlouette; Silhouette coefficient The silhouette analysis measures how well an observation is clustered and it estimates the average distance between clusters. The silhouette plot displays a measure of how close each point in one cluster is to points in the neighboring clusters. For each observation i, the silhouette width sil is calculated as follows: - For each observation i, calculate the average dissimilarity ai between i and all other points of the cluster to which i belongs. - For all other clusters C, to which i does not belong, calculate the average dissimilarity d(i,C) of i to all observations of C. The smallest of these d(i,C) is defined as bi=minCd(i,C). The value of bi can be seen as the dissimilarity between i and its neighbor cluster, i.e., the nearest one to which it does not belong. - Finally the silhouette width of the observation i is defined by the formula: Si=(bi)/max(ai,bi). Silhouette width can be interpreted as follow: - Observations with a large Si (almost 1) are very well clustered. - A small Si (around 0) means that the observation lies between two clusters. - Observations with a negative Si are probably placed in the wrong cluster. ## Approximate Silhouette ApproxSil is useful for large datasets. It is used to to select the most stable clusters based on stats. It performs the calculations on the PC coordinates. The silhouette width is a general-purpose method for evaluating the separation between clusters but requires calculating the average distance between pairs of observations within or between clusters. ```{r} library(bluster) ``` ###### For this model to work, I will need to remove the lowest resolution, since there are not more that one clusters and thus the algorithm won't be able to calculate and plot the differences: ```{r remove_originalexp_snn_res.0.01} srt@meta.data$originalexp_snn_res.0.01 <- NULL ``` ###### I will also need to transform my srt to an sce: ```{r srt_to_sce} sce <- as.SingleCellExperiment(srt) ``` ```{r} clust <- as.integer(paste0(colData(sce)$originalexp_snn_res.0.03)) sil_approx <- approxSilhouette(reducedDim(sce, "PCA"), clusters=clust) ``` **approxSilhouette** approximates the average distances for faster computation in large datasets. For a given observation, let \tilde D be the approximate average distance to all cells in cluster X. This is defined as the square root of the sum of: *The squared distance from the current observation to the centroid of cluster X. This is most accurate when the observation is distant to X relative to the latter's variation.* The summed variance of all variables across observations in cluster X. This is most accurate when the observation lies close to the close to the centroid of X. This is also equivalent to the root-square-mean distance from the current observation to all cells in X. The approximate silhouette width for each cell can then be calculated with the relevant two values of \tilde D, computed by setting X to the cluster of the current cell or the closest other cluster. ```{r ApproxSil_width_example} sil_data <- as.data.frame(sil_approx) sil_data$closest <- factor(ifelse(sil_data$width > 0, clust, sil_data$other)) sil_data$cluster <- factor(clust) ggplot(sil_data, aes(x=cluster, y=width, colour=closest)) + ggbeeswarm::geom_quasirandom(method="smiley") + theme_bw(20) + xlab("Cluster (resolution 0.05)") ``` ```{r ApproxSil_purity_example} pure_0_1 <- neighborPurity(reducedDim(sce, "PCA"), clusters=clust) pure_data <- as.data.frame(pure_0_1) pure_data$maximum <- factor(pure_data$maximum) pure_data$cluster <- factor(clust) ggplot(pure_data, aes(x=cluster, y=purity, colour=maximum)) + ggbeeswarm::geom_quasirandom(method="smiley") + theme_bw(20) + xlab("Cluster (resolution 0.05)") ``` ```{r ApproxSil_width, eval=FALSE} approx_sil <- function(sce_obj, reduction = "PCA", col_pattern = "originalexp_snn_res.", plot_cols, clust_lab = TRUE, label_size = 8, save_dir = getwd(), width=7, height=5){ res_col <- grep(pattern = col_pattern, names(colData(sce_obj))) names_col <- names(colData(sce_obj))[res_col] # gtools function, sorts gene_names alphanumeric: names_col <- mixedsort(names_col) met_dat <- as.data.frame(colData(sce)) for(i in 1: length(names_col)){ clust <- met_dat[[names_col[i]]] clust_int <- as.integer(paste0(clust)) sil_approx <- approxSilhouette(reducedDim(sce_obj, reduction), clusters = clust_int) sil_data <- as.data.frame(sil_approx) sil_data$closest <- factor(ifelse(sil_data$width > 0, clust_int, sil_data$other)) sil_data$cluster <- factor(clust_int) apr_sil_plot <-ggplot(sil_data, aes(x=cluster, y=width, colour=closest)) + ggbeeswarm::geom_quasirandom(method="smiley") + theme_bw(20) + xlab(names_col[i]) pdf(paste0(save_dir, "_", names_col[i], "_AppoxSil_width.pdf"), width=width, height=height) print(apr_sil_plot) dev.off() print(apr_sil_plot) } print("Done") } ``` ```{r eval=FALSE} dir.create(here("outs", "filter_70", "Human_R_Analysis", "Clemastine_DMSO", "ApproxSil_width_Plots")) approx_sil(sce_obj = sce, col_pattern = "originalexp_snn_res.", plot_cols = mycoloursP[6:40], save_dir = here("outs", "filter_70", "Human_R_Analysis", "Clemastine_DMSO", "ApproxSil_width_Plots")) ``` ```{r ApproxSil_purity, eval=FALSE} clu_pure <- function(sce_obj, reduction = "PCA", col_pattern = "originalexp_snn_res.", plot_cols, clust_lab = TRUE, label_size = 8, save_dir = getwd(), width=7, height=5){ res_col <- grep(pattern = col_pattern, names(colData(sce_obj))) names_col <- names(colData(sce_obj))[res_col] # gtools function, sorts gene_names alphanumeric: names_col <- mixedsort(names_col) met_dat <- as.data.frame(colData(sce)) for(i in 1: length(names_col)){ clust <- met_dat[[names_col[i]]] clust_int <- as.integer(paste0(clust)) pure <- neighborPurity(reducedDim(sce_obj, reduction), clusters = clust_int) pure_data <- as.data.frame(pure) pure_data$maximum <- factor(pure_data$maximum) pure_data$cluster <- factor(clust_int) pure_plot <- ggplot(pure_data, aes(x=cluster, y=purity, colour=maximum)) + ggbeeswarm::geom_quasirandom(method="smiley") + theme_bw(20) + xlab(names_col[i]) pdf(paste0(save_dir, "_", names_col[i], "_AppoxSil_purity.pdf"), width=width, height=height) print(pure_plot) dev.off() print(pure_plot) } print("Done") } ``` ```{r eval=FALSE} dir.create(here("outs", "filter_70", "Human_R_Analysis", "Clemastine_DMSO", "ApproxSil_purity_Plots")) clu_pure(sce_obj = sce, col_pattern = "originalexp_snn_res.", plot_cols = mycoloursP[6:40], save_dir = here("outs", "filter_70", "Human_R_Analysis", "Clemastine_DMSO", "ApproxSil_purity_Plots")) ``` # Cluster marker expression ## Examine originalexp_snn_res.0.4 ```{r res.0.6} Idents(srt) <- "originalexp_snn_res.0.4" ``` ```{r Oligodendroglia} oligodendroglia <- c("OLIG1", "OLIG2", "SOX10") ``` ```{r Oligodendroglia_FeaturePlot} FeaturePlot(srt, reduction = "umap", features = oligodendroglia, pt.size = 1, label = TRUE, ncol = 2) FeaturePlot(srt, reduction = "tsne", features = oligodendroglia, pt.size = 1, label = TRUE, ncol = 2) ``` ```{r Oligodendroglia_VlnPlot} VlnPlot(srt, features = oligodendroglia, pt.size = 0.01, cols = mycoloursP, ncol = 2) ``` ```{r OPCs} OPCs <- c("PDGFRA", "BCAN", "PCDH15", "PTGDS", "PTPRZ1", "CSPG4", "GPR17") ``` ```{r OPCs_FeaturePlot, fig.height=8, fig.width=8} FeaturePlot(srt, reduction = "umap", features = OPCs, pt.size = 1, label = TRUE, ncol = 2) FeaturePlot(srt, reduction = "tsne", features = OPCs, pt.size = 1, label = TRUE, ncol = 2) ``` ```{r OPCs_VlnPlot, fig.height=8, fig.width=8} VlnPlot(srt, features = OPCs, pt.size = 0.01, cols = mycoloursP, ncol = 2) ``` ```{r COPs} COPs <- c("NKX2-2", "MYC", "ETV1", "MYT1", "SEMA5B", "SYT1", "BMPER", "EPHB1") ``` ```{r COPs_FeaturePlot, fig.height=8, fig.width=8} FeaturePlot(srt, reduction = "umap", features = COPs, pt.size = 1, label = TRUE, ncol = 2) FeaturePlot(srt, reduction = "tsne", features = COPs, pt.size = 1, label = TRUE, ncol = 2) ``` ```{r COPs_VlnPlot, fig.height=8, fig.width=8} VlnPlot(srt, features = COPs, pt.size = 0.01, cols = mycoloursP, ncol = 2) ``` ```{r Oligodendroytes} oligos <- c("MBP", "CNP", "MAG", "MOG", "MOBP", "PLP1") ``` ```{r Oligodendroytes_FeaturePlot, fig.height=8, fig.width=8} FeaturePlot(srt, reduction = "umap", features = oligos, pt.size = 1, label = TRUE, ncol = 2) FeaturePlot(srt, reduction = "tsne", features = oligos, pt.size = 1, label = TRUE, ncol = 2) ``` ```{r Oligodendrocytes_VlnPlot, fig.height=8, fig.width=8} VlnPlot(srt, features = oligos, pt.size = 0.01, cols = mycoloursP, ncol = 2) ``` So far, it looks like all captured cells are oligodendroglia. However, I will check for other glia types. ```{r Astrocytes} astro <- c("GJA1", "AQP4", "GLUL", "SOX9", "GFAP", "NDRG2", "APOE", "FGFR3", "SPARC", "S100B", "SPON1", "SPARCL1", "SLC1A3") ``` ```{r Astrocytes_FeaturePlot, fig.height=14, fig.width=8} FeaturePlot(srt, reduction = "umap", features = astro, pt.size = 1, label = TRUE, ncol = 2) FeaturePlot(srt, reduction = "tsne", features = astro, pt.size = 1, label = TRUE, ncol = 2) ``` ```{r Astrocytes_VlnPlot, fig.height=14, fig.width=8} VlnPlot(srt, features = astro, pt.size = 0.01, cols = mycoloursP, ncol = 2) ``` ```{r Microglia} micro <- c("CX3CR1", "P2RY12") ``` ```{r Microglia_FeaturePlot} FeaturePlot(srt, reduction = "umap", features = micro, pt.size = 1, label = TRUE, ncol = 2) FeaturePlot(srt, reduction = "tsne", features = micro, pt.size = 1, label = TRUE, ncol = 2) ``` ```{r Microglia_VlnPlot} VlnPlot(srt, features = micro, pt.size = 0.01, cols = mycoloursP, ncol = 2) ``` ##### Plot all markers together: ```{r fig.height=8, fig.width=25} DotPlot(srt, group.by = "originalexp_snn_res.0.6", features = c(oligodendroglia, oligos, OPCs, COPs, astro)) & FontSize(8) ``` ## Examine originalexp_snn_res.0.7 ```{r res.0.7} Idents(srt) <- "originalexp_snn_res.0.7" ``` ```{r} FeaturePlot(srt, reduction = "umap", features = oligodendroglia, pt.size = 1, label = TRUE, ncol = 2) FeaturePlot(srt, reduction = "tsne", features = oligodendroglia, pt.size = 1, label = TRUE, ncol = 2) ``` ```{r} VlnPlot(srt, features = oligodendroglia, pt.size = 0.01, cols = mycoloursP, ncol = 2) ``` ```{r OPCs_FeaturePlot_res.0.7, fig.height=8, fig.width=8} FeaturePlot(srt, reduction = "umap", features = OPCs, pt.size = 1, label = TRUE, ncol = 2) FeaturePlot(srt, reduction = "tsne", features = OPCs, pt.size = 1, label = TRUE, ncol = 2) ``` ```{r OPCs_VlnPlot_res.0.7, fig.height=8, fig.width=8} VlnPlot(srt, features = OPCs, pt.size = 0.01, cols = mycoloursP, ncol = 2) ``` ```{r COPs_FeaturePlot_res.0.7, fig.height=8, fig.width=8} FeaturePlot(srt, reduction = "umap", features = COPs, pt.size = 1, label = TRUE, ncol = 2) FeaturePlot(srt, reduction = "tsne", features = COPs, pt.size = 1, label = TRUE, ncol = 2) ``` ```{r COPs_VlnPlot_res.0.7, fig.height=8, fig.width=8} VlnPlot(srt, features = COPs, pt.size = 0.01, cols = mycoloursP, ncol = 2) ``` ```{r Oligodendroytes_FeaturePlot_res.0.7, fig.height=8, fig.width=8} FeaturePlot(srt, reduction = "umap", features = oligos, pt.size = 1, label = TRUE, ncol = 2) FeaturePlot(srt, reduction = "tsne", features = oligos, pt.size = 1, label = TRUE, ncol = 2) ``` ```{r Oligodendrocytes_VlnPlot_res.0.7, fig.height=8, fig.width=8} VlnPlot(srt, features = oligos, pt.size = 0.01, cols = mycoloursP, ncol = 2) ``` ##### Plot all markers together: ```{r fig.height=8, fig.width=25} DotPlot(srt, group.by = "originalexp_snn_res.0.7", features = c(oligodendroglia, oligos, OPCs, COPs, astro)) & FontSize(8) ``` The difference between originalexp_snn_res.0.4 and originalexp_snn_res.0.7 seems to be that cluster 2 breaks into two smaller clusters. Based on the above marker plots, cluster 2 seems to be a progenitor population and since I am mostly interested in oligodendrocytes, I will continue the analysis with originalexp_snn_res.0.4. ##### SessionInfo <details> <summary> Click to expand </summary> ```{r} sessionInfo() ``` </details>