MonolayerCultures / src / Integrations / PrimaryCulturedRodentOligodendroglia / Int_RC17_ol_PrimaryCultured_Mouse_ol.Rmd
Int_RC17_ol_PrimaryCultured_Mouse_ol.Rmd
Raw
---
title: "Int_RC17_ol_PrimaryCulturedMouseOligodendroglia"
author: "Nina-Lydia Kazakou"
date: "24/04/2021"
output: html_document
---

This original script for integrating RC17_ol with primary cultured rodent oligodendroglia is edited on the 05/04/2022. The edits include mostly changing the patways to the original folders containing the data in order to produce a final report and include this analysis into the total monolayer oligodendroglia final analysis. 
# Set-up

### Load libraries 
```{r message=FALSE, warning=FALSE}
library(Seurat)
library(here)
library(ggsci)
library(ggplot2)
library(dplyr)
library(biomaRt)
library(SingleCellExperiment)
library(scater)
library(RColorBrewer)
library(corrplot)
```

### Load colour palets
```{r}
mypal <- pal_npg("nrc", alpha = 0.7)(10)
mypal2 <-pal_tron("legacy", alpha = 0.7)(7)
mypal3<-pal_lancet("lanonc", alpha = 0.7)(9)
mypal4<-pal_simpsons(palette = c("springfield"), alpha = 0.7)(16)
mypal5 <- pal_rickandmorty(palette = c("schwifty"), alpha = 0.7)(6)
mypal6 <- pal_futurama(palette = c("planetexpress"), alpha = 0.7)(5)
mypal7 <- pal_startrek(palette = c("uniform"), alpha = 0.7)(5)
mycoloursP<- c(mypal, mypal2, mypal3, mypal4)
```

For integrating the int_mus with the human dataset, gene names have to be the same.
For converting int_mus nomenclature to human gene names, I use biomaRt and filter out genes or which no human nomenclature exists. 

# Convert mouse gene names to human genes names
### Use library "biomaRT"

```{r eval=FALSE}
dir.create(here("data", "Processed", "Integrations", "PrimaryCulturedRodentOligodendroglia"))
dir.create(here("data", "Processed", "Integrations", "PrimaryCulturedRodentOligodendroglia", "Rodent_CellRangerOutputs"))
```

```{r eval=FALSE}
listMarts()
ensembl <- useMart("ensembl")
ensembl_human <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
ensembl_mouse <- useMart("ensembl", dataset = "mmusculus_gene_ensembl")

# Load raw_counts or take from environment
raw_counts <- Read10X(data.dir = here("data", "Processed", "Integrations", "PrimaryCulturedRodentOligodendroglia", "Rodent_CellRangerOutputs"))

mouse_genes <- rownames(raw_counts)
matched_genes <- getLDS(attributes = c("mgi_symbol", "chromosome_name"),
       filters = "mgi_symbol", values = mouse_genes, mart = ensembl_mouse,
       attributesL = c("hgnc_symbol", 
                       "chromosome_name", 
                       "start_position", "end_position") , 
       martL = ensembl_human)

matched_genes <- subset(matched_genes, matched_genes$HGNC.symbol != "")

subset_boul <- rownames(raw_counts) %in% matched_genes$MGI.symbol
raw_counts <- raw_counts[subset_boul,]

subs_boul_2 <- matched_genes$MGI.symbol %in% rownames(raw_counts)
  
# match the order of matched_genes$MGI.symbol wieh rownames(raw_counts)
matched_genes <- matched_genes[match(rownames(raw_counts), 
                                     matched_genes$MGI.symbol),]

rownames(raw_counts) <- make.unique(rownames(raw_counts))
matched_genes$MGI.symbol <- make.unique(matched_genes$MGI.symbol)
matched_genes$HGNC.symbol <- make.unique(matched_genes$HGNC.symbol)

summary(rownames(raw_counts) == matched_genes$MGI.symbol)


rownames(raw_counts) <- matched_genes$HGNC.symbol

int_mus <- CreateSeuratObject(counts = raw_counts)
```

```{r eval=FALSE}
# check how mt genes are called in the int_mus genome
int_mus[["percent.mito"]] <- PercentageFeatureSet(int_mus, pattern = "^MT-")
head(int_mus@meta.data)

VlnPlot(int_mus, features = c("nFeature_RNA", "nCount_RNA", "percent.mito"), ncol = 3)
FeatureScatter(int_mus, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
```

# Re-apply previous analysis for mouse_oligodendroglia
### QC
```{r eval=FALSE, sce}
# Create SingleCellExperiment object 
mouse_sce <- as.SingleCellExperiment(int_mus)
```


### FeatureQC
```{r eval=FALSE, featureQC}
keep_feature <- rowSums(counts(mouse_sce) > 0) > 1
mouse_sce <- mouse_sce[keep_feature, ]
nrow(mouse_sce) #17762

mouse_sce <- addPerFeatureQC(mouse_sce)
colnames(rowData(mouse_sce))
```


### CellQC
```{r eval=FALSE, cellQC}
# Identify mitochondrial genes
is_mito <- grepl("^mt-", rownames(mouse_sce))
mouse_sce <- addPerCellQC(mouse_sce, subsets = list(mt = is_mito))
outlier_mt <- isOutlier(mouse_sce$subsets_mt_percent, type = "higher")
```

```{r eval=FALSE, lib_low}
df <- addPerCellQC(mouse_sce, subsets = list(mt = is_mito))


qc.lib.lower <- isOutlier(df$sum, log=TRUE, type="lower")
lowUMI<-as.numeric(attr(qc.lib.lower, "thresholds")[2])
if(lowUMI == Inf){
lowUMI <-0
}
if(min(df$sum)>= lowUMI){
print("Filter for low UMI counts are too permissive. Apply manual set threshold. Default is 100")
qc.lib.lower<- df$sum<20
lowUMI<-20
}else{
print("Calculated thresholds seem to filter for low UMI counts")
}
```

```{reval=FALSE, }
qc.lib.low <- df$sum < 3000
```

```{r eval=FALSE, lib_high}
qc.lib.higher <-isOutlier(df$sum, log=TRUE, type="higher")
higherUMI<-as.numeric(attr(qc.lib.higher, "thresholds")[2])
if(max(df$sum)<= higherUMI){
  print("Filter for high UMI counts too permissive. Apply manual set threshold.")
  qc.lib.higher<- df$sum > 30000
  higherUMI<-30000
}else{
  print("Calculated thresholds seem to filter for high UMI counts")
}

```
```{r eval=FALSE}
attr(qc.lib.higher, "thresholds")
```

```{r eval=FALSE, epxr_low}
qc.nexprs.lower <- isOutlier(df$detected, log=TRUE, type="lower")
lowerFeatures<-as.numeric(attr(qc.nexprs.lower, "thresholds")[2])
if(min(df$detected)>= lowerFeatures | lowerFeatures < 10){
  print("Filter for high gene counts too permissive. Apply manual set threshold.")
  qc.nexprs.lower<-(df$detected)<10
  lowerFeatures<-10
}else{
  print("Calculated thresholds seem to filter for high gene counts")
}
```

```{r eval=FALSE}
attr(qc.nexprs.lower, "thresholds")
```

```{r eval=FALSE, expr_high}
qc.nexprs.higher <- isOutlier(df$detected, log=TRUE, type="higher")
higherFeatures<-as.numeric(attr(qc.nexprs.higher, "thresholds")[2])
if(max(df$detected)<= higherFeatures){
  print("Filter for high gene counts too permissive. Apply manual set threshold.")
  qc.nexprs.higher<-(df$detected)> 8000
  higherFeatures<-8000
}else{
  print("Calculated thresholds seem to filter for high gene counts")
}
```

```{r eval=FALSE}
attr(qc.nexprs.higher, "thresholds")
```


```{r eval=FALSE}
# total
outlier <- qc.lib.low |  qc.nexprs.lower | qc.lib.higher | qc.nexprs.higher | outlier_mt

mouse_sce$outlier <- outlier  
```


### Subset data based on QC
```{r eval=FALSE}
filter_sce <- mouse_sce[,!mouse_sce$outlier]
```

### Normazlise data
```{r eval=FALSE}
int_mus <- as.Seurat(filter_sce)
```

### Populate metadata
```{r eval=FALSE}
int_mus@meta.data$ident <- NULL
int_mus@meta.data$label <- NULL 

# 10x kit version
int_mus@meta.data$Chemistry <- "v2"
```

```{r eval=FALSE}
int_mus <- NormalizeData(int_mus, normalization.method = "LogNormalize", scale.factor = 10000)
```

### Find Variable Features
```{r eval=FALSE}
int_mus <- FindVariableFeatures(int_mus, 
                                     selection.method = "vst", 
                                     nfeatures = 2000)
```


### Scale Data
```{r eval=FALSE}
all_genes <- rownames(int_mus)
int_mus <- ScaleData(int_mus, features = all_genes)
```

### Dimensionality Reduction
```{r eval=FALSE}
int_mus <- RunPCA(int_mus, 
                       features = VariableFeatures(object = int_mus), 
                       npcs = 20)
int_mus <- FindNeighbors(int_mus, dims = 1:15)
int_mus <- FindClusters(int_mus, resolution = 0.8)
int_mus <- RunUMAP(int_mus, dims = 1:15)
```

```{r eval=FALSE}
DimPlot(int_mus, group.by = "RNA_snn_res.0.8", cols = mycoloursP, pt.size = 0.8, label = TRUE) & NoLegend()
```


### Annotate metadata
```{r eval=FALSE}
int_mus@meta.data$ident <- NULL
```

```{r eval=FALSE}
Idents(int_mus) <- "RNA_snn_res.0.8"

int_mus@meta.data$ClusterID <- int_mus@meta.data$RNA_snn_res.0.8

current.ids <- c(0,1,2,3 ,4,5,6,7,8,9,10,11)
new.ids <- c("m.mOL1", "m.mOL2", "m.OPC1", "m.CyP1", "m.mOL3", "m.Outer_Radial_Glia", "m.Astrocyte", "m.COP", "m.OPC2", "m.Cyp2", "m.Pericyte", "m.Neurons")
int_mus@meta.data$ClusterID <- plyr::mapvalues(x = int_mus@meta.data$ClusterID, from = current.ids, to = new.ids)
```

```{r eval=FALSE}
head(int_mus@meta.data, 2)
```

```{r eval=FALSE}
saveRDS(int_mus, here("data", "Processed", "Integrations", "PrimaryCulturedRodentOligodendroglia", "PrimaryCulturedMouseOL_with_HumanGeneAnnotation"))
```


# Integration with RC17_ol

### Load data
```{r}
RC17_ol <- readRDS(here("data", "Processed", "Oligodendroglia", "hES-derived_monolayerOL.rds"))

int_mus <- readRDS(here("data", "Processed", "Integrations", "PrimaryCulturedRodentOligodendroglia", "PrimaryCulturedMouseOL_with_HumanGeneAnnotation"))
```

```{r}
DimPlot(RC17_ol, group.by = "ClusterID", cols = mycoloursP[6:40], pt.size = 0.8, label = TRUE) & NoLegend()

DimPlot(int_mus, group.by = "ClusterID", cols = mycoloursP, pt.size = 0.8, label = TRUE) & NoLegend()
```
### Subset mouse oligodendroglia for non-oligo clusters
```{r}
Idents(int_mus) <- "ClusterID"

int_mus@meta.data$Dataset <- "Primary_Mouse_Cultured_Oligodendroglia"

int_mus <- subset(int_mus, idents = c("m.mOL1", "m.mOL2", "m.mOL3", "m.OPC1", "m.OPC2", "m.COP", "m.CyP1", "m.Cyp2"))
```


## CCA
```{r eval=FALSE, Integrate_Datasets}
# Create a list of objects
srt_list <- list(RC17_ol, int_mus)

# normalize and identify variable features for each dataset independently
srt_list<- lapply(X = srt_list, FUN = function(x) {
    x <- NormalizeData(x)
    x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})

# select features that are repeatedly variable across datasets for integration
var_features <- SelectIntegrationFeatures(object.list = srt_list)

# create anchors to integrate the datasets
anchors <- FindIntegrationAnchors(object.list = srt_list, anchor.features = var_features)   

# create an 'integrated' data assay 
ol_comb <- IntegrateData(anchorset = anchors, dims = 1:20)
```

```{r eval=FALSE, Run_Standard_Workflow}
DefaultAssay(ol_comb) <- "integrated"
Idents(ol_comb) <- "Dataset"

# Scale data 
all_genes <- rownames(ol_comb)
ol_comb <- ScaleData(ol_comb, features = all_genes)

# Dimentionality reduction
ol_comb <- RunPCA(ol_comb, npcs = 30, verbose = FALSE)

# Choose the mumber of PCs
ElbowPlot(ol_comb, ndims = 50) #20

# Clustering
ol_comb <- RunUMAP(ol_comb, reduction = "pca", dims = 1:20)
ol_comb <- FindNeighbors(ol_comb, reduction = "pca", dims = 1:20)
ol_comb <- FindClusters(ol_comb, resolution = 0.5)
```

```{r eval=FALSE}
saveRDS(ol_comb, here("data", "Processed", "Integrations", "PrimaryCulturedRodentOligodendroglia", "RC17_ol_Cultured_Mouse_ol.rds"))
```

```{r}
ol_comb <- readRDS(here("data", "Processed", "Integrations", "PrimaryCulturedRodentOligodendroglia", "RC17_ol_Cultured_Mouse_ol.rds"))
```

## Visualization
```{r fig.height=4, fig.width=8}
DimPlot(ol_comb, pt.size = 0.8, cols = c(mycoloursP[6], mycoloursP[45]), group.by = "Dataset") 
```

```{r fig.height=4, fig.width=10}
DimPlot(ol_comb, pt.size = 0.8, cols = c(mycoloursP[6], mycoloursP[45]), group.by = "Dataset", split.by = "Dataset") & NoLegend()
```

```{r fig.height=4, fig.width=9}
DimPlot(ol_comb, pt.size = 0.8, cols = mycoloursP[5:40], group.by = "ClusterID")
```

```{r fig.height=4, fig.width=10}
DimPlot(ol_comb, pt.size = 0.8, cols = mycoloursP[5:40], group.by = "ClusterID", split.by = "Dataset", label = FALSE) & NoLegend()
DimPlot(ol_comb, pt.size = 0.8, cols = mycoloursP[5:40], group.by = "ClusterID", split.by = "Dataset", label = TRUE, label.size = 3) & NoLegend()
```


## Label Transfer

```{r}
lt_anchors <- FindTransferAnchors(reference = RC17_ol , query = int_mus, dims = 1:30)

predictions <- TransferData(anchorset = lt_anchors, refdata = RC17_ol$ClusterID,  dims = 1:30)

lt_int_mus <- AddMetaData(int_mus, metadata = predictions)
```

```{r fig.width=10, fig.height=6, fig.fullwidth=TRUE}
DimPlot(lt_int_mus, group.by = "predicted.id", split.by = "predicted.id", cols = mycoloursP[3:40], ncol = 2, pt.size = 1)
```

# Correlation between ol ID and predicted ID
```{r}
met_dat <- lt_int_mus@meta.data
cor_df <- data.frame(orig_id = met_dat$ClusterID,
                     pred_id = met_dat$predicted.id)
cor_df$pred_id <- as.factor(cor_df$pred_id)
cor_df$pred_id <- factor(cor_df$pred_id, levels = c("OPC_1", "pri-OPC", "OPC_2", "MAO", "OAPC", "OLIGO_1", "CyP_1", "Cyp_2", "OLIGO_2", "OPC_3"))
cor_matr <- matrix(data= 0, 
                   nrow = length(levels(cor_df$orig_id)), 
                   ncol = length(levels(cor_df$pred_id)))
rownames(cor_matr) <- levels(cor_df$orig_id)
colnames(cor_matr) <- levels(cor_df$pred_id)
for(i in 1: nrow(cor_df)){
  name1 <- as.character(cor_df[i, 1])
  name2 <- as.character(cor_df[i, 2])
  cor_matr[name1, name2] <- cor_matr[name1, name2] + 1
}
max(cor_matr)
centr_cor_matr <- cor_matr/ max(cor_matr)
corrplot(centr_cor_matr)
```


```{r}
sessionInfo()
```