src/57_mouse_integration_3_ds.Rmd · Luise-Seeker-Human-WM-Glia

---
title: "integration_mice_2"
author: "Luise A. Seeker"
date: "07/03/2022"
output: html_document
---

```{r}
library(GEOquery)
library(Seurat)
library("biomaRt")
library(dplyr)
library(here)
library(ggsci)
library(RColorBrewer)
library(corrplot)
```

```{r, echo = F}
mypal <- pal_npg("nrc", alpha = 0.7)(10)
mypal2 <-pal_tron("legacy", alpha = 0.7)(7)
mypal3 <- pal_lancet("lanonc", alpha = 0.7)(9)
mypal4 <- pal_simpsons(palette = c("springfield"), alpha = 0.7)(16)
mypal5 <- pal_rickandmorty(palette = c("schwifty"), alpha = 0.7)(6)
mypal6 <- pal_futurama(palette = c("planetexpress"), alpha = 0.7)(5)
mypal7 <- pal_startrek(palette = c("uniform"), alpha = 0.7)(5)

mycoloursP<- c(mypal, mypal2, mypal3, mypal4, mypal5, mypal6, mypal7)

```


```{r}
getGEOSuppFiles("GSE75330")

# read in meta data of Marques et al 2017
load(here::here("data", 
          "downloaded_datasets", 
          "ScienceandDevCelldata", 
          "Science2016Paper",
          "Sciencematricesanno.Rdata"))

head(anno_science)

exp_GSE75330 <- CreateSeuratObject(counts = emat_science, meta.data = anno_science)

saveRDS(exp_GSE75330, here::here("data", "downloaded_datasets",
                                 "Marques", "Marques17_science.RDS"))

# Find variable features

exp_GSE75330 <- FindVariableFeatures(exp_GSE75330, 
                               selection.method = "vst", 
                               nfeatures = 2000)

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(exp_GSE75330), 10)

# plot variable features with and without labels
plot1 <- VariableFeaturePlot(exp_GSE75330)
plot2 <- LabelPoints(plot = plot1, 
                     points = top10, 
                     repel = TRUE)
plot1 + plot2


#scale all genes (if there is a batch effect I could add vars to regress here)

all_genes <- rownames(exp_GSE75330)
exp_GSE75330 <- ScaleData(exp_GSE75330, features = all_genes)


#linear dimensional reduction

exp_GSE75330 <- RunPCA(exp_GSE75330, 
                       features = VariableFeatures(object = exp_GSE75330), 
                       npcs = 100)
ElbowPlot(exp_GSE75330, ndims = 100)
```

```{r}

exp_GSE75330 <- FindNeighbors(exp_GSE75330, dims = 1:25)
exp_GSE75330 <- FindClusters(exp_GSE75330, resolution = 0.5)



exp_GSE75330 <- RunUMAP(exp_GSE75330, dims = 1:25)

exp_GSE75330 <- RunTSNE(exp_GSE75330, dims = 1:25, check_duplicates = FALSE)


DimPlot(exp_GSE75330, reduction = "umap", label = T, cols = mycoloursP, 
        pt.size = 0.5)


FeaturePlot(exp_GSE75330, features = c("Klk6", "Sparc", "Sparcl1"))

DimPlot(exp_GSE75330, reduction = "tsne", label = T, cols = mycoloursP, 
        pt.size = 0.5)

```
```{r}

DimPlot(exp_GSE75330, reduction = "tsne", label = T, cols = mycoloursP, 
        pt.size = 0.5, group.by = "cell_class")

```


```{r}

DimPlot(exp_GSE75330, reduction = "tsne", label = F, cols = mycoloursP, 
        pt.size = 0.5, group.by = "tissue")

```

```{r, fig.width= 8, fig.height= 8}

FeaturePlot(exp_GSE75330, reduction = "tsne", 
            features = c("Rbfox1", "Sparc", "Sparcl1", "Opalin"))

```

```{r, fig.width= 8, fig.height= 8}

FeaturePlot(exp_GSE75330, reduction = "tsne", features = c("Klk6", "Sparc", 
                                                           "Sparcl1"))

```

```{r, fig.width= 9, fig.height= 27}

FeaturePlot(exp_GSE75330, reduction = "tsne", 
            features = c("Pdgfra", "Cspg4", "Ptprz1", "Pcdh15", 
                         "Vcan", "Sox6", "Gpr17", "Neu4", "Bmp4", "Nkx2-2", 
                         "Tbx18", "Vtn", "Lum", "Col1a2", "Tcf7l2", "Casr",
                         "Ctps", "Klk6", "Tcf7l2", "Mal", "Mog", "Plp1", 
                         "Opalin", "Serinc5", "Apod"), ncol =3)


```


```{r}
# For integrating the mouse with the human dataset, gene names have to be the same.
# for converting mouse nomenclature to human gene names, I use biomaRt and filter
# out genes or which no human nomenclature exists
# use library("biomaRt")
listMarts()
ensembl <- useMart("ensembl")
ensembl_human <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
ensembl_mouse <- useMart("ensembl", dataset = "mmusculus_gene_ensembl")


mouse_genes <- rownames(exp_GSE75330)
matched_genes <- getLDS(attributes = c("mgi_symbol", "chromosome_name"),
       filters = "mgi_symbol", values = mouse_genes, mart = ensembl_mouse,
       attributesL = c("hgnc_symbol", 
                       "chromosome_name", 
                       "start_position", "end_position") , 
       martL = ensembl_human)
```


```{r}
matched_genes <- subset(matched_genes, matched_genes$HGNC.symbol != "")

subset_boul <- rownames(exp_GSE75330) %in% matched_genes$MGI.symbol
exp_GSE75330 <- exp_GSE75330[subset_boul,]

subs_boul_2 <- matched_genes$MGI.symbol %in% rownames(exp_GSE75330)
  
# match the order of matched_genes$MGI.symbol wieh rownames(emat_science)
matched_genes <- matched_genes[match(rownames(exp_GSE75330), 
                                     matched_genes$MGI.symbol),]
```

```{r}
#rownames(exp_GSE75330) <- make.unique(rownames(exp_GSE75330))
matched_genes$MGI.symbol <- make.unique(matched_genes$MGI.symbol)
matched_genes$HGNC.symbol <- make.unique(matched_genes$HGNC.symbol)

summary(rownames(exp_GSE75330) == matched_genes$MGI.symbol)

counts <- as.matrix(exp_GSE75330@assays$RNA@counts)

summary(rownames(counts) == matched_genes$MGI.symbol)

rownames(counts) <- paste(matched_genes$HGNC.symbol)

int_mus <- CreateSeuratObject(counts = counts, meta.data = exp_GSE75330@meta.data)

#check how mt genes are called in the mouse genome
int_mus[["percent.mt"]] <- PercentageFeatureSet(int_mus, pattern = "^mt-")

#VlnPlot(int_mus, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
FeatureScatter(int_mus, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
```

```{r}

int_mus <- NormalizeData(int_mus, normalization.method = "LogNormalize", scale.factor = 10000)


# Find variable features

int_mus <- FindVariableFeatures(int_mus, 
                                     selection.method = "vst", 
                                     nfeatures = 2000)

top10 <- head(VariableFeatures(int_mus), 10)

# plot variable features with and without labels
plot1 <- VariableFeaturePlot(int_mus)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot1 + plot2
```


```{r}
#scale all genes (if there is a batch effect I could add vars to regress here)

all_genes <- rownames(int_mus)
int_mus <- ScaleData(int_mus, features = all_genes)


#linear dimensional reduction

int_mus <- RunPCA(int_mus, 
                       features = VariableFeatures(object = int_mus), 
                       npcs = 100)
int_mus <- FindNeighbors(int_mus, dims = 1:25)
int_mus <- FindClusters(int_mus, resolution = 0.5)
int_mus <- RunUMAP(int_mus, dims = 1:25)
int_mus <- RunTSNE(int_mus, dims = 1:25, check_duplicates = FALSE)


DimPlot(int_mus, reduction = "umap", label = T, cols = mycoloursP, 
        pt.size = 0.5)
```


```{r}
DimPlot(int_mus, reduction = "tsne", label = T, cols = mycoloursP, 
        pt.size = 2)

```

```{r}
DimPlot(int_mus, reduction = "tsne", label = T, cols = mycoloursP, 
        group.by = "cell_class")

```

```{r}
FeaturePlot(int_mus, reduction = "tsne", 
            features = c("RBFOX1", "SPARC", "SPARCL1", "OPALIN"))

```

```{r, fig.width= 9, fig.height= 27}
genes <- c("Pdgfra", "Cspg4", "Ptprz1", "Pcdh15", 
  "Vcan", "Sox6", "Gpr17", "Neu4", "Bmp4", "Nkx2-2", 
  "Tbx18", "Vtn", "Lum", "Col1a2", "Tcf7l2", "Casr",
  "Ctps", "Klk6", "Tcf7l2", "Mal", "Mog", "Plp1", 
  "Opalin", "Serinc5", "Apod")

genes <- toupper(genes)


FeaturePlot(int_mus, reduction = "tsne", 
            features = genes, ncol = 3)

```

```{r}
int_mus@meta.data$dataset <- "Marques"

```



Read in data from Sathyamurthy et al. 

```{r}
emat_science_S <- as.matrix(read.table(file = here::here("data",
                                     "downloaded_datasets",
                                     "Sathyamurthy",
                                     "GSE103892_Expression_Count_Matrix.txt"), 
                         header = TRUE, sep = '\t', row.names = 1))


met_dat_S <- read.table(file = here::here("data",
                                     "downloaded_datasets",
                                     "Sathyamurthy",
                                     "GSE103892_Sample_Cell_Cluster_Information.txt"), 
                         header = TRUE, sep = '\t', row.names = 1)



```

Use BiomaRt to change mouse gene symbols to human gene symbols
```{r}

mouse_genes_S <- rownames(emat_science_S)
matched_genes_S <- getLDS(attributes = c("mgi_symbol", "chromosome_name"),
       filters = "mgi_symbol", values = mouse_genes_S, mart = ensembl_mouse,
       attributesL = c("hgnc_symbol", 
                       "chromosome_name", 
                       "start_position", "end_position") , 
       martL = ensembl_human)

matched_genes_S <- subset(matched_genes_S, matched_genes_S$HGNC.symbol != "")

subset_boul <- rownames(emat_science_S) %in% matched_genes_S$MGI.symbol
emat_science_S <- emat_science_S[subset_boul,]

subs_boul_2 <- matched_genes_S$MGI.symbol %in% rownames(emat_science_S)
  
# match the order of matched_genes$MGI.symbol wieh rownames(emat_science)
matched_genes_S <- matched_genes_S[match(rownames(emat_science_S), 
                                     matched_genes_S$MGI.symbol),]

rownames(emat_science_S) <- make.unique(rownames(emat_science_S))
matched_genes_S$MGI.symbol <- make.unique(matched_genes_S$MGI.symbol)
matched_genes_S$HGNC.symbol <- make.unique(matched_genes_S$HGNC.symbol)

summary(rownames(emat_science_S) == matched_genes_S$MGI.symbol)


rownames(emat_science_S) <- matched_genes_S$HGNC.symbol




exp_GSE103892 <- CreateSeuratObject(counts = emat_science_S, 
                                    meta.data = met_dat_S)
# Subset for oligos

Idents(exp_GSE103892) <- "cell.type"

exp_GSE103892_ol <- subset(exp_GSE103892, ident = c("Oligos", "OPC"))

#check how mt genes are called in the mouse genome
exp_GSE103892_ol[["percent.mt"]] <- PercentageFeatureSet(exp_GSE103892_ol, 
                                                         pattern = "^MT-")

#VlnPlot(exp_GSE103892_ol, 
#        features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), 
#        ncol = 3)

FeatureScatter(exp_GSE103892_ol, 
               feature1 = "nCount_RNA", 
               feature2 = "nFeature_RNA")

exp_GSE103892_ol <- NormalizeData(exp_GSE103892_ol, 
                               normalization.method = "LogNormalize", 
                               scale.factor = 10000)

```



```{r}
# Find variable features

exp_GSE103892_ol <- FindVariableFeatures(exp_GSE103892_ol, 
                                     selection.method = "vst", 
                                     nfeatures = 2000)

top10 <- head(VariableFeatures(exp_GSE103892_ol), 10)

# plot variable features with and without labels
plot1 <- VariableFeaturePlot(exp_GSE103892_ol)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot1 + plot2

```

```{r}

#scale all genes (if there is a batch effect I could add vars to regress here)

all_genes <- rownames(exp_GSE103892_ol)
exp_GSE103892_ol <- ScaleData(exp_GSE103892_ol, features = all_genes)


#linear dimensional reduction

exp_GSE103892_ol <- RunPCA(exp_GSE103892_ol, 
                       features = VariableFeatures(object = exp_GSE103892_ol), 
                       npcs = 100)

exp_GSE103892_ol <- FindNeighbors(exp_GSE103892_ol, dims = 1:25)
exp_GSE103892_ol <- FindClusters(exp_GSE103892_ol, resolution = 0.5)
exp_GSE103892_ol <- RunUMAP(exp_GSE103892_ol, dims = 1:25)
exp_GSE103892_ol <- RunTSNE(exp_GSE103892_ol, dims = 1:25, 
                            check_duplicates = FALSE)


DimPlot(exp_GSE103892_ol, reduction = "umap", label = T, cols = mycoloursP, 
        pt.size = 0.5)

DimPlot(exp_GSE103892_ol, reduction = "umap", label = T, cols = mycoloursP, 
        pt.size = 0.5, group.by = "cell.type")

```

```{r, fig.width = 9, fig.height=9}
FeaturePlot(exp_GSE103892_ol, features = c("MBP", "PLP1", "MAG", "RBFOX1",
                                           "OPALIN", "SPARC", "PDGFRA"), ncol = 3)
```


```{r}

exp_GSE103892_ol@meta.data$dataset <- "Sathyamurthy"

```


# read in Dev cell data



```{r}
emat_dev_c <- readRDS(file = here::here("data",
                                     "downloaded_datasets",
                                     "ScienceandDevCelldata",
                                     "DevCellPaper",
                                     "ematE13P7.rds"))


met_dat_dev_c <- readRDS(file = here::here("data",
                                     "downloaded_datasets",
                                     "ScienceandDevCelldata",
                                     "DevCellPaper",
                                     "annoE13P7.rds"))



```
Use BiomaRt to change mouse gene symbols to human gene symbols
```{r}

mouse_genes_S <- rownames(emat_dev_c)
matched_genes_S <- getLDS(attributes = c("mgi_symbol", "chromosome_name"),
       filters = "mgi_symbol", values = mouse_genes_S, mart = ensembl_mouse,
       attributesL = c("hgnc_symbol", 
                       "chromosome_name", 
                       "start_position", "end_position") , 
       martL = ensembl_human)

matched_genes_S <- subset(matched_genes_S, matched_genes_S$HGNC.symbol != "")

subset_boul <- rownames(emat_dev_c) %in% matched_genes_S$MGI.symbol
emat_dev_c <- emat_dev_c[subset_boul,]

subs_boul_2 <- matched_genes_S$MGI.symbol %in% rownames(emat_dev_c)
  
# match the order of matched_genes$MGI.symbol wieh rownames(emat_science)
matched_genes_S <- matched_genes_S[match(rownames(emat_dev_c), 
                                     matched_genes_S$MGI.symbol),]

rownames(emat_dev_c) <- make.unique(rownames(emat_dev_c))
matched_genes_S$MGI.symbol <- make.unique(matched_genes_S$MGI.symbol)
matched_genes_S$HGNC.symbol <- make.unique(matched_genes_S$HGNC.symbol)

summary(rownames(emat_dev_c) == matched_genes_S$MGI.symbol)


rownames(emat_dev_c) <- matched_genes_S$HGNC.symbol




dev_cell_seur <- CreateSeuratObject(counts = emat_dev_c, 
                                    meta.data = met_dat_dev_c)
# Subset for oligos

Idents(dev_cell_seur) <- "FinalClusters"

dev_cell_seur_ol <- subset(dev_cell_seur, ident = c("OPC1a", "OPC1b", "OPC2",
                                                    "NFOL", "COP"))
# The above removes vascular cells, cycling cells and neural progenitor cells


#check how mt genes are called in the mouse genome
dev_cell_seur_ol[["percent.mt"]] <- PercentageFeatureSet(dev_cell_seur_ol, 
                                                         pattern = "^MT-")

#VlnPlot(dev_cell_seur_ol, 
#        features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), 
#        ncol = 3)

FeatureScatter(dev_cell_seur_ol, 
               feature1 = "nCount_RNA", 
               feature2 = "nFeature_RNA")

dev_cell_seur_ol <- NormalizeData(dev_cell_seur_ol, 
                               normalization.method = "LogNormalize", 
                               scale.factor = 10000)

```



```{r}
# Find variable features

dev_cell_seur_ol <- FindVariableFeatures(dev_cell_seur_ol, 
                                     selection.method = "vst", 
                                     nfeatures = 2000)

top10 <- head(VariableFeatures(dev_cell_seur_ol), 10)

# plot variable features with and without labels
plot1 <- VariableFeaturePlot(dev_cell_seur_ol)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot1 + plot2

```

```{r}

#scale all genes (if there is a batch effect I could add vars to regress here)

all_genes <- rownames(dev_cell_seur_ol)
dev_cell_seur_ol <- ScaleData(dev_cell_seur_ol, features = all_genes)


#linear dimensional reduction

dev_cell_seur_ol <- RunPCA(dev_cell_seur_ol, 
                       features = VariableFeatures(object = dev_cell_seur_ol), 
                       npcs = 100)
dev_cell_seur_ol <- FindNeighbors(dev_cell_seur_ol, dims = 1:25)
dev_cell_seur_ol <- FindClusters(dev_cell_seur_ol, resolution = 0.5)
dev_cell_seur_ol <- RunUMAP(dev_cell_seur_ol, dims = 1:25)
dev_cell_seur_ol <- RunTSNE(dev_cell_seur_ol, dims = 1:25, 
                            check_duplicates = FALSE)


DimPlot(dev_cell_seur_ol, reduction = "umap", label = T, cols = mycoloursP, 
        pt.size = 0.5, group.by = "FinalClusters")

```


```{r, fig.height= 12, fig.width=6}
FeaturePlot(dev_cell_seur_ol, features = c("MBP", "PLP1", "MAG", "RBFOX1",
                                           "OPALIN", "SPARC"), ncol=2)
```


```{r}

dev_cell_seur_ol@meta.data$dataset <- "Marques_2018"

```

####
# read in Florridia et al. 2021 data



```{r}
load(here::here("data",
                "downloaded_datasets",
                "Floriddia2021",
                "GSE128525_SeuratObjSpCrdGEO.Rdata"))   

DimPlot(oligos.integrated, group.by = "confidentclusters")



```
Use BiomaRt to change mouse gene symbols to human gene symbols
```{r}

mouse_genes_S <- rownames(oligos.integrated)
matched_genes_S <- getLDS(attributes = c("mgi_symbol", "chromosome_name"),
       filters = "mgi_symbol", values = mouse_genes_S, mart = ensembl_mouse,
       attributesL = c("hgnc_symbol", 
                       "chromosome_name", 
                       "start_position", "end_position") , 
       martL = ensembl_human)

matched_genes_S <- subset(matched_genes_S, matched_genes_S$HGNC.symbol != "")

oligos.integrated_matr <- oligos.integrated@assays$RNA
subset_boul <- rownames(oligos.integrated_matr) %in% matched_genes_S$MGI.symbol

oligos.integrated_matr <- oligos.integrated_matr[subset_boul,]


subs_boul_2 <- matched_genes_S$MGI.symbol %in% rownames(oligos.integrated_matr)
  
# match the order of matched_genes$MGI.symbol wieh rownames(emat_science)
matched_genes_S <- matched_genes_S[match(rownames(oligos.integrated_matr), 
                                     matched_genes_S$MGI.symbol),]

rownames(oligos.integrated_matr) <- make.unique(rownames(oligos.integrated_matr))
matched_genes_S$MGI.symbol <- make.unique(matched_genes_S$MGI.symbol)
matched_genes_S$HGNC.symbol <- make.unique(matched_genes_S$HGNC.symbol)

summary(rownames(oligos.integrated_matr) == matched_genes_S$MGI.symbol)


rownames(oligos.integrated_matr) <- matched_genes_S$HGNC.symbol




flor_data <- CreateSeuratObject(counts = oligos.integrated_matr, 
                                    meta.data = oligos.integrated@meta.data)
# Subset for oligos

Idents(flor_data) <- "confidentclusters"

flor_data_ol <- subset(flor_data, ident = c("OPC", "COP", "NFOL1",
                                                    "NFOL2", "MFOL1",
                                            "MFOL2", "MOL1", "MOL2",
                                            "MOL3", "MOL4", "MOL5", "MOL6"))
# The above removes vascular cells, cycling cells and neural progenitor cells


#check how mt genes are called in the mouse genome
flor_data_ol[["percent.mt"]] <- PercentageFeatureSet(flor_data_ol, 
                                                         pattern = "^MT-")

#VlnPlot(flor_data_ol, 
#        features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), 
#        ncol = 3)

FeatureScatter(flor_data_ol, 
               feature1 = "nCount_RNA", 
               feature2 = "nFeature_RNA")

flor_data_ol <- NormalizeData(flor_data_ol, 
                               normalization.method = "LogNormalize", 
                               scale.factor = 10000)

```



```{r}
# Find variable features

flor_data_ol <- FindVariableFeatures(flor_data_ol, 
                                     selection.method = "vst", 
                                     nfeatures = 2000)

top10 <- head(VariableFeatures(flor_data_ol), 10)

# plot variable features with and without labels
plot1 <- VariableFeaturePlot(flor_data_ol)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot1 + plot2

```

```{r}

#scale all genes (if there is a batch effect I could add vars to regress here)

all_genes <- rownames(flor_data_ol)
flor_data_ol <- ScaleData(flor_data_ol, features = all_genes)


#linear dimensional reduction

flor_data_ol <- RunPCA(flor_data_ol, 
                       features = VariableFeatures(object = flor_data_ol), 
                       npcs = 100)
flor_data_ol <- FindNeighbors(flor_data_ol, dims = 1:25)
flor_data_ol <- FindClusters(flor_data_ol, resolution = 0.5)
flor_data_ol <- RunUMAP(flor_data_ol, dims = 1:25)
flor_data_ol <- RunTSNE(flor_data_ol, dims = 1:25, 
                            check_duplicates = FALSE)


DimPlot(flor_data_ol, reduction = "umap", label = T, cols = mycoloursP, 
        pt.size = 0.5, group.by = "confidentclusters")

```


```{r, fig.height= 12, fig.width=6}
FeaturePlot(flor_data_ol, features = c("MBP", "PLP1", "MAG", "RBFOX1",
                                           "OPALIN", "SPARC"), ncol=2)
```


```{r}

flor_data_ol@meta.data$dataset <- "Floriddia_2020"

```

read in human dataset or take from the environment
```{r} 
nad_ol <- readRDS(here::here("data",
                       "single_nuc_data",
                       "oligodendroglia",
                       "srt_oligos_and_opcs_LS.RDS"))

nad_ol@meta.data$dataset <- "Seeker"


```

# Label transfer

## 1) Marques et al 2018
```{r}


lt_anchors <- FindTransferAnchors(reference = dev_cell_seur_ol , query = nad_ol, 
    dims = 1:30)
predictions <- TransferData(anchorset = lt_anchors, refdata = dev_cell_seur_ol$FinalClusters, 
    dims = 1:30)

names(predictions)[1] <- "Marques2018_label_trans"

lt_nad_ol <- AddMetaData(nad_ol, metadata = predictions)

```

```{r}
DimPlot(lt_nad_ol, group.by = "Marques2018_label_trans")

```
```{r}
DimPlot(lt_nad_ol, group.by = "Marques2018_label_trans", 
        split.by = "Marques2018_label_trans")

```
# Correlation between ol ID and predicted ID
```{r}

met_dat <- lt_nad_ol@meta.data

cor_df_1 <- data.frame(nad_ol_id = met_dat$ol_clusters_named, 
                     nat_dev_id = met_dat$Marques2018_label_trans)

cor_df_1$nat_dev_id <- as.factor(cor_df_1$nat_dev_id)
cor_df_1$nat_dev_id <- factor(cor_df_1$nat_dev_id, levels = c("OPC1a",
                                                              "OPC1b",
                                                              "COP",
                                                              "NFOL"))

cor_matr_1 <- matrix(data= 0, 
                   nrow = length(levels(cor_df_1$nad_ol_id)), 
                   ncol = length(levels(cor_df_1$nat_dev_id)))

rownames(cor_matr_1) <- levels(cor_df_1$nad_ol_id)
colnames(cor_matr_1) <- levels(cor_df_1$nat_dev_id)

for(i in 1: nrow(cor_df_1)){
  name1 <- as.character(cor_df_1[i, 1])
  name2 <- as.character(cor_df_1[i, 2])
  cor_matr_1[name1, name2] <- cor_matr_1[name1, name2] + 1
}


```



```{r, fig.width = 8, fig.height = 10}
corrplot(cor_matr_4, type = "full",method = "circle", 
  mar = c(1, 0, 1, 0),
  tl.col = "black",
  is.corr = FALSE,
  col = COL1('Blues'),
  cl.cex = 1.5,
  tl.cex = 2, 
  cl.pos="b")


corrplot(cor_matr_4, type = "full",method = "circle", 
  mar = c(1, 0, 1, 0),
  tl.col = "black",
  is.corr = FALSE,
  col = COL1('Blues'),
  cl.cex = 1.5,
  tl.cex = 2, 
  cl.pos="r")


```
## 2) Marques et al 2016
```{r}


lt_anchors <- FindTransferAnchors(reference = int_mus , query = nad_ol, 
    dims = 1:30)
predictions <- TransferData(anchorset = lt_anchors, refdata = int_mus$cell_class, 
    dims = 1:30)

names(predictions)[1] <- "Marques2016_label_trans"

lt_nad_ol_2 <- AddMetaData(lt_nad_ol, metadata = predictions)

```

```{r}
DimPlot(lt_nad_ol_2, group.by = "Marques2016_label_trans")

```

```{r}
DimPlot(lt_nad_ol_2, group.by = "Marques2016_label_trans", 
        split.by = "Marques2016_label_trans", ncol = 5)

```
# Correlation between ol ID and predicted ID
```{r}

met_dat <- lt_nad_ol_2@meta.data

cor_df_2 <- data.frame(nad_ol_id = met_dat$ol_clusters_named, 
                     science_id = met_dat$Marques2016_label_trans)

cor_df_2$science_id <- as.factor(cor_df_2$science_id)
cor_df_2$science_id <- factor(cor_df_2$science_id, levels = c("OPC",
                                                        "COP",
                                                        "NFOL1",
                                                        "NFOL2",
                                                        "MOL2",
                                                        "MOL3",
                                                        "MOL4",
                                                        "MOL5",
                                                        "PPR"))

cor_matr_2 <- matrix(data= 0, 
                   nrow = length(levels(cor_df_2$nad_ol_id)), 
                   ncol = length(levels(cor_df_2$science_id)))

rownames(cor_matr_2) <- levels(cor_df_2$nad_ol_id)
colnames(cor_matr_2) <- levels(cor_df_2$science_id)

for(i in 1: nrow(cor_df_2)){
  name1 <- as.character(cor_df_2[i, 1])
  name2 <- as.character(cor_df_2[i, 2])
  cor_matr_2[name1, name2] <- cor_matr_2[name1, name2] + 1
}


```


```{r, fig.width = 8, fig.height = 10}
corrplot(cor_matr_2, type = "full",method = "circle", 
  mar = c(1, 0, 1, 0),
  tl.col = "black",
  is.corr = FALSE,
  col = COL1('Blues'),
  cl.cex = 1.5,
  tl.cex = 2, 
  cl.pos="b")


corrplot(cor_matr_2, type = "full",method = "circle", 
  mar = c(1, 0, 1, 0),
  tl.col = "black",
  is.corr = FALSE,
  col = COL1('Blues'),
  cl.cex = 1.5,
  tl.cex = 2, 
  cl.pos="r")


```

## 3) Sathyamurthy
```{r}


lt_anchors <- FindTransferAnchors(reference = exp_GSE103892_ol , query = nad_ol, 
    dims = 1:30)
predictions <- TransferData(anchorset = lt_anchors, refdata = exp_GSE103892_ol$cell.type, 
    dims = 1:30)

names(predictions)[1] <- "Sathyamurthy_label_trans"

lt_nad_ol_3 <- AddMetaData(lt_nad_ol_2, metadata = predictions)

```

```{r}
DimPlot(lt_nad_ol_3, group.by = "Sathyamurthy_label_trans")

```

```{r}
DimPlot(lt_nad_ol_3, group.by = "Sathyamurthy_label_trans", 
        split.by = "Sathyamurthy_label_trans", ncol = 5)

```
# Correlation between ol ID and predicted ID
```{r}

met_dat <- lt_nad_ol_3@meta.data

cor_df <- data.frame(nad_ol_id = met_dat$ol_clusters_named, 
                     Sathyamurthy_id = met_dat$Sathyamurthy_label_trans)

cor_df$Sathyamurthy_id <- as.factor(cor_df$Sathyamurthy_id)
cor_df$Sathyamurthy_id <- factor(cor_df$Sathyamurthy_id, levels = c("OPC",
                                                        "Oligos"))

cor_matr <- matrix(data= 0, 
                   nrow = length(levels(cor_df$nad_ol_id)), 
                   ncol = length(levels(cor_df$Sathyamurthy_id)))

rownames(cor_matr) <- levels(cor_df$nad_ol_id)
colnames(cor_matr) <- levels(cor_df$Sathyamurthy_id)

for(i in 1: nrow(cor_df)){
  name1 <- as.character(cor_df[i, 1])
  name2 <- as.character(cor_df[i, 2])
  cor_matr[name1, name2] <- cor_matr[name1, name2] + 1
}


```


```{r, fig.width = 8, fig.height = 10}
corrplot(cor_matr, type = "full",method = "circle", 
  mar = c(1, 0, 1, 0),
  tl.col = "black",
  is.corr = FALSE,
  col = COL1('Blues'),
  cl.cex = 1.5,
  tl.cex = 2, 
  cl.pos="b")


corrplot(cor_matr, type = "full",method = "circle", 
  mar = c(1, 0, 1, 0),
  tl.col = "black",
  is.corr = FALSE,
  col = COL1('Blues'),
  cl.cex = 1.5,
  tl.cex = 2, 
  cl.pos="r")

corrplot(cor_matr, type = "full",method = "circle", 
  mar = c(1, 0, 1, 0),
  tl.col = "black",
  is.corr = FALSE,
  col = COL1('Blues'),
  cl.cex = 0.5,
  tl.cex = 2, 
  cl.pos="r")

corrplot(cor_matr, type = "full",method = "circle", 
  mar = c(1, 0, 1, 0),
  tl.col = "black",
  is.corr = FALSE,
  col = COL1('Blues'),
  cl.cex = 0.5,
  tl.cex = 2, 
  cl.pos="n")


```
# All combined in one matrix
```{r}
comb_matr <- cbind(cor_matr, cor_matr_1, cor_matr_2)

corrplot(comb_matr, type = "full",method = "circle", 
  mar = c(1, 0, 1, 0),
  tl.col = "black",
  is.corr = FALSE,
  col = COL1('Blues'),
  cl.cex = 0.5,
  tl.cex = 2, 
  cl.pos="r")
```
# Label transfer

## 4) Florridia et al 2020
```{r}

flor_data_ol$dataset<-"Florridia"
lt_anchors <- FindTransferAnchors(reference = flor_data_ol , query = nad_ol, 
    dims = 1:30)
predictions <- TransferData(anchorset = lt_anchors, 
                            refdata = flor_data_ol$confidentclusters, 
                            dims = 1:30)

names(predictions)[1] <- "Florridia2020_label_trans"

lt_nad_ol_4 <- AddMetaData(lt_nad_ol_3, metadata = predictions)

```

```{r}
DimPlot(lt_nad_ol_4, group.by = "Florridia2020_label_trans")

```



# Correlation between ol ID and predicted ID
```{r}

met_dat <- lt_nad_ol_4@meta.data

cor_df_4 <- data.frame(nad_ol_id = met_dat$ol_clusters_named, 
                     flor_id = met_dat$Florridia2020_label_trans)

#THis is not including immature NFOLs, see if I can include them in the 
# matrix
cor_df_4$flor_id <- as.factor(cor_df_4$flor_id)
cor_df_4$flor_id <- factor(cor_df_4$flor_id, levels = c("OPC",
                                                        "COP",
                                                        "MOL2",
                                                        "MOL5"))

levels_flor <- c("OPC", "COP", "NFOL1", "NFOL2", "MFOL1", "MFOL2", "MOL1",  
                 "MOL2", "MOL3",  "MOL4",  "MOL5",  "MOL6")  
  


cor_matr_4 <- matrix(data = 0, 
                   nrow = length(levels(cor_df_4$nad_ol_id)), 
                   ncol = length(levels_flor))

rownames(cor_matr_4) <- levels(cor_df_4$nad_ol_id)
colnames(cor_matr_4) <- levels_flor

for(i in 1: nrow(cor_df_4)){
  name1 <- as.character(cor_df_4[i, 1])
  name2 <- as.character(cor_df_4[i, 2])
  cor_matr_4[name1, name2] <- cor_matr_4[name1, name2] + 1
}


```

```{r}
COL1(sequential = c("Oranges", "Purples", "Reds", "Blues", "Greens", 
                    "Greys", "OrRd", "YlOrRd", "YlOrBr", "YlGn"), n = 200)

COL2(diverging = c("RdBu", "BrBG", "PiYG", "PRGn", "PuOr", "RdYlBu"), n = 200)

```

```{r, fig.width = 8, fig.height = 10}
corrplot(cor_matr, type = "full",method = "circle", 
  mar = c(1, 0, 1, 0),
  tl.col = "black",
  is.corr = FALSE,
  col = COL1('Blues'),
  cl.cex = 1.5,
  tl.cex = 2, 
  cl.pos="b")


corrplot(cor_matr, type = "full",method = "circle", 
  mar = c(1, 0, 1, 0),
  tl.col = "black",
  is.corr = FALSE,
  col = COL1('Blues'),
  cl.cex = 1.5,
  tl.cex = 2, 
  cl.pos="r")


```

Integrate

```{r}
# Integrate
ol_anchors <- FindIntegrationAnchors(object.list = list(lt_nad_ol_4, 
                                                        exp_GSE103892_ol, 
                                                        int_mus,
                                                        #dev_cell_seur_ol, 
                                                        flor_data_ol),
                                     dims = 1:20) 

ol_comb <- IntegrateData(anchorset = ol_anchors, dims = 1:20)

DefaultAssay(ol_comb) <- "integrated"

# Run the standard workflow for visualization and clusterin


all_genes <- rownames(ol_comb)
ol_comb <- ScaleData(ol_comb, features = all_genes)
ol_comb <- RunPCA(ol_comb, npcs = 30, verbose = FALSE)
# t-SNE and Clustering
ol_comb <- RunUMAP(ol_comb, reduction = "pca", dims = 1:20)
ol_comb <- FindNeighbors(ol_comb, reduction = "pca", dims = 1:20)
ol_comb <- FindClusters(ol_comb, resolution = 0.5)


# Visualization
ol_comb@meta.data$Tissue <- ifelse(ol_comb@meta.data$Tissue == "NA", 
                                   paste("Mouse"),
                                   ol_comb@meta.data$Tissue)

DimPlot(ol_comb, reduction = "umap", group.by = "dataset")
DimPlot(ol_comb, reduction = "umap", group.by = "dataset", split.by = "dataset")
DimPlot(ol_comb, reduction = "umap", label = TRUE)

```

```{r}

DefaultAssay(ol_comb) <- "RNA"
FeaturePlot(ol_comb, features = c("PLP1", "MAG", "MOG", "OPALIN", "PDGFRA"))

FeaturePlot(ol_comb, features = c("SPARC"))
FeaturePlot(ol_comb, features = c("SPARCL1"))
FeaturePlot(ol_comb, features = c("RBFOX1"))
```

```{r}
DimPlot(ol_comb, reduction = "umap", group.by = "Tissue", 
        cols = mycoloursP[1:15])
```

```{r}
DimPlot(ol_comb, reduction = "umap", group.by = "Tissue", 
        split.by = "Tissue",
        cols = mycoloursP[1:15])
```

```{r}
DimPlot(ol_comb, reduction = "umap", group.by = "dataset", 
        cols = mycoloursP[1:15])
```

```{r}
DimPlot(ol_comb, reduction = "umap", group.by = "dataset", 
        split.by = "dataset",
        cols = mycoloursP[1:15])
```

```{r}
FeaturePlot(ol_comb, features = "SPARC")
FeaturePlot(ol_comb, features = "RBFOX1")
FeaturePlot(ol_comb, features = "OPALIN")

FeaturePlot(ol_comb, features = "SPARC", split.by = "dataset")
FeaturePlot(ol_comb, features = "RBFOX1", split.by = "dataset")

DimPlot(ol_comb, reduction = "umap", group.by = "Tissue", split.by = "Tissue")


FeaturePlot(ol_comb, features = c("SPARC", "RBFOX1", "OPALIN"))

FeaturePlot(ol_comb, features = "SPARC")

```

```{r}

pt <- table(Idents(ol_comb), ol_comb$dataset)
pt <- as.data.frame(pt)
pt$Var1 <- as.character(pt$Var1)

ggplot(pt, aes(x = Var2, y = Freq, fill = Var1)) +
  theme_bw(base_size = 15) +
  geom_col(position = "fill", width = 0.5) +
  xlab("Sample") +
  ylab("Proportion") +
  scale_fill_manual(values = mycoloursP) +
  theme(legend.title = element_blank())

```
```{r}
sat <- subset(pt, pt$Var2 == "Sathyamurthy")
sat_total <- sum(sat$Freq)
sat_total
```


Save integrated dataset
```{r}
save_directory <- here::here("data", 
                       "integrated")

dir.create(save_directory)

saveRDS(ol_comb, here::here(save_directory, 
                       "int_Marques_Sathyamurthy_Florridia.RDS"))
```

```{r}

cols_rearranged <- c("#FF410DB2", "#7E6148B2", "#B09C85B2", 
                     "#DC0000B2", "#6EE2FFB2", "#F7C530B2", 
                     "#95CC5EB2", "#D0DFE6B2", "#F79D1EB2", 
                     "#8491B4B2", "#91D1C2B2")
```

```{r}
DimPlot(ol_comb, group.by = "ol_clusters_named", 
        cols = cols_rearranged, label = TRUE) + 
  NoLegend()

```

```{r}
DimPlot(ol_comb, group.by = "ol_clusters_named", 
        cols = cols_rearranged, label = TRUE) + 
  NoLegend()

```


```{r}
comb_matr <- cbind(cor_matr_1, cor_matr_2, cor_matr_3, cor_matr_4)


```


```{r, fig.width = 8, fig.height = 10}
corrplot(comb_matr, type = "full",method = "circle", 
  mar = c(1, 0, 1, 0),
  tl.col = "black",
  is.corr = FALSE,
  col = COL1('Blues'),
  cl.cex = 1.5,
  tl.cex = 2, 
  cl.pos="b")


corrplot(comb_matr, type = "full",method = "circle", 
  mar = c(1, 0, 1, 0),
  tl.col = "black",
  is.corr = FALSE,
  col = COL1('Blues'),
  cl.cex = 1.5,
  tl.cex = 2, 
  cl.pos="r")

write.csv(comb_matr, here::here("outs",
                                "oligos",
                                "integration_lt",
                                "Marq16_Marq18_Flor20_Sathy_corr.csv"))
```

```{r}

sessionInfo()
```