Luise-Seeker-Human-WM-Glia / src / 07_red_dim_clust_roughAnnot_cluster_QC_1.Rmd
07_red_dim_clust_roughAnnot_cluster_QC_1.Rmd
Raw
---
title: "07_Dim_red_Rough_annot"
author: "Luise A. Seeker"
date: "05/02/2021"
output: html_document
---

```{r}
library(Seurat)
library(ggplot2)
library(ggsci)
library("scales")
library(SingleCellExperiment)
library(BiocSingular)
library(scDblFinder)
library(scran)
library(scater)
library(RCurl)
library(AnnotationHub)


```

Pick colour paletts

```{r}

mypal <- pal_npg("nrc", alpha = 0.7)(10)
mypal2 <-pal_tron("legacy", alpha = 0.7)(7)
mypal3 <- pal_lancet("lanonc", alpha = 0.7)(9)
mypal4 <- pal_simpsons(palette = c("springfield"), alpha = 0.7)(16)
mypal5 <- pal_rickandmorty(palette = c("schwifty"), alpha = 0.7)(6)
mypal6 <- pal_futurama(palette = c("planetexpress"), alpha = 0.7)(5)
mypal7 <- pal_startrek(palette = c("uniform"), alpha = 0.7)(5)

mycoloursP<- c(mypal, mypal2, mypal3, mypal4, mypal5, mypal6, mypal7)

show_col(mycoloursP, labels =F)
```


```{r}

seur_comb <- readRDS("/Users/lseeker/Documents/Work/HumanCellAtlas/splice_control_out/datasets/04_scran_normalised/combined_SCE/combined_seur_norm_raw.RDS")

```



#the dataset has been normalised before, so it is not notmalised again. Instead, variable features are searched:
```{r}
seur_comb <- FindVariableFeatures(seur_comb, selection.method = "vst", 
                                  nfeatures = 2000)
```


# Identify the 10 most highly variable genes
```{r}
top10 <- head(VariableFeatures(seur_comb), 10)

```

# plot variable features with and without labels
```{r, fig.width=10, fig.height=5, fig.fullwidth=TRUE}
plotA <- VariableFeaturePlot(seur_comb, cols = mycoloursP[4:5])
plotB <- LabelPoints(plot = plotA, points = top10, repel = TRUE)
plotA + plotB
```

#scale all genes (if there is a batch effect I could add vars to regress here)
```{r}
all_genes <- rownames(seur_comb)
seur_comb <- ScaleData(seur_comb, features = all_genes)
```

#linear dimensional reduction
```{r}
seur_comb <- RunPCA(seur_comb, features = VariableFeatures(object = seur_comb), 
                    npcs = 100)
```

#visualise
```{r, fig.width=10, fig.height=8, fig.fullwidth=TRUE}
VizDimLoadings(seur_comb, dims = 1:2, reduction = "pca")
```

```{r}
DimPlot(seur_comb, reduction = "pca", cols= mycoloursP[22:40])
```

```{r}
DimHeatmap(seur_comb, dims = 1, cells = 500, balanced = TRUE)
```

```{r, fig.width=10, fig.height=55, fig.fullwidth=TRUE}
DimHeatmap(seur_comb, dims = 1:50, cells = 500, balanced = TRUE)
```
Dimensional reduction

```{r}
ElbowPlot(seur_comb, ndims = 100)
```


Horn's paralell analysis
```{r}

comb_sce<- as.SingleCellExperiment(seur_comb)

var_feat<-VariableFeatures(seur_comb)

set.seed(100010)
horn <- PCAtools::parallelPCA(logcounts(comb_sce)[var_feat,],
    BSPARAM=BiocSingular::IrlbaParam(), niters=10)
horn$n

var_horn <- horn$original$variance

plot(var_horn, type="b", log="y", pch=16,
     xlab = "Principal components",
     ylab = "Variance explained")
permuted <- horn$permuted
for (i in seq_len(ncol(permuted))) {
    points(permuted[,i], col="grey80", pch=16)
    lines(permuted[,i], col="grey80", pch=16)
}
abline(v=horn$n, col="red")

```


```{r}
seur_comb <- FindNeighbors(seur_comb, dims = 1:25)
seur_comb <- FindClusters(seur_comb, resolution = 0.5)
```


```{r}
seur_comb <- RunUMAP(seur_comb, dims = 1:25)

seur_comb <- RunTSNE(seur_comb, dims = 1:25)



#Idents(seur_comb)<-"seurat_clusters"
DimPlot(seur_comb, reduction = "umap", label = T, cols = mycoloursP, 
        pt.size = 0.5)
```

```{r}
DimPlot(seur_comb, reduction = "tsne", label = T, cols = mycoloursP, 
        pt.size = 0.5)
```




```{r, fig.width=7, fig.height=4, fig.fullwidth=TRUE}
DimPlot(seur_comb, reduction = "pca", label = F, cols = mycoloursP, 
        pt.size = 2)
```



```{r}
seur_comb <- FindClusters(seur_comb, resolution = 0.8 )

Idents(seur_comb) <- "RNA_snn_res.0.8"
DimPlot(object = seur_comb, reduction = "umap", label=T, cols = mycoloursP)

DimPlot(object = seur_comb, reduction = "umap", label=F, cols = mycoloursP)
```


```{r}
Idents(seur_comb) <- "RNA_snn_res.0.8"
DimPlot(object = seur_comb, reduction = "tsne", label=T, cols = mycoloursP)

DimPlot(object = seur_comb, reduction = "tsne", label=F, cols = mycoloursP)
```



```{r}
Idents(seur_comb) <- "RNA_snn_res.0.8"


DimPlot(object = seur_comb, reduction = "pca", label=F, cols = mycoloursP)
```

How many individuals contribute to every cluster:
```{r}
IDsPerCluster <- as.data.frame(tapply(seur_comb@meta.data$caseNO, 
                                    seur_comb@meta.data$RNA_snn_res.0.8, 
                                    function(x) length(unique(x)) ))

names(IDsPerCluster) <- "NumberOfIDs"
IDsPerCluster$RNA_snn_res.0.8 <- rownames(IDsPerCluster)
IDsPerCluster$Cluster <- rownames(IDsPerCluster)

IDsPerCluster
```

```{r}

met_data <- seur_comb@meta.data

met_data$Cluster <- met_data$RNA_snn_res.0.8

merged_dat <- merge(met_data, IDsPerCluster, by = "Cluster", all = TRUE)

merged_dat$Barcode == met_data$Barcode

barcode_oder <- met_data$Barcode

merged_dat <- merged_dat[match(barcode_oder, merged_dat$Barcode),]

merged_dat$Barcode == met_data$Barcode

id_vector <- merged_dat$NumberOfIDs
seur_comb$CountsPerCluster_res_0_8 <- id_vector


#seur_comb@meta.data$Cluster_0_5<-seur_comb@meta.data$RNA_snn_res.0.5


DimPlot(seur_comb, reduction = "umap", label = F, cols = mycoloursP[3:40],
        group.by = "CountsPerCluster_res_0_8")



```


```{r}
seur_comb@meta.data$IdCountGroup <- 
  ifelse(seur_comb@meta.data$CountsPerCluster_res_0_8 < 10, "< 10", 
         ifelse(seur_comb@meta.data$CountsPerCluster_res_0_8 == 20, 20, 
                paste(seur_comb@meta.data$CountsPerCluster_res_0_8)))

Idents(seur_comb)<-"IdCountGroup"
DimPlot(seur_comb, reduction = "umap", label = F, cols = mycoloursP[20:30])

```


How many samples contribute to each cluster


```{r}
samples_per_cluster <- as.data.frame(tapply(seur_comb@meta.data$process_number, 
                                    seur_comb@meta.data$RNA_snn_res.0.8, 
                                    function(x) length(unique(x)) ))

names(samples_per_cluster) <- "samples_per_cluster"
samples_per_cluster$RNA_snn_res.0.8 <- rownames(samples_per_cluster)
samples_per_cluster$Cluster <- rownames(samples_per_cluster)

samples_per_cluster
```

```{r}


merged_dat <- merge(merged_dat, samples_per_cluster, by = "Cluster", all = TRUE)

#merged_dat$Barcode == met_data$Barcode

barcode_oder <- met_data$Barcode

merged_dat <- merged_dat[match(barcode_oder, merged_dat$Barcode),]

merged_dat$Barcode == met_data$Barcode

sample_vector <- merged_dat$samples_per_cluster


seur_comb$samplesPerCluster_res_0_8 <- sample_vector


#seur_comb@meta.data$Cluster_0_5<-seur_comb@meta.data$RNA_snn_res.0.5


DimPlot(seur_comb, reduction = "umap", label = F, cols = mycoloursP[8:50],
        group.by = "samplesPerCluster_res_0_8")



```


```{r}
seur_comb@meta.data$qurtile_sample__per_cl_group

seur_comb@meta.data$sample_count_per_cl_group <- 
  ifelse(seur_comb@meta.data$samplesPerCluster_res_0_8 < 10, "< 10", 
         ifelse(seur_comb@meta.data$samplesPerCluster_res_0_8 == 58, "58 (All)",
                ifelse(between(seur_comb@meta.data$samplesPerCluster_res_0_8, 
                               10, 29), "10 - 29",
                       ifelse(between(seur_comb@meta.data$samplesPerCluster_res_0_8, 
                               30, 39), "30 - 39",
                              ifelse(between(seur_comb@meta.data$samplesPerCluster_res_0_8, 
                               40, 49), "40 - 49",
                               "50 - 57")))))


DimPlot(seur_comb, reduction = "umap", label = F, cols = mycoloursP[14:50],
        group.by = "sample_count_per_cl_group")

```

```{r}
DimPlot(seur_comb, reduction = "umap", label = F, cols = mycoloursP[20:30],
        group.by = "sample_count_per_cl_group",
        split.by = "sample_count_per_cl_group")

```



Relevel 10 X batch for plotting in order
```{r}

seur_comb$X10XBatch <- as.factor(seur_comb$X10XBatch)
seur_comb$X10XBatch <- factor(seur_comb$X10XBatch, levels = c("1",
                                                              "2",
                                                              "3",
                                                              "4",
                                                              "5",
                                                              "6",
                                                              "7",
                                                              "8",
                                                              "9",
                                                              "10"))
Idents(seur_comb)<-"X10XBatch"
mycolours<- mycoloursP[1:7]



DimPlot(seur_comb, reduction = "umap", cols = mycoloursP)

```

```{r, fig.width=7, fig.height=5, fig.fullwidth=TRUE}
Idents(seur_comb)<-"caseNO"


DimPlot(seur_comb, reduction = "umap", cols = mycoloursP)
```


```{r}

Idents(seur_comb)<-"SequencingPool"
mycolours<- mycoloursP[1:6]

DimPlot(seur_comb, reduction = "umap", cols = mycoloursP)
```



```{r}
Idents(seur_comb)<-"gender"
mycolours<- mycoloursP[1:2]

DimPlot(seur_comb, reduction = "umap", cols = mycoloursP)
```


```{r}
seur_comb@meta.data$AgeGroup<-ifelse(seur_comb@meta.data$Age < 50, "Young", "Old")

Idents(seur_comb)<-"AgeGroup"

DimPlot(seur_comb, reduction = "umap", cols = mycoloursP[10:11])
```


```{r}
seur_comb@meta.data$RIN<-ifelse(seur_comb@meta.data$RINvalue == "", "NULL", 
                                seur_comb@meta.data$RINvalue)
Idents(seur_comb)<-"RIN"


DimPlot(seur_comb, reduction = "umap", cols = mycoloursP[6:40])

```


```{r}

Idents(seur_comb)<-"UMI_QC"
mycolours<- mycoloursP[25:26]

DimPlot(seur_comb, reduction = "umap", cols = mycolours)

```

```{r}

Idents(seur_comb)<-"CauseOfDeath_category"


DimPlot(seur_comb, reduction = "umap", cols = mycoloursP[20:27])

```

Doublet markers
```{r}

DimPlot(seur_comb, reduction = "umap", 
        label = F, 
        group.by = "scDblFinder_class",
        cols = c(mycoloursP[25] , mycoloursP[26]),
        pt.size = 0.7)

```

```{r}

DimPlot(seur_comb, reduction = "umap", 
        label = F, 
        group.by = "scDblFinder_class",
        split.by = "scDblFinder_class",
        cols = c(mycoloursP[25] , mycoloursP[26]),
        pt.size = 0.7)

```





```{r}

seur_comb@meta.data$ageSex <- ifelse(seur_comb@meta.data$gender == 
                                       "M" & 
                                       seur_comb@meta.data$AgeGroup == 
                                       "Old", 
                                     "Old men",
                                     ifelse(seur_comb@meta.data$gender == 
                                              "M" & 
                                              seur_comb@meta.data$AgeGroup == 
                                              "Young", 
                                            "Young men",
                                            ifelse(seur_comb@meta.data$gender == 
                                                     "F" & 
                                                     seur_comb@meta.data$AgeGroup == 
                                                     "Young", "Young women",
                                                 "Old women")))

Idents(seur_comb)<-"ageSex"

DimPlot(seur_comb, reduction = "umap", label = F, cols = mycoloursP[15:42])

```


```{r}
seur_comb <- FindClusters(seur_comb, resolution = 0.1)

DimPlot(seur_comb, reduction = "umap", label = T, cols = mycoloursP)
```










The following markers were used for cell types: 
neurons: SNAP25, STMN2,RBFOX3 and GABRB2;
inhibitory neurons: GAD1, GAD2 and SLC32A1; 
excitatory neurons: SATB2, SLC17A7 and SLC17A6; 
astrocytes: GJA1, AQP4, GLUL, SOX9, NDRG2, GFAP, ALDH1A1, ALDH1L1 and VIM; 
endothelial: CLDN5, VTN and ICAM2; 
pericytes: PDGFRB and NOTCH3
macrophage/microglia: SPI1, MRC1, TMEM119, CX3CR1 and A1F1; 
OPCs: PDGFRA, CSPG4, GPR17, PTPRZ1, OLIG1, OLIG2, PCDH15, PTGDS,
oligodendrocytes: PLP1, CNP, MAG, MOG, MOBP, MBP, SOX10


```{r}
FeaturePlot(seur_comb, features = "SNAP25") 

FeaturePlot(seur_comb, features = "GABRB2") 

FeaturePlot(seur_comb, features = "PLP1") 

FeaturePlot(seur_comb, features = "CNP") 

FeaturePlot(seur_comb, features = "GJA1") 

FeaturePlot(seur_comb, features = "AQP4") 

FeaturePlot(seur_comb, features = "CD74") 

FeaturePlot(seur_comb, features = "CX3CR1") 

FeaturePlot(seur_comb, features = "CLDN5") 

FeaturePlot(seur_comb, features = "ICAM2") 

FeaturePlot(seur_comb, features = "PDGFRB") 

FeaturePlot(seur_comb, features = "NOTCH3") 

FeaturePlot(seur_comb, features = "PDGFRA") 

FeaturePlot(seur_comb, features = "PCDH15") 

FeaturePlot(seur_comb, features = "GAD1") 

FeaturePlot(seur_comb, features = "GAD2") 

FeaturePlot(seur_comb, features = "SLC32A1") 

FeaturePlot(seur_comb, features = "SATB2") 

FeaturePlot(seur_comb, features = "SLC17A7") 

FeaturePlot(seur_comb, features = "SLC17A6")






```

Above are only a few marker genes. More can be found in a separate markdown file
with a respective descriptive name.




```{r}

seur_comb$rough_annot <- seur_comb$RNA_snn_res.0.1

Idents(seur_comb) <- "rough_annot"

seur_comb <-  RenameIdents(
  seur_comb, 
  "0" = "Oligo", 
  "1" = "RELN+ neurons", 
  "2" = "Endothelial-Pericyte",
  "3" = "Microglia_Macrophages",
  "4" = "Neuron_In",
  "5" = "OPC", 
  "6" = "Neuron_Ex",
  "7" = "Astrocyte", 
  "8" = "Endothelial-Pericyte", 
  "9" = "Unidendtified",
  "10" = "Neuron",
  "11" = "Unidendtified",
  "12" = "Neuron",
  "13" = "Unidendtified",
  "14" = "Neuron"
)




```

```{r}
seur_comb$rough_annot <- Idents(seur_comb)

```

```{r, fig.width=7, fig.height=6, fig.fullwidth=TRUE}

DimPlot(seur_comb, 
        label = T, 
        cols = mycoloursP, 
        repel = T, 
        group.by = "rough_annot")
```






```{r, fig.width=7, fig.height=6, fig.fullwidth=TRUE}
DimPlot(seur_comb, 
        label = F, 
        cols = mycoloursP, 
        group.by = "rough_annot")


```





During our analysis we identified several samples with lower quality based on 
either their high total nuclei count, low mean UMI count or large proportion 
of spliced compared to unspliced mRNAs. We will mark those samples in the 
metadata

```{r}
sample_qc_failed <- c(27, 8, 43, 63, 5, 2, 69, 45, 66, 50)

seur_comb$sample_qc_failed <- ifelse(seur_comb$process_number 
                                     %in% sample_qc_failed, 
                                     "TRUE", "FALSE")
```

```{r}
DimPlot(seur_comb, group.by = "sample_qc_failed", cols = mycoloursP)

DimPlot(seur_comb, 
        group.by = "sample_qc_failed", 
        split.by = "sample_qc_failed", 
        cols = c(mycoloursP[6], mycoloursP[5]  ))
```

```{r}

DimPlot(seur_comb, 
        group.by = "QC_UMI", 
        split.by = "QC_UMI", 
        cols = mycoloursP[16:17])
```



```{r}
summary(as.factor(seur_comb$sample_qc_failed))


```








save annotatedData 

```{r}


dir.create("/Users/lseeker/Documents/Work/HumanCellAtlas/splice_control_out/datasets/05_Annotated/allCelltypes")

saveRDS(seur_comb, "/Users/lseeker/Documents/Work/HumanCellAtlas/splice_control_out/datasets/05_Annotated/allCelltypes/HCA_rough_annotated_all.RDS")


```


Save a master cell metadata that includes all the cellcycle scores and 
doublet scores etc. and also a file including the barcodes of included oligodendrocytes.

```{r}
met_data <- seur_comb@meta.data

barcode_df <- data.frame(barcode = oligos$Barcode)


write.csv(barcode_df, "/Users/lseeker/Documents/Work/HumanCellAtlas/git_repos/AW_scRNAseq/data/HCA_oligodendroglia_barcodes.csv" )

write.csv(met_data, "/Users/lseeker/Documents/Work/HumanCellAtlas/git_repos/AW_scRNAseq/data/HCA_master_nuclei_metadata.csv" )
```

```{r}


sessionInfo()

```