scr/AllCells/ScaterQC.Rmd · CorticalOrganoids

---
title: "Organoids_QC"
author: "Nina-Lydia Kazakou"
date: "18 May 2021"
output: html_document
---

# load libraries 
```{r}
library(SingleCellExperiment)
library(Seurat)
library(scater)
library(scran)
library(devtools)
library(dplyr)
library(ggsci)
library(tidyverse)
library(Matrix)
library(scales)
library(here)
```

# set working directory
```{r}
setwd("/exports/eddie/scratch/s1241040/BO/Ananlysis/")
```

# Load fastq files & Create Seurat Object
```{r}
T1 <- Read10X("/exports/eddie/scratch/s1241040/BO/CellRanger_outputs/14462WApool01__BO_wk8/outs/filtered_feature_bc_matrix/")
T2 <- Read10X("/exports/eddie/scratch/s1241040/BO/CellRanger_outputs/14462WApool01__BO_wk16/outs/filtered_feature_bc_matrix/")
T3 <- Read10X("/exports/eddie/scratch/s1241040/BO/CellRanger_outputs/14462WApool01__BO_wk21//outs/filtered_feature_bc_matrix/")
T4 <- Read10X("/exports/eddie/scratch/s1241040/BO/CellRanger_outputs/14462WApool01__BO_wk25///outs/filtered_feature_bc_matrix/")

wk8 <- CreateSeuratObject(counts = T1, project = "Cortical_Organoids_wk8")
wk16 <- CreateSeuratObject(counts = T2, project = "Cortical_Organoids_wk16")
wk21 <- CreateSeuratObject(counts = T3, project = "Cortical_Organoids_wk21")
wk25 <- CreateSeuratObject(counts = T4, project = "Cortical_Organoids_wk25")

CO.merge <- merge(x = wk8, y = c(wk16, wk21, wk25), add.cell.id = c("wk8", "wk16", "wk21", "wk25"), project = "Cortical Brain Organoids")
View(CO.merge@meta.data)

sparse.size <- object.size(CO.merge)
sparse.size
#705283216 bytes
```

# Populate the metadata
```{r}
# Add number of genes per UMI for each cell to metadata.filt
CO.merge$log10GenesPerUMI <- log10(CO.merge$nFeature_RNA) / log10(CO.merge$nCount_RNA)

# Calculate the percentage of mitochondrial genes
CO.merge[["percent.mito"]] <- PercentageFeatureSet(CO.merge, pattern = "^MT-")

# Calculate the percentage of ribosomal genes
CO.merge[["percent.ribo"]] <- PercentageFeatureSet(CO.merge, pattern = "^RP[SL]")

# Add 10x kit chemistry version 
chem <- rep("v3.1", ncol(CO.merge))
CO.merge <- AddMetaData(CO.merge, chem, col.name = "Chemistry")

## Create a data.frame to add a few more meta.data, so that there is no confusion that I will bind in the end with my object
metadata.filt <- CO.merge@meta.data

# Add a Cells column
metadata.filt$Cells <- rownames(metadata.filt)

# Add a column specific to each Sample
metadata.filt$Sample <- NA
t1 <- grepl("^wk8_", metadata.filt$Cells)
t2 <- grepl("^wk16_", metadata.filt$Cells)
t3 <- grepl("^wk21_", metadata.filt$Cells)
t4 <- grepl("^wk25_", metadata.filt$Cells)

metadata.filt$Sample[which(t1)] <- "wk8"
metadata.filt$Sample[which(t2)] <- "wk16"
metadata.filt$Sample[which(t3)] <- "wk21"
metadata.filt$Sample[which(t4)] <- "wk25"

# Megre data.frame with Seurat Object
CO.merge@meta.data <- metadata.filt
View(CO.merge@meta.data)

```

# Set the colour pallete 
```{r}
mypal <- pal_npg("nrc", alpha = 0.7)(10)
mypal2 <-pal_tron("legacy", alpha = 0.7)(7)
mypal3 <- pal_lancet("lanonc", alpha = 0.7)(9)
mypal4 <- pal_simpsons(palette = c("springfield"), alpha = 0.7)(16)
mypal5 <- pal_rickandmorty(palette = c("schwifty"), alpha = 0.7)(6)
mypal6 <- pal_futurama(palette = c("planetexpress"), alpha = 0.7)(5)
mypal7 <- pal_startrek(palette = c("uniform"), alpha = 0.7)(5)
mycoloursP<- c(mypal, mypal2, mypal3, mypal4, mypal5, mypal6, mypal7)
show_col(mycoloursP, labels =F)
```


# Do quick QC VlnPlots  
```{r}
VlnPlot(CO.merge, features = c("nFeature_RNA", "nCount_RNA", "percent.mito"), ncol = 3, pt.size = 0.02, cols = mycoloursP)
```

# Save merged seurat.object
```{r}
saveRDS(CO.merge, here("data", file = "CO.merge.object.rds"))
```

# Filter out unwanted genes
```{r}
# Generate SCE object
co.sce <- as.SingleCellExperiment(CO.merge)

# Check the number of genes
nrow(co.sce)
#36601

keep_feature <- rowSums(counts(co.sce) > 0) >= 10
co.sce <- co.sce[keep_feature, ]
nrow(co.sce)
#22735
```

# Filter out low quality cells
```{r}
df <- perCellQCMetrics(co.sce, subsets=list(Mito=grep("MT-", rownames(co.sce))))
df

df$Cell.id <- rownames(df)

df$Timepoint <- NA
C1 <- grepl("^wk8_", df$Cell.id)
C2 <- grepl("^wk16_", df$Cell.id)
C3 <- grepl("^wk21_",df$Cell.id)
C4 <- grepl("^wk25_", df$Cell.id)

df$Timepoint[which(C1)] <- "wk8"
df$Timepoint[which(C2)] <- "wk16"
df$Timepoint[which(C3)] <- "wk21"
df$Timepoint[which(C4)] <- "wk25"

head(df)
```

```{r}
# Add metrics to the object meta.data
co.sce <- addPerCellQC(co.sce, subsets=list(Mito=grep("MT-", rownames(co.sce))))
colnames(colData(co.sce))
```

```{r}
# Plot data
plotColData(co.sce, x = "sum", y="detected", colour_by="Sample") + ggtitle("Total Cells")
plotColData(co.sce, x = "Sample", y="sum") + ggtitle("Total Count")
plotColData(co.sce, x = "Sample", y="detected") + ggtitle("Detected Features")
```

# Detect low quality cells automatically & check applied thresholds
```{r}
# Low UMI Counts
qc.lib.lower <- isOutlier(df$sum, log=TRUE, type="lower")
lowUMI <- as.numeric(attr(qc.lib.lower, "thresholds")[2])
if(lowUMI == Inf){
lowUMI <-0
}
if(min(df$sum)>= lowUMI){
print("Filter for low UMI counts are too permissive. Apply manual set threshold. Default is 100")
qc.lib.lower<- df$sum<20
lowUMI<-20
}else{
print("Calculated thresholds seem to filter for low UMI counts")
}
```


```{r}
# High UMI Counts
qc.lib.higher <-isOutlier(df$sum, log=TRUE, type="higher")
higherUMI <-  as.numeric(attr(qc.lib.higher, "thresholds")[2])
if(max(df$sum)<= higherUMI){
  print("Filter for high UMI counts too permissive. Apply manual set threshold.")
  qc.lib.higher<- df$sum > 30000
  higherUMI<-30000
}else{
  print("Calculated thresholds seem to filter for high UMI counts")
}
```

```{r}
attr(qc.lib.higher, "thresholds")
```


```{r}
# Low Gene Counts
qc.nexprs.lower <- isOutlier(df$detected, log=TRUE, type="lower")
lowerFeatures <- as.numeric(attr(qc.nexprs.lower, "thresholds")[2])
if(min(df$detected)>= lowerFeatures | lowerFeatures < 10){
  print("Filter for high gene counts too permissive. Apply manual set threshold.")
  qc.nexprs.lower<-(df$detected)<10
  lowerFeatures<-10
}else{
  print("Calculated thresholds seem to filter for high gene counts")
}
```

```{r}
attr(qc.nexprs.lower, "thresholds")
```


```{r}
# High Gene Counts 
qc.nexprs.higher <- isOutlier(df$detected, log=TRUE, type="higher")
higherFeatures <- as.numeric(attr(qc.nexprs.higher, "thresholds")[2])
if(max(df$detected)<= higherFeatures){
  print("Filter for high gene counts too permissive. Apply manual set threshold.")
  qc.nexprs.higher<-(df$detected)> 8000
  higherFeatures<-8000
}else{
  print("Calculated thresholds seem to filter for high gene counts")
}
```


```{r}
# Mitochondrial Genes 
qc.mito <- isOutlier(df$subsets_Mito_percent, type="higher")
attr(qc.mito, "thresholds")
percMitoThres<- as.numeric(attr(qc.mito, "thresholds"))
```

# Apply manual thresholds where the is.Outlier was too permissive 
```{r}
# Low UMI Counts; use the default which is 100
qc.lib.low <- df$sum < 100

# High Gene Counts; apply different thresholds for separate samples
qc.nexprs.high.wk8 <- (df$detected > 3000 & df$Timepoint == "wk8")
qc.nexprs.high.other <- (df$detected > 7500 & df$Timepoint %in% c("wk16", "wk21", "wk25"))

sum(qc.nexprs.high.wk8) #9034
sum(qc.nexprs.high.other) #5410

qc.nexprs.high <- cbind(qc.nexprs.high.other, qc.nexprs.high.wk8)
unique(qc.nexprs.high)
```

# Create a vector that contains all the low quality cells
```{r}
discard <- qc.lib.low | qc.lib.higher | qc.nexprs.lower | qc.nexprs.high.wk8 | qc.nexprs.high.other | qc.mito
```

# Create output file with thresholds
```{r}
filterDF <- DataFrame(LibSize_low = sum(qc.lib.low),
                      LibSize_low_threshold = lowUMI,  
                      LibSize_high = sum(qc.lib.higher), 
                      LibSize_high_threshold = higherUMI, 
                      NExprs_low = sum(qc.nexprs.lower), 
                      NExprs_low_threshold = lowerFeatures,
                      NExprs_high = sum(qc.nexprs.high), 
                      NExprs_high_threshold = higherFeatures,
                      MitoPerc = sum(qc.mito), 
                      MitoPeec_threshold = percMitoThres,
                      Total = sum(discard))

write.csv(filterDF, here("outs", file = "cell_qc_threshold_table.csv"))
```

# Add the data to the object meta.data (optional)
```{r}
new.dataframe <- data.frame(low_lib_size = qc.lib.low, 
                     large_lib_size = qc.lib.higher,
                     low_n_features = qc.nexprs.lower, 
                     high_n_features_wk8 = qc.nexprs.high.wk8,
                     high_n_features_other = qc.nexprs.high.other,
                     high_subsets_mito_percent = qc.mito, 
                     discard = discard)

colData(co.sce) <- cbind(colData(co.sce), new.dataframe)
```

# Add a column to the object meta.data with the cells that failed QC
```{r}
co.sce$ScaterQC_failed <- new.dataframe$discard                                                                          
colnames(colData(co.sce))
```

# Plot Cells
```{r}
plotColData(co.sce, x = "sum", y="detected", colour_by="Sample") + ggtitle("Total Cells") 
plotColData(co.sce, x = "Sample", y="sum", colour_by="ScaterQC_failed") + ggtitle("Total Count")
plotColData(co.sce, x = "Sample", y="detected", colour_by="ScaterQC_failed") + ggtitle("Detected Features")
plotColData(co.sce, x="Sample", y="subsets_Mito_percent", colour_by="ScaterQC_failed") + ggtitle("Mito percent")
plotColData(co.sce, x="sum", y="subsets_Mito_percent", colour_by="ScaterQC_failed", other_fields = "Sample") + facet_wrap(~Sample)
```

```{r}
 UMIplot <- plotColData(co.sce, 
                         x="Sample", 
                         y="sum", 
                         colour_by="ScaterQC_failed") +  
    scale_y_log10() + 
    ggtitle("Total UMI count per cell") + 
    guides(fill=guide_legend(title= "Failed Scater QC"))


FeaturePlotQC <- plotColData(co.sce, 
                               x="Sample", 
                               y="detected", 
                               colour_by="ScaterQC_failed") +
    scale_y_log10() + 
    ggtitle("Detected total features per cell") + 
    guides(fill=guide_legend(title= "Failed Scater QC"))
  
  
  
  MitoPlot<-plotColData(co.sce, 
                        x="Sample", 
                        y="percent.mito", 
                        colour_by="ScaterQC_failed") + 
    ggtitle("Percent mitochondrial genes per cell") + 
    guides(fill=guide_legend(title= "Failed Scater QC"))


myQCPlotList<- list()
myQCPlotList[[1]]<-UMIplot
myQCPlotList[[2]]<-FeaturePlotQC
myQCPlotList[[3]]<-MitoPlot

mp <-  multiplot(plotlist = myQCPlotList, cols = 1)

dir.create(here("outs", "QC"))

pdf(here("outs", "QC", file = "Scater_Filterplot.pdf"), width = 7, height = 6)

print(mp)

dev.off()
```

```{r}
scatter <- plotColData(co.sce, 
                          x = "sum", 
                          y = "percent.mito", 
                          colour_by = "ScaterQC_failed") 
  
pdf(here("outs", "QC", file = "ScatterPlot.pdf"), width = 5, height = 5)
  
print(scatter)
  
dev.off()
```

# Create a new sce without the low quality cells
```{r}
filtered.co.sce <- co.sce[,!new.dataframe$discard]
```

```{r}
saveRDS(filtered.co.sce, here("data", file = "filtered.co.sce.rds"))
```