Scripts/Mixed_Species_Analysis/MixedSpeciesQC.Rmd · Chimeras

---
title: "QC of mixed populations"
author: "Nina-Lydia Kazakou"
date: "02/02/2022"
output: html_document
---

# Set-up
```{r output-code, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r set-up, message=FALSE, warning=FALSE}
library(SingleCellExperiment)
library(Seurat)
library(scater)
library(scran)
library(devtools)
library(dplyr)
library(ggsci)
library(tidyverse)
library(Matrix)
library(scales)
library(here)
```

I want to generate a srt objects that contain the nCounts and the nFeatures, as well as the metadata from the cell_spieces for all the different filters. 

```{r test with one sample}
Met_A_species_80 <- readRDS(here("Processed/filter_80/19880WApool01__Metformin_A_S1cell_species.rds"))

Met_A_counts <- Read10X("/Nina/Chimeras/Data/STARsolo/19880WApool01__Metformin_A_S1/Solo.out/Gene/filtered/")
head(Met_A_counts)

tot_Met_A_counts <- Matrix::colSums(Met_A_counts)
summary(tot_Met_A_counts)

srt_Met_A <- CreateSeuratObject(counts = Met_A_counts, min.cells = 3)

srt_Met_A <- AddMetaData(srt_Met_A, metadata = Met_A_species_80$barcode, col.name = "barcode")
srt_Met_A <- AddMetaData(srt_Met_A, metadata = Met_A_species_80$n_mouse_umi, col.name = "n_mouse_umi")
srt_Met_A <- AddMetaData(srt_Met_A, metadata = Met_A_species_80$tot_umi, col.name = "tot_umi")
srt_Met_A <- AddMetaData(srt_Met_A, metadata = Met_A_species_80$prop_mouse, col.name = "prop_mouse")
srt_Met_A <- AddMetaData(srt_Met_A, metadata = Met_A_species_80$prop_human, col.name = "prop_human")
srt_Met_A <- AddMetaData(srt_Met_A, metadata = Met_A_species_80$species, col.name = "species")

head(srt_Met_A, 5)

dir.create(here("Processed", "filter_80", "srt_objects"))

saveRDS(srt_Met_A, here("Processed", "filter_80", "srt_objects", file = "srt_Met_A.rds"))
```



```{r create srt objects}
my_folders <- c("filter_70/", "filter_80/", "filter_90/")

for (f in 1:length(my_folders)){
  my_files = list.files(paste0("Processed/", my_folders[f]))[1:10]
  my_files_sub = my_files
  str_sub(my_files_sub, -16, -1) = ""
  
  dir.create(here("Processed", my_folders[f], "srt_objects"))
  
  for (i in 1:length(my_files)){
    rds = readRDS(paste0("Processed/", my_folders[f], my_files[i]))
    counts = Read10X(paste0("/Nina/Chimeras/Data/STARsolo/", my_files_sub[i], "/Solo.out/Gene/filtered/"))
    
    srt <- CreateSeuratObject(counts = counts, min.cells = 3)
    srt <- AddMetaData(srt, metadata = rds$barcode, col.name = "barcode")
    srt <- AddMetaData(srt, metadata = rds$n_mouse_umi, col.name = "n_mouse_umi")
    srt <- AddMetaData(srt, metadata = rds$tot_umi, col.name = "tot_umi")
    srt <- AddMetaData(srt, metadata = rds$prop_mouse, col.name = "prop_mouse")
    srt <- AddMetaData(srt, metadata = rds$prop_human, col.name = "prop_human")
    srt <- AddMetaData(srt, metadata = rds$species, col.name = "species")
    
    saveRDS(srt, here("Processed", my_folders[f], "srt_objects", file = paste0("srt_", my_files_sub[i], ".rds")))
  }
  
  rm(my_files, my_files_sub)
}
```

I want to create a merge srt object containing all the samples and cell_spieces info for all filters.

```{r merge srt}
memory.limit(30000)

srt_folders <- c("filter_70/srt_objects", "filter_80/srt_objects", "filter_90/srt_objects")
filter_70 = list()
filter_80 = list()
filter_90 = list()
lists = list(filter_70, filter_80, filter_90); rm(filter_70, filter_80, filter_90)
names(lists) = c("filter_70", "filter_80", "filter_90")
srt_combined = list()

 for (l in 1:length(srt_folders)){
  my_files = list.files(paste0("Processed/", srt_folders[l]))
  
  for (n in 1:length(my_files)){
  lists[[l]][[n]] <- readRDS(paste0("Processed/", srt_folders[l], "/", my_files[n]))
  }

  srt_combined[[l]] <- merge(lists[[l]][[1]], c(lists[[l]][[2]], lists[[l]][[3]], lists[[l]][[4]], lists[[l]][[5]], lists[[l]][[6]], lists[[l]][[7]], lists[[l]][[8]], lists[[l]][[9]], lists[[l]][[10]]), add.cell.ids = c("Clemastine_A", "Clemastine_B", "Clemastine_C", "ddH2O_A", "ddH2O_B", "DMSO_A", "DMSO_B", "Metformin_A", "Metformin_B", "Metformin_C"), project = "Chimeras_Mixed_Analysis")
  
  saveRDS(srt_combined[[l]], here("Processed", srt_folders[l],  file = paste0("srt_combined_", names(lists)[l], ".rds")))
}

names(srt_combined) = names(lists)

```

I want to create a combine_sce object for each cell_species theshold.

```{r}
my_folders <- c("filter_70", "filter_80", "filter_90")
srt_folders <- c("filter_70/srt_objects/", "filter_80/srt_objects/", "filter_90/srt_objects/")

for (b in 1:length(my_folders)){
  dir.create(here("Processed", my_folders[b], "sce_8080combined_object"))

  for (a in 1:length(srt_folders)){
  my_file <- list.files(paste0("Processed/", srt_folders[a]))[11]

    for (i in 1:length(my_file)){
  
      rds = readRDS(paste0("Processed/", srt_folders[a], my_file[i]))
      rds = AddMetaData(rds, metadata = my_folders[a], col.name = "SpeciesThreshold")
      sce = as.SingleCellExperiment(rds)
      saveRDS(rds, here("Processed", my_folders[b], "sce_8080combined_object", file = paste0("sce_8080combined_", my_folders[b], ".rds")))
   }
 }
}
```


5
*#Filter_70*
  
```{r}
sce_70 <- readRDS(here("Processed", "filter_70", "sce_combined_object", file = "sce_combined_filter_70.rds"))
```

# QC

I will use the scater package to add some quality information per cell. This computes for each cell some useful metrics such as the number of umi counts (library size), the number of detected genes and the percentage of mitochondiral genes.

I will use the automatic isOutlier function to determine which values in a numeric vector are outliers based on the median absolute deviation (MAD). Any cell with >3MADs is marked as an outlier. When using this function with low number a log transformation is added, that prevents negative thresholds.

```{r identifyMitoGenes}
is_mito <- grepl("GRCh38-MT-|mm10---Mt-", rownames(sce_70))

table(is_mito)["TRUE"]
```


```{r addQC}
sce_70 <- addPerCellQC(sce_70, subsets = list(Mito = grep("GRCh38-MT-|mm10---Mt-", rownames(sce_70))))

outlier_lib_low <- isOutlier(sce_70$total, log = TRUE, type = "lower")

outlier_expr_low <- isOutlier(sce_70$detected, log = TRUE, type = "lower")

outlier_lib_high <- isOutlier(sce_70$total, type = "higher")

outlier_expr_high <-  isOutlier(sce_70$detected, type = "higher")

# outlier_mt <- isOutlier(sce_70$subsets_mt_percent, type = "higher")

outlier <- outlier_lib_low | outlier_expr_low | outlier_lib_high | outlier_expr_high #| outlier_mt
```

```{r}
summary_outlier <- data.frame(
  lib_size_high = c(sum(outlier_lib_high),
                    attr(outlier_lib_high, "thresholds")[2]),
  expression_high = c(sum(outlier_expr_high),
                      attr(outlier_expr_high, "thresholds")[2]),
  lib_size_low = c(sum(outlier_lib_low),
                   attr(outlier_lib_low, "thresholds")[1]),
  expression_low = c(sum(outlier_expr_low),
                     attr(outlier_expr_low, "thresholds")[1]),
  #mt_pct = c(sum(outlier_mt), attr(outlier_mt, "thresholds")[2]),
  total = c(sum(outlier), NA)
)

row.names(summary_outlier) <- c("Cells filtered", "Threshold")

dir.create(here("outs", "filter_70", "ScaterQC"))
write.csv(summary_outlier, here("outs", "filter_70", "ScaterQC", "autofilter_summary.csv"))

sce_70$outlier <- outlier

summary_outlier <- read.csv(here("outs", "filter_70", "ScaterQC", "autofilter_summary.csv"))
summary_outlier
```

## Plots Before QC

Visualize data distribution

```{r ViolinPlots}
# Total Count
plotColData(sce_70, x = "species", y = "sum", colour_by = "outlier") +
  ggtitle("Total count") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

plotColData(sce_70, x = "species", y = "sum", colour_by = "outlier") +
  scale_y_log10() + ggtitle("Total count log scale") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

# Genes
plotColData(sce_70, x = "species", y = "detected", colour_by = "outlier") +
  scale_y_log10() + ggtitle("Detected Genes") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
```

```{r Histograms}
hist(sce_70$total,breaks = 100)
hist(sce_70$detected, breaks = 100)
```
In the graphs above, we can see:
  in the x axis: the total number of umi (library size) & the number of detected genes per cell
in the y axis: the number of cells for each measure


```{r ScatterPlots}
plotColData(sce_70, x = "sum", y = "detected", colour_by = "outlier")
plotColData(sce_70, x = "sum", y = "detected", colour_by = "species")
```



```{r RemoveObjects}
rm(sce_70, summary_outlier, is_mito, outlier, outlier_expr_high, outlier_expr_low, outlier_lib_high, outlier_lib_low)
```




*#Filter_80*

```{r}
sce_80 <- readRDS(here("Processed", "filter_80", "sce_combined_object", file = "sce_combined_filter_80.rds"))
```

# QC

I will use the scater package to add some quality information per cell. This computes for each cell some useful metrics such as the number of umi counts (library size), the number of detected genes and the percentage of mitochondiral genes.

I will use the automatic isOutlier function to determine which values in a numeric vector are outliers based on the median absolute deviation (MAD). Any cell with >3MADs is marked as an outlier. When using this function with low number a log transformation is added, that prevents negative thresholds.

```{r identifyMitoGenes}
is_mito <- grepl("GRCh38-MT-|mm10---Mt-", rownames(sce_80))

table(is_mito)["TRUE"]
```


```{r addQC}
sce_80 <- addPerCellQC(sce_80, subsets = list(Mito = grep("GRCh38-MT-|mm10---Mt-", rownames(sce_80))))

outlier_lib_low <- isOutlier(sce_80$total, log = TRUE, type = "lower")

outlier_expr_low <- isOutlier(sce_80$detected, log = TRUE, type = "lower")

outlier_lib_high <- isOutlier(sce_80$total, type = "higher")

outlier_expr_high <-  isOutlier(sce_80$detected, type = "higher")

# outlier_mt <- isOutlier(sce_80$subsets_mt_percent, type = "higher")

outlier <- outlier_lib_low | outlier_expr_low | outlier_lib_high | outlier_expr_high #| outlier_mt
```

```{r}
summary_outlier <- data.frame(
    lib_size_high = c(sum(outlier_lib_high),
                      attr(outlier_lib_high, "thresholds")[2]),
    expression_high = c(sum(outlier_expr_high),
                        attr(outlier_expr_high, "thresholds")[2]),
    lib_size_low = c(sum(outlier_lib_low),
                     attr(outlier_lib_low, "thresholds")[1]),
    expression_low = c(sum(outlier_expr_low),
                       attr(outlier_expr_low, "thresholds")[1]),
    #mt_pct = c(sum(outlier_mt), attr(outlier_mt, "thresholds")[2]),
    total = c(sum(outlier), NA)
  )

row.names(summary_outlier) <- c("Cells filtered", "Threshold")

dir.create(here("outs", "filter_80", "ScaterQC"))
write.csv(summary_outlier, here("outs", "filter_80", "ScaterQC", "autofilter_summary.csv"))

sce_80$outlier <- outlier

summary_outlier <- read.csv(here("outs", "filter_80", "ScaterQC", "autofilter_summary.csv"))
summary_outlier
```

## Plots Before QC

Visualize data distribution

```{r ViolinPlots}
# Total Count
plotColData(sce_80, x = "species", y = "sum", colour_by = "outlier") +
  ggtitle("Total count") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

plotColData(sce_80, x = "species", y = "sum", colour_by = "outlier") +
  scale_y_log10() + ggtitle("Total count log scale") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

# Genes
plotColData(sce_80, x = "species", y = "detected", colour_by = "outlier") +
  scale_y_log10() + ggtitle("Detected Genes") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
```

```{r Histograms}
hist(sce_80$total,breaks = 100)
hist(sce_80$detected, breaks = 100)
```
In the graphs above, we can see:
in the x axis: the total number of umi (library size) & the number of detected genes per cell
in the y axis: the number of cells for each measure


```{r ScatterPlots}
plotColData(sce_80, x = "sum", y = "detected", colour_by = "outlier")
plotColData(sce_80, x = "sum", y = "detected", colour_by = "species")
```


````{r RemoveObjects}
rm(sce_80, summary_outlier, is_mito, outlier, outlier_expr_high, outlier_expr_low, outlier_lib_high, outlier_lib_low)
```





*#Filter_90*
  
```{r}
sce_90 <- readRDS(here("Processed", "filter_90", "sce_combined_object", file = "sce_combined_filter_90.rds"))
```

# QC

I will use the scater package to add some quality information per cell. This computes for each cell some useful metrics such as the number of umi counts (library size), the number of detected genes and the percentage of mitochondiral genes.

I will use the automatic isOutlier function to determine which values in a numeric vector are outliers based on the median absolute deviation (MAD). Any cell with >3MADs is marked as an outlier. When using this function with low number a log transformation is added, that prevents negative thresholds.

```{r identifyMitoGenes}
is_mito <- grepl("GRCh38-MT-|mm10---Mt-", rownames(sce_90))

table(is_mito)["TRUE"]
```


```{r addQC}
sce_90 <- addPerCellQC(sce_90, subsets = list(Mito = grep("GRCh38-MT-|mm10---Mt-", rownames(sce_90))))

outlier_lib_low <- isOutlier(sce_90$total, log = TRUE, type = "lower")

outlier_expr_low <- isOutlier(sce_90$detected, log = TRUE, type = "lower")

outlier_lib_high <- isOutlier(sce_90$total, type = "higher")

outlier_expr_high <-  isOutlier(sce_90$detected, type = "higher")

# outlier_mt <- isOutlier(sce_90$subsets_mt_percent, type = "higher")

outlier <- outlier_lib_low | outlier_expr_low | outlier_lib_high | outlier_expr_high #| outlier_mt
```

```{r}
summary_outlier <- data.frame(
  lib_size_high = c(sum(outlier_lib_high),
                    attr(outlier_lib_high, "thresholds")[2]),
  expression_high = c(sum(outlier_expr_high),
                      attr(outlier_expr_high, "thresholds")[2]),
  lib_size_low = c(sum(outlier_lib_low),
                   attr(outlier_lib_low, "thresholds")[1]),
  expression_low = c(sum(outlier_expr_low),
                     attr(outlier_expr_low, "thresholds")[1]),
  #mt_pct = c(sum(outlier_mt), attr(outlier_mt, "thresholds")[2]),
  total = c(sum(outlier), NA)
)

row.names(summary_outlier) <- c("Cells filtered", "Threshold")

dir.create(here("outs", "filter_90", "ScaterQC"))
write.csv(summary_outlier, here("outs", "filter_90", "ScaterQC", "autofilter_summary.csv"))

sce_90$outlier <- outlier

summary_outlier <- read.csv(here("outs", "filter_90", "ScaterQC", "autofilter_summary.csv"))
summary_outlier
```

## Plots Before QC

Visualize data distribution

```{r ViolinPlots}
# Total Count
plotColData(sce_90, x = "species", y = "sum", colour_by = "outlier") +
  ggtitle("Total count") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

plotColData(sce_90, x = "species", y = "sum", colour_by = "outlier") +
  scale_y_log10() + ggtitle("Total count log scale") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

# Genes
plotColData(sce_90, x = "species", y = "detected", colour_by = "outlier") +
  scale_y_log10() + ggtitle("Detected Genes") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
```

```{r Histograms}
hist(sce_90$total,breaks = 100)
hist(sce_90$detected, breaks = 100)
```
In the graphs above, we can see:
  in the x axis: the total number of umi (library size) & the number of detected genes per cell
in the y axis: the number of cells for each measure


```{r ScatterPlots}
plotColData(sce_90, x = "sum", y = "detected", colour_by = "outlier")
plotColData(sce_90, x = "sum", y = "detected", colour_by = "species")
```



```{r sessionInfo}
sessionInfo()
```