src/63_comparative_QC.Rmd · Luise-Seeker-Human-WM-Glia

---
title: "Data_coparative_QC"
author: "Luise A. Seeker"
date: "09/06/2022"
output: html_document
---

```{r}
library(Seurat)
library(dplyr)
library(scater)
library(scran)
library(ggsci)
library(scales)
library(here)
library(gridExtra)
library(ggbeeswarm)
```

```{r, echo = F}
mypal <- pal_npg("nrc", alpha = 0.7)(10)
mypal2 <-pal_tron("legacy", alpha = 0.7)(7)
mypal3 <- pal_lancet("lanonc", alpha = 0.7)(9)
mypal4 <- pal_simpsons(palette = c("springfield"), alpha = 0.7)(16)
mypal5 <- pal_rickandmorty(palette = c("schwifty"), alpha = 0.7)(6)
mypal6 <- pal_futurama(palette = c("planetexpress"), alpha = 0.7)(5)
mypal7 <- pal_startrek(palette = c("uniform"), alpha = 0.7)(5)

mycoloursP<- c(mypal, mypal2, mypal3, mypal4, mypal5, mypal6, mypal7)

```


```{r}
seur <- readRDS(here("data",
                    "single_nuc_data",
                    "all_cell_types",
                    "srt_fine_anno_01.RDS"))

seur$dataset <- "Seeker"
```

```{r}
jaekel <- readRDS(here("data",
          "downloaded_datasets",
          "Jaekel_Agirre_2019",
          "Nat19_ALLCELLTYPES_v3.rds"))

jaekel$total_percent_mito <- jaekel$percent.mito * 100
jaekel$dataset <- "Jaekel_Agirre_2019"
```

```{r}

AD_matrix <- Read10X((here("data",
                    "downloaded_datasets",
                    "Mathys_2019",
                    "notfiltered")), gene.column = 2)
#the barcodes had a weird '' at the end and we needed to get rid of this
colnames(AD_matrix) <- gsub("'","",colnames(AD_matrix))
metadata <- read.table((here("data",
                    "downloaded_datasets",
                    "Mathys_2019",
                    "notfiltered", "metadata.tsv")), header = T, sep = ",")
#the rownames must be the barcodes
rownames(metadata) <- metadata$Cells

#and then it worked
#In this step we do filter for genes and UMI counts but not mt genes - try to make this consistent with how I processed my own data 
HumanAD <- CreateSeuratObject(counts = AD_matrix, min.cells = 3, min.features  = 200, project = "10X_AD", assay = "RNA", names.field = 1, names.delim = ".", meta.data = metadata)
dim(HumanAD)

HumanAD$total_percent_mito <- PercentageFeatureSet(HumanAD, pattern = "^MT-")

HumanAD$dataset <- "Mathys_2019"
# I think this one needs QCed according to the original paper. 
```


```{r}
nagy <- Matrix::readMM(here("data",
                    "downloaded_datasets", "Nagy2020",
                    "GSE144136_GeneBarcodeMatrix_Annotated.mtx"))

cell_names_nagy <- read.csv((here("data",
                    "downloaded_datasets", "Nagy2020",
                    "GSE144136_CellNames.csv")))

gene_names_nagy <- read.csv((here("data",
                    "downloaded_datasets", "Nagy2020",
                    "GSE144136_GeneNames.csv")))

rownames(nagy) <- gene_names_nagy$x

colnames(nagy) <- cell_names_nagy$x

nagy_seur <- CreateSeuratObject(counts = nagy)

# It appears like mitochondrial genes have been removed from the Nagy et al. dataset
nagy_seur$total_percent_mito <- PercentageFeatureSet(nagy_seur, pattern = "^MT-")

nagy_seur$dataset <- "Nagy_2020"
```

# merge datasets

```{r}


merged_dat <- merge(x = seur, y = c(jaekel, HumanAD, nagy_seur))

Idents(merged_dat) <- "dataset"

# Visualize QC metrics as a violin plot
VlnPlot(merged_dat, features = c("nFeature_RNA", 
                                 "nCount_RNA", 
                                 "total_percent_mito"), ncol = 3, 
        pt.size = 0)
```
```{r, eval = FALSE}
dir.create(here("data",
           "single_nuc_data",
           "merged_QC_data"))

saveRDS(merged_dat, here("data",
                    "single_nuc_data",
                    "merged_QC_data",
                    "merged_data.RDS"))

```

# Mathys et al did the QC in a complicated way 
They do not report a cutoff for mitochondrial percentage, but that they 
retained 75060 nuclei in their dataset, which are 1166 less than are in the
data provided. I decided to remove the 1166 nuclei with the worst 
mitochondrial percentage. 

```{r}

met_dat <- HumanAD@meta.data

met_dat <- met_dat %>%
  arrange_(~ desc(total_percent_mito)) %>%
  top_n(n = 1166)


met_dat[1166, 24]

HumanAD <- subset(HumanAD, subset = total_percent_mito < 37.70651)

```

```{r}


merged_dat <- merge(x = seur, y = c(jaekel, HumanAD, nagy_seur))

Idents(merged_dat) <- "dataset"

# Visualize QC metrics as a violin plot

plot1 <- VlnPlot(merged_dat, features = "nFeature_RNA", 
        pt.size = 0) + scale_fill_manual(values= mycoloursP[25:50])+ 
  theme_minimal()+NoLegend() + 
  theme(axis.text.x = element_text(angle = 90)) +xlab("")+ 
  ggtitle("gene count")

plot2 <- VlnPlot(merged_dat, features = "nCount_RNA", 
        pt.size = 0) + scale_fill_manual(values= mycoloursP[25:50])+ 
  theme_minimal()+NoLegend() + 
  theme(axis.text.x = element_text(angle = 90))+xlab("")+ 
  ggtitle("UMI count")

plot3 <- VlnPlot(merged_dat, features = "total_percent_mito", 
        pt.size = 0) + scale_fill_manual(values= mycoloursP[25:50])+ 
  theme_minimal()+NoLegend() + 
  theme(axis.text.x = element_text(angle = 90))+xlab("")+ 
  ggtitle("mitochondrial genes")


grid.arrange(plot1, plot2, plot3, ncol = 3)


```
```{r}
merged_dat$dataset <- factor(merged_dat$dataset, levels = c("Seeker", 
                                                            "Jaekel_Agirre_2019",
                                                            "Mathys_2019",
                                                            "Nagy_2020"))


```

# Comparison gene count

## Normality test
```{r}

#shapiro.test(merged_dat$nFeature_RNA)
set.seed(0)
ks.test(merged_dat$nFeature_RNA, 'pnorm')

qqnorm(merged_dat$nFeature_RNA)
qqline(merged_dat$nFeature_RNA)


qqnorm(log(merged_dat$nFeature_RNA))
qqline(log(merged_dat$nFeature_RNA))

#shapiro.test(log(merged_dat$nFeature_RNA))
set.seed(0)
ks.test(log(merged_dat$nFeature_RNA), 'pnorm')

hist(log(merged_dat$nFeature_RNA))

```
## Linear model
```{r}
lin_mod <- lm(log(merged_dat$nFeature_RNA) ~ merged_dat$dataset)
summary(lin_mod)
anova(lin_mod)
```

See if residuals of the model are normally distributed
```{r}
hist(residuals(lin_mod))
set.seed(0)
ks.test(residuals(lin_mod), 'pnorm')
qqnorm(residuals(lin_mod))
qqline(residuals(lin_mod))
```


# Comparison UMI count

```{r}

#shapiro.test(merged_dat$nFeature_RNA)
set.seed(0)
ks.test(merged_dat$nCount_RNA, 'pnorm')

qqnorm(merged_dat$nCount_RNA)
qqline(merged_dat$nCount_RNA)


qqnorm(log(merged_dat$nCount_RNA))
qqline(log(merged_dat$nCount_RNA))

#shapiro.test(log(merged_dat$nFeature_RNA))
set.seed(0)
ks.test(log(merged_dat$nCount_RNA), 'pnorm')

hist(log(merged_dat$nCount_RNA))

```



# Run linear model
```{r}

lin_mod <- lm(log(merged_dat$nCount_RNA) ~ merged_dat$dataset)
summary(lin_mod)
```
# Comparison mitochondrial gene percentage

```{r}

#shapiro.test(merged_dat$nFeature_RNA)
set.seed(0)
ks.test(merged_dat$total_percent_mito, 'pnorm')

qqnorm(merged_dat$total_percent_mito)
qqline(merged_dat$total_percent_mito)



#shapiro.test(log(merged_dat$nFeature_RNA))
set.seed(0)
ks.test(log(merged_dat$nCount_RNA), 'pnorm')

hist(merged_dat$total_percent_mito)


hist(sqrt(merged_dat$total_percent_mito))

```


```{r}

lin_mod <- lm(merged_dat$total_percent_mito ~ merged_dat$dataset)
summary(lin_mod)


bin_mod <- glm(merged_dat$total_percent_mito/100 ~ merged_dat$dataset,
                family = binomial(link="logit"))
summary(bin_mod)



```


```{r}
hist(resid(lin_mod))

```

# Comparison of number of nuclei per sample
```{r}
seeker <- dplyr::count(merged_dat@meta.data, merged_dat@meta.data$uniq_id) 
seeker$dataset <- "Seeker"
seeker_uniq <- seeker[!duplicated(seeker$`merged_dat@meta.data$uniq_id`),]
jaekel <- dplyr::count(merged_dat@meta.data, merged_dat@meta.data$Sample) 
jaekel$dataset <- "Jaekel_Agirre_2019"
jaekel_uniq <- jaekel[!duplicated(jaekel$`merged_dat@meta.data$Sample`),]
mathys <- dplyr::count(merged_dat@meta.data, merged_dat@meta.data$subject)
mathys$dataset <- "Mathys_2019"
mathys_uniq <- mathys[!duplicated(mathys$`merged_dat@meta.data$subject`),]


Idents(merged_dat) <-"dataset"

nagy<- subset(merged_dat, idents = "Nagy_2020")




#find out samples for Nagy et al. 

cell_info<- gsub('.{17}$', '', cell_names_nagy$x)
head(cell_info)

sample_info <- substr(cell_info, 
                      nchar(cell_info) - 9,
                      nchar(cell_info))

nagy$sample_info <- sample_info

nagy_sample_count <- dplyr::count(nagy@meta.data, nagy@meta.data$sample_info) 
nagy_sample_count$dataset <- "Nagy_2020"

nagy_uniq <- nagy_sample_count[!duplicated(nagy_sample_count$`nagy@meta.data$sample_info`),]
nagy_uniq$dataset <- "Nagy_2020"

names(seeker_uniq)[1] <- "sample_id"
names(jaekel_uniq)[1] <- "sample_id"
names(mathys_uniq)[1] <- "sample_id"
names(nagy_uniq)[1] <- "sample_id"

df <- rbind(seeker_uniq, 
            jaekel_uniq, 
            mathys_uniq, 
            nagy_uniq)

df<- subset(df, df$sample_id != "NA")


plot4 <- ggplot(df, aes(x = dataset, y = n, fill = dataset))+ geom_violin() + 
  scale_fill_manual(values= mycoloursP[25:50])+ theme_minimal() +NoLegend() + 
  theme(axis.text.x = element_text(angle = 90)) + ylab("") +
  xlab("") + 
  ggtitle("nuclei count")


grid.arrange(plot1, plot2, plot3, plot4, ncol= 4)
```

```{r}
df$dataset <- factor(df$dataset, levels = c("Seeker",
                                            "Jaekel_Agirre_2019",
                                            "Mathys_2019",
                                            "Nagy_2020"))

lin_mod <- lm(log(df$n) ~ df$dataset)
summary(lin_mod)



```
```{r}

shapiro.test(df$n)
set.seed(0)
ks.test(df$n, 'pnorm')

qqnorm(df$n)
qqline(df$n)


qqnorm(log(df$n))
qqline(log(df$n))

shapiro.test(log(df$n))
set.seed(0)
ks.test(log(df$n), 'pnorm')

hist(log(df$n))

```


```{r, eval = FALSE}
saveRDS(merged_dat, here("data",
                    "single_nuc_data",
                    "merged_QC_data",
                    "merged_data_mathys_mito_fil.RDS"))

```


# Quality metrics in our data

```{r}
ggplot(seur@meta.data, 
       aes(x = seur$PMI, y = seur$RINvalue, col = seur$caseNO, 
           shape = seur$Tissue, size = 2)) + 
  geom_point() + 
  scale_colour_manual(values= mycoloursP[25:50]) + 
  theme_minimal()+ xlab("Post mortem interval")+
  ylab ("RIN value") + NoLegend()


```
For legends
```{r}
ggplot(seur@meta.data, 
       aes(x = seur$PMI, y = seur$RINvalue, shape = seur$Tissue)) + 
  geom_point() + 
  scale_colour_manual(values= mycoloursP[25:50]) + 
  theme_minimal()+ xlab("Post mortem interval")+
  ylab ("RIN value") 


```

```{r}
ggplot(seur@meta.data, 
       aes(x = seur$PMI, y = seur$RINvalue, col = seur$Tissue)) + 
  geom_point() + 
  scale_colour_manual(values= mycoloursP[25:50]) + 
  theme_minimal()+ xlab("Post mortem interval")+
  ylab ("RIN value") 


```

```{r}

met_dat <- seur@meta.data

dat<- aggregate(seur$nCount_RNA, list(seur$PMI), FUN=mean) 
names(dat) <- c("PMI", "mean_nCount_RNA")

met_dat_pmi <- merge(met_dat, dat, by = "PMI")
met_dat_pmi$PMI <- as.factor(met_dat_pmi$PMI)

qc_1 <- ggplot(met_dat_pmi, 
      mapping = aes(x = PMI, 
           y = nCount_RNA, 
           col = caseNO)) + 
  geom_point() + 
  stat_summary(geom = "point", fun = "mean", colour = "red", size = 4)+
  theme_minimal()+ 
  xlab("Post mortem interval")+  
  theme(axis.text.x = element_text(angle = 45))+
  scale_colour_manual(values= mycoloursP[25:50])+
  ylab ("Gene count") + NoLegend() 
  
qc_1 
```

```{r}
qc_2 <- ggplot(seur@meta.data, 
       aes(x = as.factor(seur$PMI), y = seur$nFeature_RNA, col = seur$caseNO)) + 
  geom_point()  + theme_minimal()+ xlab("Post mortem interval")+ 
  theme(axis.text.x = element_text(angle = 45))+
  stat_summary(geom = "point", fun = "mean", colour = "red", size = 4)+
  scale_colour_manual(values= mycoloursP[25:50])+
  ylab ("UMI count") + NoLegend()

qc_2
```
```{r}
qc_3 <- ggplot(seur@meta.data, 
       aes(x = as.factor(seur$PMI), y = seur$total_percent_mito, col = seur$caseNO)) + 
  geom_point() + theme_minimal()+ xlab("Post mortem interval")+ 
  stat_summary(geom = "point", fun = "mean", colour = "red", size = 4)+
  theme(axis.text.x = element_text(angle = 45))+
  scale_colour_manual(values= mycoloursP[25:50])+
  ylab ("Mitochondrial gene percentage") + NoLegend()

qc_3
```
```{r}
qc_4 <- ggplot(seur@meta.data, 
       aes(x = seur$RIN, y = seur$nFeature_RNA, col = seur$caseNO)) + 
  geom_point() + theme_minimal()+ xlab("RIN value")+ 
  stat_summary(geom = "point", fun = "mean", colour = "red", size = 4)+
  theme(axis.text.x = element_text(angle = 45))+
  scale_color_manual(values= mycoloursP[25:50])+
  ylab ("Gene count") + NoLegend()

qc_4
```

```{r}
qc_5 <- ggplot(seur@meta.data, 
       aes(x = seur$RIN, y = seur$nCount_RNA, col = seur$caseNO)) + 
  geom_point() + theme_minimal()+ xlab("RIN value")+ 
  stat_summary(geom = "point", fun = "mean", colour = "red", size = 4)+
  theme(axis.text.x = element_text(angle = 45))+
  scale_color_manual(values= mycoloursP[25:50])+
  ylab ("UMI count") + NoLegend()

qc_5
```



```{r}
qc_6 <- ggplot(seur@meta.data, 
       aes(x = seur$RIN, y = seur$total_percent_mito, col = seur$caseNO)) + 
  geom_point() + theme_minimal()+ xlab("RIN value")+
  scale_color_manual(values= mycoloursP[25:50])+
  theme(axis.text.x = element_text(angle = 45))+
  stat_summary(geom = "point", fun = "mean", colour = "red", size = 4)+
  ylab ("Mitochondrial gene percentage") + NoLegend()

qc_6
```

```{r}
grid.arrange(qc_1, qc_2, qc_3, qc_4, qc_5, qc_6, ncol = 3)

```

```{r}
merged_dat <- NormalizeData(merged_dat)
merged_dat <- FindVariableFeatures(merged_dat, 
                                   selection.method = "vst", nfeatures = 2000)
all_genes <- rownames(merged_dat)
merged_dat <- ScaleData(merged_dat, features = all_genes)
merged_dat <- RunPCA(merged_dat, features = VariableFeatures(object = merged_dat))
ElbowPlot(merged_dat)


merged_dat <- FindNeighbors(merged_dat, dims = 1:10)
merged_dat <- FindClusters(merged_dat, resolution = 0.5)
merged_dat <- RunUMAP(merged_dat, dims = 1:10)
DimPlot(merged_dat, reduction = "umap")
```

```{r}

sessionInfo()
```